3 files changed, 156 insertions, 122 deletions
diff --git a/src/nspawn/nspawn-cgroup.c b/src/nspawn/nspawn-cgroup.c
index 757be77a6c..b6f1323f21 100644
--- a/src/nspawn/nspawn-cgroup.c
+++ b/src/nspawn/nspawn-cgroup.c
@@ -79,6 +79,27 @@ void cgroup_free_mounts(CGMounts *mounts) {
 
 /* cgroup-util ******************************************************/
 
+/* Similar to cg_pid_get_path_internal, but take a full list of mount options, rather than a single controller name. */
+static int cgfile_get_cgroup(FILE *cgfile, const char *opts, char **ret_cgroup) {
+
+        if (!opts) { /* cgroup-v2 */
+                rewind(cgfile);
+                return cg_pid_get_path_internal(NULL, cgfile, ret_cgroup);
+        } else { /* cgroup-v1 */
+                const char *scontroller, *state;
+                size_t controller_len;
+                FOREACH_WORD_SEPARATOR(scontroller, controller_len, opts, ",", state) {
+                        _cleanup_free_ const char *controller = strndup(scontroller, controller_len);
+                        rewind(cgfile);
+                        if (cg_pid_get_path_internal(controller, cgfile, ret_cgroup) == 0)
+                                break;
+                }
+                if (!*ret_cgroup)
+                        return -EBADMSG;
+                return 0;
+        }
+}
+
 static int chown_cgroup_path(const char *path, uid_t uid_shift) {
         _cleanup_close_ int fd = -1;
         const char *fn;
@@ -519,8 +540,9 @@ int cgroup_decide_mounts(
 
 static int cgroup_mount_cg(
                 const char *mountpoint, const char *opts, CGMountType fstype,
-                bool use_cgns, bool use_userns) {
+                FILE *cgfile, bool use_userns) {
 
+        const bool use_cgns = cgfile == NULL;
         /* If we are using userns and cgns, then we always let it be RW, because we can count on the shifted root user
          * to not have access to the things that would make us want to mount it RO.  Otherwise, we only give the
          * container RW access to its unified or name=systemd cgroup. */
@@ -528,19 +550,38 @@ static int cgroup_mount_cg(
 
         int r;
 
-        /* The superblock mount options of the mount point need to be
-         * identical to the hosts', and hence writable... */
         r = mount_verbose(LOG_ERR, "cgroup", mountpoint, fstype == CGMOUNT_CGROUP1 ? "cgroup" : "cgroup2",
-                          MS_NOSUID|MS_NOEXEC|MS_NODEV, opts);
+                          MS_NOSUID|MS_NOEXEC|MS_NODEV| ((!rw||!use_cgns) ? MS_RDONLY : 0), opts);
         if (r < 0)
                 return r;
 
-        /* ... hence let's only make the bind mount read-only, not the superblock. */
-        if (!rw) {
-                r = mount_verbose(LOG_ERR, NULL, mountpoint, NULL,
-                                  MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL);
+        if (rw && !use_cgns) {
+                /* emulate cgns by mounting everything but our subcgroup RO */
+                const char *rwmountpoint = strjoina(mountpoint, ".");
+
+                _cleanup_free_ char *cgroup = NULL;
+                r = cgfile_get_cgroup(cgfile, fstype == CGMOUNT_CGROUP2 ? NULL : opts, &cgroup);
+                if (r < 0) {
+                        if (fstype == CGMOUNT_CGROUP2)
+                                return log_error_errno(r, "Failed to get child's cgroup v2 path");
+                        else
+                                return log_error_errno(r, "Failed to associate mounted cgroup hierarchy %s with numbered cgroup hierarchy", mountpoint);
+                }
+
+                (void) mkdir(rwmountpoint, 0755);
+                r = mount_verbose(LOG_ERR, "cgroup", rwmountpoint, fstype == CGMOUNT_CGROUP1 ? "cgroup" : "cgroup2",
+                                  MS_NOSUID|MS_NOEXEC|MS_NODEV, opts);
+                if (r < 0)
+                        return r;
+                r = mount_verbose(LOG_ERR, strjoina(rwmountpoint, cgroup), strjoina(mountpoint, cgroup), NULL, MS_BIND, NULL);
+                if (r < 0)
+                        return r;
+                r = umount_verbose(rwmountpoint);
                 if (r < 0)
                         return r;
+                r = rmdir(rwmountpoint);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to rmdir temporary mountpoint %s: %m", rwmountpoint);
         }
 
         return 0;
@@ -548,10 +589,11 @@ static int cgroup_mount_cg(
 
 int cgroup_mount_mounts(
                 CGMounts m,
-                bool use_cgns,
+                FILE *cgfile,
                 uid_t uid_shift,
                 const char *selinux_apifs_context) {
 
+        const bool use_cgns = cgfile == NULL;
         const bool use_userns = uid_shift != UID_INVALID;
         const char *cgroup_root = "/sys/fs/cgroup";
 
@@ -604,7 +646,7 @@ int cgroup_mount_mounts(
                                 return log_error_errno(EINVAL, "%s is already mounted but not a cgroup hierarchy. Refusing.", dst);
                         }
                         (void) mkdir_p(dst, 0755);
-                        r = cgroup_mount_cg(dst, m.mounts[i].src, m.mounts[i].type, use_cgns, use_userns);
+                        r = cgroup_mount_cg(dst, m.mounts[i].src, m.mounts[i].type, cgfile, use_userns);
                         if (r < 0)
                                 return r;
                         break;
@@ -621,63 +663,3 @@ int cgroup_mount_mounts(
 
         return 0;
 }
-
-/* mount_cgroups, mount_systemd_cgroup_writable *********************/
-
-static int mount_systemd_cgroup_writable_one(const char *root, const char *own) {
-        int r;
-
-        assert(root);
-        assert(own);
-
-        /* Make our own cgroup a (writable) bind mount */
-        r = mount_verbose(LOG_ERR, own, own, NULL, MS_BIND, NULL);
-        if (r < 0)
-                return r;
-
-        /* And then remount the systemd cgroup root read-only */
-        return mount_verbose(LOG_ERR, NULL, root, NULL,
-                             MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL);
-}
-
-int mount_systemd_cgroup_writable(
-                const char *dest,
-                CGroupUnified inner_cgver) {
-
-        _cleanup_free_ char *own_cgroup_path = NULL;
-        const char *root, *own;
-        int r;
-
-        assert(dest);
-
-        r = cg_pid_get_path(NULL, 0, &own_cgroup_path);
-        if (r < 0)
-                return log_error_errno(r, "Failed to determine our own cgroup path: %m");
-
-        /* If we are living in the top-level, then there's nothing to do... */
-        if (path_equal(own_cgroup_path, "/"))
-                return 0;
-
-        switch (inner_cgver) {
-        default:
-        case CGROUP_UNIFIED_UNKNOWN:
-                assert_not_reached("unknown inner_cgver");
-        case CGROUP_UNIFIED_ALL:
-                root = prefix_roota(dest, "/sys/fs/cgroup");
-                own = strjoina(root, own_cgroup_path);
-                break;
-        case CGROUP_UNIFIED_SYSTEMD233:
-        case CGROUP_UNIFIED_SYSTEMD232:
-                root = prefix_roota(dest, "/sys/fs/cgroup/unified");
-                own = strjoina(root, own_cgroup_path);
-                r = mount_systemd_cgroup_writable_one(root, own);
-                if (r < 0)
-                        return r;
-                _fallthrough_;
-        case CGROUP_UNIFIED_NONE:
-                root = prefix_roota(dest, "/sys/fs/cgroup/systemd");
-                own = strjoina(root, own_cgroup_path);
-        }
-
-        return mount_systemd_cgroup_writable_one(root, own);
-}
diff --git a/src/nspawn/nspawn-cgroup.h b/src/nspawn/nspawn-cgroup.h
index b35ef666fc..e6cfa1be07 100644
--- a/src/nspawn/nspawn-cgroup.h
+++ b/src/nspawn/nspawn-cgroup.h
@@ -14,7 +14,5 @@ typedef struct CGMounts {
 
 int cgroup_setup(pid_t pid, CGroupUnified outer_cgver, CGroupUnified inner_cgver, uid_t uid_shift, bool keep_unit);
 int cgroup_decide_mounts(CGMounts *ret_mounts, CGroupUnified outer_cgver, CGroupUnified inner_cgver, bool use_cgns);
-int cgroup_mount_mounts(CGMounts mounts, bool use_cgns, uid_t uid_shift, const char *selinux_apifs_context);
+int cgroup_mount_mounts(CGMounts mounts, FILE *cgfile, uid_t uid_shift, const char *selinux_apifs_context);
 void cgroup_free_mounts(CGMounts *mounts);
-
-int mount_systemd_cgroup_writable(const char *dest, CGroupUnified inner_cgver);
diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c
index f1afa3a9ca..f64faee25b 100644
--- a/src/nspawn/nspawn.c
+++ b/src/nspawn/nspawn.c
@@ -2567,15 +2567,15 @@ static int inner_child(
         assert(directory);
         assert(kmsg_socket >= 0);
 
-        if (arg_userns_mode != USER_NAMESPACE_NO) {
-                /* Tell the parent, that it now can write the UID map. */
-                (void) barrier_place(barrier); /* #1 */
+        /* Tell the parent that it can now configure our process; write
+         * the UID map (if use_userns), place us in the correct cgroup (if
+         * use_cgns), et c. */
+        (void) barrier_place(barrier); /* #1 */
 
-                /* Wait until the parent wrote the UID map */
-                if (!barrier_place_and_sync(barrier)) { /* #2 */
-                        log_error("Parent died too early");
-                        return -ESRCH;
-                }
+        /* Wait until the parent says that we are fully configured. */
+        if (!barrier_place_and_sync(barrier)) { /* #2 */
+                log_error("Parent died too early");
+                return -ESRCH;
         }
 
         r = reset_uid_gid();
@@ -2602,28 +2602,17 @@ static int inner_child(
         if (r < 0)
                 return r;
 
-        /* Wait until we are cgroup-ified, so that we
-         * can mount the right cgroup path writable */
-        if (!barrier_place_and_sync(barrier)) { /* #4 */
-                log_error("Parent died too early");
-                return -ESRCH;
-        }
-
         if (arg_use_cgns) {
                 r = unshare(CLONE_NEWCGROUP);
                 if (r < 0)
                         return log_error_errno(errno, "Failed to unshare cgroup namespace: %m");
                 r = cgroup_mount_mounts(cgmounts,
-                                        arg_use_cgns,
+                                        NULL,
                                         arg_userns_mode == USER_NAMESPACE_NO ? UID_INVALID : 0,
                                         arg_selinux_apifs_context);
                 cgroup_free_mounts(&cgmounts);
                 if (r < 0)
                         return r;
-        } else {
-                r = mount_systemd_cgroup_writable("", arg_inner_cgver);
-                if (r < 0)
-                        return r;
         }
 
         r = setup_boot_id();
@@ -2724,7 +2713,7 @@ static int inner_child(
         /* Let the parent know that we are ready and
          * wait until the parent is ready with the
          * setup, too... */
-        if (!barrier_place_and_sync(barrier)) { /* #5 */
+        if (!barrier_place_and_sync(barrier)) { /* #4 */
                 log_error("Parent died too early");
                 return -ESRCH;
         }
@@ -2836,6 +2825,7 @@ static int outer_child(
                 int rtnl_socket,
                 int uid_shift_socket,
                 int inner_cgver_socket,
+                int cgroup_fd_socket,
                 FDSet *fds,
                 int netns_fd,
                 CGroupUnified outer_cgver) {
@@ -2853,6 +2843,7 @@ static int outer_child(
         assert(uuid_socket >= 0);
         assert(notify_socket >= 0);
         assert(kmsg_socket >= 0);
+        assert(cgroup_fd_socket);
 
         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
                 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
@@ -3078,15 +3069,6 @@ static int outer_child(
         if (r < 0)
                 return r;
 
-        if (!arg_use_cgns) {
-                r = cgroup_mount_mounts(cgmounts,
-                                        arg_use_cgns,
-                                        arg_userns_mode == USER_NAMESPACE_NO ? UID_INVALID : arg_uid_shift,
-                                        arg_selinux_apifs_context);
-                if (r < 0)
-                        return r;
-        }
-
         r = mount_move_root(directory);
         if (r < 0)
                 return log_error_errno(r, "Failed to move root directory: %m");
@@ -3109,6 +3091,7 @@ static int outer_child(
                 uuid_socket = safe_close(uuid_socket);
                 notify_socket = safe_close(notify_socket);
                 uid_shift_socket = safe_close(uid_shift_socket);
+                cgroup_fd_socket = safe_close(cgroup_fd_socket);
 
                 /* The inner child has all namespaces that are
                  * requested, so that we all are owned by the user if
@@ -3147,12 +3130,50 @@ static int outer_child(
         if (l < 0)
                 return log_error_errno(errno, "Failed to send notify fd: %m");
 
+        /* If !use_cgns, then we need to do this here because without cgns cgroups can't be mounted inside of a
+         * less privileged mountns (and using userns causes the mountns to be less privileged). */
+        if (!arg_use_cgns) {
+                /* If !use_cgns, then cgroup_mount_mounts() needs to look at /proc/pid/cgroup; but because we've
+                 * already chroot()ed (mount_move_root()), we don't have access to /proc.  So the parent opens the file
+                 * for us and then sends it to us. */
+                int cgfd;
+                _cleanup_fclose_ FILE *cgfile = NULL;
+
+                cgfd = receive_one_fd(cgroup_fd_socket, 0);
+                if (cgfd < 0)
+                        return log_error_errno(cgfd, "Failed to recv cgroup fd: %m");
+
+                cgfile = fdopen(cgfd, "re");
+                if (!cgfile) {
+                        r = -errno; /* in case safe_close() sets errno */
+                        cgfd = safe_close(cgfd);
+                        return log_error_errno(r, "Failed to create a stream object for cgroup fd: %m");
+                }
+
+                r = cgroup_mount_mounts(cgmounts,
+                                        cgfile,
+                                        arg_userns_mode == USER_NAMESPACE_NO ? UID_INVALID : arg_uid_shift,
+                                        arg_selinux_apifs_context);
+                if (r < 0)
+                        return r;
+        } else {
+                /* We're not doing anything else, but wait for a bit of data anyway; as a synchronization point, to
+                 * make sure that our SIGCHLD doesn't EINTR anything important. */
+                char c;
+                l = recv(cgroup_fd_socket, &c, sizeof(c), 0);
+                if (l < 0)
+                        return log_error_errno(errno, "Failed to recv synchronization byte: %m");
+                if (l != sizeof(c))
+                        return log_error_errno(errno, "Short read while receiving synchronization byte: %m");
+        }
+
         pid_socket = safe_close(pid_socket);
         uuid_socket = safe_close(uuid_socket);
         notify_socket = safe_close(notify_socket);
         kmsg_socket = safe_close(kmsg_socket);
         rtnl_socket = safe_close(rtnl_socket);
         netns_fd = safe_close(netns_fd);
+        cgroup_fd_socket = safe_close(cgroup_fd_socket);
 
         return 0;
 }
@@ -3654,7 +3675,8 @@ static int run(int master,
                 uuid_socket_pair[2] = { -1, -1 },
                 notify_socket_pair[2] = { -1, -1 },
                 uid_shift_socket_pair[2] = { -1, -1 },
-                inner_cgver_socket_pair[2] = { -1, -1};
+                inner_cgver_socket_pair[2] = { -1, -1},
+                cgroup_fd_socket_pair[2] = {-1, -1 };
 
         _cleanup_close_ int notify_socket= -1;
         _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
@@ -3713,6 +3735,9 @@ static int run(int master,
                 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, inner_cgver_socket_pair) < 0)
                         return log_error_errno(errno, "Failed to create unified cgroup socket pair: %m");
 
+        if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, cgroup_fd_socket_pair) < 0)
+                return log_error_errno(errno, "Failed to create cgroup socket pair: %m");
+
         /* Child can be killed before execv(), so handle SIGCHLD in order to interrupt
          * parent's blocking calls and give it a chance to call wait() and terminate. */
         r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
@@ -3756,6 +3781,7 @@ static int run(int master,
                 notify_socket_pair[0] = safe_close(notify_socket_pair[0]);
                 uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
                 inner_cgver_socket_pair[0] = safe_close(inner_cgver_socket_pair[0]);
+                cgroup_fd_socket_pair[0] = safe_close(cgroup_fd_socket_pair[0]);
 
                 (void) reset_all_signal_handlers();
                 (void) reset_signal_mask();
@@ -3773,6 +3799,7 @@ static int run(int master,
                                 rtnl_socket_pair[1],
                                 uid_shift_socket_pair[1],
                                 inner_cgver_socket_pair[1],
+                                cgroup_fd_socket_pair[1],
                                 fds,
                                 netns_fd,
                                 outer_cgver);
@@ -3793,6 +3820,7 @@ static int run(int master,
         notify_socket_pair[1] = safe_close(notify_socket_pair[1]);
         uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]);
         inner_cgver_socket_pair[1] = safe_close(inner_cgver_socket_pair[1]);
+        cgroup_fd_socket_pair[1] = safe_close(cgroup_fd_socket_pair[1]);
 
         if (arg_userns_mode != USER_NAMESPACE_NO) {
                 /* The child just let us know the UID shift it might have read from the image. */
@@ -3834,14 +3862,6 @@ static int run(int master,
                 }
         }
 
-        /* Wait for the outer child. */
-        r = wait_for_terminate_and_check("(sd-namespace)", *helper_pid, WAIT_LOG_ABNORMAL);
-        *helper_pid = 0;
-        if (r < 0)
-                return r;
-        if (r != EXIT_SUCCESS)
-                return -EIO;
-
         /* And now retrieve the PID of the inner child. */
         l = recv(pid_socket_pair[0], main_pid, sizeof *main_pid, 0);
         if (l < 0)
@@ -3868,23 +3888,22 @@ static int run(int master,
 
         log_debug("Init process invoked as PID "PID_FMT, *main_pid);
 
-        if (arg_userns_mode != USER_NAMESPACE_NO) {
-                if (!barrier_place_and_sync(&barrier)) { /* #1 */
-                        log_error("Child died too early.");
-                        return -ESRCH;
-                }
+        /* Wait until the child gives us the OK to configure it. */
+        if (!barrier_place_and_sync(&barrier)) { /* #1 */
+                log_error("Child died too early.");
+                return -ESRCH;
+        }
 
+        if (arg_userns_mode != USER_NAMESPACE_NO) {
                 r = setup_uid_map(*main_pid);
                 if (r < 0)
                         return r;
-
-                (void) barrier_place(&barrier); /* #2 */
         }
 
         if (arg_private_network) {
                 if (!arg_network_namespace_path) {
                         /* Wait until the child has unshared its network namespace. */
-                        if (!barrier_place_and_sync(&barrier)) { /* #3 */
+                        if (!barrier_place_and_sync(&barrier)) { /* #2*/
                                 log_error("Child died too early");
                                 return -ESRCH;
                         }
@@ -3962,6 +3981,8 @@ static int run(int master,
         }
 
         if (arg_register) {
+                /* If the child is to be placed into a different cgroup,
+                 * this is what does it. */
                 r = register_machine(
                                 bus,
                                 arg_machine,
@@ -3997,11 +4018,44 @@ static int run(int master,
         if (r < 0)
                 return r;
 
+        if (!arg_use_cgns) {
+                const char *fs;
+                _cleanup_close_ int fd;
+
+                fs = procfs_file_alloca(*main_pid, "cgroup");
+                fd = open(fs, O_RDONLY|O_CLOEXEC);
+                if (fd < 0)
+                        return log_error_errno(errno, "Failed to open cgroups of child: %m");
+
+                r = send_one_fd(cgroup_fd_socket_pair[0], fd, 0);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to send cgroup fd: %m");
+        } else {
+                /* we don't have any data to send, but sending something here is important because it acts as a
+                 * synchronization point.  Otherwise the outer child could exit earlier, and the resulting SIGCHLD
+                 * could interrupt something like register_machine() above.  We could use a barrier for this, but we
+                 * have a perfectly good socket here already. */
+                char c = '\0';
+                l = send(cgroup_fd_socket_pair[0], &c, sizeof(c), MSG_NOSIGNAL);
+                if (l < 0)
+                        return log_error_errno(errno, "Failed to send synchronization byte: %m");
+                if (l != sizeof(c))
+                        return log_error_errno(errno, "Short write while sending synchronization byte: %m");
+        }
+
+        /* Wait for the outer child. */
+        r = wait_for_terminate_and_check("(sd-namespace)", *helper_pid, WAIT_LOG_ABNORMAL);
+        *helper_pid = 0;
+        if (r < 0)
+                return r;
+        if (r != EXIT_SUCCESS)
+                return -EIO;
+
         /* Notify the child that the parent is ready with all
          * its setup (including cgroup-ification), and that
          * the child can now hand over control to the code to
          * run inside the container. */
-        (void) barrier_place(&barrier); /* #4 */
+        (void) barrier_place(&barrier); /* #3 */
 
         /* Block SIGCHLD here, before notifying child.
          * process_pty() will handle it with the other signals. */
@@ -4029,7 +4083,7 @@ static int run(int master,
                 return r;
 
         /* Let the child know that we are ready and wait that the child is completely ready now. */
-        if (!barrier_place_and_sync(&barrier)) { /* #5 */
+        if (!barrier_place_and_sync(&barrier)) { /* #4 */
                 log_error("Child died too early.");
                 return -ESRCH;
         }