diff options
-rw-r--r-- | src/nspawn/nspawn-cgroup.c | 122 | ||||
-rw-r--r-- | src/nspawn/nspawn-cgroup.h | 4 | ||||
-rw-r--r-- | src/nspawn/nspawn.c | 152 |
3 files changed, 156 insertions, 122 deletions
diff --git a/src/nspawn/nspawn-cgroup.c b/src/nspawn/nspawn-cgroup.c index 757be77a6c..b6f1323f21 100644 --- a/src/nspawn/nspawn-cgroup.c +++ b/src/nspawn/nspawn-cgroup.c @@ -79,6 +79,27 @@ void cgroup_free_mounts(CGMounts *mounts) { /* cgroup-util ******************************************************/ +/* Similar to cg_pid_get_path_internal, but take a full list of mount options, rather than a single controller name. */ +static int cgfile_get_cgroup(FILE *cgfile, const char *opts, char **ret_cgroup) { + + if (!opts) { /* cgroup-v2 */ + rewind(cgfile); + return cg_pid_get_path_internal(NULL, cgfile, ret_cgroup); + } else { /* cgroup-v1 */ + const char *scontroller, *state; + size_t controller_len; + FOREACH_WORD_SEPARATOR(scontroller, controller_len, opts, ",", state) { + _cleanup_free_ const char *controller = strndup(scontroller, controller_len); + rewind(cgfile); + if (cg_pid_get_path_internal(controller, cgfile, ret_cgroup) == 0) + break; + } + if (!*ret_cgroup) + return -EBADMSG; + return 0; + } +} + static int chown_cgroup_path(const char *path, uid_t uid_shift) { _cleanup_close_ int fd = -1; const char *fn; @@ -519,8 +540,9 @@ int cgroup_decide_mounts( static int cgroup_mount_cg( const char *mountpoint, const char *opts, CGMountType fstype, - bool use_cgns, bool use_userns) { + FILE *cgfile, bool use_userns) { + const bool use_cgns = cgfile == NULL; /* If we are using userns and cgns, then we always let it be RW, because we can count on the shifted root user * to not have access to the things that would make us want to mount it RO. Otherwise, we only give the * container RW access to its unified or name=systemd cgroup. */ @@ -528,19 +550,38 @@ static int cgroup_mount_cg( int r; - /* The superblock mount options of the mount point need to be - * identical to the hosts', and hence writable... */ r = mount_verbose(LOG_ERR, "cgroup", mountpoint, fstype == CGMOUNT_CGROUP1 ? "cgroup" : "cgroup2", - MS_NOSUID|MS_NOEXEC|MS_NODEV, opts); + MS_NOSUID|MS_NOEXEC|MS_NODEV| ((!rw||!use_cgns) ? MS_RDONLY : 0), opts); if (r < 0) return r; - /* ... hence let's only make the bind mount read-only, not the superblock. */ - if (!rw) { - r = mount_verbose(LOG_ERR, NULL, mountpoint, NULL, - MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL); + if (rw && !use_cgns) { + /* emulate cgns by mounting everything but our subcgroup RO */ + const char *rwmountpoint = strjoina(mountpoint, "."); + + _cleanup_free_ char *cgroup = NULL; + r = cgfile_get_cgroup(cgfile, fstype == CGMOUNT_CGROUP2 ? NULL : opts, &cgroup); + if (r < 0) { + if (fstype == CGMOUNT_CGROUP2) + return log_error_errno(r, "Failed to get child's cgroup v2 path"); + else + return log_error_errno(r, "Failed to associate mounted cgroup hierarchy %s with numbered cgroup hierarchy", mountpoint); + } + + (void) mkdir(rwmountpoint, 0755); + r = mount_verbose(LOG_ERR, "cgroup", rwmountpoint, fstype == CGMOUNT_CGROUP1 ? "cgroup" : "cgroup2", + MS_NOSUID|MS_NOEXEC|MS_NODEV, opts); + if (r < 0) + return r; + r = mount_verbose(LOG_ERR, strjoina(rwmountpoint, cgroup), strjoina(mountpoint, cgroup), NULL, MS_BIND, NULL); + if (r < 0) + return r; + r = umount_verbose(rwmountpoint); if (r < 0) return r; + r = rmdir(rwmountpoint); + if (r < 0) + return log_error_errno(r, "Failed to rmdir temporary mountpoint %s: %m", rwmountpoint); } return 0; @@ -548,10 +589,11 @@ static int cgroup_mount_cg( int cgroup_mount_mounts( CGMounts m, - bool use_cgns, + FILE *cgfile, uid_t uid_shift, const char *selinux_apifs_context) { + const bool use_cgns = cgfile == NULL; const bool use_userns = uid_shift != UID_INVALID; const char *cgroup_root = "/sys/fs/cgroup"; @@ -604,7 +646,7 @@ int cgroup_mount_mounts( return log_error_errno(EINVAL, "%s is already mounted but not a cgroup hierarchy. Refusing.", dst); } (void) mkdir_p(dst, 0755); - r = cgroup_mount_cg(dst, m.mounts[i].src, m.mounts[i].type, use_cgns, use_userns); + r = cgroup_mount_cg(dst, m.mounts[i].src, m.mounts[i].type, cgfile, use_userns); if (r < 0) return r; break; @@ -621,63 +663,3 @@ int cgroup_mount_mounts( return 0; } - -/* mount_cgroups, mount_systemd_cgroup_writable *********************/ - -static int mount_systemd_cgroup_writable_one(const char *root, const char *own) { - int r; - - assert(root); - assert(own); - - /* Make our own cgroup a (writable) bind mount */ - r = mount_verbose(LOG_ERR, own, own, NULL, MS_BIND, NULL); - if (r < 0) - return r; - - /* And then remount the systemd cgroup root read-only */ - return mount_verbose(LOG_ERR, NULL, root, NULL, - MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL); -} - -int mount_systemd_cgroup_writable( - const char *dest, - CGroupUnified inner_cgver) { - - _cleanup_free_ char *own_cgroup_path = NULL; - const char *root, *own; - int r; - - assert(dest); - - r = cg_pid_get_path(NULL, 0, &own_cgroup_path); - if (r < 0) - return log_error_errno(r, "Failed to determine our own cgroup path: %m"); - - /* If we are living in the top-level, then there's nothing to do... */ - if (path_equal(own_cgroup_path, "/")) - return 0; - - switch (inner_cgver) { - default: - case CGROUP_UNIFIED_UNKNOWN: - assert_not_reached("unknown inner_cgver"); - case CGROUP_UNIFIED_ALL: - root = prefix_roota(dest, "/sys/fs/cgroup"); - own = strjoina(root, own_cgroup_path); - break; - case CGROUP_UNIFIED_SYSTEMD233: - case CGROUP_UNIFIED_SYSTEMD232: - root = prefix_roota(dest, "/sys/fs/cgroup/unified"); - own = strjoina(root, own_cgroup_path); - r = mount_systemd_cgroup_writable_one(root, own); - if (r < 0) - return r; - _fallthrough_; - case CGROUP_UNIFIED_NONE: - root = prefix_roota(dest, "/sys/fs/cgroup/systemd"); - own = strjoina(root, own_cgroup_path); - } - - return mount_systemd_cgroup_writable_one(root, own); -} diff --git a/src/nspawn/nspawn-cgroup.h b/src/nspawn/nspawn-cgroup.h index b35ef666fc..e6cfa1be07 100644 --- a/src/nspawn/nspawn-cgroup.h +++ b/src/nspawn/nspawn-cgroup.h @@ -14,7 +14,5 @@ typedef struct CGMounts { int cgroup_setup(pid_t pid, CGroupUnified outer_cgver, CGroupUnified inner_cgver, uid_t uid_shift, bool keep_unit); int cgroup_decide_mounts(CGMounts *ret_mounts, CGroupUnified outer_cgver, CGroupUnified inner_cgver, bool use_cgns); -int cgroup_mount_mounts(CGMounts mounts, bool use_cgns, uid_t uid_shift, const char *selinux_apifs_context); +int cgroup_mount_mounts(CGMounts mounts, FILE *cgfile, uid_t uid_shift, const char *selinux_apifs_context); void cgroup_free_mounts(CGMounts *mounts); - -int mount_systemd_cgroup_writable(const char *dest, CGroupUnified inner_cgver); diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c index f1afa3a9ca..f64faee25b 100644 --- a/src/nspawn/nspawn.c +++ b/src/nspawn/nspawn.c @@ -2567,15 +2567,15 @@ static int inner_child( assert(directory); assert(kmsg_socket >= 0); - if (arg_userns_mode != USER_NAMESPACE_NO) { - /* Tell the parent, that it now can write the UID map. */ - (void) barrier_place(barrier); /* #1 */ + /* Tell the parent that it can now configure our process; write + * the UID map (if use_userns), place us in the correct cgroup (if + * use_cgns), et c. */ + (void) barrier_place(barrier); /* #1 */ - /* Wait until the parent wrote the UID map */ - if (!barrier_place_and_sync(barrier)) { /* #2 */ - log_error("Parent died too early"); - return -ESRCH; - } + /* Wait until the parent says that we are fully configured. */ + if (!barrier_place_and_sync(barrier)) { /* #2 */ + log_error("Parent died too early"); + return -ESRCH; } r = reset_uid_gid(); @@ -2602,28 +2602,17 @@ static int inner_child( if (r < 0) return r; - /* Wait until we are cgroup-ified, so that we - * can mount the right cgroup path writable */ - if (!barrier_place_and_sync(barrier)) { /* #4 */ - log_error("Parent died too early"); - return -ESRCH; - } - if (arg_use_cgns) { r = unshare(CLONE_NEWCGROUP); if (r < 0) return log_error_errno(errno, "Failed to unshare cgroup namespace: %m"); r = cgroup_mount_mounts(cgmounts, - arg_use_cgns, + NULL, arg_userns_mode == USER_NAMESPACE_NO ? UID_INVALID : 0, arg_selinux_apifs_context); cgroup_free_mounts(&cgmounts); if (r < 0) return r; - } else { - r = mount_systemd_cgroup_writable("", arg_inner_cgver); - if (r < 0) - return r; } r = setup_boot_id(); @@ -2724,7 +2713,7 @@ static int inner_child( /* Let the parent know that we are ready and * wait until the parent is ready with the * setup, too... */ - if (!barrier_place_and_sync(barrier)) { /* #5 */ + if (!barrier_place_and_sync(barrier)) { /* #4 */ log_error("Parent died too early"); return -ESRCH; } @@ -2836,6 +2825,7 @@ static int outer_child( int rtnl_socket, int uid_shift_socket, int inner_cgver_socket, + int cgroup_fd_socket, FDSet *fds, int netns_fd, CGroupUnified outer_cgver) { @@ -2853,6 +2843,7 @@ static int outer_child( assert(uuid_socket >= 0); assert(notify_socket >= 0); assert(kmsg_socket >= 0); + assert(cgroup_fd_socket); if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0) return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m"); @@ -3078,15 +3069,6 @@ static int outer_child( if (r < 0) return r; - if (!arg_use_cgns) { - r = cgroup_mount_mounts(cgmounts, - arg_use_cgns, - arg_userns_mode == USER_NAMESPACE_NO ? UID_INVALID : arg_uid_shift, - arg_selinux_apifs_context); - if (r < 0) - return r; - } - r = mount_move_root(directory); if (r < 0) return log_error_errno(r, "Failed to move root directory: %m"); @@ -3109,6 +3091,7 @@ static int outer_child( uuid_socket = safe_close(uuid_socket); notify_socket = safe_close(notify_socket); uid_shift_socket = safe_close(uid_shift_socket); + cgroup_fd_socket = safe_close(cgroup_fd_socket); /* The inner child has all namespaces that are * requested, so that we all are owned by the user if @@ -3147,12 +3130,50 @@ static int outer_child( if (l < 0) return log_error_errno(errno, "Failed to send notify fd: %m"); + /* If !use_cgns, then we need to do this here because without cgns cgroups can't be mounted inside of a + * less privileged mountns (and using userns causes the mountns to be less privileged). */ + if (!arg_use_cgns) { + /* If !use_cgns, then cgroup_mount_mounts() needs to look at /proc/pid/cgroup; but because we've + * already chroot()ed (mount_move_root()), we don't have access to /proc. So the parent opens the file + * for us and then sends it to us. */ + int cgfd; + _cleanup_fclose_ FILE *cgfile = NULL; + + cgfd = receive_one_fd(cgroup_fd_socket, 0); + if (cgfd < 0) + return log_error_errno(cgfd, "Failed to recv cgroup fd: %m"); + + cgfile = fdopen(cgfd, "re"); + if (!cgfile) { + r = -errno; /* in case safe_close() sets errno */ + cgfd = safe_close(cgfd); + return log_error_errno(r, "Failed to create a stream object for cgroup fd: %m"); + } + + r = cgroup_mount_mounts(cgmounts, + cgfile, + arg_userns_mode == USER_NAMESPACE_NO ? UID_INVALID : arg_uid_shift, + arg_selinux_apifs_context); + if (r < 0) + return r; + } else { + /* We're not doing anything else, but wait for a bit of data anyway; as a synchronization point, to + * make sure that our SIGCHLD doesn't EINTR anything important. */ + char c; + l = recv(cgroup_fd_socket, &c, sizeof(c), 0); + if (l < 0) + return log_error_errno(errno, "Failed to recv synchronization byte: %m"); + if (l != sizeof(c)) + return log_error_errno(errno, "Short read while receiving synchronization byte: %m"); + } + pid_socket = safe_close(pid_socket); uuid_socket = safe_close(uuid_socket); notify_socket = safe_close(notify_socket); kmsg_socket = safe_close(kmsg_socket); rtnl_socket = safe_close(rtnl_socket); netns_fd = safe_close(netns_fd); + cgroup_fd_socket = safe_close(cgroup_fd_socket); return 0; } @@ -3654,7 +3675,8 @@ static int run(int master, uuid_socket_pair[2] = { -1, -1 }, notify_socket_pair[2] = { -1, -1 }, uid_shift_socket_pair[2] = { -1, -1 }, - inner_cgver_socket_pair[2] = { -1, -1}; + inner_cgver_socket_pair[2] = { -1, -1}, + cgroup_fd_socket_pair[2] = {-1, -1 }; _cleanup_close_ int notify_socket= -1; _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL; @@ -3713,6 +3735,9 @@ static int run(int master, if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, inner_cgver_socket_pair) < 0) return log_error_errno(errno, "Failed to create unified cgroup socket pair: %m"); + if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, cgroup_fd_socket_pair) < 0) + return log_error_errno(errno, "Failed to create cgroup socket pair: %m"); + /* Child can be killed before execv(), so handle SIGCHLD in order to interrupt * parent's blocking calls and give it a chance to call wait() and terminate. */ r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL); @@ -3756,6 +3781,7 @@ static int run(int master, notify_socket_pair[0] = safe_close(notify_socket_pair[0]); uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]); inner_cgver_socket_pair[0] = safe_close(inner_cgver_socket_pair[0]); + cgroup_fd_socket_pair[0] = safe_close(cgroup_fd_socket_pair[0]); (void) reset_all_signal_handlers(); (void) reset_signal_mask(); @@ -3773,6 +3799,7 @@ static int run(int master, rtnl_socket_pair[1], uid_shift_socket_pair[1], inner_cgver_socket_pair[1], + cgroup_fd_socket_pair[1], fds, netns_fd, outer_cgver); @@ -3793,6 +3820,7 @@ static int run(int master, notify_socket_pair[1] = safe_close(notify_socket_pair[1]); uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]); inner_cgver_socket_pair[1] = safe_close(inner_cgver_socket_pair[1]); + cgroup_fd_socket_pair[1] = safe_close(cgroup_fd_socket_pair[1]); if (arg_userns_mode != USER_NAMESPACE_NO) { /* The child just let us know the UID shift it might have read from the image. */ @@ -3834,14 +3862,6 @@ static int run(int master, } } - /* Wait for the outer child. */ - r = wait_for_terminate_and_check("(sd-namespace)", *helper_pid, WAIT_LOG_ABNORMAL); - *helper_pid = 0; - if (r < 0) - return r; - if (r != EXIT_SUCCESS) - return -EIO; - /* And now retrieve the PID of the inner child. */ l = recv(pid_socket_pair[0], main_pid, sizeof *main_pid, 0); if (l < 0) @@ -3868,23 +3888,22 @@ static int run(int master, log_debug("Init process invoked as PID "PID_FMT, *main_pid); - if (arg_userns_mode != USER_NAMESPACE_NO) { - if (!barrier_place_and_sync(&barrier)) { /* #1 */ - log_error("Child died too early."); - return -ESRCH; - } + /* Wait until the child gives us the OK to configure it. */ + if (!barrier_place_and_sync(&barrier)) { /* #1 */ + log_error("Child died too early."); + return -ESRCH; + } + if (arg_userns_mode != USER_NAMESPACE_NO) { r = setup_uid_map(*main_pid); if (r < 0) return r; - - (void) barrier_place(&barrier); /* #2 */ } if (arg_private_network) { if (!arg_network_namespace_path) { /* Wait until the child has unshared its network namespace. */ - if (!barrier_place_and_sync(&barrier)) { /* #3 */ + if (!barrier_place_and_sync(&barrier)) { /* #2*/ log_error("Child died too early"); return -ESRCH; } @@ -3962,6 +3981,8 @@ static int run(int master, } if (arg_register) { + /* If the child is to be placed into a different cgroup, + * this is what does it. */ r = register_machine( bus, arg_machine, @@ -3997,11 +4018,44 @@ static int run(int master, if (r < 0) return r; + if (!arg_use_cgns) { + const char *fs; + _cleanup_close_ int fd; + + fs = procfs_file_alloca(*main_pid, "cgroup"); + fd = open(fs, O_RDONLY|O_CLOEXEC); + if (fd < 0) + return log_error_errno(errno, "Failed to open cgroups of child: %m"); + + r = send_one_fd(cgroup_fd_socket_pair[0], fd, 0); + if (r < 0) + return log_error_errno(r, "Failed to send cgroup fd: %m"); + } else { + /* we don't have any data to send, but sending something here is important because it acts as a + * synchronization point. Otherwise the outer child could exit earlier, and the resulting SIGCHLD + * could interrupt something like register_machine() above. We could use a barrier for this, but we + * have a perfectly good socket here already. */ + char c = '\0'; + l = send(cgroup_fd_socket_pair[0], &c, sizeof(c), MSG_NOSIGNAL); + if (l < 0) + return log_error_errno(errno, "Failed to send synchronization byte: %m"); + if (l != sizeof(c)) + return log_error_errno(errno, "Short write while sending synchronization byte: %m"); + } + + /* Wait for the outer child. */ + r = wait_for_terminate_and_check("(sd-namespace)", *helper_pid, WAIT_LOG_ABNORMAL); + *helper_pid = 0; + if (r < 0) + return r; + if (r != EXIT_SUCCESS) + return -EIO; + /* Notify the child that the parent is ready with all * its setup (including cgroup-ification), and that * the child can now hand over control to the code to * run inside the container. */ - (void) barrier_place(&barrier); /* #4 */ + (void) barrier_place(&barrier); /* #3 */ /* Block SIGCHLD here, before notifying child. * process_pty() will handle it with the other signals. */ @@ -4029,7 +4083,7 @@ static int run(int master, return r; /* Let the child know that we are ready and wait that the child is completely ready now. */ - if (!barrier_place_and_sync(&barrier)) { /* #5 */ + if (!barrier_place_and_sync(&barrier)) { /* #4 */ log_error("Child died too early."); return -ESRCH; } |