summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/nspawn/nspawn-cgroup.c122
-rw-r--r--src/nspawn/nspawn-cgroup.h4
-rw-r--r--src/nspawn/nspawn.c152
3 files changed, 156 insertions, 122 deletions
diff --git a/src/nspawn/nspawn-cgroup.c b/src/nspawn/nspawn-cgroup.c
index 757be77a6c..b6f1323f21 100644
--- a/src/nspawn/nspawn-cgroup.c
+++ b/src/nspawn/nspawn-cgroup.c
@@ -79,6 +79,27 @@ void cgroup_free_mounts(CGMounts *mounts) {
/* cgroup-util ******************************************************/
+/* Similar to cg_pid_get_path_internal, but take a full list of mount options, rather than a single controller name. */
+static int cgfile_get_cgroup(FILE *cgfile, const char *opts, char **ret_cgroup) {
+
+ if (!opts) { /* cgroup-v2 */
+ rewind(cgfile);
+ return cg_pid_get_path_internal(NULL, cgfile, ret_cgroup);
+ } else { /* cgroup-v1 */
+ const char *scontroller, *state;
+ size_t controller_len;
+ FOREACH_WORD_SEPARATOR(scontroller, controller_len, opts, ",", state) {
+ _cleanup_free_ const char *controller = strndup(scontroller, controller_len);
+ rewind(cgfile);
+ if (cg_pid_get_path_internal(controller, cgfile, ret_cgroup) == 0)
+ break;
+ }
+ if (!*ret_cgroup)
+ return -EBADMSG;
+ return 0;
+ }
+}
+
static int chown_cgroup_path(const char *path, uid_t uid_shift) {
_cleanup_close_ int fd = -1;
const char *fn;
@@ -519,8 +540,9 @@ int cgroup_decide_mounts(
static int cgroup_mount_cg(
const char *mountpoint, const char *opts, CGMountType fstype,
- bool use_cgns, bool use_userns) {
+ FILE *cgfile, bool use_userns) {
+ const bool use_cgns = cgfile == NULL;
/* If we are using userns and cgns, then we always let it be RW, because we can count on the shifted root user
* to not have access to the things that would make us want to mount it RO. Otherwise, we only give the
* container RW access to its unified or name=systemd cgroup. */
@@ -528,19 +550,38 @@ static int cgroup_mount_cg(
int r;
- /* The superblock mount options of the mount point need to be
- * identical to the hosts', and hence writable... */
r = mount_verbose(LOG_ERR, "cgroup", mountpoint, fstype == CGMOUNT_CGROUP1 ? "cgroup" : "cgroup2",
- MS_NOSUID|MS_NOEXEC|MS_NODEV, opts);
+ MS_NOSUID|MS_NOEXEC|MS_NODEV| ((!rw||!use_cgns) ? MS_RDONLY : 0), opts);
if (r < 0)
return r;
- /* ... hence let's only make the bind mount read-only, not the superblock. */
- if (!rw) {
- r = mount_verbose(LOG_ERR, NULL, mountpoint, NULL,
- MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL);
+ if (rw && !use_cgns) {
+ /* emulate cgns by mounting everything but our subcgroup RO */
+ const char *rwmountpoint = strjoina(mountpoint, ".");
+
+ _cleanup_free_ char *cgroup = NULL;
+ r = cgfile_get_cgroup(cgfile, fstype == CGMOUNT_CGROUP2 ? NULL : opts, &cgroup);
+ if (r < 0) {
+ if (fstype == CGMOUNT_CGROUP2)
+ return log_error_errno(r, "Failed to get child's cgroup v2 path");
+ else
+ return log_error_errno(r, "Failed to associate mounted cgroup hierarchy %s with numbered cgroup hierarchy", mountpoint);
+ }
+
+ (void) mkdir(rwmountpoint, 0755);
+ r = mount_verbose(LOG_ERR, "cgroup", rwmountpoint, fstype == CGMOUNT_CGROUP1 ? "cgroup" : "cgroup2",
+ MS_NOSUID|MS_NOEXEC|MS_NODEV, opts);
+ if (r < 0)
+ return r;
+ r = mount_verbose(LOG_ERR, strjoina(rwmountpoint, cgroup), strjoina(mountpoint, cgroup), NULL, MS_BIND, NULL);
+ if (r < 0)
+ return r;
+ r = umount_verbose(rwmountpoint);
if (r < 0)
return r;
+ r = rmdir(rwmountpoint);
+ if (r < 0)
+ return log_error_errno(r, "Failed to rmdir temporary mountpoint %s: %m", rwmountpoint);
}
return 0;
@@ -548,10 +589,11 @@ static int cgroup_mount_cg(
int cgroup_mount_mounts(
CGMounts m,
- bool use_cgns,
+ FILE *cgfile,
uid_t uid_shift,
const char *selinux_apifs_context) {
+ const bool use_cgns = cgfile == NULL;
const bool use_userns = uid_shift != UID_INVALID;
const char *cgroup_root = "/sys/fs/cgroup";
@@ -604,7 +646,7 @@ int cgroup_mount_mounts(
return log_error_errno(EINVAL, "%s is already mounted but not a cgroup hierarchy. Refusing.", dst);
}
(void) mkdir_p(dst, 0755);
- r = cgroup_mount_cg(dst, m.mounts[i].src, m.mounts[i].type, use_cgns, use_userns);
+ r = cgroup_mount_cg(dst, m.mounts[i].src, m.mounts[i].type, cgfile, use_userns);
if (r < 0)
return r;
break;
@@ -621,63 +663,3 @@ int cgroup_mount_mounts(
return 0;
}
-
-/* mount_cgroups, mount_systemd_cgroup_writable *********************/
-
-static int mount_systemd_cgroup_writable_one(const char *root, const char *own) {
- int r;
-
- assert(root);
- assert(own);
-
- /* Make our own cgroup a (writable) bind mount */
- r = mount_verbose(LOG_ERR, own, own, NULL, MS_BIND, NULL);
- if (r < 0)
- return r;
-
- /* And then remount the systemd cgroup root read-only */
- return mount_verbose(LOG_ERR, NULL, root, NULL,
- MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL);
-}
-
-int mount_systemd_cgroup_writable(
- const char *dest,
- CGroupUnified inner_cgver) {
-
- _cleanup_free_ char *own_cgroup_path = NULL;
- const char *root, *own;
- int r;
-
- assert(dest);
-
- r = cg_pid_get_path(NULL, 0, &own_cgroup_path);
- if (r < 0)
- return log_error_errno(r, "Failed to determine our own cgroup path: %m");
-
- /* If we are living in the top-level, then there's nothing to do... */
- if (path_equal(own_cgroup_path, "/"))
- return 0;
-
- switch (inner_cgver) {
- default:
- case CGROUP_UNIFIED_UNKNOWN:
- assert_not_reached("unknown inner_cgver");
- case CGROUP_UNIFIED_ALL:
- root = prefix_roota(dest, "/sys/fs/cgroup");
- own = strjoina(root, own_cgroup_path);
- break;
- case CGROUP_UNIFIED_SYSTEMD233:
- case CGROUP_UNIFIED_SYSTEMD232:
- root = prefix_roota(dest, "/sys/fs/cgroup/unified");
- own = strjoina(root, own_cgroup_path);
- r = mount_systemd_cgroup_writable_one(root, own);
- if (r < 0)
- return r;
- _fallthrough_;
- case CGROUP_UNIFIED_NONE:
- root = prefix_roota(dest, "/sys/fs/cgroup/systemd");
- own = strjoina(root, own_cgroup_path);
- }
-
- return mount_systemd_cgroup_writable_one(root, own);
-}
diff --git a/src/nspawn/nspawn-cgroup.h b/src/nspawn/nspawn-cgroup.h
index b35ef666fc..e6cfa1be07 100644
--- a/src/nspawn/nspawn-cgroup.h
+++ b/src/nspawn/nspawn-cgroup.h
@@ -14,7 +14,5 @@ typedef struct CGMounts {
int cgroup_setup(pid_t pid, CGroupUnified outer_cgver, CGroupUnified inner_cgver, uid_t uid_shift, bool keep_unit);
int cgroup_decide_mounts(CGMounts *ret_mounts, CGroupUnified outer_cgver, CGroupUnified inner_cgver, bool use_cgns);
-int cgroup_mount_mounts(CGMounts mounts, bool use_cgns, uid_t uid_shift, const char *selinux_apifs_context);
+int cgroup_mount_mounts(CGMounts mounts, FILE *cgfile, uid_t uid_shift, const char *selinux_apifs_context);
void cgroup_free_mounts(CGMounts *mounts);
-
-int mount_systemd_cgroup_writable(const char *dest, CGroupUnified inner_cgver);
diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c
index f1afa3a9ca..f64faee25b 100644
--- a/src/nspawn/nspawn.c
+++ b/src/nspawn/nspawn.c
@@ -2567,15 +2567,15 @@ static int inner_child(
assert(directory);
assert(kmsg_socket >= 0);
- if (arg_userns_mode != USER_NAMESPACE_NO) {
- /* Tell the parent, that it now can write the UID map. */
- (void) barrier_place(barrier); /* #1 */
+ /* Tell the parent that it can now configure our process; write
+ * the UID map (if use_userns), place us in the correct cgroup (if
+ * use_cgns), et c. */
+ (void) barrier_place(barrier); /* #1 */
- /* Wait until the parent wrote the UID map */
- if (!barrier_place_and_sync(barrier)) { /* #2 */
- log_error("Parent died too early");
- return -ESRCH;
- }
+ /* Wait until the parent says that we are fully configured. */
+ if (!barrier_place_and_sync(barrier)) { /* #2 */
+ log_error("Parent died too early");
+ return -ESRCH;
}
r = reset_uid_gid();
@@ -2602,28 +2602,17 @@ static int inner_child(
if (r < 0)
return r;
- /* Wait until we are cgroup-ified, so that we
- * can mount the right cgroup path writable */
- if (!barrier_place_and_sync(barrier)) { /* #4 */
- log_error("Parent died too early");
- return -ESRCH;
- }
-
if (arg_use_cgns) {
r = unshare(CLONE_NEWCGROUP);
if (r < 0)
return log_error_errno(errno, "Failed to unshare cgroup namespace: %m");
r = cgroup_mount_mounts(cgmounts,
- arg_use_cgns,
+ NULL,
arg_userns_mode == USER_NAMESPACE_NO ? UID_INVALID : 0,
arg_selinux_apifs_context);
cgroup_free_mounts(&cgmounts);
if (r < 0)
return r;
- } else {
- r = mount_systemd_cgroup_writable("", arg_inner_cgver);
- if (r < 0)
- return r;
}
r = setup_boot_id();
@@ -2724,7 +2713,7 @@ static int inner_child(
/* Let the parent know that we are ready and
* wait until the parent is ready with the
* setup, too... */
- if (!barrier_place_and_sync(barrier)) { /* #5 */
+ if (!barrier_place_and_sync(barrier)) { /* #4 */
log_error("Parent died too early");
return -ESRCH;
}
@@ -2836,6 +2825,7 @@ static int outer_child(
int rtnl_socket,
int uid_shift_socket,
int inner_cgver_socket,
+ int cgroup_fd_socket,
FDSet *fds,
int netns_fd,
CGroupUnified outer_cgver) {
@@ -2853,6 +2843,7 @@ static int outer_child(
assert(uuid_socket >= 0);
assert(notify_socket >= 0);
assert(kmsg_socket >= 0);
+ assert(cgroup_fd_socket);
if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
@@ -3078,15 +3069,6 @@ static int outer_child(
if (r < 0)
return r;
- if (!arg_use_cgns) {
- r = cgroup_mount_mounts(cgmounts,
- arg_use_cgns,
- arg_userns_mode == USER_NAMESPACE_NO ? UID_INVALID : arg_uid_shift,
- arg_selinux_apifs_context);
- if (r < 0)
- return r;
- }
-
r = mount_move_root(directory);
if (r < 0)
return log_error_errno(r, "Failed to move root directory: %m");
@@ -3109,6 +3091,7 @@ static int outer_child(
uuid_socket = safe_close(uuid_socket);
notify_socket = safe_close(notify_socket);
uid_shift_socket = safe_close(uid_shift_socket);
+ cgroup_fd_socket = safe_close(cgroup_fd_socket);
/* The inner child has all namespaces that are
* requested, so that we all are owned by the user if
@@ -3147,12 +3130,50 @@ static int outer_child(
if (l < 0)
return log_error_errno(errno, "Failed to send notify fd: %m");
+ /* If !use_cgns, then we need to do this here because without cgns cgroups can't be mounted inside of a
+ * less privileged mountns (and using userns causes the mountns to be less privileged). */
+ if (!arg_use_cgns) {
+ /* If !use_cgns, then cgroup_mount_mounts() needs to look at /proc/pid/cgroup; but because we've
+ * already chroot()ed (mount_move_root()), we don't have access to /proc. So the parent opens the file
+ * for us and then sends it to us. */
+ int cgfd;
+ _cleanup_fclose_ FILE *cgfile = NULL;
+
+ cgfd = receive_one_fd(cgroup_fd_socket, 0);
+ if (cgfd < 0)
+ return log_error_errno(cgfd, "Failed to recv cgroup fd: %m");
+
+ cgfile = fdopen(cgfd, "re");
+ if (!cgfile) {
+ r = -errno; /* in case safe_close() sets errno */
+ cgfd = safe_close(cgfd);
+ return log_error_errno(r, "Failed to create a stream object for cgroup fd: %m");
+ }
+
+ r = cgroup_mount_mounts(cgmounts,
+ cgfile,
+ arg_userns_mode == USER_NAMESPACE_NO ? UID_INVALID : arg_uid_shift,
+ arg_selinux_apifs_context);
+ if (r < 0)
+ return r;
+ } else {
+ /* We're not doing anything else, but wait for a bit of data anyway; as a synchronization point, to
+ * make sure that our SIGCHLD doesn't EINTR anything important. */
+ char c;
+ l = recv(cgroup_fd_socket, &c, sizeof(c), 0);
+ if (l < 0)
+ return log_error_errno(errno, "Failed to recv synchronization byte: %m");
+ if (l != sizeof(c))
+ return log_error_errno(errno, "Short read while receiving synchronization byte: %m");
+ }
+
pid_socket = safe_close(pid_socket);
uuid_socket = safe_close(uuid_socket);
notify_socket = safe_close(notify_socket);
kmsg_socket = safe_close(kmsg_socket);
rtnl_socket = safe_close(rtnl_socket);
netns_fd = safe_close(netns_fd);
+ cgroup_fd_socket = safe_close(cgroup_fd_socket);
return 0;
}
@@ -3654,7 +3675,8 @@ static int run(int master,
uuid_socket_pair[2] = { -1, -1 },
notify_socket_pair[2] = { -1, -1 },
uid_shift_socket_pair[2] = { -1, -1 },
- inner_cgver_socket_pair[2] = { -1, -1};
+ inner_cgver_socket_pair[2] = { -1, -1},
+ cgroup_fd_socket_pair[2] = {-1, -1 };
_cleanup_close_ int notify_socket= -1;
_cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
@@ -3713,6 +3735,9 @@ static int run(int master,
if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, inner_cgver_socket_pair) < 0)
return log_error_errno(errno, "Failed to create unified cgroup socket pair: %m");
+ if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, cgroup_fd_socket_pair) < 0)
+ return log_error_errno(errno, "Failed to create cgroup socket pair: %m");
+
/* Child can be killed before execv(), so handle SIGCHLD in order to interrupt
* parent's blocking calls and give it a chance to call wait() and terminate. */
r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
@@ -3756,6 +3781,7 @@ static int run(int master,
notify_socket_pair[0] = safe_close(notify_socket_pair[0]);
uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
inner_cgver_socket_pair[0] = safe_close(inner_cgver_socket_pair[0]);
+ cgroup_fd_socket_pair[0] = safe_close(cgroup_fd_socket_pair[0]);
(void) reset_all_signal_handlers();
(void) reset_signal_mask();
@@ -3773,6 +3799,7 @@ static int run(int master,
rtnl_socket_pair[1],
uid_shift_socket_pair[1],
inner_cgver_socket_pair[1],
+ cgroup_fd_socket_pair[1],
fds,
netns_fd,
outer_cgver);
@@ -3793,6 +3820,7 @@ static int run(int master,
notify_socket_pair[1] = safe_close(notify_socket_pair[1]);
uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]);
inner_cgver_socket_pair[1] = safe_close(inner_cgver_socket_pair[1]);
+ cgroup_fd_socket_pair[1] = safe_close(cgroup_fd_socket_pair[1]);
if (arg_userns_mode != USER_NAMESPACE_NO) {
/* The child just let us know the UID shift it might have read from the image. */
@@ -3834,14 +3862,6 @@ static int run(int master,
}
}
- /* Wait for the outer child. */
- r = wait_for_terminate_and_check("(sd-namespace)", *helper_pid, WAIT_LOG_ABNORMAL);
- *helper_pid = 0;
- if (r < 0)
- return r;
- if (r != EXIT_SUCCESS)
- return -EIO;
-
/* And now retrieve the PID of the inner child. */
l = recv(pid_socket_pair[0], main_pid, sizeof *main_pid, 0);
if (l < 0)
@@ -3868,23 +3888,22 @@ static int run(int master,
log_debug("Init process invoked as PID "PID_FMT, *main_pid);
- if (arg_userns_mode != USER_NAMESPACE_NO) {
- if (!barrier_place_and_sync(&barrier)) { /* #1 */
- log_error("Child died too early.");
- return -ESRCH;
- }
+ /* Wait until the child gives us the OK to configure it. */
+ if (!barrier_place_and_sync(&barrier)) { /* #1 */
+ log_error("Child died too early.");
+ return -ESRCH;
+ }
+ if (arg_userns_mode != USER_NAMESPACE_NO) {
r = setup_uid_map(*main_pid);
if (r < 0)
return r;
-
- (void) barrier_place(&barrier); /* #2 */
}
if (arg_private_network) {
if (!arg_network_namespace_path) {
/* Wait until the child has unshared its network namespace. */
- if (!barrier_place_and_sync(&barrier)) { /* #3 */
+ if (!barrier_place_and_sync(&barrier)) { /* #2*/
log_error("Child died too early");
return -ESRCH;
}
@@ -3962,6 +3981,8 @@ static int run(int master,
}
if (arg_register) {
+ /* If the child is to be placed into a different cgroup,
+ * this is what does it. */
r = register_machine(
bus,
arg_machine,
@@ -3997,11 +4018,44 @@ static int run(int master,
if (r < 0)
return r;
+ if (!arg_use_cgns) {
+ const char *fs;
+ _cleanup_close_ int fd;
+
+ fs = procfs_file_alloca(*main_pid, "cgroup");
+ fd = open(fs, O_RDONLY|O_CLOEXEC);
+ if (fd < 0)
+ return log_error_errno(errno, "Failed to open cgroups of child: %m");
+
+ r = send_one_fd(cgroup_fd_socket_pair[0], fd, 0);
+ if (r < 0)
+ return log_error_errno(r, "Failed to send cgroup fd: %m");
+ } else {
+ /* we don't have any data to send, but sending something here is important because it acts as a
+ * synchronization point. Otherwise the outer child could exit earlier, and the resulting SIGCHLD
+ * could interrupt something like register_machine() above. We could use a barrier for this, but we
+ * have a perfectly good socket here already. */
+ char c = '\0';
+ l = send(cgroup_fd_socket_pair[0], &c, sizeof(c), MSG_NOSIGNAL);
+ if (l < 0)
+ return log_error_errno(errno, "Failed to send synchronization byte: %m");
+ if (l != sizeof(c))
+ return log_error_errno(errno, "Short write while sending synchronization byte: %m");
+ }
+
+ /* Wait for the outer child. */
+ r = wait_for_terminate_and_check("(sd-namespace)", *helper_pid, WAIT_LOG_ABNORMAL);
+ *helper_pid = 0;
+ if (r < 0)
+ return r;
+ if (r != EXIT_SUCCESS)
+ return -EIO;
+
/* Notify the child that the parent is ready with all
* its setup (including cgroup-ification), and that
* the child can now hand over control to the code to
* run inside the container. */
- (void) barrier_place(&barrier); /* #4 */
+ (void) barrier_place(&barrier); /* #3 */
/* Block SIGCHLD here, before notifying child.
* process_pty() will handle it with the other signals. */
@@ -4029,7 +4083,7 @@ static int run(int master,
return r;
/* Let the child know that we are ready and wait that the child is completely ready now. */
- if (!barrier_place_and_sync(&barrier)) { /* #5 */
+ if (!barrier_place_and_sync(&barrier)) { /* #4 */
log_error("Child died too early.");
return -ESRCH;
}