summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLuke Shumaker <lukeshu@lukeshu.com>2017-06-17 17:49:04 -0400
committerLuke Shumaker <lukeshu@parabola.nu>2018-08-16 21:55:17 -0400
commit81190bdbc5c98f0e1ffaad990fe520fd6ec4f203 (patch)
treeed4a7eda392ce533c0dc6fe2aef97fe5dc8f3db2
parentf62963d6e5e48412513b00af89037755793974f7 (diff)
nspawn: (Re)mount the systemd hierarchy RO in the outer child, not inner
The current situation: If !use_cgns, then we mount the systemd hierarchy RW, bind the container's subcgroup, then remount the hierarchy RO. This gives the container RW access to its subcgroup, but makes the rest of the hierarchy RO. Except that the remount happens inside the container's final mount namespace; which means that the container could just remount it RW. I know that all systemd-nspawn features are not security features, and provide protection against accidents only. But we can do better! We can't just move the remount operation to where we mount cgroups in the namespace helper, because we don't know what the container's subcgroup is yet: the inner child (the container's PID 1) hasn't yet been created; let alone moved into its final cgroup by machined. So instead, we wait until after we've raw_clone()ed the inner child, check what its cgroup is, and *then* mount the cgroups. The mounts will propagate from the helper's mount namespace to the child mount namespace. This solution presents several challenges of its own: 1. The outer child will have chroot()ed by the time it goes to look at the inner child's cgroup. So how is it going to look at /proc/${inner_child}/cgroup if it doesn't have access to /proc? We'll have the parent open the file, then pass the file descriptor to the outer child over a socket. 2. The parent will have to work with the fact that the outer and inner child coexist; before the outer child exited as soon as the inner child was clone()ed, and the parent got away with pretending that only one existed at a time. We'll have to add a "barrier" to stop the outer child from exiting at a point where the resulting SIGCHLD could cause a problem for the parent. The obvious answer might be to add a second literal barrier (as in barrier.c), but the socket mentioned above serves the purpose when !use_cgns, so go ahead and re-purpose the socket to serve as a barrier even when use_cgns. 3. MS_REMOUNT operations don't propagate between mount namespaces. Part of me thinks "that's a bug/omission in the kernel", but another part of me is saying "no, that's ridiculous, there has to be a good reason why MS_REMOUNT operations don't propagate between mount namespaces." Anyway, this means that the various MS_REMOUNT operations to make things read-only are being entirely ignored by the container's namespace. Actually, because when we remount the tmpfs we don't pass MS_BIND, the superblock is marked read-only; and since the superblock is a shared global, it effectively propagated. The mountpoint-specific flags still say "rw" in the container, but the superblock overrides them. It's a bit weird, but it works. So that just leaves the cgroup mounts. Instead of always mounting them RW, then remounting them RO if necessary, just mount them RO the first time. This is actually what it used to do a couple of years ago, but in c053458 it was changed because "Otherwise we'll generate kernel runtime warnings about non-matching mount options." Maybe that was true then, but it's not true today; today it does not generate that warning for a differing MS_RDONLY flag (though it does if the string options are different).
-rw-r--r--src/nspawn/nspawn-cgroup.c122
-rw-r--r--src/nspawn/nspawn-cgroup.h4
-rw-r--r--src/nspawn/nspawn.c152
3 files changed, 156 insertions, 122 deletions
diff --git a/src/nspawn/nspawn-cgroup.c b/src/nspawn/nspawn-cgroup.c
index 757be77a6c..b6f1323f21 100644
--- a/src/nspawn/nspawn-cgroup.c
+++ b/src/nspawn/nspawn-cgroup.c
@@ -79,6 +79,27 @@ void cgroup_free_mounts(CGMounts *mounts) {
/* cgroup-util ******************************************************/
+/* Similar to cg_pid_get_path_internal, but take a full list of mount options, rather than a single controller name. */
+static int cgfile_get_cgroup(FILE *cgfile, const char *opts, char **ret_cgroup) {
+
+ if (!opts) { /* cgroup-v2 */
+ rewind(cgfile);
+ return cg_pid_get_path_internal(NULL, cgfile, ret_cgroup);
+ } else { /* cgroup-v1 */
+ const char *scontroller, *state;
+ size_t controller_len;
+ FOREACH_WORD_SEPARATOR(scontroller, controller_len, opts, ",", state) {
+ _cleanup_free_ const char *controller = strndup(scontroller, controller_len);
+ rewind(cgfile);
+ if (cg_pid_get_path_internal(controller, cgfile, ret_cgroup) == 0)
+ break;
+ }
+ if (!*ret_cgroup)
+ return -EBADMSG;
+ return 0;
+ }
+}
+
static int chown_cgroup_path(const char *path, uid_t uid_shift) {
_cleanup_close_ int fd = -1;
const char *fn;
@@ -519,8 +540,9 @@ int cgroup_decide_mounts(
static int cgroup_mount_cg(
const char *mountpoint, const char *opts, CGMountType fstype,
- bool use_cgns, bool use_userns) {
+ FILE *cgfile, bool use_userns) {
+ const bool use_cgns = cgfile == NULL;
/* If we are using userns and cgns, then we always let it be RW, because we can count on the shifted root user
* to not have access to the things that would make us want to mount it RO. Otherwise, we only give the
* container RW access to its unified or name=systemd cgroup. */
@@ -528,19 +550,38 @@ static int cgroup_mount_cg(
int r;
- /* The superblock mount options of the mount point need to be
- * identical to the hosts', and hence writable... */
r = mount_verbose(LOG_ERR, "cgroup", mountpoint, fstype == CGMOUNT_CGROUP1 ? "cgroup" : "cgroup2",
- MS_NOSUID|MS_NOEXEC|MS_NODEV, opts);
+ MS_NOSUID|MS_NOEXEC|MS_NODEV| ((!rw||!use_cgns) ? MS_RDONLY : 0), opts);
if (r < 0)
return r;
- /* ... hence let's only make the bind mount read-only, not the superblock. */
- if (!rw) {
- r = mount_verbose(LOG_ERR, NULL, mountpoint, NULL,
- MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL);
+ if (rw && !use_cgns) {
+ /* emulate cgns by mounting everything but our subcgroup RO */
+ const char *rwmountpoint = strjoina(mountpoint, ".");
+
+ _cleanup_free_ char *cgroup = NULL;
+ r = cgfile_get_cgroup(cgfile, fstype == CGMOUNT_CGROUP2 ? NULL : opts, &cgroup);
+ if (r < 0) {
+ if (fstype == CGMOUNT_CGROUP2)
+ return log_error_errno(r, "Failed to get child's cgroup v2 path");
+ else
+ return log_error_errno(r, "Failed to associate mounted cgroup hierarchy %s with numbered cgroup hierarchy", mountpoint);
+ }
+
+ (void) mkdir(rwmountpoint, 0755);
+ r = mount_verbose(LOG_ERR, "cgroup", rwmountpoint, fstype == CGMOUNT_CGROUP1 ? "cgroup" : "cgroup2",
+ MS_NOSUID|MS_NOEXEC|MS_NODEV, opts);
+ if (r < 0)
+ return r;
+ r = mount_verbose(LOG_ERR, strjoina(rwmountpoint, cgroup), strjoina(mountpoint, cgroup), NULL, MS_BIND, NULL);
+ if (r < 0)
+ return r;
+ r = umount_verbose(rwmountpoint);
if (r < 0)
return r;
+ r = rmdir(rwmountpoint);
+ if (r < 0)
+ return log_error_errno(r, "Failed to rmdir temporary mountpoint %s: %m", rwmountpoint);
}
return 0;
@@ -548,10 +589,11 @@ static int cgroup_mount_cg(
int cgroup_mount_mounts(
CGMounts m,
- bool use_cgns,
+ FILE *cgfile,
uid_t uid_shift,
const char *selinux_apifs_context) {
+ const bool use_cgns = cgfile == NULL;
const bool use_userns = uid_shift != UID_INVALID;
const char *cgroup_root = "/sys/fs/cgroup";
@@ -604,7 +646,7 @@ int cgroup_mount_mounts(
return log_error_errno(EINVAL, "%s is already mounted but not a cgroup hierarchy. Refusing.", dst);
}
(void) mkdir_p(dst, 0755);
- r = cgroup_mount_cg(dst, m.mounts[i].src, m.mounts[i].type, use_cgns, use_userns);
+ r = cgroup_mount_cg(dst, m.mounts[i].src, m.mounts[i].type, cgfile, use_userns);
if (r < 0)
return r;
break;
@@ -621,63 +663,3 @@ int cgroup_mount_mounts(
return 0;
}
-
-/* mount_cgroups, mount_systemd_cgroup_writable *********************/
-
-static int mount_systemd_cgroup_writable_one(const char *root, const char *own) {
- int r;
-
- assert(root);
- assert(own);
-
- /* Make our own cgroup a (writable) bind mount */
- r = mount_verbose(LOG_ERR, own, own, NULL, MS_BIND, NULL);
- if (r < 0)
- return r;
-
- /* And then remount the systemd cgroup root read-only */
- return mount_verbose(LOG_ERR, NULL, root, NULL,
- MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL);
-}
-
-int mount_systemd_cgroup_writable(
- const char *dest,
- CGroupUnified inner_cgver) {
-
- _cleanup_free_ char *own_cgroup_path = NULL;
- const char *root, *own;
- int r;
-
- assert(dest);
-
- r = cg_pid_get_path(NULL, 0, &own_cgroup_path);
- if (r < 0)
- return log_error_errno(r, "Failed to determine our own cgroup path: %m");
-
- /* If we are living in the top-level, then there's nothing to do... */
- if (path_equal(own_cgroup_path, "/"))
- return 0;
-
- switch (inner_cgver) {
- default:
- case CGROUP_UNIFIED_UNKNOWN:
- assert_not_reached("unknown inner_cgver");
- case CGROUP_UNIFIED_ALL:
- root = prefix_roota(dest, "/sys/fs/cgroup");
- own = strjoina(root, own_cgroup_path);
- break;
- case CGROUP_UNIFIED_SYSTEMD233:
- case CGROUP_UNIFIED_SYSTEMD232:
- root = prefix_roota(dest, "/sys/fs/cgroup/unified");
- own = strjoina(root, own_cgroup_path);
- r = mount_systemd_cgroup_writable_one(root, own);
- if (r < 0)
- return r;
- _fallthrough_;
- case CGROUP_UNIFIED_NONE:
- root = prefix_roota(dest, "/sys/fs/cgroup/systemd");
- own = strjoina(root, own_cgroup_path);
- }
-
- return mount_systemd_cgroup_writable_one(root, own);
-}
diff --git a/src/nspawn/nspawn-cgroup.h b/src/nspawn/nspawn-cgroup.h
index b35ef666fc..e6cfa1be07 100644
--- a/src/nspawn/nspawn-cgroup.h
+++ b/src/nspawn/nspawn-cgroup.h
@@ -14,7 +14,5 @@ typedef struct CGMounts {
int cgroup_setup(pid_t pid, CGroupUnified outer_cgver, CGroupUnified inner_cgver, uid_t uid_shift, bool keep_unit);
int cgroup_decide_mounts(CGMounts *ret_mounts, CGroupUnified outer_cgver, CGroupUnified inner_cgver, bool use_cgns);
-int cgroup_mount_mounts(CGMounts mounts, bool use_cgns, uid_t uid_shift, const char *selinux_apifs_context);
+int cgroup_mount_mounts(CGMounts mounts, FILE *cgfile, uid_t uid_shift, const char *selinux_apifs_context);
void cgroup_free_mounts(CGMounts *mounts);
-
-int mount_systemd_cgroup_writable(const char *dest, CGroupUnified inner_cgver);
diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c
index f1afa3a9ca..f64faee25b 100644
--- a/src/nspawn/nspawn.c
+++ b/src/nspawn/nspawn.c
@@ -2567,15 +2567,15 @@ static int inner_child(
assert(directory);
assert(kmsg_socket >= 0);
- if (arg_userns_mode != USER_NAMESPACE_NO) {
- /* Tell the parent, that it now can write the UID map. */
- (void) barrier_place(barrier); /* #1 */
+ /* Tell the parent that it can now configure our process; write
+ * the UID map (if use_userns), place us in the correct cgroup (if
+ * use_cgns), et c. */
+ (void) barrier_place(barrier); /* #1 */
- /* Wait until the parent wrote the UID map */
- if (!barrier_place_and_sync(barrier)) { /* #2 */
- log_error("Parent died too early");
- return -ESRCH;
- }
+ /* Wait until the parent says that we are fully configured. */
+ if (!barrier_place_and_sync(barrier)) { /* #2 */
+ log_error("Parent died too early");
+ return -ESRCH;
}
r = reset_uid_gid();
@@ -2602,28 +2602,17 @@ static int inner_child(
if (r < 0)
return r;
- /* Wait until we are cgroup-ified, so that we
- * can mount the right cgroup path writable */
- if (!barrier_place_and_sync(barrier)) { /* #4 */
- log_error("Parent died too early");
- return -ESRCH;
- }
-
if (arg_use_cgns) {
r = unshare(CLONE_NEWCGROUP);
if (r < 0)
return log_error_errno(errno, "Failed to unshare cgroup namespace: %m");
r = cgroup_mount_mounts(cgmounts,
- arg_use_cgns,
+ NULL,
arg_userns_mode == USER_NAMESPACE_NO ? UID_INVALID : 0,
arg_selinux_apifs_context);
cgroup_free_mounts(&cgmounts);
if (r < 0)
return r;
- } else {
- r = mount_systemd_cgroup_writable("", arg_inner_cgver);
- if (r < 0)
- return r;
}
r = setup_boot_id();
@@ -2724,7 +2713,7 @@ static int inner_child(
/* Let the parent know that we are ready and
* wait until the parent is ready with the
* setup, too... */
- if (!barrier_place_and_sync(barrier)) { /* #5 */
+ if (!barrier_place_and_sync(barrier)) { /* #4 */
log_error("Parent died too early");
return -ESRCH;
}
@@ -2836,6 +2825,7 @@ static int outer_child(
int rtnl_socket,
int uid_shift_socket,
int inner_cgver_socket,
+ int cgroup_fd_socket,
FDSet *fds,
int netns_fd,
CGroupUnified outer_cgver) {
@@ -2853,6 +2843,7 @@ static int outer_child(
assert(uuid_socket >= 0);
assert(notify_socket >= 0);
assert(kmsg_socket >= 0);
+ assert(cgroup_fd_socket);
if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
@@ -3078,15 +3069,6 @@ static int outer_child(
if (r < 0)
return r;
- if (!arg_use_cgns) {
- r = cgroup_mount_mounts(cgmounts,
- arg_use_cgns,
- arg_userns_mode == USER_NAMESPACE_NO ? UID_INVALID : arg_uid_shift,
- arg_selinux_apifs_context);
- if (r < 0)
- return r;
- }
-
r = mount_move_root(directory);
if (r < 0)
return log_error_errno(r, "Failed to move root directory: %m");
@@ -3109,6 +3091,7 @@ static int outer_child(
uuid_socket = safe_close(uuid_socket);
notify_socket = safe_close(notify_socket);
uid_shift_socket = safe_close(uid_shift_socket);
+ cgroup_fd_socket = safe_close(cgroup_fd_socket);
/* The inner child has all namespaces that are
* requested, so that we all are owned by the user if
@@ -3147,12 +3130,50 @@ static int outer_child(
if (l < 0)
return log_error_errno(errno, "Failed to send notify fd: %m");
+ /* If !use_cgns, then we need to do this here because without cgns cgroups can't be mounted inside of a
+ * less privileged mountns (and using userns causes the mountns to be less privileged). */
+ if (!arg_use_cgns) {
+ /* If !use_cgns, then cgroup_mount_mounts() needs to look at /proc/pid/cgroup; but because we've
+ * already chroot()ed (mount_move_root()), we don't have access to /proc. So the parent opens the file
+ * for us and then sends it to us. */
+ int cgfd;
+ _cleanup_fclose_ FILE *cgfile = NULL;
+
+ cgfd = receive_one_fd(cgroup_fd_socket, 0);
+ if (cgfd < 0)
+ return log_error_errno(cgfd, "Failed to recv cgroup fd: %m");
+
+ cgfile = fdopen(cgfd, "re");
+ if (!cgfile) {
+ r = -errno; /* in case safe_close() sets errno */
+ cgfd = safe_close(cgfd);
+ return log_error_errno(r, "Failed to create a stream object for cgroup fd: %m");
+ }
+
+ r = cgroup_mount_mounts(cgmounts,
+ cgfile,
+ arg_userns_mode == USER_NAMESPACE_NO ? UID_INVALID : arg_uid_shift,
+ arg_selinux_apifs_context);
+ if (r < 0)
+ return r;
+ } else {
+ /* We're not doing anything else, but wait for a bit of data anyway; as a synchronization point, to
+ * make sure that our SIGCHLD doesn't EINTR anything important. */
+ char c;
+ l = recv(cgroup_fd_socket, &c, sizeof(c), 0);
+ if (l < 0)
+ return log_error_errno(errno, "Failed to recv synchronization byte: %m");
+ if (l != sizeof(c))
+ return log_error_errno(errno, "Short read while receiving synchronization byte: %m");
+ }
+
pid_socket = safe_close(pid_socket);
uuid_socket = safe_close(uuid_socket);
notify_socket = safe_close(notify_socket);
kmsg_socket = safe_close(kmsg_socket);
rtnl_socket = safe_close(rtnl_socket);
netns_fd = safe_close(netns_fd);
+ cgroup_fd_socket = safe_close(cgroup_fd_socket);
return 0;
}
@@ -3654,7 +3675,8 @@ static int run(int master,
uuid_socket_pair[2] = { -1, -1 },
notify_socket_pair[2] = { -1, -1 },
uid_shift_socket_pair[2] = { -1, -1 },
- inner_cgver_socket_pair[2] = { -1, -1};
+ inner_cgver_socket_pair[2] = { -1, -1},
+ cgroup_fd_socket_pair[2] = {-1, -1 };
_cleanup_close_ int notify_socket= -1;
_cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
@@ -3713,6 +3735,9 @@ static int run(int master,
if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, inner_cgver_socket_pair) < 0)
return log_error_errno(errno, "Failed to create unified cgroup socket pair: %m");
+ if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, cgroup_fd_socket_pair) < 0)
+ return log_error_errno(errno, "Failed to create cgroup socket pair: %m");
+
/* Child can be killed before execv(), so handle SIGCHLD in order to interrupt
* parent's blocking calls and give it a chance to call wait() and terminate. */
r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
@@ -3756,6 +3781,7 @@ static int run(int master,
notify_socket_pair[0] = safe_close(notify_socket_pair[0]);
uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
inner_cgver_socket_pair[0] = safe_close(inner_cgver_socket_pair[0]);
+ cgroup_fd_socket_pair[0] = safe_close(cgroup_fd_socket_pair[0]);
(void) reset_all_signal_handlers();
(void) reset_signal_mask();
@@ -3773,6 +3799,7 @@ static int run(int master,
rtnl_socket_pair[1],
uid_shift_socket_pair[1],
inner_cgver_socket_pair[1],
+ cgroup_fd_socket_pair[1],
fds,
netns_fd,
outer_cgver);
@@ -3793,6 +3820,7 @@ static int run(int master,
notify_socket_pair[1] = safe_close(notify_socket_pair[1]);
uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]);
inner_cgver_socket_pair[1] = safe_close(inner_cgver_socket_pair[1]);
+ cgroup_fd_socket_pair[1] = safe_close(cgroup_fd_socket_pair[1]);
if (arg_userns_mode != USER_NAMESPACE_NO) {
/* The child just let us know the UID shift it might have read from the image. */
@@ -3834,14 +3862,6 @@ static int run(int master,
}
}
- /* Wait for the outer child. */
- r = wait_for_terminate_and_check("(sd-namespace)", *helper_pid, WAIT_LOG_ABNORMAL);
- *helper_pid = 0;
- if (r < 0)
- return r;
- if (r != EXIT_SUCCESS)
- return -EIO;
-
/* And now retrieve the PID of the inner child. */
l = recv(pid_socket_pair[0], main_pid, sizeof *main_pid, 0);
if (l < 0)
@@ -3868,23 +3888,22 @@ static int run(int master,
log_debug("Init process invoked as PID "PID_FMT, *main_pid);
- if (arg_userns_mode != USER_NAMESPACE_NO) {
- if (!barrier_place_and_sync(&barrier)) { /* #1 */
- log_error("Child died too early.");
- return -ESRCH;
- }
+ /* Wait until the child gives us the OK to configure it. */
+ if (!barrier_place_and_sync(&barrier)) { /* #1 */
+ log_error("Child died too early.");
+ return -ESRCH;
+ }
+ if (arg_userns_mode != USER_NAMESPACE_NO) {
r = setup_uid_map(*main_pid);
if (r < 0)
return r;
-
- (void) barrier_place(&barrier); /* #2 */
}
if (arg_private_network) {
if (!arg_network_namespace_path) {
/* Wait until the child has unshared its network namespace. */
- if (!barrier_place_and_sync(&barrier)) { /* #3 */
+ if (!barrier_place_and_sync(&barrier)) { /* #2*/
log_error("Child died too early");
return -ESRCH;
}
@@ -3962,6 +3981,8 @@ static int run(int master,
}
if (arg_register) {
+ /* If the child is to be placed into a different cgroup,
+ * this is what does it. */
r = register_machine(
bus,
arg_machine,
@@ -3997,11 +4018,44 @@ static int run(int master,
if (r < 0)
return r;
+ if (!arg_use_cgns) {
+ const char *fs;
+ _cleanup_close_ int fd;
+
+ fs = procfs_file_alloca(*main_pid, "cgroup");
+ fd = open(fs, O_RDONLY|O_CLOEXEC);
+ if (fd < 0)
+ return log_error_errno(errno, "Failed to open cgroups of child: %m");
+
+ r = send_one_fd(cgroup_fd_socket_pair[0], fd, 0);
+ if (r < 0)
+ return log_error_errno(r, "Failed to send cgroup fd: %m");
+ } else {
+ /* we don't have any data to send, but sending something here is important because it acts as a
+ * synchronization point. Otherwise the outer child could exit earlier, and the resulting SIGCHLD
+ * could interrupt something like register_machine() above. We could use a barrier for this, but we
+ * have a perfectly good socket here already. */
+ char c = '\0';
+ l = send(cgroup_fd_socket_pair[0], &c, sizeof(c), MSG_NOSIGNAL);
+ if (l < 0)
+ return log_error_errno(errno, "Failed to send synchronization byte: %m");
+ if (l != sizeof(c))
+ return log_error_errno(errno, "Short write while sending synchronization byte: %m");
+ }
+
+ /* Wait for the outer child. */
+ r = wait_for_terminate_and_check("(sd-namespace)", *helper_pid, WAIT_LOG_ABNORMAL);
+ *helper_pid = 0;
+ if (r < 0)
+ return r;
+ if (r != EXIT_SUCCESS)
+ return -EIO;
+
/* Notify the child that the parent is ready with all
* its setup (including cgroup-ification), and that
* the child can now hand over control to the code to
* run inside the container. */
- (void) barrier_place(&barrier); /* #4 */
+ (void) barrier_place(&barrier); /* #3 */
/* Block SIGCHLD here, before notifying child.
* process_pty() will handle it with the other signals. */
@@ -4029,7 +4083,7 @@ static int run(int master,
return r;
/* Let the child know that we are ready and wait that the child is completely ready now. */
- if (!barrier_place_and_sync(&barrier)) { /* #5 */
+ if (!barrier_place_and_sync(&barrier)) { /* #4 */
log_error("Child died too early.");
return -ESRCH;
}