diff options
author | Luke Shumaker <lukeshu@lukeshu.com> | 2017-06-17 17:49:04 -0400 |
---|---|---|
committer | Luke Shumaker <lukeshu@parabola.nu> | 2018-08-16 21:55:17 -0400 |
commit | 81190bdbc5c98f0e1ffaad990fe520fd6ec4f203 (patch) | |
tree | ed4a7eda392ce533c0dc6fe2aef97fe5dc8f3db2 /src/nspawn/nspawn-cgroup.c | |
parent | f62963d6e5e48412513b00af89037755793974f7 (diff) |
nspawn: (Re)mount the systemd hierarchy RO in the outer child, not inner
The current situation:
If !use_cgns, then we mount the systemd hierarchy RW, bind the
container's subcgroup, then remount the hierarchy RO. This gives the
container RW access to its subcgroup, but makes the rest of the hierarchy
RO.
Except that the remount happens inside the container's final mount
namespace; which means that the container could just remount it RW.
I know that all systemd-nspawn features are not security features, and
provide protection against accidents only. But we can do better!
We can't just move the remount operation to where we mount cgroups in the
namespace helper, because we don't know what the container's subcgroup is
yet: the inner child (the container's PID 1) hasn't yet been created; let
alone moved into its final cgroup by machined. So instead, we wait until
after we've raw_clone()ed the inner child, check what its cgroup is, and
*then* mount the cgroups. The mounts will propagate from the helper's
mount namespace to the child mount namespace.
This solution presents several challenges of its own:
1. The outer child will have chroot()ed by the time it goes to look at the
inner child's cgroup. So how is it going to look at
/proc/${inner_child}/cgroup if it doesn't have access to /proc? We'll
have the parent open the file, then pass the file descriptor to the
outer child over a socket.
2. The parent will have to work with the fact that the outer and inner
child coexist; before the outer child exited as soon as the inner child
was clone()ed, and the parent got away with pretending that only one
existed at a time. We'll have to add a "barrier" to stop the outer
child from exiting at a point where the resulting SIGCHLD could cause a
problem for the parent. The obvious answer might be to add a second
literal barrier (as in barrier.c), but the socket mentioned above
serves the purpose when !use_cgns, so go ahead and re-purpose the
socket to serve as a barrier even when use_cgns.
3. MS_REMOUNT operations don't propagate between mount namespaces. Part
of me thinks "that's a bug/omission in the kernel", but another part of
me is saying "no, that's ridiculous, there has to be a good reason why
MS_REMOUNT operations don't propagate between mount namespaces."
Anyway, this means that the various MS_REMOUNT operations to make
things read-only are being entirely ignored by the container's
namespace.
Actually, because when we remount the tmpfs we don't pass MS_BIND, the
superblock is marked read-only; and since the superblock is a shared
global, it effectively propagated. The mountpoint-specific flags still
say "rw" in the container, but the superblock overrides them. It's a
bit weird, but it works.
So that just leaves the cgroup mounts. Instead of always mounting them
RW, then remounting them RO if necessary, just mount them RO the first
time. This is actually what it used to do a couple of years ago, but
in c053458 it was changed because "Otherwise we'll generate kernel
runtime warnings about non-matching mount options." Maybe that was
true then, but it's not true today; today it does not generate that
warning for a differing MS_RDONLY flag (though it does if the string
options are different).
Diffstat (limited to 'src/nspawn/nspawn-cgroup.c')
-rw-r--r-- | src/nspawn/nspawn-cgroup.c | 122 |
1 files changed, 52 insertions, 70 deletions
diff --git a/src/nspawn/nspawn-cgroup.c b/src/nspawn/nspawn-cgroup.c index 757be77a6c..b6f1323f21 100644 --- a/src/nspawn/nspawn-cgroup.c +++ b/src/nspawn/nspawn-cgroup.c @@ -79,6 +79,27 @@ void cgroup_free_mounts(CGMounts *mounts) { /* cgroup-util ******************************************************/ +/* Similar to cg_pid_get_path_internal, but take a full list of mount options, rather than a single controller name. */ +static int cgfile_get_cgroup(FILE *cgfile, const char *opts, char **ret_cgroup) { + + if (!opts) { /* cgroup-v2 */ + rewind(cgfile); + return cg_pid_get_path_internal(NULL, cgfile, ret_cgroup); + } else { /* cgroup-v1 */ + const char *scontroller, *state; + size_t controller_len; + FOREACH_WORD_SEPARATOR(scontroller, controller_len, opts, ",", state) { + _cleanup_free_ const char *controller = strndup(scontroller, controller_len); + rewind(cgfile); + if (cg_pid_get_path_internal(controller, cgfile, ret_cgroup) == 0) + break; + } + if (!*ret_cgroup) + return -EBADMSG; + return 0; + } +} + static int chown_cgroup_path(const char *path, uid_t uid_shift) { _cleanup_close_ int fd = -1; const char *fn; @@ -519,8 +540,9 @@ int cgroup_decide_mounts( static int cgroup_mount_cg( const char *mountpoint, const char *opts, CGMountType fstype, - bool use_cgns, bool use_userns) { + FILE *cgfile, bool use_userns) { + const bool use_cgns = cgfile == NULL; /* If we are using userns and cgns, then we always let it be RW, because we can count on the shifted root user * to not have access to the things that would make us want to mount it RO. Otherwise, we only give the * container RW access to its unified or name=systemd cgroup. */ @@ -528,19 +550,38 @@ static int cgroup_mount_cg( int r; - /* The superblock mount options of the mount point need to be - * identical to the hosts', and hence writable... */ r = mount_verbose(LOG_ERR, "cgroup", mountpoint, fstype == CGMOUNT_CGROUP1 ? "cgroup" : "cgroup2", - MS_NOSUID|MS_NOEXEC|MS_NODEV, opts); + MS_NOSUID|MS_NOEXEC|MS_NODEV| ((!rw||!use_cgns) ? MS_RDONLY : 0), opts); if (r < 0) return r; - /* ... hence let's only make the bind mount read-only, not the superblock. */ - if (!rw) { - r = mount_verbose(LOG_ERR, NULL, mountpoint, NULL, - MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL); + if (rw && !use_cgns) { + /* emulate cgns by mounting everything but our subcgroup RO */ + const char *rwmountpoint = strjoina(mountpoint, "."); + + _cleanup_free_ char *cgroup = NULL; + r = cgfile_get_cgroup(cgfile, fstype == CGMOUNT_CGROUP2 ? NULL : opts, &cgroup); + if (r < 0) { + if (fstype == CGMOUNT_CGROUP2) + return log_error_errno(r, "Failed to get child's cgroup v2 path"); + else + return log_error_errno(r, "Failed to associate mounted cgroup hierarchy %s with numbered cgroup hierarchy", mountpoint); + } + + (void) mkdir(rwmountpoint, 0755); + r = mount_verbose(LOG_ERR, "cgroup", rwmountpoint, fstype == CGMOUNT_CGROUP1 ? "cgroup" : "cgroup2", + MS_NOSUID|MS_NOEXEC|MS_NODEV, opts); + if (r < 0) + return r; + r = mount_verbose(LOG_ERR, strjoina(rwmountpoint, cgroup), strjoina(mountpoint, cgroup), NULL, MS_BIND, NULL); + if (r < 0) + return r; + r = umount_verbose(rwmountpoint); if (r < 0) return r; + r = rmdir(rwmountpoint); + if (r < 0) + return log_error_errno(r, "Failed to rmdir temporary mountpoint %s: %m", rwmountpoint); } return 0; @@ -548,10 +589,11 @@ static int cgroup_mount_cg( int cgroup_mount_mounts( CGMounts m, - bool use_cgns, + FILE *cgfile, uid_t uid_shift, const char *selinux_apifs_context) { + const bool use_cgns = cgfile == NULL; const bool use_userns = uid_shift != UID_INVALID; const char *cgroup_root = "/sys/fs/cgroup"; @@ -604,7 +646,7 @@ int cgroup_mount_mounts( return log_error_errno(EINVAL, "%s is already mounted but not a cgroup hierarchy. Refusing.", dst); } (void) mkdir_p(dst, 0755); - r = cgroup_mount_cg(dst, m.mounts[i].src, m.mounts[i].type, use_cgns, use_userns); + r = cgroup_mount_cg(dst, m.mounts[i].src, m.mounts[i].type, cgfile, use_userns); if (r < 0) return r; break; @@ -621,63 +663,3 @@ int cgroup_mount_mounts( return 0; } - -/* mount_cgroups, mount_systemd_cgroup_writable *********************/ - -static int mount_systemd_cgroup_writable_one(const char *root, const char *own) { - int r; - - assert(root); - assert(own); - - /* Make our own cgroup a (writable) bind mount */ - r = mount_verbose(LOG_ERR, own, own, NULL, MS_BIND, NULL); - if (r < 0) - return r; - - /* And then remount the systemd cgroup root read-only */ - return mount_verbose(LOG_ERR, NULL, root, NULL, - MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL); -} - -int mount_systemd_cgroup_writable( - const char *dest, - CGroupUnified inner_cgver) { - - _cleanup_free_ char *own_cgroup_path = NULL; - const char *root, *own; - int r; - - assert(dest); - - r = cg_pid_get_path(NULL, 0, &own_cgroup_path); - if (r < 0) - return log_error_errno(r, "Failed to determine our own cgroup path: %m"); - - /* If we are living in the top-level, then there's nothing to do... */ - if (path_equal(own_cgroup_path, "/")) - return 0; - - switch (inner_cgver) { - default: - case CGROUP_UNIFIED_UNKNOWN: - assert_not_reached("unknown inner_cgver"); - case CGROUP_UNIFIED_ALL: - root = prefix_roota(dest, "/sys/fs/cgroup"); - own = strjoina(root, own_cgroup_path); - break; - case CGROUP_UNIFIED_SYSTEMD233: - case CGROUP_UNIFIED_SYSTEMD232: - root = prefix_roota(dest, "/sys/fs/cgroup/unified"); - own = strjoina(root, own_cgroup_path); - r = mount_systemd_cgroup_writable_one(root, own); - if (r < 0) - return r; - _fallthrough_; - case CGROUP_UNIFIED_NONE: - root = prefix_roota(dest, "/sys/fs/cgroup/systemd"); - own = strjoina(root, own_cgroup_path); - } - - return mount_systemd_cgroup_writable_one(root, own); -} |