nspawn: (Re)mount the systemd hierarchy RO in the outer child, not inner

The current situation: If !use_cgns, then we mount the systemd hierarchy RW, bind the container's subcgroup, then remount the hierarchy RO. This gives the container RW access to its subcgroup, but makes the rest of the hierarchy RO. Except that the remount happens inside the container's final mount namespace; which means that the container could just remount it RW. I know that all systemd-nspawn features are not security features, and provide protection against accidents only. But we can do better! We can't just move the remount operation to where we mount cgroups in the namespace helper, because we don't know what the container's subcgroup is yet: the inner child (the container's PID 1) hasn't yet been created; let alone moved into its final cgroup by machined. So instead, we wait until after we've raw_clone()ed the inner child, check what its cgroup is, and *then* mount the cgroups. The mounts will propagate from the helper's mount namespace to the child mount namespace. This solution presents several challenges of its own: 1. The outer child will have chroot()ed by the time it goes to look at the inner child's cgroup. So how is it going to look at /proc/${inner_child}/cgroup if it doesn't have access to /proc? We'll have the parent open the file, then pass the file descriptor to the outer child over a socket. 2. The parent will have to work with the fact that the outer and inner child coexist; before the outer child exited as soon as the inner child was clone()ed, and the parent got away with pretending that only one existed at a time. We'll have to add a "barrier" to stop the outer child from exiting at a point where the resulting SIGCHLD could cause a problem for the parent. The obvious answer might be to add a second literal barrier (as in barrier.c), but the socket mentioned above serves the purpose when !use_cgns, so go ahead and re-purpose the socket to serve as a barrier even when use_cgns. 3. MS_REMOUNT operations don't propagate between mount namespaces. Part of me thinks "that's a bug/omission in the kernel", but another part of me is saying "no, that's ridiculous, there has to be a good reason why MS_REMOUNT operations don't propagate between mount namespaces." Anyway, this means that the various MS_REMOUNT operations to make things read-only are being entirely ignored by the container's namespace. Actually, because when we remount the tmpfs we don't pass MS_BIND, the superblock is marked read-only; and since the superblock is a shared global, it effectively propagated. The mountpoint-specific flags still say "rw" in the container, but the superblock overrides them. It's a bit weird, but it works. So that just leaves the cgroup mounts. Instead of always mounting them RW, then remounting them RO if necessary, just mount them RO the first time. This is actually what it used to do a couple of years ago, but in c053458 it was changed because "Otherwise we'll generate kernel runtime warnings about non-matching mount options." Maybe that was true then, but it's not true today; today it does not generate that warning for a differing MS_RDONLY flag (though it does if the string options are different).
author: Luke Shumaker <lukeshu@lukeshu.com> 2017-06-17 17:49:04 -0400
committer: Luke Shumaker <lukeshu@parabola.nu> 2018-08-16 21:55:17 -0400
commit: 81190bdbc5c98f0e1ffaad990fe520fd6ec4f203 (patch)
tree: ed4a7eda392ce533c0dc6fe2aef97fe5dc8f3db2 /src/nspawn/nspawn-cgroup.c
parent: f62963d6e5e48412513b00af89037755793974f7 (diff)
1 files changed, 52 insertions, 70 deletions
diff --git a/src/nspawn/nspawn-cgroup.c b/src/nspawn/nspawn-cgroup.c
index 757be77a6c..b6f1323f21 100644
--- a/src/nspawn/nspawn-cgroup.c
+++ b/src/nspawn/nspawn-cgroup.c
@@ -79,6 +79,27 @@ void cgroup_free_mounts(CGMounts *mounts) {
 
 /* cgroup-util ******************************************************/
 
+/* Similar to cg_pid_get_path_internal, but take a full list of mount options, rather than a single controller name. */
+static int cgfile_get_cgroup(FILE *cgfile, const char *opts, char **ret_cgroup) {
+
+        if (!opts) { /* cgroup-v2 */
+                rewind(cgfile);
+                return cg_pid_get_path_internal(NULL, cgfile, ret_cgroup);
+        } else { /* cgroup-v1 */
+                const char *scontroller, *state;
+                size_t controller_len;
+                FOREACH_WORD_SEPARATOR(scontroller, controller_len, opts, ",", state) {
+                        _cleanup_free_ const char *controller = strndup(scontroller, controller_len);
+                        rewind(cgfile);
+                        if (cg_pid_get_path_internal(controller, cgfile, ret_cgroup) == 0)
+                                break;
+                }
+                if (!*ret_cgroup)
+                        return -EBADMSG;
+                return 0;
+        }
+}
+
 static int chown_cgroup_path(const char *path, uid_t uid_shift) {
         _cleanup_close_ int fd = -1;
         const char *fn;
@@ -519,8 +540,9 @@ int cgroup_decide_mounts(
 
 static int cgroup_mount_cg(
                 const char *mountpoint, const char *opts, CGMountType fstype,
-                bool use_cgns, bool use_userns) {
+                FILE *cgfile, bool use_userns) {
 
+        const bool use_cgns = cgfile == NULL;
         /* If we are using userns and cgns, then we always let it be RW, because we can count on the shifted root user
          * to not have access to the things that would make us want to mount it RO.  Otherwise, we only give the
          * container RW access to its unified or name=systemd cgroup. */
@@ -528,19 +550,38 @@ static int cgroup_mount_cg(
 
         int r;
 
-        /* The superblock mount options of the mount point need to be
-         * identical to the hosts', and hence writable... */
         r = mount_verbose(LOG_ERR, "cgroup", mountpoint, fstype == CGMOUNT_CGROUP1 ? "cgroup" : "cgroup2",
-                          MS_NOSUID|MS_NOEXEC|MS_NODEV, opts);
+                          MS_NOSUID|MS_NOEXEC|MS_NODEV| ((!rw||!use_cgns) ? MS_RDONLY : 0), opts);
         if (r < 0)
                 return r;
 
-        /* ... hence let's only make the bind mount read-only, not the superblock. */
-        if (!rw) {
-                r = mount_verbose(LOG_ERR, NULL, mountpoint, NULL,
-                                  MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL);
+        if (rw && !use_cgns) {
+                /* emulate cgns by mounting everything but our subcgroup RO */
+                const char *rwmountpoint = strjoina(mountpoint, ".");
+
+                _cleanup_free_ char *cgroup = NULL;
+                r = cgfile_get_cgroup(cgfile, fstype == CGMOUNT_CGROUP2 ? NULL : opts, &cgroup);
+                if (r < 0) {
+                        if (fstype == CGMOUNT_CGROUP2)
+                                return log_error_errno(r, "Failed to get child's cgroup v2 path");
+                        else
+                                return log_error_errno(r, "Failed to associate mounted cgroup hierarchy %s with numbered cgroup hierarchy", mountpoint);
+                }
+
+                (void) mkdir(rwmountpoint, 0755);
+                r = mount_verbose(LOG_ERR, "cgroup", rwmountpoint, fstype == CGMOUNT_CGROUP1 ? "cgroup" : "cgroup2",
+                                  MS_NOSUID|MS_NOEXEC|MS_NODEV, opts);
+                if (r < 0)
+                        return r;
+                r = mount_verbose(LOG_ERR, strjoina(rwmountpoint, cgroup), strjoina(mountpoint, cgroup), NULL, MS_BIND, NULL);
+                if (r < 0)
+                        return r;
+                r = umount_verbose(rwmountpoint);
                 if (r < 0)
                         return r;
+                r = rmdir(rwmountpoint);
+                if (r < 0)
+                        return log_error_errno(r, "Failed to rmdir temporary mountpoint %s: %m", rwmountpoint);
         }
 
         return 0;
@@ -548,10 +589,11 @@ static int cgroup_mount_cg(
 
 int cgroup_mount_mounts(
                 CGMounts m,
-                bool use_cgns,
+                FILE *cgfile,
                 uid_t uid_shift,
                 const char *selinux_apifs_context) {
 
+        const bool use_cgns = cgfile == NULL;
         const bool use_userns = uid_shift != UID_INVALID;
         const char *cgroup_root = "/sys/fs/cgroup";
 
@@ -604,7 +646,7 @@ int cgroup_mount_mounts(
                                 return log_error_errno(EINVAL, "%s is already mounted but not a cgroup hierarchy. Refusing.", dst);
                         }
                         (void) mkdir_p(dst, 0755);
-                        r = cgroup_mount_cg(dst, m.mounts[i].src, m.mounts[i].type, use_cgns, use_userns);
+                        r = cgroup_mount_cg(dst, m.mounts[i].src, m.mounts[i].type, cgfile, use_userns);
                         if (r < 0)
                                 return r;
                         break;
@@ -621,63 +663,3 @@ int cgroup_mount_mounts(
 
         return 0;
 }
-
-/* mount_cgroups, mount_systemd_cgroup_writable *********************/
-
-static int mount_systemd_cgroup_writable_one(const char *root, const char *own) {
-        int r;
-
-        assert(root);
-        assert(own);
-
-        /* Make our own cgroup a (writable) bind mount */
-        r = mount_verbose(LOG_ERR, own, own, NULL, MS_BIND, NULL);
-        if (r < 0)
-                return r;
-
-        /* And then remount the systemd cgroup root read-only */
-        return mount_verbose(LOG_ERR, NULL, root, NULL,
-                             MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL);
-}
-
-int mount_systemd_cgroup_writable(
-                const char *dest,
-                CGroupUnified inner_cgver) {
-
-        _cleanup_free_ char *own_cgroup_path = NULL;
-        const char *root, *own;
-        int r;
-
-        assert(dest);
-
-        r = cg_pid_get_path(NULL, 0, &own_cgroup_path);
-        if (r < 0)
-                return log_error_errno(r, "Failed to determine our own cgroup path: %m");
-
-        /* If we are living in the top-level, then there's nothing to do... */
-        if (path_equal(own_cgroup_path, "/"))
-                return 0;
-
-        switch (inner_cgver) {
-        default:
-        case CGROUP_UNIFIED_UNKNOWN:
-                assert_not_reached("unknown inner_cgver");
-        case CGROUP_UNIFIED_ALL:
-                root = prefix_roota(dest, "/sys/fs/cgroup");
-                own = strjoina(root, own_cgroup_path);
-                break;
-        case CGROUP_UNIFIED_SYSTEMD233:
-        case CGROUP_UNIFIED_SYSTEMD232:
-                root = prefix_roota(dest, "/sys/fs/cgroup/unified");
-                own = strjoina(root, own_cgroup_path);
-                r = mount_systemd_cgroup_writable_one(root, own);
-                if (r < 0)
-                        return r;
-                _fallthrough_;
-        case CGROUP_UNIFIED_NONE:
-                root = prefix_roota(dest, "/sys/fs/cgroup/systemd");
-                own = strjoina(root, own_cgroup_path);
-        }
-
-        return mount_systemd_cgroup_writable_one(root, own);
-}
author	Luke Shumaker <lukeshu@lukeshu.com>	2017-06-17 17:49:04 -0400
committer	Luke Shumaker <lukeshu@parabola.nu>	2018-08-16 21:55:17 -0400
commit	81190bdbc5c98f0e1ffaad990fe520fd6ec4f203 (patch)
tree	ed4a7eda392ce533c0dc6fe2aef97fe5dc8f3db2 /src/nspawn/nspawn-cgroup.c
parent	f62963d6e5e48412513b00af89037755793974f7 (diff)