nspawn: Add functions for deciding cgroup mounts before performing them

This is part 1 of a 2-part commit; it adds the functions, but doesn't modify anything to use them; that's part 2. I've split it up for clarity of the diff, and to make rebasing easier. Add (static) cgmount_add(), to build a list of CGMounts; cgroup_free_mounts() to eventually free that list; and cgroup_mount_mounts() to perform the mounts in the list. The behavior of cgroup_mount_mounts() borrows from/imitates: - mount_legacy_cgroup_hierarchy(): Most everything, but it shoves the decision of whether to use cgroup v1 or v2 to be the responsibility of whoever is building the list. - mount_legacy_cgns_[un]supported(): The tmpfs logic, the logics on when to mount a cgroup hierarchy RO or RW, the logics on when to remount /sys/fs/cgroup RO, what flags to pass to path_is_mountpoint(). - mount_unified_cgroups(): logics on deciding if a mountpoint is a cgroup hierarchy, what flags to pass to path_is_mountpoint().
author: Luke Shumaker <lukeshu@parabola.nu> 2018-07-23 22:02:26 -0400
committer: Luke Shumaker <lukeshu@parabola.nu> 2018-08-16 21:55:16 -0400
commit: 1561a0ef6a55b89ecd0caa31ce11b092acd25174 (patch)
tree: dc443844ab448c2983bb0ae6b20f3d42b62928ef
parent: f20e1f8f21e605923577279f961de3fb9e1ebe9a (diff)
2 files changed, 175 insertions, 0 deletions
diff --git a/src/nspawn/nspawn-cgroup.c b/src/nspawn/nspawn-cgroup.c
index de4988a39b..b22d175e2c 100644
--- a/src/nspawn/nspawn-cgroup.c
+++ b/src/nspawn/nspawn-cgroup.c
@@ -19,6 +19,67 @@
 
 /* Code for managing the list of CGMounts ***************************/
 
+typedef enum CGMountType {
+        CGMOUNT_SYMLINK,
+        CGMOUNT_TMPFS,
+        CGMOUNT_CGROUP1,
+        CGMOUNT_CGROUP2,
+        _CGMOUNT_MAX
+} CGMountType;
+
+struct CGMount {
+        CGMountType type;
+        char *src;
+        char *dst;
+};
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-function"
+static CGMount *cgmount_add(CGMounts *mounts, CGMountType type, const char *src, const char *dst) {
+#pragma GCC diagnostic pop
+
+        char *hsrc = NULL, *hdst = NULL;
+        CGMount *c, *ret;
+
+        assert(mounts);
+        assert(type >= 0 && type < _CGMOUNT_MAX);
+        assert(src);
+        assert(dst);
+
+        hsrc = strdup(src);
+        hdst = strdup(dst);
+        if (!hsrc || !hdst) {
+                free(hsrc);
+                free(hdst);
+                return NULL;
+        }
+
+        c = reallocarray(mounts->mounts, mounts->n + 1, sizeof(CGMount));
+        if (!c)
+                return NULL;
+
+        mounts->mounts = c;
+        ret = &(mounts->mounts)[mounts->n];
+        (mounts->n)++;
+
+        *ret = (CGMount) {
+                .type = type,
+                .src = hsrc,
+                .dst = hdst,
+        };
+        return ret;
+}
+
+void cgroup_free_mounts(CGMounts *mounts) {
+
+        for (size_t i = 0; i < mounts->n; i++) {
+                free(mounts->mounts[i].src);
+                free(mounts->mounts[i].dst);
+        }
+        mounts->mounts = mfree(mounts->mounts);
+        mounts->n = 0;
+}
+
 /* cgroup-util ******************************************************/
 
 static int chown_cgroup_path(const char *path, uid_t uid_shift) {
@@ -581,6 +642,112 @@ static int mount_unified_cgroups(const char *dest) {
 
 /* cgroup_mount_mounts **********************************************/
 
+static int cgroup_mount_cg(
+                const char *mountpoint, const char *opts, CGMountType fstype,
+                bool use_cgns, bool use_userns) {
+
+        /* If we are using userns and cgns, then we always let it be RW, because we can count on the shifted root user
+         * to not have access to the things that would make us want to mount it RO.  Otherwise, we only give the
+         * container RW access to its unified or name=systemd cgroup. */
+        const bool rw = (use_userns && use_cgns) || fstype == CGMOUNT_CGROUP2 || streq(mountpoint, "/sys/fs/cgroup/systemd");
+
+        int r;
+
+        /* The superblock mount options of the mount point need to be
+         * identical to the hosts', and hence writable... */
+        r = mount_verbose(LOG_ERR, "cgroup", mountpoint, fstype == CGMOUNT_CGROUP1 ? "cgroup" : "cgroup2",
+                          MS_NOSUID|MS_NOEXEC|MS_NODEV, opts);
+        if (r < 0)
+                return r;
+
+        /* ... hence let's only make the bind mount read-only, not the superblock. */
+        if (!rw) {
+                r = mount_verbose(LOG_ERR, NULL, mountpoint, NULL,
+                                  MS_BIND|MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_RDONLY, NULL);
+                if (r < 0)
+                        return r;
+        }
+
+        return 0;
+}
+
+int cgroup_mount_mounts(
+                const char *dest,
+                CGMounts m,
+                bool use_cgns,
+                uid_t uid_shift,
+                const char *selinux_apifs_context) {
+
+        const bool use_userns = uid_shift != UID_INVALID;
+        const char *cgroup_root = prefix_roota(dest, "/sys/fs/cgroup");
+
+        bool used_tmpfs = false;
+
+        for (size_t i = 0; i < m.n; i++) {
+                _cleanup_free_ char *options = NULL;
+                const char *dst;
+                int r;
+
+                dst = prefix_roota(cgroup_root, m.mounts[i].dst);
+
+                /* The checks here to see if things are already mounted are kind of primative.  Perhaps they should
+                 * actually check the statfs() f_type to verify that the thing mounted is what we want to be mounted
+                 * (similar to cgroup-util's detection logic)?  But I don't really understand the use-case for having
+                 * any of these already mounted, so I'm not sure if such increased strictness would be unwelcome. */
+
+                switch (m.mounts[i].type) {
+                case CGMOUNT_SYMLINK:
+                        (void) mkdir_parents(dst, 0755);
+                        r = symlink_idempotent(m.mounts[i].src, dst);
+                        if (r < 0)
+                                return r;
+                        break;
+                case CGMOUNT_TMPFS:
+                        used_tmpfs = true;
+                        r = path_is_mount_point(dst, dest, path_equal(cgroup_root, dst) ? AT_SYMLINK_FOLLOW : 0);
+                        if (r < 0)
+                                return log_error_errno(r, "Failed to determine if %s is mounted already: %m", dst);
+                        if (r > 0)
+                                continue;
+                        r = tmpfs_patch_options(m.mounts[i].src, uid_shift, selinux_apifs_context, &options);
+                        if (r < 0)
+                                return log_oom();
+                        r = mount_verbose(LOG_ERR, /*name*/"tmpfs", dst, /*fstype*/"tmpfs",
+                                          MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME, options);
+                        if (r < 0)
+                                return r;
+                        break;
+                case CGMOUNT_CGROUP1:
+                case CGMOUNT_CGROUP2:
+                        r = path_is_mount_point(dst, dest, path_equal(cgroup_root, dst) ? AT_SYMLINK_FOLLOW : 0);
+                        if (r < 0 && r != -ENOENT)
+                                return log_error_errno(r, "Failed to determine if %s is mounted already: %m", dst);
+                        if (r > 0) {
+                                if (access(prefix_roota(dst, "cgroup.procs"), F_OK) >= 0)
+                                        continue;
+                                if (errno != ENOENT)
+                                        return log_error_errno(errno, "Failed to determine if mount point %s is a cgroup hierarchy: %m", dst);
+                                return log_error_errno(EINVAL, "%s is already mounted but not a cgroup hierarchy. Refusing.", dst);
+                        }
+                        (void) mkdir_p(dst, 0755);
+                        r = cgroup_mount_cg(dst, m.mounts[i].src, m.mounts[i].type, use_cgns, use_userns);
+                        if (r < 0)
+                                return r;
+                        break;
+                default:
+                        assert_not_reached("Invalid CGMount type");
+                        return -EINVAL;
+                }
+        }
+
+        /* I'm going to be honest: I don't understand why we don't do this if we're using both userns and cgns. */
+        if (used_tmpfs && (!use_userns || !use_cgns))
+                return mount_verbose(LOG_ERR, NULL, "/sys/fs/cgroup", NULL,
+                                     MS_REMOUNT|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME|MS_RDONLY, "mode=755");
+
+        return 0;
+}
+
 /* mount_cgroups, mount_systemd_cgroup_writable *********************/
 
 int mount_cgroups(
diff --git a/src/nspawn/nspawn-cgroup.h b/src/nspawn/nspawn-cgroup.h
index 40672d8f91..6f6d40fc40 100644
--- a/src/nspawn/nspawn-cgroup.h
+++ b/src/nspawn/nspawn-cgroup.h
@@ -6,7 +6,15 @@
 
 #include "cgroup-util.h"
 
+typedef struct CGMount CGMount;
+typedef struct CGMounts {
+        CGMount *mounts;
+        size_t n;
+} CGMounts;
+
 int cgroup_setup(pid_t pid, CGroupUnified outer_cgver, CGroupUnified inner_cgver, uid_t uid_shift, bool keep_unit);
+int cgroup_mount_mounts(const char *dest, CGMounts mounts, bool use_cgns, uid_t uid_shift, const char *selinux_apifs_context);
+void cgroup_free_mounts(CGMounts *mounts);
 
 int mount_cgroups(const char *dest, CGroupUnified outer_cgver, CGroupUnified inner_cgver, bool userns, uid_t uid_shift, uid_t uid_range, const char *selinux_apifs_context, bool use_cgns);
 int mount_systemd_cgroup_writable(const char *dest, CGroupUnified inner_cgver);
author	Luke Shumaker <lukeshu@parabola.nu>	2018-07-23 22:02:26 -0400
committer	Luke Shumaker <lukeshu@parabola.nu>	2018-08-16 21:55:16 -0400
commit	1561a0ef6a55b89ecd0caa31ce11b092acd25174 (patch)
tree	dc443844ab448c2983bb0ae6b20f3d42b62928ef
parent	f20e1f8f21e605923577279f961de3fb9e1ebe9a (diff)