summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLuke Shumaker <lukeshu@lukeshu.com>2017-05-20 17:49:22 -0400
committerLuke Shumaker <lukeshu@parabola.nu>2018-08-16 21:55:17 -0400
commit44c65162b10acb9f80254a2a356e22226dcc2c2d (patch)
treea4108c54074da33959324162d84c2ee107d7053c
parent2bea0592d696b74a16a817f952505f814395d6be (diff)
nspawn: nspawn-cgroup.c: Drastically modify cgroup_setup()
This is large and needs split in to multiple commits. - improve error messages - work correctly when outer_cgver==FULL && inner_cgver==NONE
-rw-r--r--src/nspawn/nspawn-cgroup.c339
-rw-r--r--src/nspawn/nspawn.c4
2 files changed, 256 insertions, 87 deletions
diff --git a/src/nspawn/nspawn-cgroup.c b/src/nspawn/nspawn-cgroup.c
index 8072ae4651..d2df6ae400 100644
--- a/src/nspawn/nspawn-cgroup.c
+++ b/src/nspawn/nspawn-cgroup.c
@@ -11,7 +11,9 @@
#include "nspawn-cgroup.h"
#include "nspawn-mount.h"
#include "path-util.h"
+#include "process-util.h"
#include "rm-rf.h"
+#include "stdio-util.h"
#include "string-util.h"
#include "strv.h"
#include "user-util.h"
@@ -100,6 +102,64 @@ static int cgfile_get_cgroup(FILE *cgfile, const char *opts, char **ret_cgroup)
}
}
+static int cgdir_attach(const char *cgdir, pid_t pid) {
+
+ const char *filepath = NULL;
+ char c[DECIMAL_STR_MAX(pid_t) + 2];
+
+ assert(cgdir);
+ assert(pid >= 0);
+
+ filepath = strjoina(cgdir, "/cgroup.procs");
+
+ if (pid == 0)
+ pid = getpid_cached();
+
+ xsprintf(c, PID_FMT "\n", pid);
+
+ return write_string_file(filepath, c, 0);
+}
+
+static int cgdir_enable_all(const char *cgdir) {
+
+ const char *filepath = NULL;
+ _cleanup_fclose_ FILE *f = NULL;
+ _cleanup_free_ char *controllers = NULL;
+ const char *controller, *state;
+ size_t controller_len;
+ int r;
+
+ filepath = strjoina(cgdir, "/cgroup.controllers", NULL);
+ r = read_one_line_file(filepath, &controllers);
+ if (r < 0)
+ return r;
+
+ FOREACH_WORD(controller, controller_len, controllers, state) {
+ char s[controller_len+2];
+
+ s[0] = '+';
+ memcpy(&s[1], controller, controller_len);
+ s[controller_len+1] = '\0';
+
+ if (!f) {
+ filepath = strjoina(cgdir, "/cgroup.subtree_control", NULL);
+ f = fopen(filepath, "we");
+ if (!f) {
+ log_debug_errno(errno, "Failed to open cgroup.subtree_control file of %s: %m", cgdir);
+ break;
+ }
+ }
+
+ r = write_string_stream(f, s, 0);
+ if (r < 0) {
+ log_debug_errno(r, "Failed to enable controller %s for %s: %m", &s[1], cgdir);
+ clearerr(f);
+ }
+ }
+
+ return 0;
+}
+
static int cgdir_chown(const char *path, uid_t uid_shift) {
_cleanup_close_ int fd = -1;
@@ -129,42 +189,49 @@ static int cgdir_chown(const char *path, uid_t uid_shift) {
/* cgroup_setup *****************************************************/
-static int chown_cgroup(pid_t pid, CGroupUnified inner_cgver, uid_t uid_shift) {
- _cleanup_free_ char *path = NULL, *fs = NULL;
+static int chown_cgroup(pid_t pid, uid_t uid_shift, const char *cg1sd_mountpoint, const char *cg2_mountpoint) {
+ _cleanup_free_ char *cgroup = NULL;
+ const char *cgfilename;
+ _cleanup_fclose_ FILE *cgfile = NULL;
int r;
- r = cg_pid_get_path(NULL, pid, &path);
- if (r < 0)
- return log_error_errno(r, "Failed to get container cgroup path: %m");
-
- r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, path, NULL, &fs);
- if (r < 0)
- return log_error_errno(r, "Failed to get file system path for container cgroup: %m");
+ if (!cg1sd_mountpoint && !cg2_mountpoint)
+ return 0;
- r = cgdir_chown(fs, uid_shift);
+ /* Determine the cgroup. Thanks to sync_cgroup(), we can get away with only doing this once. */
+ cgfilename = procfs_file_alloca(pid, "cgroup");
+ cgfile = fopen(cgfilename, "re");
+ if (!cgfile)
+ log_error_errno(errno, "chown container cgroup: Failed to get cgroup of the container: %m");
+ r = cgfile_get_cgroup(cgfile, cg2_mountpoint ? NULL : "name=systemd", &cgroup);
if (r < 0)
- return log_error_errno(r, "Failed to chown() cgroup %s: %m", fs);
+ return log_error_errno(r, "chown container cgroup: Failed to get cgroup of the container: %m");
- if (inner_cgver == CGROUP_UNIFIED_SYSTEMD233) {
- _cleanup_free_ char *lfs = NULL;
- /* Always propagate access rights from unified to legacy controller */
+ /* cgroup-v2 */
+ if (cg2_mountpoint) {
+ char *cgdir;
- r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER_LEGACY, path, NULL, &lfs);
+ cgdir = strjoina(cg2_mountpoint, cgroup);
+ r = cgdir_chown(cgdir, uid_shift);
if (r < 0)
- return log_error_errno(r, "Failed to get file system path for container cgroup: %m");
+ return log_error_errno(r, "chown container cgroup: cgdir_chown(path=\"%s\", uid=" UID_FMT "): %m", cgdir, uid_shift);
+ }
+
+ /* cgroup-v1 name=systemd */
+ if (cg1sd_mountpoint) {
+ char *cgdir;
- r = cgdir_chown(lfs, uid_shift);
+ cgdir = strjoina(cg1sd_mountpoint, cgroup);
+ r = cgdir_chown(cgdir, uid_shift);
if (r < 0)
- return log_error_errno(r, "Failed to chown() cgroup %s: %m", lfs);
+ return log_error_errno(r, "chown container cgroup: cgdir_chown(path=\"%s\", uid=" UID_FMT "): %m", cgdir, uid_shift);
}
return 0;
}
-static int sync_cgroup(pid_t pid, CGroupUnified outer_cgver, CGroupUnified inner_cgver, uid_t uid_shift) {
+static int sync_cgroup(pid_t pid, uid_t uid_shift, const char *mountpoint) {
_cleanup_free_ char *cgroup = NULL;
- char mountpoint[] = "/tmp/containerXXXXXX", pid_string[DECIMAL_STR_MAX(pid) + 1];
- bool undo_mount = false;
const char *fn;
int r;
@@ -175,58 +242,23 @@ static int sync_cgroup(pid_t pid, CGroupUnified outer_cgver, CGroupUnified inner
r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup);
if (r < 0)
- return log_error_errno(r, "Failed to get control group of " PID_FMT ": %m", pid);
-
- /* In order to access the container's hierarchy we need to mount it */
- if (!mkdtemp(mountpoint))
- return log_error_errno(errno, "Failed to generate temporary mount point for container hierarchy: %m");
-
- if (outer_cgver >= CGROUP_UNIFIED_SYSTEMD232)
- r = mount_verbose(LOG_ERR, "cgroup", mountpoint, "cgroup",
- MS_NOSUID|MS_NOEXEC|MS_NODEV, "none,name=systemd,xattr");
- else
- r = mount_verbose(LOG_ERR, "cgroup", mountpoint, "cgroup2",
- MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
- if (r < 0)
- goto finish;
-
- undo_mount = true;
+ return log_error_errno(r, "sync host cgroup -> container cgroup: Failed to determine cgroup of the container: %m");
/* If nspawn dies abruptly the cgroup hierarchy created below
* its unit isn't cleaned up. So, let's remove it
* https://github.com/systemd/systemd/pull/4223#issuecomment-252519810 */
fn = strjoina(mountpoint, cgroup);
(void) rm_rf(fn, REMOVE_ROOT|REMOVE_ONLY_DIRECTORIES);
+ (void) mkdir_p(fn, 0755);
- fn = strjoina(mountpoint, cgroup, "/cgroup.procs");
- (void) mkdir_parents(fn, 0755);
-
- sprintf(pid_string, PID_FMT, pid);
- r = write_string_file(fn, pid_string, 0);
- if (r < 0) {
- log_error_errno(r, "Failed to move process: %m");
- goto finish;
- }
-
- fn = strjoina(mountpoint, cgroup);
- r = cgdir_chown(fn, uid_shift);
+ r = cgdir_attach(fn, pid);
if (r < 0)
- log_error_errno(r, "Failed to chown() cgroup %s: %m", fn);
-finish:
- if (undo_mount)
- (void) umount_verbose(mountpoint);
+ return log_error_errno(r, "sync host cgroup -> container cgroup: Failed to move process to %s: %m", fn);
- (void) rmdir(mountpoint);
- return r;
+ return 0;
}
-static int create_subcgroup(pid_t pid, bool keep_unit) {
- _cleanup_free_ char *cgroup = NULL;
- CGroupMask supported;
- const char *payload;
- int r;
-
- assert(pid > 1);
+static int create_subcgroup(pid_t pid, bool keep_unit, const char *cg1sd_mountpoint, const char *cg2_mountpoint) {
/* In the unified hierarchy inner nodes may only contain subgroups, but not processes. Hence, if we running in
* the unified hierarchy and the container does the same, and we did not create a scope unit for the container
@@ -243,58 +275,195 @@ static int create_subcgroup(pid_t pid, bool keep_unit) {
* legacy setup, this is fine too, since delegation of controllers is generally not safe there, hence we won't
* do it. */
- r = cg_mask_supported(&supported);
- if (r < 0)
- return log_error_errno(r, "Failed to determine supported controllers: %m");
+ const char *cgfilename;
+ _cleanup_fclose_ FILE *cgfile = NULL;
+ _cleanup_free_ char *cgroup = NULL;
+ int r;
- if (keep_unit)
- r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 0, &cgroup);
- else
- r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup);
- if (r < 0)
- return log_error_errno(r, "Failed to get our control group: %m");
+ assert(pid > 1);
+
+ if (!cg1sd_mountpoint && !cg2_mountpoint)
+ return 0;
- payload = strjoina(cgroup, "/payload");
- r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, payload, pid);
+ cgfilename = procfs_file_alloca(pid, "cgroup");
+ cgfile = fopen(cgfilename, "re");
+ if (!cgfile)
+ log_error_errno(errno, "create subcgroup: Failed to get cgroup of the container: %m");
+ r = cgfile_get_cgroup(cgfile, cg2_mountpoint ? NULL : "name=systemd", &cgroup);
if (r < 0)
- return log_error_errno(r, "Failed to create %s subcgroup: %m", payload);
+ return log_error_errno(r, "create subcgroup: Failed to get cgroup of the container: %m");
- if (keep_unit) {
- const char *supervisor;
+ if (cg2_mountpoint) {
+ const char *cgdir;
+ const char *payload;
- supervisor = strjoina(cgroup, "/supervisor");
- r = cg_create_and_attach(SYSTEMD_CGROUP_CONTROLLER, supervisor, 0);
+ cgdir = strjoina(cg2_mountpoint, cgroup);
+ payload = strjoina(cgdir, "/payload");
+
+ r = mkdir_errno_wrapper(payload, 0755);
+ if (r < 0 && r != -EEXIST)
+ return log_error_errno(r, "create payload cgroup: Failed to create subcgroup %s: %m", payload);
+
+ r = cgdir_attach(payload, pid);
if (r < 0)
- return log_error_errno(r, "Failed to create %s subcgroup: %m", supervisor);
+ return log_error_errno(r, "create payload cgroup: Failed to move process to subcgroup %s: %m", payload);
+
+ if (keep_unit) {
+ const char *supervisor;
+
+ supervisor = strjoina(cgdir, "/supervisor");
+
+ r = mkdir_errno_wrapper(supervisor, 0755);
+ if (r < 0 && r != -EEXIST)
+ return log_error_errno(r, "create supervisor cgroup: Failed to create subcgroup %s: %m", supervisor);
+
+ r = cgdir_attach(supervisor, 0);
+ if (r < 0)
+ return log_error_errno(r, "create supervisor cgroup: Failed to move process to subcgroup %s: %m", supervisor);
+ }
+
+ (void) cgdir_enable_all(cgdir);
+ }
+
+ if (cg1sd_mountpoint) {
+ const char *cgdir;
+ const char *payload;
+
+ cgdir = strjoina(cg1sd_mountpoint, cgroup);
+ payload = strjoina(cgdir, "/payload");
+
+ r = mkdir_errno_wrapper(payload, 0755);
+ if (r < 0 && r != -EEXIST)
+ return log_error_errno(r, "create payload cgroup: Failed to create subcgroup %s: %m", payload);
+
+ r = cgdir_attach(payload, pid);
+ if (r < 0)
+ return log_error_errno(r, "create payload cgroup: Failed to move process to subcgroup %s: %m", payload);
+
+ if (keep_unit) {
+ const char *supervisor;
+
+ supervisor = strjoina(cgdir, "/supervisor");
+
+ r = mkdir_errno_wrapper(supervisor, 0755);
+ if (r < 0 && r != -EEXIST)
+ return log_error_errno(r, "create supervisor cgroup: Failed to create subcgroup %s: %m", supervisor);
+
+ r = cgdir_attach(supervisor, 0);
+ if (r < 0)
+ return log_error_errno(r, "create supervisor cgroup: Failed to move process to subcgroup %s: %m", supervisor);
+ }
}
- /* Try to enable as many controllers as possible for the new payload. */
- (void) cg_enable_everywhere(supported, supported, cgroup);
return 0;
}
-int cgroup_setup(pid_t pid, CGroupUnified outer_cgver, CGroupUnified inner_cgver, uid_t uid_shift, bool keep_unit) {
+static int cgroup_setup_internal(pid_t pid, CGroupUnified outer_cgver, CGroupUnified inner_cgver, uid_t uid_shift, bool keep_unit, const char *cg1sd_mountpoint, const char *cg2_mountpoint) {
int r;
if ((outer_cgver >= CGROUP_UNIFIED_SYSTEMD232) != (inner_cgver >= CGROUP_UNIFIED_SYSTEMD232)) {
/* sync the name=systemd hierarchy with the unified hierarchy */
- r = sync_cgroup(pid, outer_cgver, inner_cgver, uid_shift);
+ r = sync_cgroup(pid, uid_shift, outer_cgver == CGROUP_UNIFIED_NONE ? cg2_mountpoint : cg1sd_mountpoint);
if (r < 0)
return r;
}
- r = create_subcgroup(pid, keep_unit);
+ r = create_subcgroup(pid, keep_unit, cg1sd_mountpoint, cg2_mountpoint);
if (r < 0)
return r;
- r = chown_cgroup(pid, inner_cgver, uid_shift);
+ r = chown_cgroup(pid, uid_shift, cg1sd_mountpoint, cg2_mountpoint);
if (r < 0)
return r;
return 0;
}
+int cgroup_setup(pid_t pid, CGroupUnified outer_cgver, CGroupUnified inner_cgver, uid_t uid_shift, bool keep_unit) {
+
+ /* The main purpose of this function is to properly "delegate" parts of the cgroup hierarchies to the container
+ * (see doc/CGROUP_DELEGATION.md). In general, delegating cgroup-v1 hierarchies is *not safe*, so we
+ * don't. However, systemd takes extra measures to make delegating the name=systemd hierarchy safe, so we make
+ * an exception for it. */
+
+ bool cg1sd_used, cg2_used;
+ _cleanup_free_ char *cg2_mountpoint = NULL;
+ _cleanup_free_ char *cg1sd_mountpoint = NULL;
+ int r, q;
+
+ assert(outer_cgver != CGROUP_UNIFIED_UNKNOWN);
+ assert(inner_cgver != CGROUP_UNIFIED_UNKNOWN);
+
+ cg1sd_used = inner_cgver == CGROUP_UNIFIED_NONE || inner_cgver == CGROUP_UNIFIED_SYSTEMD233;
+ cg2_used = inner_cgver >= CGROUP_UNIFIED_SYSTEMD232;
+ if (cg2_used) {
+ switch (outer_cgver) {
+ case CGROUP_UNIFIED_SYSTEMD233: cg2_mountpoint = strdup("/sys/fs/cgroup/unified"); break;
+ case CGROUP_UNIFIED_SYSTEMD232: cg2_mountpoint = strdup("/sys/fs/cgroup/systemd"); break;
+ case CGROUP_UNIFIED_ALL: cg2_mountpoint = strdup("/sys/fs/cgroup"); break;
+ case CGROUP_UNIFIED_NONE:
+ cg2_mountpoint = strdup("/tmp/container-cg2-XXXXXX");
+ if (!mkdtemp(cg2_mountpoint))
+ return log_error_errno(errno, "Failed to create temporary mount point for container cgroup hierarchy: %m");
+ r = mount_verbose(LOG_ERR, "cgroup", cg2_mountpoint, "cgroup2",
+ MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
+ if (r < 0) {
+ log_error("Failed to mount container cgroup hierarchy");
+ (void) rmdir(cg2_mountpoint);
+ return r;
+ }
+ break;
+ default:
+ case CGROUP_UNIFIED_UNKNOWN:
+ assert_not_reached("invalid outer_cgver");
+ }
+ }
+ if (cg1sd_used) {
+ switch (outer_cgver) {
+ case CGROUP_UNIFIED_NONE:
+ case CGROUP_UNIFIED_SYSTEMD233:
+ cg1sd_mountpoint = strdup("/sys/fs/cgroup/systemd");
+ break;
+ case CGROUP_UNIFIED_SYSTEMD232:
+ case CGROUP_UNIFIED_ALL:
+ cg1sd_mountpoint = strdup("/tmp/container-cg1sd-XXXXXX");
+ if (!mkdtemp(cg1sd_mountpoint))
+ return log_error_errno(errno, "Failed to create temporary mount point for container cgroup hierarchy: %m");
+ r = mount_verbose(LOG_ERR, "cgroup", cg1sd_mountpoint, "cgroup",
+ MS_NOSUID|MS_NOEXEC|MS_NODEV, "none,name=systemd,xattr");
+ if (r < 0) {
+ log_error("Failed to mount container cgroup hierarchy");
+ (void) rmdir(cg1sd_mountpoint);
+ return r;
+ }
+ break;
+ default:
+ case CGROUP_UNIFIED_UNKNOWN:
+ assert_not_reached("can't use legacy/hybrid container on unknown host");
+ break;
+ }
+ }
+
+ r = cgroup_setup_internal(pid, outer_cgver, inner_cgver, uid_shift, keep_unit,
+ cg1sd_used ? cg1sd_mountpoint : NULL, cg2_used ? cg2_mountpoint : NULL);
+
+ if (cg2_used && startswith(cg2_mountpoint, "/tmp")) {
+ q = umount_verbose(cg2_mountpoint);
+ if (r >= 0 && q < 0)
+ r = q;
+ (void) rmdir(cg2_mountpoint);
+ }
+ if (cg1sd_used && startswith(cg1sd_mountpoint, "/tmp")) {
+ q = umount_verbose(cg1sd_mountpoint);
+ if (r >= 0 && q < 0)
+ r = q;
+ (void) rmdir(cg1sd_mountpoint);
+ }
+
+ return r;
+}
+
/* cgroup_decide_mounts *********************************************/
/* Retrieve a list of cgroup v1 hierarchies. */
diff --git a/src/nspawn/nspawn.c b/src/nspawn/nspawn.c
index f64faee25b..5ae11ea0ce 100644
--- a/src/nspawn/nspawn.c
+++ b/src/nspawn/nspawn.c
@@ -369,7 +369,7 @@ static int detect_inner_cgver_from_image(const char *directory, CGroupUnified ou
* to sniff the systemd version) was only added in 231, so we'll have a false negative here for 230. */
r = systemd_installation_has_version(directory, 230);
if (r < 0)
- return log_error_errno(r, "Failed to determine systemd version in container: %m");
+ return log_error_errno(r, "Failed to decide cgroup version to use: Failed to determine systemd version in container: %m");
if (r > 0)
arg_inner_cgver = CGROUP_UNIFIED_ALL;
else
@@ -379,7 +379,7 @@ static int detect_inner_cgver_from_image(const char *directory, CGroupUnified ou
/* systemd v233+ -style mixed cgroup hierarchy */
r = systemd_installation_has_version(directory, 233);
if (r < 0)
- return log_error_errno(r, "Failed to determine systemd version in container: %m");
+ return log_error_errno(r, "Failed to decide cgroup version to use: Failed to determine systemd version in container: %m");
if (r > 0)
arg_inner_cgver = CGROUP_UNIFIED_SYSTEMD233;
else