From: "Daniel P. Berrange" <berrange@xxxxxxxxxx> Normal practice is for cgroups controllers to be mounted at /sys/fs/cgroup. When setting up a container, /sys is mounted with a new sysfs instance, thus we must re-mount all the cgroups controllers. The complexity is that we must mount them in the same layout as the host OS. ie if 'cpu' and 'cpuacct' were mounted at the same location in the host we must preserve this in the container. Also if any controllers are co-located we must setup symlinks from the individual controller name to the co-located mount-point Signed-off-by: Daniel P. Berrange <berrange@xxxxxxxxxx> --- src/lxc/lxc_container.c | 243 +++++++++++++++++++++++++++++++++++++++++++++-- src/util/cgroup.h | 2 + 2 files changed, 236 insertions(+), 9 deletions(-) diff --git a/src/lxc/lxc_container.c b/src/lxc/lxc_container.c index a3ca76c..e606108 100644 --- a/src/lxc/lxc_container.c +++ b/src/lxc/lxc_container.c @@ -35,6 +35,7 @@ #include <sys/stat.h> #include <unistd.h> #include <mntent.h> +#include <dirent.h> /* Yes, we want linux private one, for _syscall2() macro */ #include <linux/unistd.h> @@ -1122,6 +1123,192 @@ cleanup: } +struct lxcContainerCGroup { + const char *dir; + const char *linkDest; +}; + + +static void lxcContainerCGroupFree(struct lxcContainerCGroup *mounts, + size_t nmounts) +{ + size_t i; + + if (!mounts) + return; + + for (i = 0 ; i < nmounts ; i++) { + VIR_FREE(mounts[i].dir); + VIR_FREE(mounts[i].linkDest); + } + VIR_FREE(mounts); +} + + +static int lxcContainerIdentifyCGroups(struct lxcContainerCGroup **mountsret, + size_t *nmountsret) +{ + FILE *procmnt = NULL; + struct mntent mntent; + struct dirent *dent; + char mntbuf[1024]; + int ret = -1; + struct lxcContainerCGroup *mounts = NULL; + size_t nmounts = 0; + DIR *dh = NULL; + char *path = NULL; + + *mountsret = NULL; + *nmountsret = 0; + + VIR_DEBUG("Finding cgroups mount points under %s", VIR_CGROUP_SYSFS_MOUNT); + + if (!(procmnt = setmntent("/proc/mounts", "r"))) { + virReportSystemError(errno, "%s", + _("Failed to read /proc/mounts")); + return -1; + } + + while (getmntent_r(procmnt, &mntent, mntbuf, sizeof(mntbuf)) != NULL) { + VIR_DEBUG("Got %s", mntent.mnt_dir); + if (STRNEQ(mntent.mnt_type, "cgroup") || + !STRPREFIX(mntent.mnt_dir, VIR_CGROUP_SYSFS_MOUNT)) + continue; + + /* Skip named mounts with no controller since they're + * for application use only ie systemd */ + if (strstr(mntent.mnt_opts, "name=")) + continue; + + if (VIR_EXPAND_N(mounts, nmounts, 1) < 0) { + virReportOOMError(); + goto cleanup; + } + if (!(mounts[nmounts-1].dir = strdup(mntent.mnt_dir))) { + virReportOOMError(); + goto cleanup; + } + VIR_DEBUG("Grabbed %s", mntent.mnt_dir); + } + + VIR_DEBUG("Checking for symlinks in %s", VIR_CGROUP_SYSFS_MOUNT); + if (!(dh = opendir(VIR_CGROUP_SYSFS_MOUNT))) { + virReportSystemError(errno, + _("Unable to read directory %s"), + VIR_CGROUP_SYSFS_MOUNT); + goto cleanup; + } + + while ((dent = readdir(dh)) != NULL) { + ssize_t rv; + /* The cgroups links are just relative to the local + * dir so we don't need a large buf */ + char linkbuf[100]; + + if (dent->d_name[0] == '.') + continue; + + VIR_DEBUG("Checking entry %s", dent->d_name); + if (virAsprintf(&path, "%s/%s", VIR_CGROUP_SYSFS_MOUNT, dent->d_name) < 0) { + virReportOOMError(); + goto cleanup; + } + + if ((rv = readlink(path, linkbuf, sizeof(linkbuf)-1)) < 0) { + if (errno != EINVAL) { + virReportSystemError(errno, + _("Unable to resolve link %s"), + path); + VIR_FREE(path); + goto cleanup; + } + /* Ok not a link */ + VIR_FREE(path); + } else { + linkbuf[rv] = '\0'; + VIR_DEBUG("Got a link %s to %s", path, linkbuf); + if (VIR_EXPAND_N(mounts, nmounts, 1) < 0) { + virReportOOMError(); + goto cleanup; + } + if (!(mounts[nmounts-1].linkDest = strdup(linkbuf))) { + virReportOOMError(); + goto cleanup; + } + mounts[nmounts-1].dir = path; + path = NULL; + } + } + + *mountsret = mounts; + *nmountsret = nmounts; + ret = 0; + +cleanup: + closedir(dh); + endmntent(procmnt); + VIR_FREE(path); + + if (ret < 0) + lxcContainerCGroupFree(mounts, nmounts); + return ret; +} + + +static int lxcContainerMountCGroups(struct lxcContainerCGroup *mounts, + size_t nmounts) +{ + size_t i; + + VIR_DEBUG("Mounting cgroups at '%s'", VIR_CGROUP_SYSFS_MOUNT); + + if (virFileMakePath(VIR_CGROUP_SYSFS_MOUNT) < 0) { + virReportSystemError(errno, + _("Unable to create directory %s"), + VIR_CGROUP_SYSFS_MOUNT); + return -1; + } + + if (mount("tmpfs", VIR_CGROUP_SYSFS_MOUNT, "tmpfs", MS_NOSUID|MS_NODEV|MS_NOEXEC, "mode=755") < 0) { + virReportSystemError(errno, + _("Failed to mount %s on %s type %s"), + "tmpfs", VIR_CGROUP_SYSFS_MOUNT, "tmpfs"); + return -1; + } + + for (i = 0 ; i < nmounts ; i++) { + if (mounts[i].linkDest) { + VIR_DEBUG("Link mount point '%s' to '%s'", + mounts[i].dir, mounts[i].linkDest); + if (symlink(mounts[i].linkDest, mounts[i].dir) < 0) { + virReportSystemError(errno, + _("Unable to symlink directory %s to %s"), + mounts[i].dir, mounts[i].linkDest); + return -1; + } + } else { + VIR_DEBUG("Create mount point '%s'", mounts[i].dir); + if (virFileMakePath(mounts[i].dir) < 0) { + virReportSystemError(errno, + _("Unable to create directory %s"), + mounts[i].dir); + return -1; + } + + if (mount("cgroup", mounts[i].dir, "cgroup", + 0, mounts[i].dir + strlen(VIR_CGROUP_SYSFS_MOUNT) + 1) < 0) { + virReportSystemError(errno, + _("Failed to mount %s on %s"), + "cgroup", mounts[i].dir); + return -1; + } + } + } + + return 0; +} + + /* Got a FS mapped to /, we're going the pivot_root * approach to do a better-chroot-than-chroot * this is based on this thread http://lkml.org/lkml/2008/3/5/29 @@ -1132,31 +1319,49 @@ static int lxcContainerSetupPivotRoot(virDomainDefPtr vmDef, size_t nttyPaths, virSecurityManagerPtr securityDriver) { + struct lxcContainerCGroup *mounts = NULL; + size_t nmounts = 0; + int ret = -1; + + /* Before pivoting we need to identify any + * cgroups controllers that are mounted */ + if (lxcContainerIdentifyCGroups(&mounts, &nmounts) < 0) + goto cleanup; + /* Gives us a private root, leaving all parent OS mounts on /.oldroot */ if (lxcContainerPivotRoot(root) < 0) - return -1; + goto cleanup; /* Mounts the core /proc, /sys, etc filesystems */ if (lxcContainerMountBasicFS(vmDef, true, securityDriver) < 0) - return -1; + goto cleanup; + + /* Now we can re-mount the cgroups controllers in the + * same configuration as before */ + if (lxcContainerMountCGroups(mounts, nmounts) < 0) + goto cleanup; /* Mounts /dev/pts */ if (lxcContainerMountFSDevPTS(root) < 0) - return -1; + goto cleanup; /* Populates device nodes in /dev/ */ if (lxcContainerPopulateDevices(ttyPaths, nttyPaths) < 0) - return -1; + goto cleanup; /* Sets up any non-root mounts from guest config */ if (lxcContainerMountAllFS(vmDef, "/.oldroot", true) < 0) - return -1; + goto cleanup; /* Gets rid of all remaining mounts from host OS, including /.oldroot itself */ if (lxcContainerUnmountSubtree("/.oldroot", true) < 0) - return -1; + goto cleanup; - return 0; + ret = 0; + +cleanup: + lxcContainerCGroupFree(mounts, nmounts); + return ret; } @@ -1166,6 +1371,10 @@ static int lxcContainerSetupExtraMounts(virDomainDefPtr vmDef, virDomainFSDefPtr root, virSecurityManagerPtr securityDriver) { + int ret = -1; + struct lxcContainerCGroup *mounts = NULL; + size_t nmounts = 0; + VIR_DEBUG("def=%p", vmDef); /* * This makes sure that any new filesystems in the @@ -1190,19 +1399,35 @@ static int lxcContainerSetupExtraMounts(virDomainDefPtr vmDef, if (lxcContainerMountAllFS(vmDef, "", false) < 0) return -1; + /* Before replacing /sys we need to identify any + * cgroups controllers that are mounted */ + if (lxcContainerIdentifyCGroups(&mounts, &nmounts) < 0) + goto cleanup; + /* Gets rid of any existing stuff under /proc, since we need new * namespace aware versions of those. We must do /proc second * otherwise we won't find /proc/mounts :-) */ if (lxcContainerUnmountSubtree("/sys", false) < 0 || lxcContainerUnmountSubtree("/proc", false) < 0) - return -1; + goto cleanup; /* Mounts the core /proc, /sys, etc filesystems */ if (lxcContainerMountBasicFS(vmDef, false, securityDriver) < 0) - return -1; + goto cleanup; + + /* Now we can re-mount the cgroups controllers in the + * same configuration as before */ + if (lxcContainerMountCGroups(mounts, nmounts) < 0) + goto cleanup; VIR_DEBUG("Mounting completed"); return 0; + + ret = 0; + +cleanup: + lxcContainerCGroupFree(mounts, nmounts); + return ret; } diff --git a/src/util/cgroup.h b/src/util/cgroup.h index 8486c42..857945d 100644 --- a/src/util/cgroup.h +++ b/src/util/cgroup.h @@ -16,6 +16,8 @@ struct virCgroup; typedef struct virCgroup *virCgroupPtr; +#define VIR_CGROUP_SYSFS_MOUNT "/sys/fs/cgroup" + enum { VIR_CGROUP_CONTROLLER_CPU, VIR_CGROUP_CONTROLLER_CPUACCT, -- 1.7.10.1 -- libvir-list mailing list libvir-list@xxxxxxxxxx https://www.redhat.com/mailman/listinfo/libvir-list