The LXC driver currently allows custom mount points to be setup inside the container. This only works for non-root mount points. You cannot replace the entire root filesystem. This patch adds support for replacing the entire root filesystem, thus allowing the use of LXC containers as a 'better chroot than chroot'. Well, with one minor flaw - the Linux kernel currently has no device namespace virtualization, so the admin inside the container can just do a 'mknod' and access the real devices of the host. So for now this patch doesn't make LXC containers secure, but a future kernel release will enable it to be secure. lxc_container.c | 253 ++++++++++++++++++++++++++++++++++++++++++++++++-------- util.c | 12 +- 2 files changed, 226 insertions(+), 39 deletions(-) Daniel diff -r eaa42985aed4 src/lxc_container.c --- a/src/lxc_container.c Tue Aug 05 16:50:59 2008 +0100 +++ b/src/lxc_container.c Tue Aug 05 16:51:14 2008 +0100 @@ -1,10 +1,12 @@ /* * Copyright IBM Corp. 2008 + * Copyright Red Hat 2008 * * lxc_container.c: file description * * Authors: * David L. Leskovec <dlesko at linux.vnet.ibm.com> + * Daniel P. Berrange <berrange@xxxxxxxxxx> * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public @@ -28,10 +30,18 @@ #include <fcntl.h> #include <limits.h> #include <stdlib.h> +#include <stdio.h> #include <sys/ioctl.h> #include <sys/mount.h> #include <sys/wait.h> #include <unistd.h> +#include <mntent.h> + +/* Yes, we want linux private one, for _syscall2() macro */ +#include <linux/unistd.h> + +/* For MS_MOVE */ +#include <linux/fs.h> #include "lxc_container.h" #include "util.h" @@ -105,23 +115,15 @@ * * Returns 0 on success or -1 in case of error */ -static int lxcContainerSetStdio(int control, const char *ttyPath) +static int lxcContainerSetStdio(int control, int ttyfd) { int rc = -1; - int ttyfd; int open_max, i; if (setsid() < 0) { lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, _("setsid failed: %s"), strerror(errno)); - goto error_out; - } - - ttyfd = open(ttyPath, O_RDWR|O_NOCTTY); - if (ttyfd < 0) { - lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, - _("open(%s) failed: %s"), ttyPath, strerror(errno)); - goto error_out; + goto cleanup; } if (ioctl(ttyfd, TIOCSCTTY, NULL) < 0) { @@ -159,8 +161,6 @@ cleanup: close(ttyfd); - -error_out: return rc; } @@ -223,6 +223,7 @@ return 0; } + /** * lxcEnableInterfaces: * @vm: Pointer to vm structure @@ -252,6 +253,20 @@ return rc; } + +//_syscall2(int, pivot_root, char *, newroot, const char *, oldroot) +extern int pivot_root(const char * new_root,const char * put_old); + +static int lxcContainerChildMountSort(const void *a, const void *b) +{ + const char **sa = (const char**)a; + const char **sb = (const char**)b; + + /* Delibrately reversed args - we need to unmount deepest + children first */ + return strcmp(*sb, *sa); +} + /** * lxcChild: * @argv: Pointer to container arguments @@ -269,8 +284,8 @@ int rc = -1; lxc_child_argv_t *argv = data; virDomainDefPtr vmDef = argv->config; - virDomainFSDefPtr curMount; - int i; + virDomainFSDefPtr tmp, root = NULL; + int ttyfd, i; if (NULL == vmDef) { lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, @@ -278,36 +293,210 @@ return -1; } +#if 0 + ttyfd = open(argv->ttyPath, O_RDWR|O_NOCTTY); + if (ttyfd < 0) { + lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, + _("open(%s) failed: %s"), argv->ttyPath, strerror(errno)); + return -1; + } +#endif + /* handle the bind mounts first before doing anything else that may */ /* then access those mounted dirs */ - curMount = vmDef->fss; - for (i = 0; curMount; curMount = curMount->next) { - // XXX fix - if (curMount->type != VIR_DOMAIN_FS_TYPE_MOUNT) + for (tmp = vmDef->fss; tmp && !root; tmp = tmp->next) { + if (tmp->type != VIR_DOMAIN_FS_TYPE_MOUNT) continue; - rc = mount(curMount->src, - curMount->dst, - NULL, - MS_BIND, - NULL); - if (0 != rc) { + if (STREQ(tmp->dst, "/")) + root = tmp; + } + + if (root) { + char *oldroot; + struct mntent *mntent; + char **mounts = NULL; + int nmounts = 0; + FILE *procmnt; + struct { + int maj; + int min; + const char *path; + } devs[] = { + { 1, 3, "/dev/null" }, + { 1, 5, "/dev/zero" }, + { 1, 7, "/dev/full" }, + { 5, 1, "/dev/console" }, + }; + + /* Got a FS mapped to /, we're going the pivot_root + approach to do a better-chroot-than-chroot */ + + /* this is based on this thread http://lkml.org/lkml/2008/3/5/29 */ + + /* First step is to ensure the new root itself is + a mount point */ + if (mount(root->src, root->src, NULL, MS_BIND, NULL) < 0) { lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, - _("failed to mount %s at %s for container: %s"), - curMount->src, curMount->dst, strerror(errno)); + _("failed to bind new root %s: %s"), + root->src, strerror(errno)); + return -1; + } + + if (asprintf(&oldroot, "%s/.oldroot", root->src) < 0) { + lxcError(NULL, NULL, VIR_ERR_NO_MEMORY, NULL); + return -1; + } + + if (virFileMakePath(oldroot) < 0) { + lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, + _("failed to create %s: %s"), + oldroot, strerror(errno)); + return -1; + } + + /* The old root directory will live at /.oldroot after + * this and will soon be unmounted completely */ + if (pivot_root(root->src, oldroot) < 0) { + lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, + _("failed to pivot root %s to %s: %s"), + oldroot, root->src, strerror(errno)); + return -1; + } + + /* CWD is undefined after pivot_root, so go to / */ + if (chdir("/") < 0) { + return -1; + } + + if (virFileMakePath("/proc") < 0 || + mount("none", "/proc", "proc", 0, NULL) < 0) { + lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, + _("failed to mount /proc for container: %s"), + strerror(errno)); + return -1; + } + if (virFileMakePath("/dev") < 0 || + mount("none", "/dev", "tmpfs", 0, NULL) < 0) { + lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, + _("failed to mount /dev tmpfs for container: %s"), + strerror(errno)); + return -1; + } + /* Move old devpts into container, since we have to + connect to the master ptmx which was opened in + the parent. + XXX This sucks, we need to figure out how to get our + own private devpts for isolation + */ + if (virFileMakePath("/dev/pts") < 0 || + mount("/.oldroot/dev/pts", "/dev/pts", NULL, + MS_MOVE, NULL) < 0) { + lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, + _("failed to move /dev/pts into container: %s"), + strerror(errno)); + return -1; + } + + /* Populate /dev/ with a few important bits */ + umask(0); + for (i = 0 ; i < ARRAY_CARDINALITY(devs) ; i++) { + dev_t dev = makedev(devs[i].maj, devs[i].min); + if (mknod(devs[i].path, + 0777 | S_IFCHR, + dev) < 0) { + lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, + _("failed to make device %s: %s"), + devs[i].path, strerror(errno)); + return -1; + } + } + umask(0700); + + /* Pull in rest of container's mounts */ + for (tmp = vmDef->fss; tmp; tmp = tmp->next) { + char *src; + if (STREQ(tmp->dst, "/")) + continue; + // XXX fix + if (tmp->type != VIR_DOMAIN_FS_TYPE_MOUNT) + continue; + + if (asprintf(&src, "/.oldroot/%s", tmp->src) < 0) + return -1; + + if (virFileMakePath(tmp->dst) < 0 || + mount(src, tmp->dst, NULL, MS_BIND, NULL) < 0) { + lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, + _("failed to mount %s at %s for container: %s"), + tmp->src, tmp->dst, strerror(errno)); + return -1; + } + VIR_FREE(src); + } + + if (!(procmnt = setmntent("/proc/mounts", "r"))) { + lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, + _("failed to read /proc/mounts: %s"), + strerror(errno)); + return -1; + } + while ((mntent = getmntent(procmnt)) != NULL) { + if (!STRPREFIX(mntent->mnt_dir, "/.oldroot")) + continue; + if (VIR_REALLOC_N(mounts, nmounts+1) < 0) + return -1; + mounts[nmounts++] = strdup(mntent->mnt_dir); + } + endmntent(procmnt); + + qsort(mounts, nmounts, sizeof(mounts[0]), + lxcContainerChildMountSort); + + for (i = 0 ; i < nmounts ; i++) { + if (umount(mounts[i]) < 0) { + lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, + _("failed to unmount %s: %s"), + mounts[i], strerror(errno)); + return -1; + } + } + } else { + /* Nothing mapped to /, we're using the main root, + but with extra stuff mapped in */ + for (tmp = vmDef->fss; tmp; tmp = tmp->next) { + // XXX fix + if (tmp->type != VIR_DOMAIN_FS_TYPE_MOUNT) + continue; + rc = mount(tmp->src, + tmp->dst, + NULL, + MS_BIND, + NULL); + if (0 != rc) { + lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, + _("failed to mount %s at %s for container: %s"), + tmp->src, tmp->dst, strerror(errno)); + return -1; + } + } + + /* mount /proc */ + if (mount("lxcproc", "/proc", "proc", 0, NULL) < 0) { + lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, + _("failed to mount /proc for container: %s"), + strerror(errno)); return -1; } } - /* mount /proc */ - rc = mount("lxcproc", "/proc", "proc", 0, NULL); - if (0 != rc) { + ttyfd = open(argv->ttyPath, O_RDWR|O_NOCTTY); + if (ttyfd < 0) { lxcError(NULL, NULL, VIR_ERR_INTERNAL_ERROR, - _("failed to mount /proc for container: %s"), - strerror(errno)); + _("open(%s) failed: %s"), argv->ttyPath, strerror(errno)); return -1; } - if (lxcContainerSetStdio(argv->monitor, argv->ttyPath) < 0) + if (lxcContainerSetStdio(argv->monitor, ttyfd) < 0) return -1; /* Wait for interface devices to show up */ diff -r eaa42985aed4 src/util.c --- a/src/util.c Tue Aug 05 16:50:59 2008 +0100 +++ b/src/util.c Tue Aug 05 16:51:14 2008 +0100 @@ -524,13 +524,11 @@ if (!(p = strrchr(parent, '/'))) return EINVAL; - if (p == parent) - return EPERM; - - *p = '\0'; - - if ((err = virFileMakePath(parent))) - return err; + if (p != parent) { + *p = '\0'; + if ((err = virFileMakePath(parent))) + return err; + } if (mkdir(path, 0777) < 0 && errno != EEXIST) return errno; -- |: Red Hat, Engineering, London -o- http://people.redhat.com/berrange/ :| |: http://libvirt.org -o- http://virt-manager.org -o- http://ovirt.org :| |: http://autobuild.org -o- http://search.cpan.org/~danberr/ :| |: GnuPG: 7D3B9505 -o- F3C9 553F A1DA 4AC2 5648 23C1 B3DF F742 7D3B 9505 :| -- Libvir-list mailing list Libvir-list@xxxxxxxxxx https://www.redhat.com/mailman/listinfo/libvir-list