This patch adds the start container support. A couple new source files are added - lxc_container.h and lxc_container.c These contain the setup code that runs within the container namespace prior to exec'ing the user specified init. This is a rough outline of the functions involved in starting a container and the namespace and process under which they run: lxcVmStart() - runs under callers process lxcSetupTtyTunnel() - opens a tty and socket pair, tty stored in vmDef double fork to separate from parent process grandchild calls lxcStartContainer() see below parent continues wait for child process(es) if child process was successful, change vm state to running return lxcStartContainer() - runs in parent namespace, child process from lxcVmStart Allocate stack for container clone() - child process will start in lxcChild() see below exit() - once lxcTtyForward returns, the container has exited lxcChild() - runs within container, child process from clone() mount user filesystems mount container /proc lxcExecWithTty() - see below, will not return lxcExecWithTty() - runs within container lxcSetupContainerTty() - opens tty for container Set up SIGCHLD handler fork() Child calls lxcExecContainerInit() see below Parent continues lxcTtyForward - shuttles data between file descriptors until flag is set in this case between the master end of the container tty and the master end of the parent tty exit() - when lxcTtyForward returns, container init has exited lxcExecContainerInit() - runs within contianer, child process from lxcExecWithTty exec containers init if exec fails, exit() There's (at least) a couple issues I don't have good solutions for - 1) In this setup with a tty console, we end up with at least 2 processes per container. One process is running the user init. The CMD listed under ps will be the init as specified in the XML (unless it changes it to something else). The other process is forwarding console traffic between the parent and container pts. The CMD listed in ps depends will depend on the mgmt app used to start the container. Using virsh, it's something like this outside the container: root 10141 1 93 22:05 pts/6 00:27:50 /home/dlesko/src/dev/libvirt-ss/libvirt/src/.libs/lt-virsh -c lxc:/// and this inside the container: root 1 0 93 22:05 pts/6 00:29:19 /home/dlesko/src/dev/libvirt-ss/libvirt/src/.libs/lt-virsh -c lxc:/// This can be a bit confusing. I'm not sure how important it is but it would be nice to change this to something a little more meaningful as is done by ssh. 2) The container can stall when nothing is connected to the parent side pty and console output fills up the buffer. To avoid this, we set the parent side pty to be non-blocking. The result of this is that we will discard any console output once the buffer has filled. When a user does connect to the console, they may get a flood of (potentially very) old data. It would be nice to be able to provide some more recent output once someone connects to the console. -- Best Regards, Dave Leskovec IBM Linux Technology Center Open Virtualization
--- src/Makefile.am | 1 src/lxc_container.c | 323 ++++++++++++++++++++++++++++++++++++++++++++++++++++ src/lxc_container.h | 43 ++++++ src/lxc_driver.c | 231 ++++++++++++++++++++++++++++++++++++- 4 files changed, 596 insertions(+), 2 deletions(-) Index: b/src/lxc_container.h =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ b/src/lxc_container.h 2008-03-14 11:25:02.000000000 -0700 @@ -0,0 +1,43 @@ +/* + * Copyright IBM Corp. 2008 + * + * lxc_container.h: header file for fcns run inside container + * + * Authors: + * David L. Leskovec <dlesko at linux.vnet.ibm.com> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef LXC_CONTAINER_H +#define LXC_CONTAINER_H + +#ifdef WITH_LXC + +/* Function declarations */ +int lxcChild( void *argv ); + +#endif /* LXC_DRIVER_H */ + +#endif /* LXC_CONTAINER_H */ + +/* + * Local variables: + * indent-tabs-mode: nil + * c-indent-level: 4 + * c-basic-offset: 4 + * tab-width: 4 + * End: + */ Index: b/src/lxc_container.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ b/src/lxc_container.c 2008-03-19 22:59:14.000000000 -0700 @@ -0,0 +1,323 @@ +/* + * Copyright IBM Corp. 2008 + * + * lxc_container.c: file description + * + * Authors: + * David L. Leskovec <dlesko at linux.vnet.ibm.com> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <config.h> + +#ifdef WITH_LXC + +#include <fcntl.h> +#include <limits.h> +#include <poll.h> +#include <signal.h> +#include <stdlib.h> +#include <sys/ioctl.h> +#include <sys/mount.h> +#include <sys/wait.h> +#include <unistd.h> + +#include "lxc_container.h" +#include "lxc_conf.h" +#include "util.h" + +#define DEBUG(fmt,...) VIR_DEBUG(__FILE__, fmt, __VA_ARGS__) +#define DEBUG0(msg) VIR_DEBUG(__FILE__, "%s", msg) + +/* Functions */ +static int lxcExecContainerInit(lxc_vm_def_t *vmDef) +{ + int rc = -1; + char* execString; + int execStringLen = strlen(vmDef->init) + 1 + 5; + + if(NULL == (execString = calloc(execStringLen, sizeof(char)))) { + goto error_out; + } + + strcpy(execString, "exec "); + strcat(execString, vmDef->init); + + execl("/bin/sh", "sh", "-c", execString, (char*)NULL); + DEBUG("execl failed: %s", strerror(errno)); + +error_out: + exit(rc); +} + +static int lxcSetupContainerTty(int *ttymaster, int *ttyslave) +{ + int rc = -1; + char ttyName[PATH_MAX]; + + *ttymaster = posix_openpt(O_RDWR|O_NOCTTY); + if (*ttymaster < 0) { + DEBUG("posix_openpt() failed: %s", strerror(errno)); + goto cleanup; + } + + if (unlockpt(*ttymaster) < 0) { + DEBUG("unlockpt() failed: %s", strerror(errno)); + goto cleanup; + } + + if (0 != ptsname_r(*ttymaster, ttyName, sizeof(ttyName))) { + DEBUG("failed to mount for container: %s", strerror(errno)); + goto cleanup; + } + + *ttyslave = open(ttyName, O_RDWR|O_NOCTTY); + if (*ttyslave < 0) { + DEBUG("open(%s) failed: %s", ttyName, strerror(errno)); + goto cleanup; + } + + rc = 0; + +cleanup: + if (0 != rc) { + if (-1 != *ttyslave) { + close(*ttyslave); + } + if (-1 != *ttymaster) { + close(*ttymaster); + } + } + + return rc; +} + +static int lxcSetContainerStdio(int ttyfd) +{ + int rc = -1; + + if (setsid() < 0) { + DEBUG0("setsid failed"); + goto cleanup; + } + + if (ioctl(ttyfd, TIOCSCTTY, NULL) < 0) { + DEBUG("ioctl(TIOCSTTY) failed: %s", strerror(errno)); + goto cleanup; + } + + close(0); close(1); close(2); + + if (dup2(ttyfd, 0) < 0) { + DEBUG("dup2(stdin) failed: %s", strerror(errno)); + goto cleanup; + } + + if (dup2(ttyfd, 1) < 0) { + DEBUG("dup2(stdout) failed: %s", strerror(errno)); + goto cleanup; + } + + if (dup2(ttyfd, 2) < 0) { + DEBUG("dup2(stderr) failed: %s", strerror(errno)); + goto cleanup; + } + + rc = 0; + +cleanup: + close(ttyfd); + + return rc; +} + +static int lxcTtyForward(int fd1, int fd2, int *loopFlag, int pollmsecs) +{ + int rc = -1; + int i; + char buf[2]; + struct pollfd fds[2]; + int numFds = 0; + + if (0 <= fd1) { + fds[numFds].fd = fd1; + fds[numFds].events = POLLIN; + ++numFds; + } + + if (0 <= fd2) { + fds[numFds].fd = fd2; + fds[numFds].events = POLLIN; + ++numFds; + } + + if (0 == numFds) { + DEBUG0("No fds to monitor, return"); + goto cleanup; + } + + while (!(*loopFlag)) { + if ((rc = poll(fds, numFds, pollmsecs)) <= 0) { + if(*loopFlag) { + goto cleanup; + } + + if ((0 == rc) || (errno == EINTR) || (errno == EAGAIN)) { + continue; + } + + DEBUG("poll returned error: %s", strerror(errno)); + goto cleanup; + } + + for (i = 0; i < numFds; ++i) { + if (!fds[i].revents) { + continue; + } + + if (fds[i].revents & POLLIN) { + saferead(fds[i].fd, buf, 1); + if (1 < numFds) { + safewrite(fds[i ^ 1].fd, buf, 1); + } + + } + + } + + } + + rc = 0; + +cleanup: + return rc; +} + +static pid_t initPid; +static int exitChildLoop; +static void lxcExecChildHandler(int sig ATTRIBUTE_UNUSED, + siginfo_t *signalInfo, + void *context ATTRIBUTE_UNUSED) +{ + DEBUG("lxcExecChildHandler signal from %d\n", signalInfo->si_pid); + + if (signalInfo->si_pid == initPid) { + exitChildLoop = 1; + } else { + waitpid(signalInfo->si_pid, NULL, WNOHANG); + } + +} + +static int lxcExecWithTty(lxc_vm_t *vm) +{ + int rc = -1; + lxc_vm_def_t *vmDef = vm->def; + int ttymaster = -1; + int ttyslave = -1; + struct sigaction sigAction; + sigset_t sigMask; + int childStatus; + + if (lxcSetupContainerTty(&ttymaster, &ttyslave) < 0) { + goto exit_with_error; + } + + sigAction.sa_sigaction = lxcExecChildHandler; + sigfillset(&sigMask); + sigAction.sa_mask = sigMask; + sigAction.sa_flags = SA_SIGINFO; + if (0 != sigaction(SIGCHLD, &sigAction, NULL)) { + DEBUG("sigaction failed: %s\n", strerror(errno)); + goto exit_with_error; + } + + exitChildLoop = 0; + if ((initPid = fork()) == 0) { + if(lxcSetContainerStdio(ttyslave) < 0) { + exitChildLoop = 1; + goto exit_with_error; + } + + lxcExecContainerInit(vmDef); + /* this function will not return. if it fails, it will exit */ + } + + close(ttyslave); + lxcTtyForward(ttymaster, vm->parentTty, + &exitChildLoop, 100); + + DEBUG("child waiting on pid %d", initPid); + waitpid(initPid, &childStatus, 0); + rc = WEXITSTATUS(childStatus); + DEBUG("container exited with rc: %d", rc); + +exit_with_error: + exit(rc); +} + +int lxcChild( void *argv ) +{ + int rc = -1; + lxc_vm_t *vm = (lxc_vm_t *)argv; + lxc_vm_def_t *vmDef = vm->def; + lxc_mount_t *curMount; + int i; + + if (NULL == vmDef) { + DEBUG0("lxcChild() passed invalid vm definition"); + goto cleanup; + } + + /* handle the bind mounts first before doing anything else that may */ + /* then access those mounted dirs */ + curMount = vmDef->mounts; + for (i = 0; curMount; curMount = curMount->next) { + rc = mount(curMount->source, + curMount->target, + NULL, + MS_BIND, + NULL); + if (0 != rc) { + DEBUG("failed to mount for container: %s", strerror(errno)); + goto cleanup; + } + } + + /* mount /proc */ + rc = mount("lxcproc", "/proc", "proc", 0, NULL); + if (0 != rc) { + DEBUG("failed to mount /proc for container: %s", strerror(errno)); + goto cleanup; + } + + rc = lxcExecWithTty(vm); + /* this function will only return if an error occured */ + +cleanup: + return rc; +} + +#endif /* WITH_LXC */ + +/* + * Local variables: + * indent-tabs-mode: nil + * c-indent-level: 4 + * c-basic-offset: 4 + * tab-width: 4 + * End: + */ Index: b/src/lxc_driver.c =================================================================== --- a/src/lxc_driver.c 2008-03-14 11:24:45.000000000 -0700 +++ b/src/lxc_driver.c 2008-03-19 22:59:58.000000000 -0700 @@ -25,14 +25,17 @@ #ifdef WITH_LXC +#include <fcntl.h> #include <sched.h> #include <sys/utsname.h> #include <string.h> #include <sys/types.h> +#include <termios.h> #include <unistd.h> #include <wait.h> #include "lxc_conf.h" +#include "lxc_container.h" #include "lxc_driver.h" #include "driver.h" #include "internal.h" @@ -360,6 +363,230 @@ return lxcGenerateXML(dom->conn, driver, vm, vm->def); } +static int lxcStartContainer(lxc_driver_t* driver, + lxc_vm_t *vm, + int ttyDev ATTRIBUTE_UNUSED) +{ + int rc = -1; + int flags; + int stacksize = getpagesize() * 4; + void *stack, *stacktop; + + /* allocate a stack for the container */ + stack = malloc(stacksize); + if (!stack) { + DEBUG0("unable to allocate container stack"); + goto error_exit; + } + stacktop = (char*)stack + stacksize; + + flags = CLONE_NEWPID|CLONE_NEWNS|CLONE_NEWUTS|CLONE_NEWUSER|CLONE_NEWIPC|SIGCHLD; + + vm->pid = clone(lxcChild, stacktop, flags, (void *)vm); + + DEBUG("clone() returned, %d", vm->pid); + + if (vm->pid < 0) { + DEBUG("clone() failed, %s", strerror(errno)); + goto error_exit; + } + + vm->def->id = vm->pid; + lxcSaveConfig(NULL, driver, vm, vm->def); + + rc = 0; + +error_exit: + exit(rc); +} + +static int lxcPutTtyInRawMode(virConnectPtr conn, int ttyDev) +{ + int rc = -1; + + struct termios ttyAttr; + + if (tcgetattr(ttyDev, &ttyAttr) < 0) { + lxcError(conn, NULL, VIR_ERR_INTERNAL_ERROR, + "tcgetattr() failed: %s", strerror(errno)); + goto cleanup; + } + + cfmakeraw(&ttyAttr); + + if (tcsetattr(ttyDev, TCSADRAIN, &ttyAttr) < 0) { + lxcError(conn, NULL, VIR_ERR_INTERNAL_ERROR, + "tcsetattr failed: %s", strerror(errno)); + goto cleanup; + } + + rc = 0; + +cleanup: + return rc; +} + +static int lxcSetupTtyTunnel(virConnectPtr conn, + lxc_vm_def_t *vmDef, + int* ttyDev) +{ + int rc = -1; + char *ptsStr; + + if (0 < strlen(vmDef->tty)) { + *ttyDev = open(vmDef->tty, O_RDWR|O_NOCTTY|O_NONBLOCK); + if (*ttyDev < 0) { + lxcError(conn, NULL, VIR_ERR_INTERNAL_ERROR, + "open() tty failed: %s", strerror(errno)); + goto setup_complete; + } + + rc = grantpt(*ttyDev); + if (rc < 0) { + lxcError(conn, NULL, VIR_ERR_INTERNAL_ERROR, + "grantpt() failed: %s", strerror(errno)); + goto setup_complete; + } + + rc = unlockpt(*ttyDev); + if (rc < 0) { + lxcError(conn, NULL, VIR_ERR_INTERNAL_ERROR, + "unlockpt() failed: %s", strerror(errno)); + goto setup_complete; + } + + /* get the name and print it to stdout */ + ptsStr = ptsname(*ttyDev); + if (ptsStr == NULL) { + lxcError(conn, NULL, VIR_ERR_INTERNAL_ERROR, + "ptsname() failed"); + goto setup_complete; + } + /* This value may need to be stored in the container configuration file */ + if (STRNEQ(ptsStr, vmDef->tty)) { + strcpy(vmDef->tty, ptsStr); + } + + /* Enter raw mode, so all characters are passed directly to child */ + if (lxcPutTtyInRawMode(conn, *ttyDev) < 0) { + goto setup_complete; + } + + } else { + *ttyDev = -1; + } + + rc = 0; + +setup_complete: + if((0 != rc) && (*ttyDev > 0)) { + close(*ttyDev); + } + + return rc; +} + +static int lxcVmStart(virConnectPtr conn, + lxc_driver_t * driver, + lxc_vm_t * vm) +{ + int rc = -1; + lxc_vm_def_t *vmDef = vm->def; + int ttyDev = -1; + pid_t forkPid; + int childStatus; + + /* open tty for the container */ + if(lxcSetupTtyTunnel(conn, vmDef, &vm->parentTty) < 0) { + goto cleanup; + } + + if(0 == (forkPid = fork())) + { + if(0 == (forkPid = fork())) + { + lxcStartContainer(driver, vm, ttyDev); + } + else + { + waitpid(forkPid, &childStatus, 0); + DEBUG("lxcStartContainer exited with rc = %d", childStatus); + exit(childStatus); + } + } + + waitpid(forkPid, &childStatus, 0); + + DEBUG("inner fork exited with rc = %d", childStatus); + + if(childStatus == 0) { + vm->state = VIR_DOMAIN_RUNNING; + driver->ninactivevms--; + driver->nactivevms++; + lxcLoadContainerConfigFile(driver, vm->configFileBase); + rc = 0; + } + +cleanup: + return rc; +} + +static int lxcDomainStart(virDomainPtr dom) +{ + int rc = -1; + virConnectPtr conn = dom->conn; + lxc_driver_t *driver = (lxc_driver_t *)(conn->privateData); + lxc_vm_t *vm = lxcFindVMByUUID(driver, dom->uuid); + + if (!vm) { + lxcError(conn, dom, VIR_ERR_INVALID_DOMAIN, + "no domain with uuid"); + goto cleanup; + } + + rc = lxcVmStart(conn, driver, vm); + +cleanup: + return rc; +} + +static virDomainPtr +lxcDomainCreateAndStart(virConnectPtr conn, + const char *xml, + unsigned int flags ATTRIBUTE_UNUSED) { + lxc_driver_t *driver = (lxc_driver_t *)conn->privateData; + lxc_vm_t *vm; + lxc_vm_def_t *def; + virDomainPtr dom = NULL; + + if (!(def = lxcParseVMDef(conn, xml, NULL))) { + goto return_point; + } + + if (!(vm = lxcAssignVMDef(conn, driver, def))) { + lxcFreeVMDef(def); + goto return_point; + } + + if (lxcSaveVMDef(conn, driver, vm, def) < 0) { + lxcRemoveInactiveVM(driver, vm); + return NULL; + } + + if (lxcVmStart(conn, driver, vm) < 0) { + lxcRemoveInactiveVM(driver, vm); + goto return_point; + } + + dom = virGetDomain(conn, vm->def->name, vm->def->uuid); + if (dom) { + dom->id = vm->def->id; + } + +return_point: + return dom; +} + static int lxcStartup(virConnectPtr conn) { lxc_driver_t *driver; @@ -423,7 +650,7 @@ NULL, /* getCapabilities */ lxcListDomains, /* listDomains */ lxcNumDomains, /* numOfDomains */ - NULL/*lxcDomainCreateLinux*/, /* domainCreateLinux */ + lxcDomainCreateAndStart, /* domainCreateLinux */ lxcDomainLookupByID, /* domainLookupByID */ lxcDomainLookupByUUID, /* domainLookupByUUID */ lxcDomainLookupByName, /* domainLookupByName */ @@ -447,7 +674,7 @@ lxcDomainDumpXML, /* domainDumpXML */ lxcListDefinedDomains, /* listDefinedDomains */ lxcNumDefinedDomains, /* numOfDefinedDomains */ - NULL, /* domainCreate */ + lxcDomainStart, /* domainCreate */ lxcDomainDefine, /* domainDefineXML */ lxcDomainUndefine, /* domainUndefine */ NULL, /* domainAttachDevice */ Index: b/src/Makefile.am =================================================================== --- a/src/Makefile.am 2008-03-14 11:14:01.000000000 -0700 +++ b/src/Makefile.am 2008-03-14 11:25:02.000000000 -0700 @@ -61,6 +61,7 @@ openvz_driver.c openvz_driver.h \ lxc_driver.c lxc_driver.h \ lxc_conf.c lxc_conf.h \ + lxc_container.c lxc_container.h \ nodeinfo.h nodeinfo.c \ storage_conf.h storage_conf.c \ storage_driver.h storage_driver.c \
-- Libvir-list mailing list Libvir-list@xxxxxxxxxx https://www.redhat.com/mailman/listinfo/libvir-list