On Thu, Nov 24, 2016 at 03:47:58PM +0100, Michal Privoznik wrote: > Prime time. When it comes to spawning qemu process and > relabelling all the devices it's going to touch, there's inherent > race with other applications in the system (e.g. udev). Instead > of trying convincing udev to not touch libvirt managed devices, > we can create a separate mount namespace for the qemu, and mount > our own /dev there. Of course this puts more work onto us as we > have to maintain /dev files on each domain start and device > hot(un-)plug. On the other hand, this enhances security also. > > >From technical POV, on domain startup process the parent > (libvirtd) creates: > > /var/lib/libvirt/qemu/$domain.dev > /var/lib/libvirt/qemu/$domain.devpts > > The child (which is going to be qemu eventually) calls unshare() > to create new mount namespace. From now on anything that child > does is invisible to the parent. Child then mounts tmpfs on > $domain.dev (so that it still sees original /dev from the host) > and creates some devices (as explained in one of the previous > patches). The devices have to be created exactly as they are in > the host (including perms, seclabels, ACLs, ...). After that it > moves $domain.dev mount to /dev. > > What's the $domain.devpts mount there for then you ask? QEMU can > create PTYs for some chardevs. And historically we exposed the > host ends in our domain XML allowing users to connect to them. > Therefore we must preserve devpts mount to be shared with the > host's one. > > To make this patch as small as possible, creating of devices > configured for domain in question is implemented in next patches. IIUC, this means that QEMU startup will be broken for any guests that need host devices for disk, usb/pci passthrough, etc ? It is nice to keep the work incremental though. I wonder if there is any value in doing everything *except* the MS_MOVE of /var/lib/libvirt/qemu/$domain.dev -> /dev/. ie only turn it on once all the code is in place ? > Signed-off-by: Michal Privoznik <mprivozn@xxxxxxxxxx> > --- > src/qemu/qemu_domain.c | 289 ++++++++++++++++++++++++++++++++++++++++++++++++ > src/qemu/qemu_domain.h | 10 ++ > src/qemu/qemu_process.c | 13 +++ > 3 files changed, 312 insertions(+) > > diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c > index 137d4d5..d6a1c29 100644 > --- a/src/qemu/qemu_domain.c > +++ b/src/qemu/qemu_domain.c > @@ -55,6 +55,9 @@ > > #include <sys/time.h> > #include <fcntl.h> > +#if defined(HAVE_SYS_MOUNT_H) > +# include <sys/mount.h> > +#endif > > #include <libxml/xpathInternals.h> > > @@ -1627,6 +1630,9 @@ qemuDomainObjPrivateXMLFormat(virBufferPtr buf, > virDomainChrTypeToString(priv->monConfig->type)); > } > > + if (priv->containerized) > + virBufferAddLit(buf, "<containerized/>\n"); I wonder if it is worth explicitly listing the namespaces we enabled ? eg <namespaces> <mount/> </namespaces> so we're prepared to deal with us adding use of more private namespaces in future ? > + > qemuDomainObjPrivateXMLFormatVcpus(buf, vm->def); > > if (priv->qemuCaps) { > @@ -1809,6 +1815,8 @@ qemuDomainObjPrivateXMLParse(xmlXPathContextPtr ctxt, > goto error; > } > > + priv->containerized = virXPathBoolean("count(./containerized) > 0", ctxt) > 0; > + > if ((n = virXPathNodeSet("./vcpus/vcpu", ctxt, &nodes)) < 0) > goto error; > > @@ -6653,3 +6661,284 @@ qemuDomainSupportsVideoVga(virDomainVideoDefPtr video, > > return true; > } > + > + > +static int > +qemuDomainCreateDevice(const char *device, > + const char *path, > + bool allow_noent) > +{ > + char *devicePath = NULL; > + struct stat sb; > + int ret = -1; > + > + if (!STRPREFIX(device, "/dev")) { > + virReportError(VIR_ERR_INTERNAL_ERROR, > + _("invalid device: %s"), > + device); > + goto cleanup; > + } > + > + if (virAsprintf(&devicePath, "%s/%s", > + path, device + 4) < 0) > + goto cleanup; > + > + if (stat(device, &sb) < 0) { > + if (errno == ENOENT && allow_noent) { > + /* Ignore non-existent device. */ > + ret = 0; > + goto cleanup; > + } > + > + virReportSystemError(errno, _("Unable to stat %s"), device); > + goto cleanup; > + } > + > + if (virFileMakeParentPath(devicePath) < 0) { > + virReportSystemError(errno, > + _("Unable to create %s"), > + devicePath); > + goto cleanup; > + } > + > + if (mknod(devicePath, sb.st_mode, sb.st_rdev) < 0) { > + virReportSystemError(errno, > + _("Failed to make device %s"), > + devicePath); > + goto cleanup; > + } > + > + if (chown(devicePath, sb.st_uid, sb.st_gid) < 0) { > + virReportSystemError(errno, > + _("Failed to chown device %s"), > + devicePath); > + goto cleanup; > + } > + > + if (virFileCopyACLs(device, devicePath) < 0 && > + errno != ENOTSUP) { > + virReportSystemError(errno, > + _("Failed to copy ACLs on device %s"), > + devicePath); > + goto cleanup; > + } Per my comments on earlier patches, I don't think we need to have the chown or ACL copy. Just have the dev owned by root:root and let our security drivers deal with the rest. > + > + ret = 0; > + cleanup: > + VIR_FREE(devicePath); > + return ret; > +} > + > + > + > +static int > +qemuDomainPopulateDevices(virQEMUDriverPtr driver, > + virDomainObjPtr vm ATTRIBUTE_UNUSED, > + const char *path) > +{ > + virQEMUDriverConfigPtr cfg = virQEMUDriverGetConfig(driver); > + const char *const *devices = (const char *const *) cfg->cgroupDeviceACL; > + size_t i; > + int ret = -1; > + > + if (!devices) > + devices = defaultDeviceACL; > + > + for (i = 0; devices[i]; i++) { > + if (qemuDomainCreateDevice(devices[i], path, true) < 0) > + goto cleanup; > + } > + > + ret = 0; > + cleanup: > + virObjectUnref(cfg); > + return ret; > +} > + > + > +static int > +qemuDomainSetupDev(virQEMUDriverPtr driver, > + virDomainObjPtr vm, > + const char *path) > +{ > + char *mount_options = NULL; > + char *opts = NULL; > + int ret = -1; > + > + VIR_DEBUG("Setting up /dev/ for domain %s", vm->def->name); > + > + mount_options = virSecurityManagerGetMountOptions(driver->securityManager, > + vm->def); > + > + if (!mount_options && > + VIR_STRDUP(mount_options, "") < 0) > + goto cleanup; > + > + /* > + * tmpfs is limited to 64kb, since we only have device nodes in there > + * and don't want to DOS the entire OS RAM usage > + */ > + if (virAsprintf(&opts, > + "mode=755,size=65536%s", mount_options) < 0) > + goto cleanup; > + > + if (virFileSetupDev(path, opts) < 0) > + goto cleanup; > + > + if (qemuDomainPopulateDevices(driver, vm, path) < 0) > + goto cleanup; > + > + ret = 0; > + cleanup: > + VIR_FREE(opts); > + VIR_FREE(mount_options); > + return ret; > +} > + > + > +int > +qemuDomainBuildNamespace(virQEMUDriverPtr driver, > + virDomainObjPtr vm) > +{ > + virQEMUDriverConfigPtr cfg = virQEMUDriverGetConfig(driver); > + qemuDomainObjPrivatePtr priv = vm->privateData; > + const unsigned long mount_flags = MS_MOVE; > + char *devPath = NULL; > + char *devptsPath = NULL; > + int ret = -1; > + > + if (!priv->containerized) { > + ret = 0; > + goto cleanup; > + } > + > + if (virAsprintf(&devPath, "%s/%s.dev", > + cfg->stateDir, vm->def->name) < 0 || > + virAsprintf(&devptsPath, "%s/%s.devpts", > + cfg->stateDir, vm->def->name) < 0) > + goto cleanup; > + > + if (qemuDomainSetupDev(driver, vm, devPath) < 0) > + goto cleanup; > + > + /* Save /dev/pts mount point because /dev/pts/NNN is exposed in our live > + * XML and other applications are supposed to be able to use it. */ > + if (mount("/dev/pts", devptsPath, NULL, mount_flags, NULL) < 0) { > + virReportSystemError(errno, "%s", > + _("Unable to move /dev/pts mount")); > + goto cleanup; > + } > + > + if (mount(devPath, "/dev", NULL, mount_flags, NULL) < 0) { > + virReportSystemError(errno, > + _("Failed to mount %s on /dev"), > + devPath); > + goto cleanup; > + } > + > + if (virFileMakePath("/dev/pts") < 0) { > + virReportSystemError(errno, "%s", > + _("Cannot create /dev/pts")); > + goto cleanup; > + } > + > + if (mount(devptsPath, "/dev/pts", NULL, mount_flags, NULL) < 0) { > + virReportSystemError(errno, > + _("Failed to mount %s on /dev/pts"), > + devptsPath); > + goto cleanup; > + } > + > + if (virFileBindMountDevice("/dev/pts/ptmx", "/dev/ptmx") < 0) > + goto cleanup; > + > + VIR_DEBUG("blaaah: %d", system("find /dev/ -ls > /tmp/blaaah")); > + VIR_DEBUG("blaaah: %d", system("echo >> /tmp/blaaah")); > + VIR_DEBUG("blaaah: %d", system("mount >> /tmp/blaaah")); Heh, :-) > + ret = 0; > + cleanup: > + virObjectUnref(cfg); > + VIR_FREE(devPath); > + return ret; > +} > + > + > +int > +qemuDomainCreateNamespace(virQEMUDriverPtr driver, > + virDomainObjPtr vm) > +{ > + virQEMUDriverConfigPtr cfg = virQEMUDriverGetConfig(driver); > + qemuDomainObjPrivatePtr priv = vm->privateData; > + int ret = -1; > + char *path = NULL; > + > +#if !defined(__linux__) > + /* Namespaces are Linux specific. On other platforms just > + * carry on with the old behaviour. */ > + return 0; > +#endif This feels quite likely to create compiler warnings on non-linux about unreachable code. It feels like we should just stub out the entire method instead. > + > + if (!virQEMUDriverIsPrivileged(driver)) { > + ret = 0; > + goto cleanup; > + } > + > + if (virAsprintf(&path, "%s/%s.dev", > + cfg->stateDir, vm->def->name) < 0) > + goto cleanup; > + > + if (virFileMakePath(path) < 0) { > + virReportSystemError(errno, > + _("Failed to create %s"), > + path); > + goto cleanup; > + } > + > + VIR_FREE(path); > + if (virAsprintf(&path, "%s/%s.devpts", > + cfg->stateDir, vm->def->name) < 0) > + goto cleanup; > + > + if (virFileMakePath(path) < 0) { > + virReportSystemError(errno, > + _("Failed to create %s"), > + path); > + goto cleanup; > + } > + > + priv->containerized = true; > + ret = 0; > + cleanup: > + VIR_FREE(path); > + virObjectUnref(cfg); > + return ret; > +} > + > + > +void > +qemuDomainDeleteNamespace(virQEMUDriverPtr driver, > + virDomainObjPtr vm) > +{ > + virQEMUDriverConfigPtr cfg = virQEMUDriverGetConfig(driver); > + qemuDomainObjPrivatePtr priv = vm->privateData; > + char *path; > + > + if (!priv->containerized) > + return; > + > + if (virAsprintf(&path, "%s/%s.dev", > + cfg->stateDir, vm->def->name) < 0) > + goto cleanup; > + > + virFileDeleteTree(path); > + > + VIR_FREE(path); > + if (virAsprintf(&path, "%s/%s.devpts", > + cfg->stateDir, vm->def->name) < 0) > + goto cleanup; > + > + virFileDeleteTree(path); > + cleanup: > + virObjectUnref(cfg); > + VIR_FREE(path); > +} This is running in the host namespace and custom /dev tmpfs was only ever mounted in the QEMU private namespace, so should be invisible to the host. As such, IIUC, these directories should both be 100% empty. IOW, it seems that we don't need virFileDeleteTree - plain rmdir should be sufficient and safer against accidents. > diff --git a/src/qemu/qemu_domain.h b/src/qemu/qemu_domain.h > index 7650ff3..9c34476 100644 > --- a/src/qemu/qemu_domain.h > +++ b/src/qemu/qemu_domain.h > @@ -170,6 +170,8 @@ typedef qemuDomainObjPrivate *qemuDomainObjPrivatePtr; > struct _qemuDomainObjPrivate { > struct qemuDomainJobObj job; > > + bool containerized; > + > qemuMonitorPtr mon; > virDomainChrSourceDefPtr monConfig; > bool monJSON; > @@ -785,4 +787,12 @@ int qemuDomainCheckMonitor(virQEMUDriverPtr driver, > bool qemuDomainSupportsVideoVga(virDomainVideoDefPtr video, > virQEMUCapsPtr qemuCaps); > > +int qemuDomainBuildNamespace(virQEMUDriverPtr driver, > + virDomainObjPtr vm); > + > +int qemuDomainCreateNamespace(virQEMUDriverPtr driver, > + virDomainObjPtr vm); > + > +void qemuDomainDeleteNamespace(virQEMUDriverPtr driver, > + virDomainObjPtr vm); > #endif /* __QEMU_DOMAIN_H__ */ > diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c > index 93a38e1..4f64dd1 100644 > --- a/src/qemu/qemu_process.c > +++ b/src/qemu/qemu_process.c > @@ -2661,6 +2661,12 @@ static int qemuProcessHook(void *data) > if (virSecurityManagerClearSocketLabel(h->driver->securityManager, h->vm->def) < 0) > goto cleanup; > > + if (virProcessSetupPrivateMountNS() < 0) > + goto cleanup; > + > + if (qemuDomainBuildNamespace(h->driver, h->vm) < 0) > + goto cleanup; > + > if (virDomainNumatuneGetMode(h->vm->def->numa, -1, &mode) == 0) { > if (mode == VIR_DOMAIN_NUMATUNE_MEM_STRICT && > h->cfg->cgroupControllers & (1 << VIR_CGROUP_CONTROLLER_CPUSET) && > @@ -5434,6 +5440,11 @@ qemuProcessLaunch(virConnectPtr conn, > > qemuDomainLogContextMarkPosition(logCtxt); > > + VIR_DEBUG("Building mount namespace"); > + > + if (qemuDomainCreateNamespace(driver, vm) < 0) > + goto cleanup; > + > VIR_DEBUG("Clear emulator capabilities: %d", > cfg->clearEmulatorCapabilities); > if (cfg->clearEmulatorCapabilities) > @@ -6210,6 +6221,8 @@ void qemuProcessStop(virQEMUDriverPtr driver, > } > } > > + qemuDomainDeleteNamespace(driver, vm); > + > vm->taint = 0; > vm->pid = -1; > virDomainObjSetState(vm, VIR_DOMAIN_SHUTOFF, reason); Regards, Daniel -- |: http://berrange.com -o- http://www.flickr.com/photos/dberrange/ :| |: http://libvirt.org -o- http://virt-manager.org :| |: http://entangle-photo.org -o- http://search.cpan.org/~danberr/ :| -- libvir-list mailing list libvir-list@xxxxxxxxxx https://www.redhat.com/mailman/listinfo/libvir-list