From: Michael Galaxy <mgalaxy@xxxxxxxxxx> We start by introducing a backwards-compatible, comma-separated specification that will not break existing installations, such as in the following example: $ cat qemu.conf | grep memory_backing_dir memory_backing_dir = ["/path/to/pmem/0", "/path/to/pmem/1"] (The old syntax with a single string is also still supported) memory_backing_dir = "/path/to/dir" In our case, we almost always have two NUMA nodes, so in that example, we have two PMEM regions which are created on the Linux kernel command line that get mounted into those two locations for libvirt to use. Signed-off-by: Michael Galaxy <mgalaxy@xxxxxxxxxx> --- src/qemu/qemu_command.c | 6 +- src/qemu/qemu_conf.c | 139 +++++++++++++++++++++++++++++++++++----- src/qemu/qemu_conf.h | 16 +++-- src/qemu/qemu_driver.c | 37 +++++++---- src/qemu/qemu_hotplug.c | 6 +- src/qemu/qemu_process.c | 42 ++++++------ src/qemu/qemu_process.h | 6 +- tests/testutilsqemu.c | 5 +- 8 files changed, 191 insertions(+), 66 deletions(-) diff --git a/src/qemu/qemu_command.c b/src/qemu/qemu_command.c index 28914c9c34..4227e683c9 100644 --- a/src/qemu/qemu_command.c +++ b/src/qemu/qemu_command.c @@ -3421,7 +3421,8 @@ qemuBuildMemoryBackendProps(virJSONValue **backendProps, } else { /* We can have both pagesize and mem source. If that's the case, * prefer hugepages as those are more specific. */ - if (qemuGetMemoryBackingPath(priv->driver, def, mem->info.alias, &memPath) < 0) + + if (qemuGetMemoryBackingPath(def, priv->driver, priv->autoNodeset, mem->targetNode, mem->info.alias, &memPath) < 0) return -1; } @@ -7300,7 +7301,8 @@ qemuBuildMemPathStr(const virDomainDef *def, return -1; prealloc = true; } else if (def->mem.source == VIR_DOMAIN_MEMORY_SOURCE_FILE) { - if (qemuGetMemoryBackingPath(priv->driver, def, "ram", &mem_path) < 0) + // This path should not be reached if NUMA is requested + if (qemuGetMemoryBackingPath(def, priv->driver, priv->autoNodeset, 0, "ram", &mem_path) < 0) return -1; } diff --git a/src/qemu/qemu_conf.c b/src/qemu/qemu_conf.c index b36bede6c3..c2d3e4b400 100644 --- a/src/qemu/qemu_conf.c +++ b/src/qemu/qemu_conf.c @@ -43,6 +43,7 @@ #include "virfile.h" #include "virstring.h" #include "virutil.h" +#include "virnuma.h" #include "configmake.h" #include "security/security_util.h" @@ -137,6 +138,9 @@ virQEMUDriverConfig *virQEMUDriverConfigNew(bool privileged, cfg->cgroupControllers = -1; /* -1 == auto-detect */ + cfg->memoryBackingDirs = g_new0(char *, 1); + cfg->nmemoryBackingDirs = 1; + if (root != NULL) { cfg->logDir = g_strdup_printf("%s/log/qemu", root); cfg->swtpmLogDir = g_strdup_printf("%s/log/swtpm", root); @@ -153,7 +157,7 @@ virQEMUDriverConfig *virQEMUDriverConfigNew(bool privileged, cfg->checkpointDir = g_strdup_printf("%s/checkpoint", cfg->libDir); cfg->autoDumpPath = g_strdup_printf("%s/dump", cfg->libDir); cfg->nvramDir = g_strdup_printf("%s/nvram", cfg->libDir); - cfg->memoryBackingDir = g_strdup_printf("%s/ram", cfg->libDir); + cfg->memoryBackingDirs[0] = g_strdup_printf("%s/ram", cfg->libDir); } else if (privileged) { cfg->logDir = g_strdup_printf("%s/log/libvirt/qemu", LOCALSTATEDIR); @@ -174,7 +178,8 @@ virQEMUDriverConfig *virQEMUDriverConfigNew(bool privileged, cfg->checkpointDir = g_strdup_printf("%s/checkpoint", cfg->libDir); cfg->autoDumpPath = g_strdup_printf("%s/dump", cfg->libDir); cfg->nvramDir = g_strdup_printf("%s/nvram", cfg->libDir); - cfg->memoryBackingDir = g_strdup_printf("%s/ram", cfg->libDir); + + cfg->memoryBackingDirs[0] = g_strdup_printf("%s/ram", cfg->libDir); cfg->swtpmStorageDir = g_strdup_printf("%s/lib/libvirt/swtpm", LOCALSTATEDIR); } else { @@ -201,7 +206,7 @@ virQEMUDriverConfig *virQEMUDriverConfigNew(bool privileged, cfg->configBaseDir); cfg->autoDumpPath = g_strdup_printf("%s/qemu/dump", cfg->configBaseDir); cfg->nvramDir = g_strdup_printf("%s/qemu/nvram", cfg->configBaseDir); - cfg->memoryBackingDir = g_strdup_printf("%s/qemu/ram", cfg->configBaseDir); + cfg->memoryBackingDirs[0] = g_strdup_printf("%s/qemu/ram", cfg->configBaseDir); cfg->swtpmStorageDir = g_strdup_printf("%s/qemu/swtpm", cfg->configBaseDir); } @@ -294,6 +299,7 @@ virQEMUDriverConfig *virQEMUDriverConfigNew(bool privileged, static void virQEMUDriverConfigDispose(void *obj) { virQEMUDriverConfig *cfg = obj; + size_t i; virBitmapFree(cfg->namespaces); @@ -369,7 +375,12 @@ static void virQEMUDriverConfigDispose(void *obj) virFirmwareFreeList(cfg->firmwares, cfg->nfirmwares); - g_free(cfg->memoryBackingDir); + for (i = 0; i < cfg->nmemoryBackingDirs; i++) { + g_free(cfg->memoryBackingDirs[i]); + } + + g_free(cfg->memoryBackingDirs); + g_free(cfg->swtpmStorageDir); g_strfreev(cfg->capabilityfilters); @@ -1018,15 +1029,21 @@ static int virQEMUDriverConfigLoadMemoryEntry(virQEMUDriverConfig *cfg, virConf *conf) { - g_autofree char *dir = NULL; + char **memoryBackingDirs = NULL; int rc; - if ((rc = virConfGetValueString(conf, "memory_backing_dir", &dir)) < 0) + if ((rc = virConfGetValueStringList(conf, "memory_backing_dir", true, &memoryBackingDirs) < 0)) return -1; - if (rc > 0) { - VIR_FREE(cfg->memoryBackingDir); - cfg->memoryBackingDir = g_strdup_printf("%s/libvirt/qemu", dir); + if (rc == 0) { + size_t i; + for (i = 0; i < cfg->nmemoryBackingDirs; i++) + g_free(cfg->memoryBackingDirs[i]); + + cfg->nmemoryBackingDirs = g_strv_length(memoryBackingDirs); + cfg->memoryBackingDirs = g_new0(char *, cfg->nmemoryBackingDirs); + for (i = 0; i < cfg->nmemoryBackingDirs; i++) + cfg->memoryBackingDirs[i] = g_strdup_printf("%s/libvirt/qemu", memoryBackingDirs[i]); } return 0; @@ -1604,22 +1621,108 @@ qemuGetDomainHupageMemPath(virQEMUDriver *driver, int -qemuGetMemoryBackingDomainPath(virQEMUDriver *driver, - const virDomainDef *def, +qemuGetMemoryBackingDomainPath(const virDomainDef *def, + virQEMUDriver *driver, + virBitmap *autoNodeset, + const size_t targetNode, char **path) { g_autoptr(virQEMUDriverConfig) cfg = virQEMUDriverGetConfig(driver); const char *root = driver->embeddedRoot; g_autofree char *shortName = NULL; + size_t path_index = 0; // original behavior, described below if (!(shortName = virDomainDefGetShortName(def))) return -1; - if (root && !STRPREFIX(cfg->memoryBackingDir, root)) { + /* + * We have three use cases: + * + * 1. Domain has multiple NUMA nodes, but they have only specified + * a single directory path in qemu.conf. (Original default behavior). + * + * In this case, we already placed the memory backing path for each NUMA node + * into the same path location. Preserve the established default behavior. + * + * 2. Domain has multiple NUMA nodes, but we have asked for multiple directory + * paths as well. + * + * In this case, we will have a one-to-one relationship between the number + * of NUMA nodes and the order in which the paths are provided. + * If the user does not specify enough paths, then we need to throw an error. + * NOTE: This is open to comment. The "ordering" of the paths here is not intially + * configurable to preserve backwards compatibility with the original qemu.conf syntax. + * If controlling the ordering is desired, we would need to revise the syntax in + * qemu.conf to make that possible. That hasn't been needed so far. + * + * NOTE A): We must check with numatune here, if requested. The number of NUMA nodes + * may be less than or equal to the number of provided paths. If it is less, + * we have to respect the choices made by numatune. In this case, we will map the + * physical NUMA nodes (0, 1, 2...) in the order in which they appear in qemu.conf + * + * 3. Domain has a single NUMA node, but we have asked for multiple directory paths. + * + * In this case we also need to check if numatune is requested. If so, + * we want to pick the path indicated by numatune. + * + * NOTE B): In both cases 2 and 3, if numatune is requested, the path obviously cannot + * be changed on the fly, like it normally would be in "restrictive" mode + * during runtime. So, we will only do this is the mode requested is "strict". + * + * NOTE C): Furthermore, in both cases 2 and 3, if the number of directory paths provided + * is more than one, and one of either: a) no numatune is provided at all or + * b) numatune is in fact provided, but the mode is not strict, + * then we must throw an error. This is because we cannot know which backing + * directory path to choose without the user's input. + * + * NOTE D): If one or more directory paths is requested in any of the cases 1, 2, or 3, + * the numatune cannot specifiy more than one NUMA node, because the only mode + * possible with directory paths is "strict" (e.g. automatic numa balancing of + * memory will not work). Only one numa node can be requested by numatune, else + * we must throw an error. + */ + + if (cfg->nmemoryBackingDirs > 1) { + virBitmap *numaBitmap = virDomainNumatuneGetNodeset(def->numa, autoNodeset, targetNode); + size_t numa_node_count = virDomainNumaGetNodeCount(def->numa); + virDomainNumatuneMemMode mode; + + if (def->numa && numaBitmap && virNumaNodesetIsAvailable(numaBitmap) && + virDomainNumatuneGetMode(def->numa, -1, &mode) == 0 && + mode == VIR_DOMAIN_NUMATUNE_MEM_STRICT && + virBitmapCountBits(numaBitmap) == 1) { + + // Is numatune provided? + // Is it strict? + // Does it only specify a single pinning for this target? + // Yes to all 3? then good to go. + + if (cfg->nmemoryBackingDirs < numa_node_count) { + virReportError(VIR_ERR_CONFIG_UNSUPPORTED, + _("Domain requesting configuration for %1$lu NUMA nodes, but memory backing directory only has (%2$lu) directory paths available. Either reduce this to one directory or provide more paths to use."), + numa_node_count, + cfg->nmemoryBackingDirs); + return -1; + } + + path_index = virBitmapNextSetBit(numaBitmap, -1); + } else if (numa_node_count > 1 && numa_node_count == cfg->nmemoryBackingDirs) { + // Be nice. A valid numatune and pinning has not been specified, but the number + // of paths matches up exactly, so just assign them one-to-one. + path_index = targetNode; + } else { + virReportError(VIR_ERR_CONFIG_UNSUPPORTED, + _("There are (%1$lu) memory directory directories configured. Domain must use a 'strict' numatune as well as an associated pinning configuration for each NUMA node before proceeding. An individual NUMA node can only be pinned to a single backing directory. Please correct the domain configuration or remove the memory backing directories and try again."), + cfg->nmemoryBackingDirs); + return -1; + } + } + + if (root && !STRPREFIX(cfg->memoryBackingDirs[path_index], root)) { g_autofree char * hash = virDomainDriverGenerateRootHash("qemu", root); - *path = g_strdup_printf("%s/%s-%s", cfg->memoryBackingDir, hash, shortName); + *path = g_strdup_printf("%s/%s-%s", cfg->memoryBackingDirs[path_index], hash, shortName); } else { - *path = g_strdup_printf("%s/%s", cfg->memoryBackingDir, shortName); + *path = g_strdup_printf("%s/%s", cfg->memoryBackingDirs[path_index], shortName); } return 0; @@ -1639,8 +1742,10 @@ qemuGetMemoryBackingDomainPath(virQEMUDriver *driver, * -1 otherwise (with error reported). */ int -qemuGetMemoryBackingPath(virQEMUDriver *driver, - const virDomainDef *def, +qemuGetMemoryBackingPath(const virDomainDef *def, + virQEMUDriver *driver, + virBitmap *autoNodeset, + const size_t targetNode, const char *alias, char **memPath) { @@ -1653,7 +1758,7 @@ qemuGetMemoryBackingPath(virQEMUDriver *driver, return -1; } - if (qemuGetMemoryBackingDomainPath(driver, def, &domainPath) < 0) + if (qemuGetMemoryBackingDomainPath(def, driver, autoNodeset, targetNode, &domainPath) < 0) return -1; *memPath = g_strdup_printf("%s/%s", domainPath, alias); diff --git a/src/qemu/qemu_conf.h b/src/qemu/qemu_conf.h index aa1e1a626c..76f11e681c 100644 --- a/src/qemu/qemu_conf.h +++ b/src/qemu/qemu_conf.h @@ -221,7 +221,8 @@ struct _virQEMUDriverConfig { unsigned int glusterDebugLevel; bool virtiofsdDebug; - char *memoryBackingDir; + char **memoryBackingDirs; + size_t nmemoryBackingDirs; uid_t swtpm_user; gid_t swtpm_group; @@ -368,11 +369,16 @@ int qemuGetDomainHupageMemPath(virQEMUDriver *driver, unsigned long long pagesize, char **memPath); -int qemuGetMemoryBackingDomainPath(virQEMUDriver *driver, - const virDomainDef *def, +int qemuGetMemoryBackingDomainPath(const virDomainDef *def, + virQEMUDriver *driver, + virBitmap *autoNodeset, + const size_t targetNode, char **path); -int qemuGetMemoryBackingPath(virQEMUDriver *driver, - const virDomainDef *def, + +int qemuGetMemoryBackingPath(const virDomainDef *def, + virQEMUDriver *driver, + virBitmap *autoNodeset, + const size_t targetNode, const char *alias, char **memPath); diff --git a/src/qemu/qemu_driver.c b/src/qemu/qemu_driver.c index 3801ad623a..42caafb692 100644 --- a/src/qemu/qemu_driver.c +++ b/src/qemu/qemu_driver.c @@ -652,11 +652,16 @@ qemuStateInitialize(bool privileged, cfg->nvramDir); goto error; } - if (g_mkdir_with_parents(cfg->memoryBackingDir, 0777) < 0) { - virReportSystemError(errno, _("Failed to create memory backing dir %1$s"), - cfg->memoryBackingDir); - goto error; + for (i = 0; i < cfg->nmemoryBackingDirs; i++) { + if (g_mkdir_with_parents(cfg->memoryBackingDirs[i], 0777) < 0) { + virReportSystemError(errno, + _("Failed to create memory backing dir # %1$lu @ %2$s, total: %3$lu"), + i, cfg->memoryBackingDirs[i], + cfg->nmemoryBackingDirs); + goto error; + } } + if (g_mkdir_with_parents(cfg->slirpStateDir, 0777) < 0) { virReportSystemError(errno, _("Failed to create slirp state dir %1$s"), cfg->slirpStateDir); @@ -792,12 +797,14 @@ qemuStateInitialize(bool privileged, (int)cfg->group); goto error; } - if (chown(cfg->memoryBackingDir, cfg->user, cfg->group) < 0) { - virReportSystemError(errno, - _("unable to set ownership of '%1$s' to %2$d:%3$d"), - cfg->memoryBackingDir, (int)cfg->user, - (int)cfg->group); - goto error; + for (i = 0; i < cfg->nmemoryBackingDirs; i++) { + if (chown(cfg->memoryBackingDirs[i], cfg->user, cfg->group) < 0) { + virReportSystemError(errno, + _("unable to set ownership of '%1$s' to %2$d:%3$d"), + cfg->memoryBackingDirs[i], (int)cfg->user, + (int)cfg->group); + goto error; + } } if (chown(cfg->slirpStateDir, cfg->user, cfg->group) < 0) { virReportSystemError(errno, @@ -842,10 +849,12 @@ qemuStateInitialize(bool privileged, goto error; } - if (privileged && - virFileUpdatePerm(cfg->memoryBackingDir, - 0, S_IXGRP | S_IXOTH) < 0) - goto error; + for (i = 0; i < cfg->nmemoryBackingDirs; i++) { + if (privileged && + virFileUpdatePerm(cfg->memoryBackingDirs[i], + 0, S_IXGRP | S_IXOTH) < 0) + goto error; + } /* Get all the running persistent or transient configs first */ if (virDomainObjListLoadAllConfigs(qemu_driver->domains, diff --git a/src/qemu/qemu_hotplug.c b/src/qemu/qemu_hotplug.c index 7cb1800504..a990d1f9f4 100644 --- a/src/qemu/qemu_hotplug.c +++ b/src/qemu/qemu_hotplug.c @@ -2276,7 +2276,7 @@ qemuDomainAttachMemory(virQEMUDriver *driver, priv, vm->def, mem, true, false, NULL) < 0) goto cleanup; - if (qemuProcessBuildDestroyMemoryPaths(driver, vm, mem, true) < 0) + if (qemuProcessBuildDestroyMemoryPaths(vm, mem, true) < 0) goto cleanup; if (qemuDomainNamespaceSetupMemory(vm, mem, &teardowndevice) < 0) @@ -2351,7 +2351,7 @@ qemuDomainAttachMemory(virQEMUDriver *driver, qemuDomainObjExitMonitor(vm); if (objAdded && mem) - ignore_value(qemuProcessDestroyMemoryBackingPath(driver, vm, mem)); + ignore_value(qemuProcessDestroyMemoryBackingPath(vm, mem)); virErrorRestore(&orig_err); if (!mem) @@ -4646,7 +4646,7 @@ qemuDomainRemoveMemoryDevice(virQEMUDriver *driver, if (qemuDomainNamespaceTeardownMemory(vm, mem) < 0) VIR_WARN("Unable to remove memory device from /dev"); - if (qemuProcessDestroyMemoryBackingPath(driver, vm, mem) < 0) + if (qemuProcessDestroyMemoryBackingPath(vm, mem) < 0) VIR_WARN("Unable to destroy memory backing path"); qemuDomainReleaseMemoryDeviceSlot(vm, mem); diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c index a69878e8bb..0b3f19af7f 100644 --- a/src/qemu/qemu_process.c +++ b/src/qemu/qemu_process.c @@ -4059,12 +4059,12 @@ qemuProcessBuildDestroyMemoryPathsImpl(virQEMUDriver *driver, int -qemuProcessBuildDestroyMemoryPaths(virQEMUDriver *driver, - virDomainObj *vm, +qemuProcessBuildDestroyMemoryPaths(virDomainObj *vm, virDomainMemoryDef *mem, bool build) { - + qemuDomainObjPrivate *priv = vm->privateData; + virQEMUDriver *driver = priv->driver; g_autoptr(virQEMUDriverConfig) cfg = virQEMUDriverGetConfig(driver); size_t i; bool shouldBuildHP = false; @@ -4094,13 +4094,14 @@ qemuProcessBuildDestroyMemoryPaths(virQEMUDriver *driver, } if (!build || shouldBuildMB) { - g_autofree char *path = NULL; - if (qemuGetMemoryBackingDomainPath(driver, vm->def, &path) < 0) - return -1; + for (i = 0; i < cfg->nmemoryBackingDirs; i++) { + g_autofree char *path = NULL; + if (qemuGetMemoryBackingDomainPath(vm->def, driver, priv->autoNodeset, i, &path) < 0) + return -1; - if (qemuProcessBuildDestroyMemoryPathsImpl(driver, vm, - path, build) < 0) - return -1; + if (qemuProcessBuildDestroyMemoryPathsImpl(driver, vm, path, build) < 0) + return -1; + } } return 0; @@ -4108,19 +4109,22 @@ qemuProcessBuildDestroyMemoryPaths(virQEMUDriver *driver, int -qemuProcessDestroyMemoryBackingPath(virQEMUDriver *driver, - virDomainObj *vm, +qemuProcessDestroyMemoryBackingPath(virDomainObj *vm, virDomainMemoryDef *mem) { g_autofree char *path = NULL; + qemuDomainObjPrivate *priv = vm->privateData; + g_autoptr(virQEMUDriverConfig) cfg = virQEMUDriverGetConfig(priv->driver); + size_t i; - if (qemuGetMemoryBackingPath(driver, vm->def, mem->info.alias, &path) < 0) - return -1; + for (i = 0; i < cfg->nmemoryBackingDirs; i++) { + if (qemuGetMemoryBackingPath(vm->def, priv->driver, priv->autoNodeset, i, mem->info.alias, &path) < 0) + return -1; - if (unlink(path) < 0 && - errno != ENOENT) { - virReportSystemError(errno, _("Unable to remove %1$s"), path); - return -1; + if (unlink(path) < 0 && errno != ENOENT) { + virReportSystemError(errno, _("Unable to remove %1$s"), path); + return -1; + } } return 0; @@ -7334,7 +7338,7 @@ qemuProcessPrepareHost(virQEMUDriver *driver, if (qemuProcessPrepareHostBackendChardev(vm) < 0) return -1; - if (qemuProcessBuildDestroyMemoryPaths(driver, vm, NULL, true) < 0) + if (qemuProcessBuildDestroyMemoryPaths(vm, NULL, true) < 0) return -1; /* Ensure no historical cgroup for this VM is lying around bogus @@ -8705,7 +8709,7 @@ void qemuProcessStop(virQEMUDriver *driver, g_clear_pointer(&priv->mon, qemuMonitorClose); } - qemuProcessBuildDestroyMemoryPaths(driver, vm, NULL, false); + qemuProcessBuildDestroyMemoryPaths(vm, NULL, false); /* Do this before we delete the tree and remove pidfile. */ qemuProcessKillManagedPRDaemon(vm); diff --git a/src/qemu/qemu_process.h b/src/qemu/qemu_process.h index 2324aeb7bd..394af9655d 100644 --- a/src/qemu/qemu_process.h +++ b/src/qemu/qemu_process.h @@ -38,13 +38,11 @@ int qemuProcessStopCPUs(virQEMUDriver *driver, virDomainPausedReason reason, virDomainAsyncJob asyncJob); -int qemuProcessBuildDestroyMemoryPaths(virQEMUDriver *driver, - virDomainObj *vm, +int qemuProcessBuildDestroyMemoryPaths(virDomainObj *vm, virDomainMemoryDef *mem, bool build); -int qemuProcessDestroyMemoryBackingPath(virQEMUDriver *driver, - virDomainObj *vm, +int qemuProcessDestroyMemoryBackingPath(virDomainObj *vm, virDomainMemoryDef *mem); void qemuProcessReconnectAll(virQEMUDriver *driver); diff --git a/tests/testutilsqemu.c b/tests/testutilsqemu.c index ee6cae218a..55cf39b4c8 100644 --- a/tests/testutilsqemu.c +++ b/tests/testutilsqemu.c @@ -321,8 +321,9 @@ int qemuTestDriverInit(virQEMUDriver *driver) cfg->libDir = g_strdup("/var/lib/libvirt/qemu"); VIR_FREE(cfg->channelTargetDir); cfg->channelTargetDir = g_strdup("/var/run/libvirt/qemu/channel"); - VIR_FREE(cfg->memoryBackingDir); - cfg->memoryBackingDir = g_strdup("/var/lib/libvirt/qemu/ram"); + g_free(cfg->memoryBackingDirs); + cfg->memoryBackingDirs = g_new0(char *, 1); + cfg->memoryBackingDirs[0] = g_strdup("/var/lib/libvirt/qemu/ram"); VIR_FREE(cfg->nvramDir); cfg->nvramDir = g_strdup("/var/lib/libvirt/qemu/nvram"); VIR_FREE(cfg->passtStateDir); -- 2.34.1