On Tue, Feb 28, 2012 at 10:10:50PM +0800, Osier Yang wrote: > numad is an user-level daemon that monitors NUMA topology and > processes resource consumption to facilitate good NUMA resource > alignment of applications/virtual machines to improve performance > and minimize cost of remote memory latencies. It provides a > pre-placement advisory interface, so significant processes can > be pre-bound to nodes with sufficient available resources. > > More details: http://fedoraproject.org/wiki/Features/numad > > "numad -w ncpus:memory_amount" is the advisory interface numad > provides currently. > > This patch add the support by introducing new XML like: > <numatune> > <cpu required_cpus="4" required_memory="524288"/> > </numatune> Isn't the usual case going to be the vcpus and memory in the guest? IMO we should default to passing those numbers to numad if required_cpus and required_memory are not provided explicitly. Dave > And the corresponding numad command line will be: > numad -w 4:500 > > The advisory nodeset returned from numad will be used to set > domain process CPU affinity then. (e.g. qemuProcessInitCpuAffinity). > > If the user specifies both CPU affinity policy (e.g. > (<vcpu cpuset="1-10,^7,^8">4</vcpu>) and XML indicating to use > numad for the advisory nodeset, the specified CPU affinity will be > overridden by the nodeset returned from numad. > > If no XML to specify the CPU affinity policy, and XML indicating > to use numad is specified, the returned nodeset will be printed > in <cpu cpuset="$nodeset_from_numad"/>4</vcpu>. > > Only QEMU/KVM and LXC drivers support it now. > --- > configure.ac | 8 +++ > docs/formatdomain.html.in | 18 ++++++- > docs/schemas/domaincommon.rng | 12 ++++ > src/conf/domain_conf.c | 125 +++++++++++++++++++++++++++++++---------- > src/conf/domain_conf.h | 5 ++ > src/lxc/lxc_controller.c | 98 ++++++++++++++++++++++++++++---- > src/qemu/qemu_process.c | 99 +++++++++++++++++++++++++++++---- > 7 files changed, 311 insertions(+), 54 deletions(-) > > diff --git a/configure.ac b/configure.ac > index c9cdd7b..31f0835 100644 > --- a/configure.ac > +++ b/configure.ac > @@ -1445,6 +1445,14 @@ AM_CONDITIONAL([HAVE_NUMACTL], [test "$with_numactl" != "no"]) > AC_SUBST([NUMACTL_CFLAGS]) > AC_SUBST([NUMACTL_LIBS]) > > +dnl Do we have numad? > +if test "$with_qemu" = "yes"; then > + AC_PATH_PROG([NUMAD], [numad], [], [/bin:/usr/bin:/usr/local/bin:$PATH]) > + > + if test -n "$NUMAD"; then > + AC_DEFINE_UNQUOTED([NUMAD],["$NUMAD"], [Location or name of the numad program]) > + fi > +fi > > dnl pcap lib > LIBPCAP_CONFIG="pcap-config" > diff --git a/docs/formatdomain.html.in b/docs/formatdomain.html.in > index 6fcca94..d8e70a6 100644 > --- a/docs/formatdomain.html.in > +++ b/docs/formatdomain.html.in > @@ -505,6 +505,7 @@ > ... > <numatune> > <memory mode="strict" nodeset="1-4,^3"/> > + <cpu required_cpus="3" required_memory="524288"/> > </numatune> > ... > </domain> > @@ -519,7 +520,7 @@ > <span class='since'>Since 0.9.3</span> > <dt><code>memory</code></dt> > <dd> > - The optional <code>memory</code> element specify how to allocate memory > + The optional <code>memory</code> element specifies how to allocate memory > for the domain process on a NUMA host. It contains two attributes, > attribute <code>mode</code> is either 'interleave', 'strict', > or 'preferred', > @@ -527,6 +528,21 @@ > syntax with attribute <code>cpuset</code> of element <code>vcpu</code>. > <span class='since'>Since 0.9.3</span> > </dd> > + <dd> > + The optional <code>cpu</code> element indicates pinning the virtual CPUs > + to the nodeset returned by querying "numad" (a system daemon that monitors > + NUMA topology and usage). It has two attributes, attribute > + <code>required_cpus</code> specifies the number of physical CPUs the guest > + process want to use. And the optional attribute <code>required_memory</code> > + specifies the amount of free memory the guest process want to see on a node, > + "numad" will pick the physical CPUs on the node which has enough free > + memory of amount specified by <code>required_memory</code>. > + > + NB, with using this element, the physical CPUs specified by attribute > + <code>cpuset</code> (of element <code>vcpu</code>) will be overridden by the > + nodeset returned from "numad". > + <span class='since'>Since 0.9.11 (QEMU/KVM and LXC only)</span> > + </dd> > </dl> > > > diff --git a/docs/schemas/domaincommon.rng b/docs/schemas/domaincommon.rng > index 3908733..d0f443d 100644 > --- a/docs/schemas/domaincommon.rng > +++ b/docs/schemas/domaincommon.rng > @@ -549,6 +549,18 @@ > </attribute> > </element> > </optional> > + <optional> > + <element name="cpu"> > + <attribute name="required_cpu"> > + <ref name="countCPU"/> > + </attribute> > + <optional> > + <attribute name="required_memory"> > + <ref name="memoryKB"/> > + </attribute> > + </optional> > + </element> > + </optional> > </element> > </optional> > </interleave> > diff --git a/src/conf/domain_conf.c b/src/conf/domain_conf.c > index f9654f1..aa03c05 100644 > --- a/src/conf/domain_conf.c > +++ b/src/conf/domain_conf.c > @@ -7125,7 +7125,6 @@ error: > goto cleanup; > } > > - > static int virDomainDefMaybeAddController(virDomainDefPtr def, > int type, > int idx) > @@ -7185,6 +7184,7 @@ static virDomainDefPtr virDomainDefParseXML(virCapsPtr caps, > bool uuid_generated = false; > virBitmapPtr bootMap = NULL; > unsigned long bootMapSize = 0; > + xmlNodePtr cur; > > if (VIR_ALLOC(def) < 0) { > virReportOOMError(); > @@ -7454,47 +7454,100 @@ static virDomainDefPtr virDomainDefParseXML(virCapsPtr caps, > VIR_FREE(nodes); > > /* Extract numatune if exists. */ > - if ((n = virXPathNodeSet("./numatune", ctxt, NULL)) < 0) { > + if ((n = virXPathNodeSet("./numatune", ctxt, &nodes)) < 0) { > virDomainReportError(VIR_ERR_INTERNAL_ERROR, > "%s", _("cannot extract numatune nodes")); > goto error; > } > > + if (n > 1) { > + virDomainReportError(VIR_ERR_XML_ERROR, "%s", > + _("only one numatune is supported")); > + VIR_FREE(nodes); > + goto error; > + } > + > if (n) { > - tmp = virXPathString("string(./numatune/memory/@nodeset)", ctxt); > - if (tmp) { > - char *set = tmp; > - int nodemasklen = VIR_DOMAIN_CPUMASK_LEN; > + cur = nodes[0]->children; > + while (cur != NULL) { > + if (cur->type == XML_ELEMENT_NODE) { > + if ((xmlStrEqual(cur->name, BAD_CAST "memory"))) { > + tmp = virXMLPropString(cur, "nodeset"); > > - if (VIR_ALLOC_N(def->numatune.memory.nodemask, nodemasklen) < 0) { > - goto no_memory; > - } > + if (tmp) { > + char *set = tmp; > + int nodemasklen = VIR_DOMAIN_CPUMASK_LEN; > > - /* "nodeset" leads same syntax with "cpuset". */ > - if (virDomainCpuSetParse(set, 0, def->numatune.memory.nodemask, > - nodemasklen) < 0) > - goto error; > - VIR_FREE(tmp); > - } else { > - virDomainReportError(VIR_ERR_INTERNAL_ERROR, > - "%s", _("nodeset for NUMA memory tuning must be set")); > - goto error; > - } > + if (VIR_ALLOC_N(def->numatune.memory.nodemask, > + nodemasklen) < 0) { > + virReportOOMError(); > + goto error; > + } > > - tmp = virXPathString("string(./numatune/memory/@mode)", ctxt); > - if (tmp) { > - if ((def->numatune.memory.mode = > - virDomainNumatuneMemModeTypeFromString(tmp)) < 0) { > - virDomainReportError(VIR_ERR_INTERNAL_ERROR, > - _("Unsupported NUMA memory tuning mode '%s'"), > - tmp); > - goto error; > + /* "nodeset" leads same syntax with "cpuset". */ > + if (virDomainCpuSetParse(set, 0, > + def->numatune.memory.nodemask, > + nodemasklen) < 0) > + goto error; > + VIR_FREE(tmp); > + } else { > + virDomainReportError(VIR_ERR_XML_ERROR, "%s", > + _("nodeset for NUMA memory " > + "tuning must be set")); > + goto error; > + } > + > + tmp = virXMLPropString(cur, "mode"); > + if (tmp) { > + if ((def->numatune.memory.mode = > + virDomainNumatuneMemModeTypeFromString(tmp)) < 0) { > + virDomainReportError(VIR_ERR_XML_ERROR, > + _("Unsupported NUMA memory " > + "tuning mode '%s'"), > + tmp); > + goto error; > + } > + VIR_FREE(tmp); > + } else { > + def->numatune.memory.mode = VIR_DOMAIN_NUMATUNE_MEM_STRICT; > + } > + } else if (xmlStrEqual(cur->name, BAD_CAST "cpu")) { > + char *req_cpus = NULL; > + char *req_memory = NULL; > + req_cpus = virXMLPropString(cur, "required_cpus"); > + req_memory = virXMLPropString(cur, "required_memory"); > + > + if (req_cpus && > + virStrToLong_ui(req_cpus, NULL, 10, > + &def->numatune.cpu.required_cpus) < 0) { > + virDomainReportError(VIR_ERR_INTERNAL_ERROR, "%s", > + _("Cannot parse <cpu> 'required_cpus'" > + " attribute")); > + goto error; > + } > + > + if (req_memory && > + virStrToLong_ul(req_memory, NULL, 10, > + &def->numatune.cpu.required_memory) < 0) { > + virDomainReportError(VIR_ERR_INTERNAL_ERROR, "%s", > + _("Cannot parse <cpu> 'required_memory'" > + " attribute")); > + goto error; > + } > + > + VIR_FREE(req_cpus); > + VIR_FREE(req_memory); > + } else { > + virDomainReportError(VIR_ERR_XML_ERROR, > + _("unsupported XML element %s"), > + (const char *)cur->name); > + goto error; > + } > } > - VIR_FREE(tmp); > - } else { > - def->numatune.memory.mode = VIR_DOMAIN_NUMATUNE_MEM_STRICT; > + cur = cur->next; > } > } > + VIR_FREE(nodes); > > n = virXPathNodeSet("./features/*", ctxt, &nodes); > if (n < 0) > @@ -11761,7 +11814,8 @@ virDomainDefFormatInternal(virDomainDefPtr def, > def->cputune.period || def->cputune.quota) > virBufferAddLit(buf, " </cputune>\n"); > > - if (def->numatune.memory.nodemask) { > + if (def->numatune.memory.nodemask || > + def->numatune.cpu.required_cpus) { > const char *mode; > char *nodemask = NULL; > > @@ -11778,6 +11832,15 @@ virDomainDefFormatInternal(virDomainDefPtr def, > virBufferAsprintf(buf, " <memory mode='%s' nodeset='%s'/>\n", > mode, nodemask); > VIR_FREE(nodemask); > + > + if (def->numatune.cpu.required_cpus) > + virBufferAsprintf(buf, " <cpu required_cpus='%d' ", > + def->numatune.cpu.required_cpus); > + > + if (def->numatune.cpu.required_memory) > + virBufferAsprintf(buf, "required_memory='%lu'/>\n", > + def->numatune.cpu.required_memory); > + > virBufferAddLit(buf, " </numatune>\n"); > } > > diff --git a/src/conf/domain_conf.h b/src/conf/domain_conf.h > index 596be4d..1284599 100644 > --- a/src/conf/domain_conf.h > +++ b/src/conf/domain_conf.h > @@ -1416,6 +1416,11 @@ struct _virDomainNumatuneDef { > int mode; > } memory; > > + struct { > + unsigned int required_cpus; > + unsigned long required_memory; > + } cpu; > + > /* Future NUMA tuning related stuff should go here. */ > }; > > diff --git a/src/lxc/lxc_controller.c b/src/lxc/lxc_controller.c > index 8f336f5..ec6434d 100644 > --- a/src/lxc/lxc_controller.c > +++ b/src/lxc/lxc_controller.c > @@ -327,6 +327,47 @@ static int lxcSetContainerNUMAPolicy(virDomainDefPtr def) > } > #endif > > +#if defined(NUMAD) > +static char * > +lxcGetNumadAdvice(unsigned int req_cpus, > + unsigned long req_memory) { > + virCommandPtr cmd = NULL; > + char *reqs = NULL; > + char *ret = NULL; > + > + /* numad uses "MB" for memory. */ > + if (req_memory) { > + req_memory = req_memory / 1024; > + if (virAsprintf(&reqs, "%d:%lu", req_cpus, req_memory) < 0) { > + virReportOOMError(); > + goto out; > + } > + cmd = virCommandNewArgList(NUMAD, "-w", reqs, NULL); > + } else { > + cmd = virCommandNewArgList(NUMAD, "-w", "%d", req_cpus, NULL); > + } > + > + virCommandSetOutputBuffer(cmd, &ret); > + > + if (virCommandRun(cmd, NULL) < 0) { > + lxcError(VIR_ERR_INTERNAL_ERROR, "%s", > + _("Failed to query numad for the advisory nodeset")); > + } > + > +out: > + VIR_FREE(reqs); > + virCommandFree(cmd); > + return ret; > +} > +#else > +static char * > +lxcGetNumadAdvice(unsigned int req_cpus ATTRIBUTE_UNUSED, > + unsigned long req_memory ATTRIBUTE_UNUSED) { > + lxcError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", > + _("numad is not available on this host")); > + return NULL; > +} > +#endif > > /* > * To be run while still single threaded > @@ -355,19 +396,54 @@ static int lxcSetContainerCpuAffinity(virDomainDefPtr def) > return -1; > } > > - if (def->cpumask) { > - /* XXX why don't we keep 'cpumask' in the libvirt cpumap > - * format to start with ?!?! */ > - for (i = 0 ; i < maxcpu && i < def->cpumasklen ; i++) > - if (def->cpumask[i]) > + /* def->cpumask will be overridden by the nodeset > + * suggested by numad if it's specified. > + */ > + if (def->numatune.cpu.required_cpus) { > + char *tmp_cpumask = NULL; > + char *nodeset = NULL; > + > + nodeset = lxcGetNumadAdvice(def->numatune.cpu.required_cpus, > + def->numatune.cpu.required_memory); > + if (!nodeset) > + return -1; > + > + if (VIR_ALLOC_N(tmp_cpumask, VIR_DOMAIN_CPUMASK_LEN) < 0) { > + virReportOOMError(); > + return -1; > + } > + > + if (virDomainCpuSetParse(nodeset, 0, tmp_cpumask, > + VIR_DOMAIN_CPUMASK_LEN) < 0) { > + VIR_FREE(tmp_cpumask); > + VIR_FREE(nodeset); > + return -1; > + } > + > + for (i = 0; i < maxcpu && i < VIR_DOMAIN_CPUMASK_LEN; i++) { > + if (tmp_cpumask[i]) > VIR_USE_CPU(cpumap, i); > + } > + > + /* Update def->cpumask */ > + VIR_FREE(def->cpumask); > + def->cpumask = tmp_cpumask; > + VIR_FREE(nodeset); > } else { > - /* You may think this is redundant, but we can't assume libvirtd > - * itself is running on all pCPUs, so we need to explicitly set > - * the spawned LXC instance to all pCPUs if no map is given in > - * its config file */ > - for (i = 0 ; i < maxcpu ; i++) > - VIR_USE_CPU(cpumap, i); > + if (def->cpumask) { > + /* XXX why don't we keep 'cpumask' in the libvirt cpumap > + * format to start with ?!?! */ > + for (i = 0 ; i < maxcpu && i < def->cpumasklen ; i++) > + if (def->cpumask[i]) > + VIR_USE_CPU(cpumap, i); > + } else { > + /* You may think this is redundant, but we can't assume libvirtd > + * itself is running on all pCPUs, so we need to explicitly set > + * the spawned LXC instance to all pCPUs if no map is given in > + * its config file */ > + for (i = 0 ; i < maxcpu ; i++) > + VIR_USE_CPU(cpumap, i); > + } > } > > /* We are pressuming we are running between fork/exec of LXC > diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c > index 41218de..eb9f8f1 100644 > --- a/src/qemu/qemu_process.c > +++ b/src/qemu/qemu_process.c > @@ -1633,6 +1633,48 @@ qemuProcessInitNumaMemoryPolicy(virDomainObjPtr vm) > } > #endif > > +#if defined(NUMAD) > +static char * > +qemuGetNumadAdvice(unsigned int req_cpus, > + unsigned long req_memory) { > + virCommandPtr cmd = NULL; > + char *reqs = NULL; > + char *output = NULL; > + > + /* numad uses "MB" for memory. */ > + if (req_memory) { > + req_memory = req_memory / 1024; > + if (virAsprintf(&reqs, "%d:%lu", req_cpus, req_memory) < 0) { > + virReportOOMError(); > + goto out; > + } > + > + cmd = virCommandNewArgList(NUMAD, "-w", reqs, NULL); > + } else { > + cmd = virCommandNewArgList(NUMAD, "-w", "%u", req_cpus, NULL); > + } > + > + virCommandSetOutputBuffer(cmd, &output); > + > + if (virCommandRun(cmd, NULL) < 0) > + qemuReportError(VIR_ERR_INTERNAL_ERROR, "%s", > + _("Failed to query numad for the advisory nodeset")); > + > +out: > + VIR_FREE(reqs); > + virCommandFree(cmd); > + return output; > +} > +#else > +static char * > +qemuGetNumadAdvice(unsigned int req_cpus ATTRIBUTE_UNUSED, > + unsigned long req_memory ATTRIBUTE_UNUSED) { > + qemuReportError(VIR_ERR_CONFIG_UNSUPPORTED, "%s", > + _("numad is not available on this host")); > + return NULL; > +} > +#endif > + > /* > * To be run between fork/exec of QEMU only > */ > @@ -1661,19 +1703,54 @@ qemuProcessInitCpuAffinity(virDomainObjPtr vm) > return -1; > } > > - if (vm->def->cpumask) { > - /* XXX why don't we keep 'cpumask' in the libvirt cpumap > - * format to start with ?!?! */ > - for (i = 0 ; i < maxcpu && i < vm->def->cpumasklen ; i++) > - if (vm->def->cpumask[i]) > + /* vm->def->cpumask will be overridden by the nodeset > + * suggested by numad if it's specified. > + */ > + if (vm->def->numatune.cpu.required_cpus) { > + char *tmp_cpumask = NULL; > + char *nodeset = NULL; > + > + nodeset = qemuGetNumadAdvice(vm->def->numatune.cpu.required_cpus, > + vm->def->numatune.cpu.required_memory); > + if (!nodeset) > + return -1; > + > + if (VIR_ALLOC_N(tmp_cpumask, VIR_DOMAIN_CPUMASK_LEN) < 0) { > + virReportOOMError(); > + return -1; > + } > + > + if (virDomainCpuSetParse(nodeset, 0, tmp_cpumask, > + VIR_DOMAIN_CPUMASK_LEN) < 0) { > + VIR_FREE(tmp_cpumask); > + VIR_FREE(nodeset); > + return -1; > + } > + > + for (i = 0; i < maxcpu && i < VIR_DOMAIN_CPUMASK_LEN; i++) { > + if (tmp_cpumask[i]) > VIR_USE_CPU(cpumap, i); > + } > + > + /* Update vm->def->cpumask */ > + VIR_FREE(vm->def->cpumask); > + vm->def->cpumask = tmp_cpumask; > + VIR_FREE(nodeset); > } else { > - /* You may think this is redundant, but we can't assume libvirtd > - * itself is running on all pCPUs, so we need to explicitly set > - * the spawned QEMU instance to all pCPUs if no map is given in > - * its config file */ > - for (i = 0 ; i < maxcpu ; i++) > - VIR_USE_CPU(cpumap, i); > + if (vm->def->cpumask) { > + /* XXX why don't we keep 'cpumask' in the libvirt cpumap > + * format to start with ?!?! */ > + for (i = 0 ; i < maxcpu && i < vm->def->cpumasklen ; i++) > + if (vm->def->cpumask[i]) > + VIR_USE_CPU(cpumap, i); > + } else { > + /* You may think this is redundant, but we can't assume libvirtd > + * itself is running on all pCPUs, so we need to explicitly set > + * the spawned QEMU instance to all pCPUs if no map is given in > + * its config file */ > + for (i = 0 ; i < maxcpu ; i++) > + VIR_USE_CPU(cpumap, i); > + } > } > > /* We are pressuming we are running between fork/exec of QEMU > -- > 1.7.7.3 > > -- > libvir-list mailing list > libvir-list@xxxxxxxxxx > https://www.redhat.com/mailman/listinfo/libvir-list -- libvir-list mailing list libvir-list@xxxxxxxxxx https://www.redhat.com/mailman/listinfo/libvir-list