On Wed, Jan 30, 2019 at 02:56:46PM +0100, Michal Privoznik wrote: > https://bugzilla.redhat.com/show_bug.cgi?id=1503284 > > The way we currently start qemu from CPU affinity POV is as > follows: > > 1) the child process is set affinity to all online CPUs (unless > some vcpu pinning was given in the domain XML) > > 2) Once qemu is running, cpuset cgroup is configured taking > memory pinning into account > > Problem is that we let qemu allocate its memory just anywhere in > 1) and then rely in 2) to be able to move the memory to > configured NUMA nodes. This might not be always possible (e.g. > qemu might lock some parts of its memory) and is very suboptimal > (copying large memory between NUMA nodes takes significant amount > of time). The solution is to set the affinity correctly from the > beginning and then possibly refine it later via cgroups. > > Signed-off-by: Michal Privoznik <mprivozn@xxxxxxxxxx> > --- > src/qemu/qemu_process.c | 152 ++++++++++++++++++++++------------------ > 1 file changed, 83 insertions(+), 69 deletions(-) > > diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c > index 9ccc3601a2..a4668f6773 100644 > --- a/src/qemu/qemu_process.c > +++ b/src/qemu/qemu_process.c > @@ -2435,6 +2435,44 @@ qemuProcessDetectIOThreadPIDs(virQEMUDriverPtr driver, > } > > > +static int > +qemuProcessGetAllCpuAffinity(pid_t pid, > + virBitmapPtr *cpumapRet) > +{ > + VIR_AUTOPTR(virBitmap) cpumap = NULL; > + VIR_AUTOPTR(virBitmap) hostcpumap = NULL; > + int hostcpus; > + > + *cpumapRet = NULL; > + > + if (!virHostCPUHasBitmap()) > + return 0; > + > + if (!(hostcpumap = virHostCPUGetOnlineBitmap()) || > + !(cpumap = virProcessGetAffinity(pid))) > + return -1; > + > + if (!virBitmapEqual(hostcpumap, cpumap)) { > + /* setaffinity fails if you set bits for CPUs which > + * aren't present, so we have to limit ourselves */ > + if ((hostcpus = virHostCPUGetCount()) < 0) > + return -1; > + > + if (hostcpus > QEMUD_CPUMASK_LEN) > + hostcpus = QEMUD_CPUMASK_LEN; > + > + virBitmapFree(cpumap); > + if (!(cpumap = virBitmapNew(hostcpus))) > + return -1; > + > + virBitmapSetAll(cpumap); > + } > + > + VIR_STEAL_PTR(*cpumapRet, cpumap); IIUC, if the QEMU process affinity matches the current set of online host CPUs, we just return the current QEMU affinity. Otherwise we construct a mask of all host CPUs, but seemingly ignoring whether the host CPUs are online or not. Seems a bit odd to return list of online host CPUs in one case, or a list of all possible host CPUs in the other cases. So I guess I'm wondering why this method is not simply reduced to 1 single line *cpumapRet = virHostCPUGetOnlineBitmap(); ? > @@ -2454,59 +2492,36 @@ qemuProcessInitCpuAffinity(virDomainObjPtr vm) > return -1; > } > > - if (vm->def->placement_mode == VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO) { > - VIR_DEBUG("Set CPU affinity with advisory nodeset from numad"); > - cpumapToSet = priv->autoCpuset; > + /* Here is the deal, we can't set cpuset.mems before qemu is > + * started as it clashes with KVM allocation. Therefore we > + * used to let qemu allocate its memory anywhere as we would > + * then move the memory to desired NUMA node via CGroups. > + * However, that might not be always possible because qemu > + * might lock some parts of its memory (e.g. due to VFIO). > + * Solution is to set some temporary affinity now and then > + * fix it later, once qemu is already running. */ > + if (virDomainNumaGetNodeCount(vm->def->numa) <= 1 && > + virDomainNumatuneGetMode(vm->def->numa, -1, &mem_mode) == 0 && > + mem_mode == VIR_DOMAIN_NUMATUNE_MEM_STRICT) { > + if (virDomainNumatuneMaybeGetNodeset(vm->def->numa, > + priv->autoNodeset, > + &cpumapToSet, > + -1) < 0) > + goto cleanup; > + } else if (vm->def->cputune.emulatorpin) { > + cpumapToSet = vm->def->cputune.emulatorpin; > } else { > - VIR_DEBUG("Set CPU affinity with specified cpuset"); > - if (vm->def->cpumask) { > - cpumapToSet = vm->def->cpumask; > - } else { > - /* You may think this is redundant, but we can't assume libvirtd > - * itself is running on all pCPUs, so we need to explicitly set > - * the spawned QEMU instance to all pCPUs if no map is given in > - * its config file */ > - int hostcpus; > - > - if (virHostCPUHasBitmap()) { > - hostcpumap = virHostCPUGetOnlineBitmap(); > - cpumap = virProcessGetAffinity(vm->pid); > - } > - > - if (hostcpumap && cpumap && virBitmapEqual(hostcpumap, cpumap)) { > - /* we're using all available CPUs, no reason to set > - * mask. If libvirtd is running without explicit > - * affinity, we can use hotplugged CPUs for this VM */ > - ret = 0; > - goto cleanup; > - } else { > - /* setaffinity fails if you set bits for CPUs which > - * aren't present, so we have to limit ourselves */ > - if ((hostcpus = virHostCPUGetCount()) < 0) > - goto cleanup; > - > - if (hostcpus > QEMUD_CPUMASK_LEN) > - hostcpus = QEMUD_CPUMASK_LEN; > - > - virBitmapFree(cpumap); > - if (!(cpumap = virBitmapNew(hostcpus))) > - goto cleanup; > - > - virBitmapSetAll(cpumap); > - > - cpumapToSet = cpumap; > - } > - } > + if (qemuProcessGetAllCpuAffinity(vm->pid, &hostcpumap) < 0) > + goto cleanup; > + cpumapToSet = hostcpumap; IIUC, the end up setting affinity to one of (in priority order) - The CPUs associated with NUMA memory affinity mask - The CPUs associated with emulator pinning - All online host CPUs Later (once QEMU has allocated its memory) we then change this again to be simpler - The CPUs associated with emulator pinning - All online host CPUs I think it would be nice to mention this explicitly in the commit message, as it clarifies how we're solving the problem the commit message mentions. > } > > - if (virProcessSetAffinity(vm->pid, cpumapToSet) < 0) > + if (cpumapToSet && > + virProcessSetAffinity(vm->pid, cpumapToSet) < 0) > goto cleanup; > > ret = 0; > - > cleanup: > - virBitmapFree(cpumap); > - virBitmapFree(hostcpumap); > return ret; > } > #else /* !defined(HAVE_SCHED_GETAFFINITY) && !defined(HAVE_BSD_CPU_AFFINITY) */ > @@ -2586,7 +2601,8 @@ qemuProcessSetupPid(virDomainObjPtr vm, > qemuDomainObjPrivatePtr priv = vm->privateData; > virDomainNumatuneMemMode mem_mode; > virCgroupPtr cgroup = NULL; > - virBitmapPtr use_cpumask; > + virBitmapPtr use_cpumask = NULL; > + VIR_AUTOPTR(virBitmap) hostcpumap = NULL; > char *mem_mask = NULL; > int ret = -1; > > @@ -2598,12 +2614,21 @@ qemuProcessSetupPid(virDomainObjPtr vm, > } > > /* Infer which cpumask shall be used. */ > - if (cpumask) > + if (cpumask) { > use_cpumask = cpumask; > - else if (vm->def->placement_mode == VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO) > + } else if (vm->def->placement_mode == VIR_DOMAIN_CPU_PLACEMENT_MODE_AUTO) { > use_cpumask = priv->autoCpuset; > - else > + } else if (vm->def->cpumask) { > use_cpumask = vm->def->cpumask; > + } else if (virHostCPUHasBitmap()) { > + /* You may think this is redundant, but we can't assume libvirtd > + * itself is running on all pCPUs, so we need to explicitly set > + * the spawned QEMU instance to all pCPUs if no map is given in > + * its config file */ > + if (qemuProcessGetAllCpuAffinity(pid, &hostcpumap) < 0) > + goto cleanup; > + use_cpumask = hostcpumap; > + } > > /* > * If CPU cgroup controller is not initialized here, then we need > @@ -2628,13 +2653,7 @@ qemuProcessSetupPid(virDomainObjPtr vm, > qemuSetupCgroupCpusetCpus(cgroup, use_cpumask) < 0) > goto cleanup; > > - /* > - * Don't setup cpuset.mems for the emulator, they need to > - * be set up after initialization in order for kvm > - * allocations to succeed. > - */ > - if (nameval != VIR_CGROUP_THREAD_EMULATOR && > - mem_mask && virCgroupSetCpusetMems(cgroup, mem_mask) < 0) > + if (mem_mask && virCgroupSetCpusetMems(cgroup, mem_mask) < 0) > goto cleanup; > > } > @@ -6634,12 +6653,7 @@ qemuProcessLaunch(virConnectPtr conn, > > /* This must be done after cgroup placement to avoid resetting CPU > * affinity */ > - if (!vm->def->cputune.emulatorpin && > - qemuProcessInitCpuAffinity(vm) < 0) > - goto cleanup; > - > - VIR_DEBUG("Setting emulator tuning/settings"); > - if (qemuProcessSetupEmulator(vm) < 0) > + if (qemuProcessInitCpuAffinity(vm) < 0) > goto cleanup; > > VIR_DEBUG("Setting cgroup for external devices (if required)"); > @@ -6708,10 +6722,6 @@ qemuProcessLaunch(virConnectPtr conn, > if (qemuProcessUpdateAndVerifyCPU(driver, vm, asyncJob) < 0) > goto cleanup; > > - VIR_DEBUG("Setting up post-init cgroup restrictions"); > - if (qemuSetupCpusetMems(vm) < 0) > - goto cleanup; > - > VIR_DEBUG("setting up hotpluggable cpus"); > if (qemuDomainHasHotpluggableStartupVcpus(vm->def)) { > if (qemuDomainRefreshVcpuInfo(driver, vm, asyncJob, false) < 0) > @@ -6737,6 +6747,10 @@ qemuProcessLaunch(virConnectPtr conn, > if (qemuProcessDetectIOThreadPIDs(driver, vm, asyncJob) < 0) > goto cleanup; > > + VIR_DEBUG("Setting emulator tuning/settings"); > + if (qemuProcessSetupEmulator(vm) < 0) > + goto cleanup; > + > VIR_DEBUG("Setting global CPU cgroup (if required)"); > if (qemuSetupGlobalCpuCgroup(vm) < 0) > goto cleanup; > -- > 2.19.2 > > -- > libvir-list mailing list > libvir-list@xxxxxxxxxx > https://www.redhat.com/mailman/listinfo/libvir-list Regards, Daniel -- |: https://berrange.com -o- https://www.flickr.com/photos/dberrange :| |: https://libvirt.org -o- https://fstop138.berrange.com :| |: https://entangle-photo.org -o- https://www.instagram.com/dberrange :| -- libvir-list mailing list libvir-list@xxxxxxxxxx https://www.redhat.com/mailman/listinfo/libvir-list