The amount of memory a ppc64 domain might need to lock is different than that of a equally-sized x86 domain, so we need to check the domain's architecture and act accordingly. Resolves: https://bugzilla.redhat.com/show_bug.cgi?id=1273480 --- src/qemu/qemu_domain.c | 80 +++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 79 insertions(+), 1 deletion(-) diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c index 2c0f5af..1e92b9d 100644 --- a/src/qemu/qemu_domain.c +++ b/src/qemu/qemu_domain.c @@ -3793,7 +3793,7 @@ qemuDomainUpdateCurrentMemorySize(virQEMUDriverPtr driver, * @def: domain definition * * Returns the size of the memory in bytes that needs to be set as - * RLIMIT_MEMLOCK for purpose of VFIO device passthrough. + * RLIMIT_MEMLOCK for the QEMU process. * If a mem.hard_limit is set, then that value is preferred; otherwise, the * value returned may depend upon the architecture or devices present. */ @@ -3808,6 +3808,84 @@ qemuDomainGetMlockLimitBytes(virDomainDefPtr def) goto done; } + if (ARCH_IS_PPC64(def->os.arch)) { + unsigned long long maxMemory; + unsigned long long memory; + unsigned long long baseLimit; + unsigned long long passthroughLimit; + size_t nPCIHostBridges; + size_t i; + bool usesVFIO = false; + + /* TODO: Detect at runtime once we start using more than just + * the default PCI Host Bridge */ + nPCIHostBridges = 1; + + for (i = 0; i < def->nhostdevs; i++) { + virDomainHostdevDefPtr dev = def->hostdevs[i]; + + if (dev->mode == VIR_DOMAIN_HOSTDEV_MODE_SUBSYS && + dev->source.subsys.type == VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_PCI && + dev->source.subsys.u.pci.backend == VIR_DOMAIN_HOSTDEV_PCI_BACKEND_VFIO) { + usesVFIO = true; + break; + } + } + + memory = virDomainDefGetMemoryActual(def); + + if (def->mem.max_memory) + maxMemory = def->mem.max_memory; + else + maxMemory = memory; + + /* baseLimit := maxMemory / 128 (a) + * + 4 MiB * #PHBs + 8 MiB (b) + * + * (a) is the hash table + * + * (b) is accounting for the 32-bit DMA window - it could be either the + * KVM accelerated TCE tables for emulated devices, or the VFIO + * userspace view. The 4 MiB per-PHB (including the default one) covers + * a 2GiB DMA window: default is 1GiB, but it's possible it'll be + * increased to help performance. The 8 MiB extra should be plenty for + * the TCE table index for any reasonable number of PHBs and several + * spapr-vlan or spapr-vscsi devices (512kB + a tiny bit each) */ + baseLimit = maxMemory / 128 + + 4096 * nPCIHostBridges + + 8192; + + /* passthroughLimit := max( 2 GiB * #PHBs, (c) + * memory (d) + * + memory * 1/512 * #PHBs + 8 MiB ) (e) + * + * (c) is the pre-DDW VFIO DMA window accounting. We're allowing 2 GiB + * rather than 1 GiB + * + * (d) is the with-DDW (and memory pre-registration and related + * features) DMA window accounting - assuming that we only account RAM + * once, even if mapped to multiple PHBs + * + * (e) is the with-DDW userspace view and overhead for the 64-bit DMA + * window. This is based a bit on expected guest behaviour, but there + * really isn't a way to completely avoid that. We assume the guest + * requests a 64-bit DMA window (per PHB) just big enough to map all + * its RAM. 4 kiB page size gives the 1/512; it will be less with 64 + * kiB pages, less still if the guest is mapped with hugepages (unlike + * the default 32-bit DMA window, DDW windows can use large IOMMU + * pages). 8 MiB is for second and further level overheads, like (b) */ + passthroughLimit = MAX(2 * 1024 * 1024 * nPCIHostBridges, + memory + + memory / 512 * nPCIHostBridges + 8192); + + if (usesVFIO) + memKB = baseLimit + passthroughLimit; + else + memKB = baseLimit; + + goto done; + } + /* For device passthrough using VFIO the guest memory and MMIO memory * regions need to be locked persistent in order to allow DMA. * -- 2.5.0 -- libvir-list mailing list libvir-list@xxxxxxxxxx https://www.redhat.com/mailman/listinfo/libvir-list