[Patch v3 5/9] KVM: x86/mmu: Allocate TDP page table's page on correct NUMA node on split

Vipin Sharma <vipinsh@xxxxxxxxxx> · Wed, 21 Dec 2022 18:34:53 -0800

When dirty log is enabled, huge pages are split. Page table's pages
during the split are allocated based on the current thread NUMA node or
mempolicy. This causes inefficient page table accesses if underlying
page is on a different NUMA node

Allocate page table's pages on the same NUMA node as the underlying huge
page when dirty log is enabled and huge pages are split.

The performance gain during the pre-copy phase of live migrations of a
416 vCPUs and 11 TiB memory VM  on a 8 node host was seen in the range
of 130% to 150%.

Suggested-by: David Matlack <dmatlack@xxxxxxxxxx>
Signed-off-by: Vipin Sharma <vipinsh@xxxxxxxxxx>
---
 arch/x86/kvm/mmu/tdp_mmu.c | 12 ++++++++----
 include/linux/kvm_host.h   | 18 ++++++++++++++++++
 2 files changed, 26 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 4974fa96deff..376b8dceb3f9 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -1403,7 +1403,7 @@ bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm,
 	return spte_set;
 }
 
-static struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(gfp_t gfp)
+static struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(int nid, gfp_t gfp)
 {
 	struct kvm_mmu_page *sp;
 
@@ -1413,7 +1413,8 @@ static struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(gfp_t gfp)
 	if (!sp)
 		return NULL;
 
-	sp->spt = (void *)__get_free_page(gfp);
+	sp->spt = kvm_mmu_get_free_page(nid, gfp);
+
 	if (!sp->spt) {
 		kmem_cache_free(mmu_page_header_cache, sp);
 		return NULL;
@@ -1427,6 +1428,9 @@ static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm,
 						       bool shared)
 {
 	struct kvm_mmu_page *sp;
+	int nid;
+
+	nid = kvm_pfn_to_page_table_nid(spte_to_pfn(iter->old_spte));
 
 	/*
 	 * Since we are allocating while under the MMU lock we have to be
@@ -1437,7 +1441,7 @@ static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm,
 	 * If this allocation fails we drop the lock and retry with reclaim
 	 * allowed.
 	 */
-	sp = __tdp_mmu_alloc_sp_for_split(GFP_NOWAIT | __GFP_ACCOUNT);
+	sp = __tdp_mmu_alloc_sp_for_split(nid, GFP_NOWAIT | __GFP_ACCOUNT);
 	if (sp)
 		return sp;
 
@@ -1449,7 +1453,7 @@ static struct kvm_mmu_page *tdp_mmu_alloc_sp_for_split(struct kvm *kvm,
 		write_unlock(&kvm->mmu_lock);
 
 	iter->yielded = true;
-	sp = __tdp_mmu_alloc_sp_for_split(GFP_KERNEL_ACCOUNT);
+	sp = __tdp_mmu_alloc_sp_for_split(nid, GFP_KERNEL_ACCOUNT);
 
 	if (shared)
 		read_lock(&kvm->mmu_lock);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index d48064503b88..a262e15ebd19 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1583,6 +1583,24 @@ void kvm_arch_sync_events(struct kvm *kvm);
 int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu);
 
 struct page *kvm_pfn_to_refcounted_page(kvm_pfn_t pfn);
+
+/*
+ * Tells the appropriate NUMA node location of the page table's page based on
+ * pfn it will point to.
+ *
+ * Return the nid of the page if pfn is valid and backed by a refcounted page,
+ * otherwise, return the nearest memory node for the current CPU.
+ */
+static inline int kvm_pfn_to_page_table_nid(kvm_pfn_t pfn)
+{
+	struct page *page = kvm_pfn_to_refcounted_page(pfn);
+
+	if (page)
+		return page_to_nid(page);
+	else
+		return numa_mem_id();
+}
+
 bool kvm_is_zone_device_page(struct page *page);
 
 struct kvm_irq_ack_notifier {
-- 
2.39.0.314.g84b9a713c41-goog