On Thu, Dec 1, 2022 at 11:57 AM Vipin Sharma <vipinsh@xxxxxxxxxx> wrote: > > Page table pages of a VM are currently allocated based on the current > task's NUMA node or its mempolicy. This can cause suboptimal remote > accesses by the vCPU if it is accessing physical pages local to its NUMA > node but the page table pages mapping those physcal pages were created > by some other vCPU which was on different NUMA node or had different > policy. > > Allocate page table pages on the same NUMA node where underlying > physical page exists. Page table at level 5, 4, and 3 might not end up > on the same NUMA node as they can span multiple NUMA nodes. > > Signed-off-by: Vipin Sharma <vipinsh@xxxxxxxxxx> > --- > arch/x86/include/asm/kvm_host.h | 4 +- > arch/x86/kvm/mmu.h | 1 - > arch/x86/kvm/mmu/mmu.c | 109 ++++++++++++++++++++++---------- > arch/x86/kvm/mmu/paging_tmpl.h | 4 +- > arch/x86/kvm/mmu/tdp_mmu.c | 16 +++-- > include/linux/kvm_host.h | 2 + > include/linux/kvm_types.h | 2 + > virt/kvm/kvm_main.c | 7 +- > 8 files changed, 101 insertions(+), 44 deletions(-) > > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h > index 283cbb83d6ae..8a0293326abc 100644 > --- a/arch/x86/include/asm/kvm_host.h > +++ b/arch/x86/include/asm/kvm_host.h > @@ -782,7 +782,7 @@ struct kvm_vcpu_arch { > struct kvm_mmu *walk_mmu; > > struct kvm_mmu_memory_cache mmu_pte_list_desc_cache; > - struct kvm_mmu_memory_cache mmu_shadow_page_cache; > + struct kvm_mmu_memory_cache mmu_shadow_page_cache[MAX_NUMNODES]; > struct kvm_mmu_memory_cache mmu_shadowed_info_cache; > struct kvm_mmu_memory_cache mmu_page_header_cache; > > @@ -1415,7 +1415,7 @@ struct kvm_arch { > * > * Protected by kvm->slots_lock. > */ > - struct kvm_mmu_memory_cache split_shadow_page_cache; > + struct kvm_mmu_memory_cache split_shadow_page_cache[MAX_NUMNODES]; > struct kvm_mmu_memory_cache split_page_header_cache; > > /* > diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h > index c960fb096e5c..6bdaacb6faa0 100644 > --- a/arch/x86/kvm/mmu.h > +++ b/arch/x86/kvm/mmu.h > @@ -119,7 +119,6 @@ void kvm_mmu_unload(struct kvm_vcpu *vcpu); > void kvm_mmu_free_obsolete_roots(struct kvm_vcpu *vcpu); > void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu); > void kvm_mmu_sync_prev_roots(struct kvm_vcpu *vcpu); > -void *kvm_mmu_get_free_page(int nid, gfp_t gfp); > > static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu) > { > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c > index 0554dfc55553..ff7b17af8ab8 100644 > --- a/arch/x86/kvm/mmu/mmu.c > +++ b/arch/x86/kvm/mmu/mmu.c > @@ -648,31 +648,43 @@ static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu) > > static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect) > { > - int r; > + int r, i; Nit: nid or node or node_id would be more descriptive than just i. > > /* 1 rmap, 1 parent PTE per level, and the prefetched rmaps. */ > r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache, > 1 + PT64_ROOT_MAX_LEVEL + PTE_PREFETCH_NUM); > if (r) > return r; > - r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_shadow_page_cache, > - PT64_ROOT_MAX_LEVEL); > - if (r) > - return r; > + > + for (i = 0; i < MAX_NUMNODES; i++) { > + if (node_online(i)) { > + r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_shadow_page_cache[i], > + PT64_ROOT_MAX_LEVEL); > + if (r) > + return r; > + } > + } > + Nice, I was worried about wasting cache memory we were never going to use, but this works great to at least ensure we're not allocating cached memory for nodes that don't exist. > if (maybe_indirect) { > r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_shadowed_info_cache, > PT64_ROOT_MAX_LEVEL); > if (r) > return r; > } > + > return kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache, > PT64_ROOT_MAX_LEVEL); > } > > static void mmu_free_memory_caches(struct kvm_vcpu *vcpu) > { > + int i; > + > kvm_mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache); > - kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadow_page_cache); > + > + for (i = 0; i < MAX_NUMNODES; i++) > + kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadow_page_cache[i]); > + > kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadowed_info_cache); > kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache); > } > @@ -2203,13 +2215,17 @@ static struct kvm_mmu_page *__kvm_mmu_get_shadow_page(struct kvm *kvm, > > static struct kvm_mmu_page *kvm_mmu_get_shadow_page(struct kvm_vcpu *vcpu, > gfn_t gfn, > - union kvm_mmu_page_role role) > + union kvm_mmu_page_role role, > + int nid) > { > - struct shadow_page_caches caches = { > - .page_header_cache = &vcpu->arch.mmu_page_header_cache, > - .shadow_page_cache = &vcpu->arch.mmu_shadow_page_cache, > - .shadowed_info_cache = &vcpu->arch.mmu_shadowed_info_cache, > - }; > + struct shadow_page_caches caches; > + > + if (nid == NUMA_NO_NODE) > + nid = numa_mem_id(); > + > + caches.page_header_cache = &vcpu->arch.mmu_page_header_cache; > + caches.shadow_page_cache = &vcpu->arch.mmu_shadow_page_cache[nid]; > + caches.shadowed_info_cache = &vcpu->arch.mmu_shadowed_info_cache; > > return __kvm_mmu_get_shadow_page(vcpu->kvm, vcpu, &caches, gfn, role); > } > @@ -2262,15 +2278,19 @@ static union kvm_mmu_page_role kvm_mmu_child_role(u64 *sptep, bool direct, > > static struct kvm_mmu_page *kvm_mmu_get_child_sp(struct kvm_vcpu *vcpu, > u64 *sptep, gfn_t gfn, > - bool direct, unsigned int access) > + bool direct, unsigned int access, > + kvm_pfn_t pfn) > { > union kvm_mmu_page_role role; > + int nid; > > if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) > return ERR_PTR(-EEXIST); > > role = kvm_mmu_child_role(sptep, direct, access); > - return kvm_mmu_get_shadow_page(vcpu, gfn, role); > + nid = kvm_pfn_to_refcounted_page_nid(pfn); > + > + return kvm_mmu_get_shadow_page(vcpu, gfn, role, nid); > } > > static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterator, > @@ -3153,7 +3173,8 @@ static int __direct_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) > if (it.level == fault->goal_level) > break; > > - sp = kvm_mmu_get_child_sp(vcpu, it.sptep, base_gfn, true, ACC_ALL); > + sp = kvm_mmu_get_child_sp(vcpu, it.sptep, base_gfn, true, > + ACC_ALL, fault->pfn); > if (sp == ERR_PTR(-EEXIST)) > continue; > > @@ -3579,7 +3600,7 @@ static hpa_t mmu_alloc_root(struct kvm_vcpu *vcpu, gfn_t gfn, int quadrant, > WARN_ON_ONCE(quadrant && !role.has_4_byte_gpte); > WARN_ON_ONCE(role.direct && role.has_4_byte_gpte); > > - sp = kvm_mmu_get_shadow_page(vcpu, gfn, role); > + sp = kvm_mmu_get_shadow_page(vcpu, gfn, role, NUMA_NO_NODE); > ++sp->root_count; > > return __pa(sp->spt); > @@ -5853,15 +5874,20 @@ static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) > > int kvm_mmu_create(struct kvm_vcpu *vcpu) > { > - int ret; > + int ret, i; > > vcpu->arch.mmu_pte_list_desc_cache.kmem_cache = pte_list_desc_cache; > vcpu->arch.mmu_pte_list_desc_cache.gfp_zero = __GFP_ZERO; > + vcpu->arch.mmu_pte_list_desc_cache.node = NUMA_NO_NODE; > > vcpu->arch.mmu_page_header_cache.kmem_cache = mmu_page_header_cache; > vcpu->arch.mmu_page_header_cache.gfp_zero = __GFP_ZERO; > + vcpu->arch.mmu_page_header_cache.node = NUMA_NO_NODE; > > - vcpu->arch.mmu_shadow_page_cache.gfp_zero = __GFP_ZERO; > + for (i = 0; i < MAX_NUMNODES; i++) { > + vcpu->arch.mmu_shadow_page_cache[i].gfp_zero = __GFP_ZERO; > + vcpu->arch.mmu_shadow_page_cache[i].node = i; > + } > > vcpu->arch.mmu = &vcpu->arch.root_mmu; > vcpu->arch.walk_mmu = &vcpu->arch.root_mmu; > @@ -6012,7 +6038,7 @@ static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm, > int kvm_mmu_init_vm(struct kvm *kvm) > { > struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker; > - int r; > + int r, i; > > INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); > INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages); > @@ -6029,20 +6055,29 @@ int kvm_mmu_init_vm(struct kvm *kvm) > > kvm->arch.split_page_header_cache.kmem_cache = mmu_page_header_cache; > kvm->arch.split_page_header_cache.gfp_zero = __GFP_ZERO; > + kvm->arch.split_page_header_cache.node = NUMA_NO_NODE; > + > + for (i = 0; i < MAX_NUMNODES; i++) { > + kvm->arch.split_shadow_page_cache[i].gfp_zero = __GFP_ZERO; > + kvm->arch.split_shadow_page_cache[i].node = i; > + } > > - kvm->arch.split_shadow_page_cache.gfp_zero = __GFP_ZERO; > > kvm->arch.split_desc_cache.kmem_cache = pte_list_desc_cache; > kvm->arch.split_desc_cache.gfp_zero = __GFP_ZERO; > + kvm->arch.split_desc_cache.node = NUMA_NO_NODE; > > return 0; > } > > static void mmu_free_vm_memory_caches(struct kvm *kvm) > { > + int i; > + > kvm_mmu_free_memory_cache(&kvm->arch.split_desc_cache); > kvm_mmu_free_memory_cache(&kvm->arch.split_page_header_cache); > - kvm_mmu_free_memory_cache(&kvm->arch.split_shadow_page_cache); > + for (i = 0; i < MAX_NUMNODES; i++) > + kvm_mmu_free_memory_cache(&kvm->arch.split_shadow_page_cache[i]); > } > > void kvm_mmu_uninit_vm(struct kvm *kvm) > @@ -6150,7 +6185,7 @@ static inline bool need_topup(struct kvm_mmu_memory_cache *cache, int min) > return kvm_mmu_memory_cache_nr_free_objects(cache) < min; > } > > -static bool need_topup_split_caches_or_resched(struct kvm *kvm) > +static bool need_topup_split_caches_or_resched(struct kvm *kvm, int nid) > { > if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) > return true; > @@ -6162,10 +6197,10 @@ static bool need_topup_split_caches_or_resched(struct kvm *kvm) > */ > return need_topup(&kvm->arch.split_desc_cache, SPLIT_DESC_CACHE_MIN_NR_OBJECTS) || > need_topup(&kvm->arch.split_page_header_cache, 1) || > - need_topup(&kvm->arch.split_shadow_page_cache, 1); > + need_topup(&kvm->arch.split_shadow_page_cache[nid], 1); > } > > -static int topup_split_caches(struct kvm *kvm) > +static int topup_split_caches(struct kvm *kvm, int nid) > { > /* > * Allocating rmap list entries when splitting huge pages for nested > @@ -6195,16 +6230,19 @@ static int topup_split_caches(struct kvm *kvm) > if (r) > return r; > > - return kvm_mmu_topup_memory_cache(&kvm->arch.split_shadow_page_cache, 1); > + return kvm_mmu_topup_memory_cache(&kvm->arch.split_shadow_page_cache[nid], 1); > } > > -static struct kvm_mmu_page *shadow_mmu_get_sp_for_split(struct kvm *kvm, u64 *huge_sptep) > +static struct kvm_mmu_page *shadow_mmu_get_sp_for_split(struct kvm *kvm, > + u64 *huge_sptep, > + u64 huge_spte) > { > struct kvm_mmu_page *huge_sp = sptep_to_sp(huge_sptep); > struct shadow_page_caches caches = {}; > union kvm_mmu_page_role role; > unsigned int access; > gfn_t gfn; > + int nid; > > gfn = kvm_mmu_page_get_gfn(huge_sp, spte_index(huge_sptep)); > access = kvm_mmu_page_get_access(huge_sp, spte_index(huge_sptep)); > @@ -6217,9 +6255,13 @@ static struct kvm_mmu_page *shadow_mmu_get_sp_for_split(struct kvm *kvm, u64 *hu > */ > role = kvm_mmu_child_role(huge_sptep, /*direct=*/true, access); > > + nid = kvm_pfn_to_refcounted_page_nid(spte_to_pfn(huge_spte)); > + if (nid == NUMA_NO_NODE) > + nid = numa_mem_id(); > + > /* Direct SPs do not require a shadowed_info_cache. */ > caches.page_header_cache = &kvm->arch.split_page_header_cache; > - caches.shadow_page_cache = &kvm->arch.split_shadow_page_cache; > + caches.shadow_page_cache = &kvm->arch.split_shadow_page_cache[nid]; > > /* Safe to pass NULL for vCPU since requesting a direct SP. */ > return __kvm_mmu_get_shadow_page(kvm, NULL, &caches, gfn, role); > @@ -6238,7 +6280,7 @@ static void shadow_mmu_split_huge_page(struct kvm *kvm, > gfn_t gfn; > int index; > > - sp = shadow_mmu_get_sp_for_split(kvm, huge_sptep); > + sp = shadow_mmu_get_sp_for_split(kvm, huge_sptep, huge_spte); > > for (index = 0; index < SPTE_ENT_PER_PAGE; index++) { > sptep = &sp->spt[index]; > @@ -6276,7 +6318,7 @@ static int shadow_mmu_try_split_huge_page(struct kvm *kvm, > u64 *huge_sptep) > { > struct kvm_mmu_page *huge_sp = sptep_to_sp(huge_sptep); > - int level, r = 0; > + int level, r = 0, nid; > gfn_t gfn; > u64 spte; > > @@ -6284,13 +6326,16 @@ static int shadow_mmu_try_split_huge_page(struct kvm *kvm, > gfn = kvm_mmu_page_get_gfn(huge_sp, spte_index(huge_sptep)); > level = huge_sp->role.level; > spte = *huge_sptep; > + nid = kvm_pfn_to_refcounted_page_nid(spte_to_pfn(spte)); > + if (nid == NUMA_NO_NODE) > + nid = numa_mem_id(); > > if (kvm_mmu_available_pages(kvm) <= KVM_MIN_FREE_MMU_PAGES) { > r = -ENOSPC; > goto out; > } > > - if (need_topup_split_caches_or_resched(kvm)) { > + if (need_topup_split_caches_or_resched(kvm, nid)) { > write_unlock(&kvm->mmu_lock); > cond_resched(); > /* > @@ -6298,7 +6343,7 @@ static int shadow_mmu_try_split_huge_page(struct kvm *kvm, > * rmap iterator should be restarted because the MMU lock was > * dropped. > */ > - r = topup_split_caches(kvm) ?: -EAGAIN; > + r = topup_split_caches(kvm, nid) ?: -EAGAIN; > write_lock(&kvm->mmu_lock); > goto out; > } > @@ -6988,7 +7033,7 @@ void kvm_mmu_pre_destroy_vm(struct kvm *kvm) > kthread_stop(kvm->arch.nx_huge_page_recovery_thread); > } > > -void *kvm_mmu_get_free_page(int nid, gfp_t gfp) > +void *kvm_arch_mmu_get_free_page(int nid, gfp_t gfp) > { > struct page *spt_page; > void *address = NULL; > diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h > index 0f6455072055..1b1039a1b178 100644 > --- a/arch/x86/kvm/mmu/paging_tmpl.h > +++ b/arch/x86/kvm/mmu/paging_tmpl.h > @@ -652,7 +652,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, > table_gfn = gw->table_gfn[it.level - 2]; > access = gw->pt_access[it.level - 2]; > sp = kvm_mmu_get_child_sp(vcpu, it.sptep, table_gfn, > - false, access); > + false, access, fault->pfn); > > if (sp != ERR_PTR(-EEXIST)) { > /* > @@ -708,7 +708,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, > validate_direct_spte(vcpu, it.sptep, direct_access); > > sp = kvm_mmu_get_child_sp(vcpu, it.sptep, base_gfn, > - true, direct_access); > + true, direct_access, fault->pfn); > if (sp == ERR_PTR(-EEXIST)) > continue; > > diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c > index 1607afbfcc0b..be0763e6b058 100644 > --- a/arch/x86/kvm/mmu/tdp_mmu.c > +++ b/arch/x86/kvm/mmu/tdp_mmu.c > @@ -270,12 +270,15 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, > kvm_mmu_page_as_id(_root) != _as_id) { \ > } else > > -static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu) > +static struct kvm_mmu_page *tdp_mmu_alloc_sp(struct kvm_vcpu *vcpu, int nid) > { > struct kvm_mmu_page *sp; > > + if (nid == NUMA_NO_NODE) > + nid = numa_mem_id(); > + > sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache); > - sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache); > + sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache[nid]); > > return sp; > } > @@ -327,7 +330,7 @@ hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu) > goto out; > } > > - root = tdp_mmu_alloc_sp(vcpu); > + root = tdp_mmu_alloc_sp(vcpu, NUMA_NO_NODE); > tdp_mmu_init_sp(root, NULL, 0, role); > > refcount_set(&root->tdp_mmu_root_count, 1); > @@ -1159,7 +1162,7 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) > struct kvm *kvm = vcpu->kvm; > struct tdp_iter iter; > struct kvm_mmu_page *sp; > - int ret = RET_PF_RETRY; > + int ret = RET_PF_RETRY, nid; > > kvm_mmu_hugepage_adjust(vcpu, fault); > > @@ -1188,11 +1191,12 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) > !is_large_pte(iter.old_spte)) > continue; > > + nid = kvm_pfn_to_refcounted_page_nid(fault->pfn); > /* > * The SPTE is either non-present or points to a huge page that > * needs to be split. > */ > - sp = tdp_mmu_alloc_sp(vcpu); > + sp = tdp_mmu_alloc_sp(vcpu, nid); > tdp_mmu_init_child_sp(sp, &iter); > > sp->nx_huge_page_disallowed = fault->huge_page_disallowed; > @@ -1423,7 +1427,7 @@ static struct kvm_mmu_page *__tdp_mmu_alloc_sp_for_split(int nid, gfp_t gfp) > if (!sp) > return NULL; > > - sp->spt = kvm_mmu_get_free_page(nid, gfp); > + sp->spt = kvm_arch_mmu_get_free_page(nid, gfp); > > if (!sp->spt) { > kmem_cache_free(mmu_page_header_cache, sp); > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h > index 558ded73f660..07674955460b 100644 > --- a/include/linux/kvm_host.h > +++ b/include/linux/kvm_host.h > @@ -1374,6 +1374,8 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu, bool usermode_vcpu_not_eligible); > > void kvm_flush_remote_tlbs(struct kvm *kvm); > > +void *kvm_arch_mmu_get_free_page(int nid, gfp_t gfp); > + > #ifdef KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE > int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min); > int __kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int capacity, int min); > diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h > index 3ca3db020e0e..cb627cf1b4e1 100644 > --- a/include/linux/kvm_types.h > +++ b/include/linux/kvm_types.h > @@ -96,6 +96,8 @@ struct kvm_mmu_memory_cache { > struct kmem_cache *kmem_cache; > int capacity; > void **objects; > + /* Node on which memory should be allocated by default */ > + int node; > }; > #endif > > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c > index 1782c4555d94..4d59c9d48277 100644 > --- a/virt/kvm/kvm_main.c > +++ b/virt/kvm/kvm_main.c > @@ -384,6 +384,11 @@ static void kvm_flush_shadow_all(struct kvm *kvm) > kvm_arch_guest_memory_reclaimed(kvm); > } > > +void * __weak kvm_arch_mmu_get_free_page(int nid, gfp_t gfp_flags) > +{ > + return (void *)__get_free_page(gfp_flags); > +} > + Rather than making this __weak, you could use #ifdef CONFIG_NUMA to just put all the code in the arch-neutral function. > #ifdef KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE > static inline void *mmu_memory_cache_alloc_obj(struct kvm_mmu_memory_cache *mc, > gfp_t gfp_flags) > @@ -393,7 +398,7 @@ static inline void *mmu_memory_cache_alloc_obj(struct kvm_mmu_memory_cache *mc, > if (mc->kmem_cache) > return kmem_cache_alloc(mc->kmem_cache, gfp_flags); > else > - return (void *)__get_free_page(gfp_flags); > + return kvm_arch_mmu_get_free_page(mc->node, gfp_flags); > } > > int __kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int capacity, int min) > -- > 2.39.0.rc0.267.gcb52ba06e7-goog >