On Fri, Jul 15, 2022, Sean Christopherson wrote: > Drop the requirement that a pfn be backed by a refcounted, compound or > or ZONE_DEVICE, struct page, and instead rely solely on the host page > tables to identify huge pages. The PageCompound() check is a remnant of > an old implementation that identified (well, attempt to identify) huge > pages without walking the host page tables. The ZONE_DEVICE check was > added as an exception to the PageCompound() requirement. In other words, > neither check is actually a hard requirement, if the primary has a pfn > backed with a huge page, then KVM can back the pfn with a huge page > regardless of the backing store. > > Dropping the @pfn parameter will also allow KVM to query the max host > mapping level without having to first get the pfn, which is advantageous > for use outside of the page fault path where KVM wants to take action if > and only if a page can be mapped huge, i.e. avoids the pfn lookup for > gfns that can't be backed with a huge page. Our of curiosity, when host maps huge pages under VMA with VM_PFNMAP, they are basically out of the control of MM (I presume). So, when drivers of those pages remove them from host page table, do we (KVM) get mmu_notifiers? > > Cc: Mingwei Zhang <mizhang@xxxxxxxxxx> > Signed-off-by: Sean Christopherson <seanjc@xxxxxxxxxx> Reviewed-by: Mingwei Zhang <mizhang@xxxxxxxxxx> > --- > arch/x86/kvm/mmu/mmu.c | 23 +++++------------------ > arch/x86/kvm/mmu/mmu_internal.h | 2 +- > arch/x86/kvm/mmu/tdp_mmu.c | 8 +------- > 3 files changed, 7 insertions(+), 26 deletions(-) > > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c > index 52664c3caaab..bebff1d5acd4 100644 > --- a/arch/x86/kvm/mmu/mmu.c > +++ b/arch/x86/kvm/mmu/mmu.c > @@ -2919,11 +2919,10 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep) > __direct_pte_prefetch(vcpu, sp, sptep); > } > > -static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, > +static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, > const struct kvm_memory_slot *slot) > { > int level = PG_LEVEL_4K; > - struct page *page; > unsigned long hva; > unsigned long flags; > pgd_t pgd; > @@ -2931,17 +2930,6 @@ static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, > pud_t pud; > pmd_t pmd; > > - /* > - * Note, @slot must be non-NULL, i.e. the caller is responsible for > - * ensuring @pfn isn't garbage and is backed by a memslot. > - */ > - page = kvm_pfn_to_refcounted_page(pfn); > - if (!page) > - return PG_LEVEL_4K; > - > - if (!PageCompound(page) && !kvm_is_zone_device_page(page)) > - return PG_LEVEL_4K; > - > /* > * Note, using the already-retrieved memslot and __gfn_to_hva_memslot() > * is not solely for performance, it's also necessary to avoid the > @@ -2994,7 +2982,7 @@ static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, > > int kvm_mmu_max_mapping_level(struct kvm *kvm, > const struct kvm_memory_slot *slot, gfn_t gfn, > - kvm_pfn_t pfn, int max_level) > + int max_level) > { > struct kvm_lpage_info *linfo; > int host_level; > @@ -3009,7 +2997,7 @@ int kvm_mmu_max_mapping_level(struct kvm *kvm, > if (max_level == PG_LEVEL_4K) > return PG_LEVEL_4K; > > - host_level = host_pfn_mapping_level(kvm, gfn, pfn, slot); > + host_level = host_pfn_mapping_level(kvm, gfn, slot); > return min(host_level, max_level); > } > > @@ -3034,8 +3022,7 @@ void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault > * level, which will be used to do precise, accurate accounting. > */ > fault->req_level = kvm_mmu_max_mapping_level(vcpu->kvm, slot, > - fault->gfn, fault->pfn, > - fault->max_level); > + fault->gfn, fault->max_level); > if (fault->req_level == PG_LEVEL_4K || fault->huge_page_disallowed) > return; > > @@ -6406,7 +6393,7 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm, > */ > if (sp->role.direct && > sp->role.level < kvm_mmu_max_mapping_level(kvm, slot, sp->gfn, > - pfn, PG_LEVEL_NUM)) { > + PG_LEVEL_NUM)) { > pte_list_remove(kvm, rmap_head, sptep); > > if (kvm_available_flush_tlb_with_range()) > diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h > index ae2d660e2dab..582def531d4d 100644 > --- a/arch/x86/kvm/mmu/mmu_internal.h > +++ b/arch/x86/kvm/mmu/mmu_internal.h > @@ -309,7 +309,7 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, > > int kvm_mmu_max_mapping_level(struct kvm *kvm, > const struct kvm_memory_slot *slot, gfn_t gfn, > - kvm_pfn_t pfn, int max_level); > + int max_level); > void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault); > void disallowed_hugepage_adjust(struct kvm_page_fault *fault, u64 spte, int cur_level); > > diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c > index f3a430d64975..d75d93edc40a 100644 > --- a/arch/x86/kvm/mmu/tdp_mmu.c > +++ b/arch/x86/kvm/mmu/tdp_mmu.c > @@ -1733,7 +1733,6 @@ static void zap_collapsible_spte_range(struct kvm *kvm, > gfn_t end = start + slot->npages; > struct tdp_iter iter; > int max_mapping_level; > - kvm_pfn_t pfn; > > rcu_read_lock(); > > @@ -1745,13 +1744,8 @@ static void zap_collapsible_spte_range(struct kvm *kvm, > !is_last_spte(iter.old_spte, iter.level)) > continue; > > - /* > - * This is a leaf SPTE. Check if the PFN it maps can > - * be mapped at a higher level. > - */ > - pfn = spte_to_pfn(iter.old_spte); > max_mapping_level = kvm_mmu_max_mapping_level(kvm, slot, > - iter.gfn, pfn, PG_LEVEL_NUM); > + iter.gfn, PG_LEVEL_NUM); > > WARN_ON(max_mapping_level < iter.level); > > -- > 2.37.0.170.g444d1eabd0-goog >