On Fri, Sep 06, 2024, Vipin Sharma wrote: > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c > index 455caaaa04f5..fc597f66aa11 100644 > --- a/arch/x86/kvm/mmu/mmu.c > +++ b/arch/x86/kvm/mmu/mmu.c > @@ -7317,8 +7317,8 @@ static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel > return err; > } > > -void kvm_recover_nx_huge_pages(struct kvm *kvm, struct list_head *pages, > - unsigned long nr_pages) > +void kvm_recover_nx_huge_pages(struct kvm *kvm, bool shared, > + struct list_head *pages, unsigned long nr_pages) > { > struct kvm_memory_slot *slot; > int rcu_idx; > @@ -7329,7 +7329,10 @@ void kvm_recover_nx_huge_pages(struct kvm *kvm, struct list_head *pages, > ulong to_zap; > > rcu_idx = srcu_read_lock(&kvm->srcu); > - write_lock(&kvm->mmu_lock); > + if (shared) Hmm, what if we do this? enum kvm_mmu_types { KVM_SHADOW_MMU, #ifdef CONFIG_X86_64 KVM_TDP_MMU, #endif KVM_NR_MMU_TYPES, }; #ifndef CONFIG_X86_64 #define KVM_TDP_MMU -1 #endif And then this becomes: if (mmu_type == KVM_TDP_MMU) > + read_lock(&kvm->mmu_lock); > + else > + write_lock(&kvm->mmu_lock); > > /* > * Zapping TDP MMU shadow pages, including the remote TLB flush, must > @@ -7341,8 +7344,13 @@ void kvm_recover_nx_huge_pages(struct kvm *kvm, struct list_head *pages, > ratio = READ_ONCE(nx_huge_pages_recovery_ratio); > to_zap = ratio ? DIV_ROUND_UP(nr_pages, ratio) : 0; > for ( ; to_zap; --to_zap) { > - if (list_empty(pages)) > + if (tdp_mmu_enabled) Shouldn't this be? if (shared) Or if we do the above if (mmu_type == KVM_TDP_MMU) Actually, better idea (sans comments) if (mmu_type == KVM_TDP_MMU) { read_lock(&kvm->mmu_lock); kvm_tdp_mmu_pages_lock(kvm); } else { write_lock(&kvm->mmu_lock); } rcu_read_lock(); ratio = READ_ONCE(nx_huge_pages_recovery_ratio); to_zap = ratio ? DIV_ROUND_UP(possible_nx->nr_pages, ratio) : 0; for ( ; to_zap; --to_zap) { if (list_empty(possible_nx->pages)) break; ... /* Blah blah blah. */ if (mmu_type == KVM_TDP_MMU) kvm_tdp_mmu_pages_unlock(kvm); ... /* Blah blah blah. */ if (mmu_type == KVM_TDP_MMU) kvm_tdp_mmu_pages_lock(kvm); } kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush); rcu_read_unlock(); if (mmu_type == KVM_TDP_MMU) { kvm_tdp_mmu_pages_unlock(kvm); read_unlock(&kvm->mmu_lock); } else { write_unlock(&kvm->mmu_lock); } srcu_read_unlock(&kvm->srcu, rcu_idx); > @@ -825,23 +835,51 @@ static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root, > rcu_read_unlock(); > } > > -bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp) > +bool kvm_tdp_mmu_zap_possible_nx_huge_page(struct kvm *kvm, This rename, and any refactoring that is associated with said rename, e.g. comments, belongs in a separate patch. > + struct kvm_mmu_page *sp) > { > - u64 old_spte; > + struct tdp_iter iter = { > + .old_spte = sp->ptep ? kvm_tdp_mmu_read_spte(sp->ptep) : 0, > + .sptep = sp->ptep, > + .level = sp->role.level + 1, > + .gfn = sp->gfn, > + .as_id = kvm_mmu_page_as_id(sp), > + }; > + > + lockdep_assert_held_read(&kvm->mmu_lock); Newline here, to isolate the lockdep assertion from the functional code. > + if (WARN_ON_ONCE(!is_tdp_mmu_page(sp))) > + return false; > > /* > - * This helper intentionally doesn't allow zapping a root shadow page, > - * which doesn't have a parent page table and thus no associated entry. > + * Root shadow pages don't a parent page table and thus no associated Missed a word or three. > + * entry, but they can never be possible NX huge pages. > */ > if (WARN_ON_ONCE(!sp->ptep)) > return false; > > - old_spte = kvm_tdp_mmu_read_spte(sp->ptep); > - if (WARN_ON_ONCE(!is_shadow_present_pte(old_spte))) > + /* > + * Since mmu_lock is held in read mode, it's possible another task has > + * already modified the SPTE. Zap the SPTE if and only if the SPTE > + * points at the SP's page table, as checking shadow-present isn't > + * sufficient, e.g. the SPTE could be replaced by a leaf SPTE, or even > + * another SP. Note, spte_to_child_pt() also checks that the SPTE is > + * shadow-present, i.e. guards against zapping a frozen SPTE. > + */ > + if ((tdp_ptep_t)sp->spt != spte_to_child_pt(iter.old_spte, iter.level)) > return false; > > - tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte, > - SHADOW_NONPRESENT_VALUE, sp->gfn, sp->role.level + 1); > + /* > + * If a different task modified the SPTE, then it should be impossible > + * for the SPTE to still be used for the to-be-zapped SP. Non-leaf > + * SPTEs don't have Dirty bits, KVM always sets the Accessed bit when > + * creating non-leaf SPTEs, and all other bits are immutable for non- > + * leaf SPTEs, i.e. the only legal operations for non-leaf SPTEs are > + * zapping and replacement. > + */ > + if (tdp_mmu_set_spte_atomic(kvm, &iter, SHADOW_NONPRESENT_VALUE)) { > + WARN_ON_ONCE((tdp_ptep_t)sp->spt == spte_to_child_pt(iter.old_spte, iter.level)); > + return false; > + } > > return true; > }