On Fri, Sep 25, 2020 at 02:23:00PM -0700, Ben Gardon wrote: > +/* > + * Clear non-leaf SPTEs and free the page tables they point to, if those SPTEs > + * exist in order to allow execute access on a region that would otherwise be > + * mapped as a large page. > + */ > +void kvm_tdp_mmu_recover_nx_lpages(struct kvm *kvm) > +{ > + struct kvm_mmu_page *sp; > + bool flush; > + int rcu_idx; > + unsigned int ratio; > + ulong to_zap; > + u64 old_spte; > + > + rcu_idx = srcu_read_lock(&kvm->srcu); > + spin_lock(&kvm->mmu_lock); > + > + ratio = READ_ONCE(nx_huge_pages_recovery_ratio); > + to_zap = ratio ? DIV_ROUND_UP(kvm->stat.nx_lpage_splits, ratio) : 0; This is broken, and possibly related to Paolo's INIT_LIST_HEAD issue. The TDP MMU never increments nx_lpage_splits, it instead has its own counter, tdp_mmu_lpage_disallowed_page_count. Unless I'm missing something, to_zap is guaranteed to be zero and thus this is completely untested. I don't see any reason for a separate tdp_mmu_lpage_disallowed_page_count, a single VM can't have both a legacy MMU and a TDP MMU, so it's not like there will be collisions with other code incrementing nx_lpage_splits. And the TDP MMU should be updating stats anyways. > + > + while (to_zap && > + !list_empty(&kvm->arch.tdp_mmu_lpage_disallowed_pages)) { > + /* > + * We use a separate list instead of just using active_mmu_pages > + * because the number of lpage_disallowed pages is expected to > + * be relatively small compared to the total. > + */ > + sp = list_first_entry(&kvm->arch.tdp_mmu_lpage_disallowed_pages, > + struct kvm_mmu_page, > + lpage_disallowed_link); > + > + old_spte = *sp->parent_sptep; > + *sp->parent_sptep = 0; > + > + list_del(&sp->lpage_disallowed_link); > + kvm->arch.tdp_mmu_lpage_disallowed_page_count--; > + > + handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), sp->gfn, > + old_spte, 0, sp->role.level + 1); > + > + flush = true; > + > + if (!--to_zap || need_resched() || > + spin_needbreak(&kvm->mmu_lock)) { > + flush = false; > + kvm_flush_remote_tlbs(kvm); > + if (to_zap) > + cond_resched_lock(&kvm->mmu_lock); > + } > + } > + > + if (flush) > + kvm_flush_remote_tlbs(kvm); > + > + spin_unlock(&kvm->mmu_lock); > + srcu_read_unlock(&kvm->srcu, rcu_idx); > +} > + > diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h > index 2ecb047211a6d..45ea2d44545db 100644 > --- a/arch/x86/kvm/mmu/tdp_mmu.h > +++ b/arch/x86/kvm/mmu/tdp_mmu.h > @@ -43,4 +43,6 @@ void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm, > > bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, > struct kvm_memory_slot *slot, gfn_t gfn); > + > +void kvm_tdp_mmu_recover_nx_lpages(struct kvm *kvm); > #endif /* __KVM_X86_MMU_TDP_MMU_H */ > -- > 2.28.0.709.gb0816b6eb0-goog >