On Thu, Apr 7, 2022 at 12:39 PM Sean Christopherson <seanjc@xxxxxxxxxx> wrote: > > On Fri, Apr 01, 2022, David Matlack wrote: > > --- > > arch/x86/kvm/mmu/tdp_mmu.c | 37 +++++++++++++++++++++++++------------ > > 1 file changed, 25 insertions(+), 12 deletions(-) > > > > diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c > > index 9263765c8068..5a2120d85347 100644 > > --- a/arch/x86/kvm/mmu/tdp_mmu.c > > +++ b/arch/x86/kvm/mmu/tdp_mmu.c > > @@ -1131,6 +1131,10 @@ static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter, > > return 0; > > } > > > > +static int tdp_mmu_split_huge_page_atomic(struct kvm_vcpu *vcpu, > > + struct tdp_iter *iter, > > + bool account_nx); > > + > > /* > > * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing > > * page tables and SPTEs to translate the faulting guest physical address. > > @@ -1140,6 +1144,7 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) > > struct kvm_mmu *mmu = vcpu->arch.mmu; > > struct tdp_iter iter; > > struct kvm_mmu_page *sp; > > + bool account_nx; > > int ret; > > > > kvm_mmu_hugepage_adjust(vcpu, fault); > > @@ -1155,28 +1160,22 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) > > if (iter.level == fault->goal_level) > > break; > > > > + account_nx = fault->huge_page_disallowed && > > + fault->req_level >= iter.level; > > + > > /* > > * If there is an SPTE mapping a large page at a higher level > > - * than the target, that SPTE must be cleared and replaced > > - * with a non-leaf SPTE. > > + * than the target, split it down one level. > > */ > > if (is_shadow_present_pte(iter.old_spte) && > > is_large_pte(iter.old_spte)) { > > - if (tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter)) > > + if (tdp_mmu_split_huge_page_atomic(vcpu, &iter, account_nx)) > > As Ben brought up in patch 2, this conflicts in nasty ways with Mingwei's series > to more preciesly check sp->lpage_disallowed. There's apparently a bug in that > code when using shadow paging, but assuming said bug isn't a blocking issue, I'd > prefer to land this on top of Mingwei's series. > > With a bit of massaging, I think we can make the whole thing a bit more > straightforward. This is what I ended up with (compile tested only, your patch 2 > dropped, might split moving the "init" to a prep patch). I'll give it a spin, > and assuming it works and Mingwei's bug is resolved, I'll post this and Mingwei's > series as a single thing. Sean and I spoke offline. I'll wait for Mingwei to send another version of his patches and then send a v2 of my series on top of that. > > --- > arch/x86/kvm/mmu/tdp_mmu.c | 99 ++++++++++++++++++-------------------- > 1 file changed, 48 insertions(+), 51 deletions(-) > > diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c > index f046af20f3d6..b0abf14570ea 100644 > --- a/arch/x86/kvm/mmu/tdp_mmu.c > +++ b/arch/x86/kvm/mmu/tdp_mmu.c > @@ -1126,6 +1126,9 @@ static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter, > return 0; > } > > +static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter, > + struct kvm_mmu_page *sp, bool shared); > + > /* > * Handle a TDP page fault (NPT/EPT violation/misconfiguration) by installing > * page tables and SPTEs to translate the faulting guest physical address. > @@ -1136,7 +1139,8 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) > struct kvm *kvm = vcpu->kvm; > struct tdp_iter iter; > struct kvm_mmu_page *sp; > - int ret; > + bool account_nx; > + int ret, r; > > kvm_mmu_hugepage_adjust(vcpu, fault); > > @@ -1151,57 +1155,50 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) > if (iter.level == fault->goal_level) > break; > > - /* > - * If there is an SPTE mapping a large page at a higher level > - * than the target, that SPTE must be cleared and replaced > - * with a non-leaf SPTE. > - */ > + /* Nothing to do if there's already a shadow page installed. */ > if (is_shadow_present_pte(iter.old_spte) && > - is_large_pte(iter.old_spte)) { > - if (tdp_mmu_zap_spte_atomic(vcpu->kvm, &iter)) > - break; > - > - /* > - * The iter must explicitly re-read the spte here > - * because the new value informs the !present > - * path below. > - */ > - iter.old_spte = kvm_tdp_mmu_read_spte(iter.sptep); > + !is_large_pte(iter.old_spte)) > + continue; > + > + /* > + * If the SPTE has been frozen by another thread, just give up > + * and retry to avoid unnecessary page table alloc and free. > + */ > + if (is_removed_spte(iter.old_spte)) > + break; > + > + /* > + * The SPTE is either invalid or points at a huge page that > + * needs to be split. > + */ > + sp = tdp_mmu_alloc_sp(vcpu); > + tdp_mmu_init_child_sp(sp, &iter); > + > + account_nx = fault->huge_page_disallowed && > + fault->req_level >= iter.level; > + > + sp->lpage_disallowed = account_nx; > + /* > + * Ensure lpage_disallowed is visible before the SP is marked > + * present (or not-huge), as mmu_lock is held for read. Pairs > + * with the smp_rmb() in disallowed_hugepage_adjust(). > + */ > + smp_wmb(); > + > + if (!is_shadow_present_pte(iter.old_spte)) > + r = tdp_mmu_link_sp(kvm, &iter, sp, true); > + else > + r = tdp_mmu_split_huge_page(kvm, &iter, sp, true); > + > + if (r) { > + tdp_mmu_free_sp(sp); > + break; > } > > - if (!is_shadow_present_pte(iter.old_spte)) { > - bool account_nx = fault->huge_page_disallowed && > - fault->req_level >= iter.level; > - > - /* > - * If SPTE has been frozen by another thread, just > - * give up and retry, avoiding unnecessary page table > - * allocation and free. > - */ > - if (is_removed_spte(iter.old_spte)) > - break; > - > - sp = tdp_mmu_alloc_sp(vcpu); > - tdp_mmu_init_child_sp(sp, &iter); > - > - sp->lpage_disallowed = account_nx; > - /* > - * Ensure lpage_disallowed is visible before the SP is > - * marked present, as mmu_lock is held for read. Pairs > - * with the smp_rmb() in disallowed_hugepage_adjust(). > - */ > - smp_wmb(); > - > - if (tdp_mmu_link_sp(kvm, &iter, sp, true)) { > - tdp_mmu_free_sp(sp); > - break; > - } > - > - if (account_nx) { > - spin_lock(&kvm->arch.tdp_mmu_pages_lock); > - __account_huge_nx_page(kvm, sp); > - spin_unlock(&kvm->arch.tdp_mmu_pages_lock); > - } > + if (account_nx) { > + spin_lock(&kvm->arch.tdp_mmu_pages_lock); > + __account_huge_nx_page(kvm, sp); > + spin_unlock(&kvm->arch.tdp_mmu_pages_lock); > } > } > > @@ -1472,8 +1469,6 @@ static int tdp_mmu_split_huge_page(struct kvm *kvm, struct tdp_iter *iter, > const int level = iter->level; > int ret, i; > > - tdp_mmu_init_child_sp(sp, iter); > - > /* > * No need for atomics when writing to sp->spt since the page table has > * not been linked in yet and thus is not reachable from any other CPU. > @@ -1549,6 +1544,8 @@ static int tdp_mmu_split_huge_pages_root(struct kvm *kvm, > continue; > } > > + tdp_mmu_init_child_sp(sp, &iter); > + > if (tdp_mmu_split_huge_page(kvm, &iter, sp, shared)) > goto retry; > > > base-commit: f06d9d4f3d89912c40c57da45d64b9827d8580ac > -- >