On Fri, Aug 09, 2024 at 12:03:00PM -0700, Sean Christopherson wrote: > Trigger KVM's various "unprotect gfn" paths if and only if the page fault > was a write to a write-protected gfn. To do so, add a new page fault > return code, RET_PF_WRITE_PROTECTED, to explicitly and precisely track > such page faults. > > If a page fault requires emulation for any MMIO (or any reason besides > write-protection), trying to unprotect the gfn is pointless and risks > putting the vCPU into an infinite loop. E.g. KVM will put the vCPU into > an infinite loop if the vCPU manages to trigger MMIO on a page table walk. > > Fixes: 147277540bbc ("kvm: svm: Add support for additional SVM NPF error codes") > Cc: stable@xxxxxxxxxxxxxxx > Signed-off-by: Sean Christopherson <seanjc@xxxxxxxxxx> > --- > arch/x86/kvm/mmu/mmu.c | 78 +++++++++++++++++++-------------- > arch/x86/kvm/mmu/mmu_internal.h | 3 ++ > arch/x86/kvm/mmu/mmutrace.h | 1 + > arch/x86/kvm/mmu/paging_tmpl.h | 2 +- > arch/x86/kvm/mmu/tdp_mmu.c | 6 +-- > 5 files changed, 53 insertions(+), 37 deletions(-) > > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c > index 901be9e420a4..e3aa04c498ea 100644 > --- a/arch/x86/kvm/mmu/mmu.c > +++ b/arch/x86/kvm/mmu/mmu.c > @@ -2914,10 +2914,8 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot, > trace_kvm_mmu_set_spte(level, gfn, sptep); > } > > - if (wrprot) { > - if (write_fault) > - ret = RET_PF_EMULATE; > - } > + if (wrprot && write_fault) > + ret = RET_PF_WRITE_PROTECTED; > > if (flush) > kvm_flush_remote_tlbs_gfn(vcpu->kvm, gfn, level); > @@ -4549,7 +4547,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault > return RET_PF_RETRY; > > if (page_fault_handle_page_track(vcpu, fault)) > - return RET_PF_EMULATE; > + return RET_PF_WRITE_PROTECTED; > > r = fast_page_fault(vcpu, fault); > if (r != RET_PF_INVALID) > @@ -4642,7 +4640,7 @@ static int kvm_tdp_mmu_page_fault(struct kvm_vcpu *vcpu, > int r; > > if (page_fault_handle_page_track(vcpu, fault)) > - return RET_PF_EMULATE; > + return RET_PF_WRITE_PROTECTED; > > r = fast_page_fault(vcpu, fault); > if (r != RET_PF_INVALID) > @@ -4726,6 +4724,9 @@ static int kvm_tdp_map_page(struct kvm_vcpu *vcpu, gpa_t gpa, u64 error_code, > case RET_PF_EMULATE: > return -ENOENT; > > + case RET_PF_WRITE_PROTECTED: > + return -EPERM; > + > case RET_PF_RETRY: > case RET_PF_CONTINUE: > case RET_PF_INVALID: > @@ -5960,6 +5961,41 @@ void kvm_mmu_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new, > write_unlock(&vcpu->kvm->mmu_lock); > } > > +static int kvm_mmu_write_protect_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, > + u64 error_code, int *emulation_type) > +{ > + bool direct = vcpu->arch.mmu->root_role.direct; > + > + /* > + * Before emulating the instruction, check if the error code > + * was due to a RO violation while translating the guest page. > + * This can occur when using nested virtualization with nested > + * paging in both guests. If true, we simply unprotect the page > + * and resume the guest. > + */ > + if (direct && > + (error_code & PFERR_NESTED_GUEST_PAGE) == PFERR_NESTED_GUEST_PAGE) { > + kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2_or_gpa)); > + return RET_PF_FIXED; > + } > + > + /* > + * The gfn is write-protected, but if emulation fails we can still > + * optimistically try to just unprotect the page and let the processor > + * re-execute the instruction that caused the page fault. Do not allow > + * retrying MMIO emulation, as it's not only pointless but could also > + * cause us to enter an infinite loop because the processor will keep > + * faulting on the non-existent MMIO address. Retrying an instruction > + * from a nested guest is also pointless and dangerous as we are only > + * explicitly shadowing L1's page tables, i.e. unprotecting something > + * for L1 isn't going to magically fix whatever issue cause L2 to fail. > + */ > + if (!mmio_info_in_cache(vcpu, cr2_or_gpa, direct) && !is_guest_mode(vcpu)) Looks the mmio_info_in_cache() checking can be removed, emulation should not come here with RET_PF_WRITE_PROTECTED introduced, may WARN_ON_ONCE() it. > + *emulation_type |= EMULTYPE_ALLOW_RETRY_PF; > + > + return RET_PF_EMULATE; > +} > + > int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code, > void *insn, int insn_len) > { > @@ -6005,6 +6041,10 @@ int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 err > if (r < 0) > return r; > > + if (r == RET_PF_WRITE_PROTECTED) > + r = kvm_mmu_write_protect_fault(vcpu, cr2_or_gpa, error_code, > + &emulation_type); > + > if (r == RET_PF_FIXED) > vcpu->stat.pf_fixed++; > else if (r == RET_PF_EMULATE) > @@ -6015,32 +6055,6 @@ int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 err > if (r != RET_PF_EMULATE) > return 1; > > - /* > - * Before emulating the instruction, check if the error code > - * was due to a RO violation while translating the guest page. > - * This can occur when using nested virtualization with nested > - * paging in both guests. If true, we simply unprotect the page > - * and resume the guest. > - */ > - if (vcpu->arch.mmu->root_role.direct && > - (error_code & PFERR_NESTED_GUEST_PAGE) == PFERR_NESTED_GUEST_PAGE) { > - kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2_or_gpa)); > - return 1; > - } > - > - /* > - * vcpu->arch.mmu.page_fault returned RET_PF_EMULATE, but we can still > - * optimistically try to just unprotect the page and let the processor > - * re-execute the instruction that caused the page fault. Do not allow > - * retrying MMIO emulation, as it's not only pointless but could also > - * cause us to enter an infinite loop because the processor will keep > - * faulting on the non-existent MMIO address. Retrying an instruction > - * from a nested guest is also pointless and dangerous as we are only > - * explicitly shadowing L1's page tables, i.e. unprotecting something > - * for L1 isn't going to magically fix whatever issue cause L2 to fail. > - */ > - if (!mmio_info_in_cache(vcpu, cr2_or_gpa, direct) && !is_guest_mode(vcpu)) > - emulation_type |= EMULTYPE_ALLOW_RETRY_PF; > emulate: > return x86_emulate_instruction(vcpu, cr2_or_gpa, emulation_type, insn, > insn_len); > diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h > index 1721d97743e9..50d2624111f8 100644 > --- a/arch/x86/kvm/mmu/mmu_internal.h > +++ b/arch/x86/kvm/mmu/mmu_internal.h > @@ -258,6 +258,8 @@ int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault); > * RET_PF_CONTINUE: So far, so good, keep handling the page fault. > * RET_PF_RETRY: let CPU fault again on the address. > * RET_PF_EMULATE: mmio page fault, emulate the instruction directly. > + * RET_PF_WRITE_PROTECTED: the gfn is write-protected, either unprotected the > + * gfn and retry, or emulate the instruction directly. > * RET_PF_INVALID: the spte is invalid, let the real page fault path update it. > * RET_PF_FIXED: The faulting entry has been fixed. > * RET_PF_SPURIOUS: The faulting entry was already fixed, e.g. by another vCPU. > @@ -274,6 +276,7 @@ enum { > RET_PF_CONTINUE = 0, > RET_PF_RETRY, > RET_PF_EMULATE, > + RET_PF_WRITE_PROTECTED, > RET_PF_INVALID, > RET_PF_FIXED, > RET_PF_SPURIOUS, > diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h > index 195d98bc8de8..f35a830ce469 100644 > --- a/arch/x86/kvm/mmu/mmutrace.h > +++ b/arch/x86/kvm/mmu/mmutrace.h > @@ -57,6 +57,7 @@ > TRACE_DEFINE_ENUM(RET_PF_CONTINUE); > TRACE_DEFINE_ENUM(RET_PF_RETRY); > TRACE_DEFINE_ENUM(RET_PF_EMULATE); > +TRACE_DEFINE_ENUM(RET_PF_WRITE_PROTECTED); > TRACE_DEFINE_ENUM(RET_PF_INVALID); > TRACE_DEFINE_ENUM(RET_PF_FIXED); > TRACE_DEFINE_ENUM(RET_PF_SPURIOUS); > diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h > index 69941cebb3a8..a722a3c96af9 100644 > --- a/arch/x86/kvm/mmu/paging_tmpl.h > +++ b/arch/x86/kvm/mmu/paging_tmpl.h > @@ -805,7 +805,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault > > if (page_fault_handle_page_track(vcpu, fault)) { > shadow_page_table_clear_flood(vcpu, fault->addr); > - return RET_PF_EMULATE; > + return RET_PF_WRITE_PROTECTED; > } > > r = mmu_topup_memory_caches(vcpu, true); > diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c > index c7dc49ee7388..8bf44ac9372f 100644 > --- a/arch/x86/kvm/mmu/tdp_mmu.c > +++ b/arch/x86/kvm/mmu/tdp_mmu.c > @@ -1046,10 +1046,8 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, > * protected, emulation is needed. If the emulation was skipped, > * the vCPU would have the same fault again. > */ > - if (wrprot) { > - if (fault->write) > - ret = RET_PF_EMULATE; > - } > + if (wrprot && fault->write) > + ret = RET_PF_WRITE_PROTECTED; > > /* If a MMIO SPTE is installed, the MMIO will need to be emulated. */ > if (unlikely(is_mmio_spte(vcpu->kvm, new_spte))) { > -- > 2.46.0.76.ge559c4bf1a-goog > >