Currently, vendor code is patching the inject_page_fault and later, on vmexit, expecting kvm_init_mmu to restore the inject_page_fault callback. This is brittle, as exposed by the fact that SVM KVM_SET_NESTED_STATE forgets to do it. Instead, do the check at the time a page fault actually has to be injected. This does incur the cost of an extra retpoline for nested vmexits when TDP is disabled, but is overall much cleaner. While at it, add a comment that explains why the different behavior is needed in this case. Signed-off-by: Paolo Bonzini <pbonzini@xxxxxxxxxx> --- arch/x86/include/asm/kvm_host.h | 3 +++ arch/x86/kvm/mmu/mmu.c | 2 +- arch/x86/kvm/svm/nested.c | 4 +--- arch/x86/kvm/vmx/nested.c | 4 +--- arch/x86/kvm/x86.c | 17 +++++++++++++++++ 5 files changed, 23 insertions(+), 7 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 713e08f62385..92855d3984a7 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1508,6 +1508,8 @@ struct kvm_x86_nested_ops { int (*enable_evmcs)(struct kvm_vcpu *vcpu, uint16_t *vmcs_version); uint16_t (*get_evmcs_version)(struct kvm_vcpu *vcpu); + void (*inject_page_fault)(struct kvm_vcpu *vcpu, + struct x86_exception *fault); }; struct kvm_x86_init_ops { @@ -1747,6 +1749,7 @@ void kvm_queue_exception_p(struct kvm_vcpu *vcpu, unsigned nr, unsigned long pay void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr); void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault); +void kvm_inject_page_fault_shadow(struct kvm_vcpu *vcpu, struct x86_exception *fault); bool kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault); bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl); diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 0e393506f4df..f3494dcc4e2f 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4950,7 +4950,7 @@ static void init_kvm_softmmu(struct kvm_vcpu *vcpu, context->get_guest_pgd = kvm_get_guest_cr3; context->get_pdptr = kvm_pdptr_read; - context->inject_page_fault = kvm_inject_page_fault; + context->inject_page_fault = kvm_inject_page_fault_shadow; } static union kvm_mmu_role diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 96bab464967f..ff58c9ebc552 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -680,9 +680,6 @@ int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb12_gpa, if (ret) return ret; - if (!npt_enabled) - vcpu->arch.mmu->inject_page_fault = svm_inject_page_fault_nested; - if (!from_vmrun) kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); @@ -1571,4 +1568,5 @@ struct kvm_x86_nested_ops svm_nested_ops = { .get_nested_state_pages = svm_get_nested_state_pages, .get_state = svm_get_nested_state, .set_state = svm_set_nested_state, + .inject_page_fault = svm_inject_page_fault_nested, }; diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 1dfe23963a9e..564c60566da7 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2615,9 +2615,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); } - if (!enable_ept) - vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested; - if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) && WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, vmcs12->guest_ia32_perf_global_ctrl))) { @@ -6807,4 +6804,5 @@ struct kvm_x86_nested_ops vmx_nested_ops = { .write_log_dirty = nested_vmx_write_pml_buffer, .enable_evmcs = nested_enable_evmcs, .get_evmcs_version = nested_get_evmcs_version, + .inject_page_fault = vmx_inject_page_fault_nested, }; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index da33d3a88a8d..1546a25a9307 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -746,6 +746,23 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) } EXPORT_SYMBOL_GPL(kvm_inject_page_fault); +void kvm_inject_page_fault_shadow(struct kvm_vcpu *vcpu, + struct x86_exception *fault) +{ + /* + * The core exception injection code is not able to combine + * an exception with a vmexit; if a page fault happens while + * a page fault exception is being delivered, the original + * page fault would be changed incorrectly into a double + * fault. To work around this, #PF vmexits are injected + * without going through kvm_queue_exception. + */ + if (unlikely(is_guest_mode(vcpu))) + kvm_x86_ops.nested_ops->inject_page_fault(vcpu, fault); + else + kvm_inject_page_fault(vcpu, fault); +} + bool kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) { -- 2.31.1