> On 8 Oct 2018, at 21:29, Jim Mattson <jmattson@xxxxxxxxxx> wrote: > > When exception payloads are enabled by userspace (which is not yet > possible) and a #PF is raised in L2, defer the setting of CR2 until > the #PF is delivered. This allows the L1 hypervisor to intercept the > fault before CR2 is modified. > > For backwards compatibility, when exception payloads are not enabled > by userspace, kvm_multiple_exception modifies CR2 when the #PF > exception is raised. > > Reported-by: Jim Mattson <jmattson@xxxxxxxxxx> > Suggested-by: Paolo Bonzini <pbonzini@xxxxxxxxxx> > Signed-off-by: Jim Mattson <jmattson@xxxxxxxxxx> > Reviewed-by: Peter Shier <pshier@xxxxxxxxxx> > --- > arch/x86/kvm/svm.c | 13 ++++++------- > arch/x86/kvm/vmx.c | 19 +++++++++---------- > arch/x86/kvm/x86.c | 40 ++++++++++++++++++++++++++++++++++++---- > arch/x86/kvm/x86.h | 2 ++ > 4 files changed, 53 insertions(+), 21 deletions(-) > > diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c > index d96092b35936..8b05a84cf8f2 100644 > --- a/arch/x86/kvm/svm.c > +++ b/arch/x86/kvm/svm.c > @@ -805,6 +805,8 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu) > nested_svm_check_exception(svm, nr, has_error_code, error_code)) > return; > > + kvm_deliver_exception_payload(&svm->vcpu); > + > if (nr == BP_VECTOR && !static_cpu_has(X86_FEATURE_NRIPS)) { > unsigned long rip, old_rip = kvm_rip_read(&svm->vcpu); > > @@ -2965,16 +2967,13 @@ static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, > svm->vmcb->control.exit_info_1 = error_code; > > /* > - * FIXME: we should not write CR2 when L1 intercepts an L2 #PF exception. > - * The fix is to add the ancillary datum (CR2 or DR6) to structs > - * kvm_queued_exception and kvm_vcpu_events, so that CR2 and DR6 can be > - * written only when inject_pending_event runs (DR6 would written here > - * too). This should be conditional on a new capability---if the > - * capability is disabled, kvm_multiple_exception would write the > - * ancillary information to CR2 or DR6, for backwards ABI-compatibility. > + * EXITINFO2 is undefined for all exception intercepts other > + * than #PF. > */ > if (svm->vcpu.arch.exception.nested_apf) > svm->vmcb->control.exit_info_2 = svm->vcpu.arch.apf.nested_apf_token; > + else if (svm->vcpu.arch.exception.has_payload) > + svm->vmcb->control.exit_info_2 = svm->vcpu.arch.exception.payload; > else > svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2; > > diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c > index 06412ba46aa3..fc3f2d27b580 100644 > --- a/arch/x86/kvm/vmx.c > +++ b/arch/x86/kvm/vmx.c > @@ -3275,27 +3275,24 @@ static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned long *exit > { > struct vmcs12 *vmcs12 = get_vmcs12(vcpu); > unsigned int nr = vcpu->arch.exception.nr; > + bool has_payload = vcpu->arch.exception.has_payload; > + unsigned long payload = vcpu->arch.exception.payload; > > if (nr == PF_VECTOR) { > if (vcpu->arch.exception.nested_apf) { > *exit_qual = vcpu->arch.apf.nested_apf_token; > return 1; > } > - /* > - * FIXME: we must not write CR2 when L1 intercepts an L2 #PF exception. > - * The fix is to add the ancillary datum (CR2 or DR6) to structs > - * kvm_queued_exception and kvm_vcpu_events, so that CR2 and DR6 > - * can be written only when inject_pending_event runs. This should be > - * conditional on a new capability---if the capability is disabled, > - * kvm_multiple_exception would write the ancillary information to > - * CR2 or DR6, for backwards ABI-compatibility. > - */ > if (nested_vmx_is_page_fault_vmexit(vmcs12, > vcpu->arch.exception.error_code)) { > - *exit_qual = vcpu->arch.cr2; > + *exit_qual = has_payload ? payload : vcpu->arch.cr2; > return 1; > } > } else { > + /* > + * FIXME: we must not write DR6 when L1 intercepts an > + * L2 #DB exception. > + */ > if (vmcs12->exception_bitmap & (1u << nr)) { > if (nr == DB_VECTOR) > *exit_qual = vcpu->arch.dr6; > @@ -3329,6 +3326,8 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu) > u32 error_code = vcpu->arch.exception.error_code; > u32 intr_info = nr | INTR_INFO_VALID_MASK; > > + kvm_deliver_exception_payload(vcpu); > + > if (has_error_code) { > vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code); > intr_info |= INTR_INFO_DELIVER_CODE_MASK; > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > index 7c4e6845fe80..974f0784ac99 100644 > --- a/arch/x86/kvm/x86.c > +++ b/arch/x86/kvm/x86.c > @@ -400,6 +400,26 @@ static int exception_type(int vector) > return EXCPT_FAULT; > } > > +void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu) > +{ > + unsigned nr = vcpu->arch.exception.nr; > + unsigned long has_payload = vcpu->arch.exception.has_payload; has_payload should be defines as “bool”. > + unsigned long payload = vcpu->arch.exception.payload; > + > + if (!has_payload) > + return; > + > + switch (nr) { > + case PF_VECTOR: > + vcpu->arch.cr2 = payload; > + break; > + } > + > + vcpu->arch.exception.has_payload = false; > + vcpu->arch.exception.payload = 0; > +} > +EXPORT_SYMBOL_GPL(kvm_deliver_exception_payload); > + > static void kvm_multiple_exception(struct kvm_vcpu *vcpu, > unsigned nr, bool has_error, u32 error_code, > bool has_payload, unsigned long payload, bool reinject) > @@ -433,6 +453,9 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu, > vcpu->arch.exception.error_code = error_code; > vcpu->arch.exception.has_payload = has_payload; > vcpu->arch.exception.payload = payload; > + if (!vcpu->kvm->arch.exception_payload_enabled || > + !is_guest_mode(vcpu)) > + kvm_deliver_exception_payload(vcpu); I would add a comment in code here explaining why we defer exception payload when in guest-mode. I don’t think it should only be described in relevant commit messages. > return; > } > > @@ -478,6 +501,13 @@ void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr) > } > EXPORT_SYMBOL_GPL(kvm_requeue_exception); > > +static void kvm_queue_exception_e_p(struct kvm_vcpu *vcpu, unsigned nr, > + u32 error_code, unsigned long payload) > +{ > + kvm_multiple_exception(vcpu, nr, true, error_code, > + true, payload, false); > +} > + > int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err) > { > if (err) > @@ -494,11 +524,13 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) > ++vcpu->stat.pf_guest; > vcpu->arch.exception.nested_apf = > is_guest_mode(vcpu) && fault->async_page_fault; > - if (vcpu->arch.exception.nested_apf) > + if (vcpu->arch.exception.nested_apf) { > vcpu->arch.apf.nested_apf_token = fault->address; > - else > - vcpu->arch.cr2 = fault->address; > - kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code); > + kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code); > + } else { > + kvm_queue_exception_e_p(vcpu, PF_VECTOR, fault->error_code, > + fault->address); > + } > } > EXPORT_SYMBOL_GPL(kvm_inject_page_fault); > > diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h > index 67b9568613f3..224cd0a47568 100644 > --- a/arch/x86/kvm/x86.h > +++ b/arch/x86/kvm/x86.h > @@ -266,6 +266,8 @@ int kvm_write_guest_virt_system(struct kvm_vcpu *vcpu, > > int handle_ud(struct kvm_vcpu *vcpu); > > +void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu); > + > void kvm_vcpu_mtrr_init(struct kvm_vcpu *vcpu); > u8 kvm_mtrr_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn); > bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data); > -- > 2.19.0.605.g01d371f741-goog > Looks good to me. Reviewed-by: Liran Alon <liran.alon@xxxxxxxxxx>