[PATCH 3/4] KVM: x86: correctly merge pending and injected exception

Maxim Levitsky <mlevitsk@xxxxxxxxxx> · Thu, 1 Apr 2021 17:38:16 +0300

Allow the pending and the injected exceptions to co-exist
when they are raised.

Add 'kvm_deliver_pending_exception' function which 'merges' the pending
and injected exception or delivers a VM exit with both for a case when
the L1 intercepts the pending exception.

The later is done by vendor code using new nested callback
'deliver_exception_as_vmexit'

The kvm_deliver_pending_exception is called after each VM exit,
and prior to VM entry which ensures that during userspace VM exits,
only injected exception can be in a raised state.

Signed-off-by: Maxim Levitsky <mlevitsk@xxxxxxxxxx>
---
 arch/x86/include/asm/kvm_host.h |   9 ++
 arch/x86/kvm/svm/nested.c       |  27 ++--
 arch/x86/kvm/svm/svm.c          |   2 +-
 arch/x86/kvm/vmx/nested.c       |  58 ++++----
 arch/x86/kvm/vmx/vmx.c          |   2 +-
 arch/x86/kvm/x86.c              | 233 ++++++++++++++++++--------------
 6 files changed, 181 insertions(+), 150 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 3b2fd276e8d5..a9b9cd030d9a 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1346,6 +1346,15 @@ struct kvm_x86_ops {
 
 struct kvm_x86_nested_ops {
 	int (*check_events)(struct kvm_vcpu *vcpu);
+
+	/*
+	 * Deliver a pending exception as a VM exit if the L1 intercepts it.
+	 * Returns -EBUSY if L1 does intercept the exception but,
+	 * it is not possible to deliver it right now.
+	 * (for example when nested run is pending)
+	 */
+	int (*deliver_exception_as_vmexit)(struct kvm_vcpu *vcpu);
+
 	bool (*hv_timer_pending)(struct kvm_vcpu *vcpu);
 	void (*triple_fault)(struct kvm_vcpu *vcpu);
 	int (*get_state)(struct kvm_vcpu *vcpu,
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 7adad9b6dcad..ff745d59ffcf 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -1061,21 +1061,6 @@ static int svm_check_nested_events(struct kvm_vcpu *vcpu)
 		return 0;
 	}
 
-	if (vcpu->arch.pending_exception.valid) {
-		/*
-		 * Only a pending nested run can block a pending exception.
-		 * Otherwise an injected NMI/interrupt should either be
-		 * lost or delivered to the nested hypervisor in the EXITINTINFO
-		 * vmcb field, while delivering the pending exception.
-		 */
-		if (svm->nested.nested_run_pending)
-                        return -EBUSY;
-		if (!nested_exit_on_exception(svm))
-			return 0;
-		nested_svm_inject_exception_vmexit(svm);
-		return 0;
-	}
-
 	if (vcpu->arch.smi_pending && !svm_smi_blocked(vcpu)) {
 		if (block_nested_events)
 			return -EBUSY;
@@ -1107,6 +1092,17 @@ static int svm_check_nested_events(struct kvm_vcpu *vcpu)
 	return 0;
 }
 
+int svm_deliver_nested_exception_as_vmexit(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+
+	if (svm->nested.nested_run_pending)
+		return -EBUSY;
+	if (nested_exit_on_exception(svm))
+		nested_svm_inject_exception_vmexit(svm);
+	return 0;
+}
+
 int nested_svm_exit_special(struct vcpu_svm *svm)
 {
 	u32 exit_code = svm->vmcb->control.exit_code;
@@ -1321,6 +1317,7 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
 struct kvm_x86_nested_ops svm_nested_ops = {
 	.check_events = svm_check_nested_events,
 	.triple_fault = nested_svm_triple_fault,
+	.deliver_exception_as_vmexit = svm_deliver_nested_exception_as_vmexit,
 	.get_nested_state_pages = svm_get_nested_state_pages,
 	.get_state = svm_get_nested_state,
 	.set_state = svm_set_nested_state,
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 90b541138c5a..b89e48574c39 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -363,7 +363,7 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu)
 	bool has_error_code = vcpu->arch.injected_exception.has_error_code;
 	u32 error_code = vcpu->arch.injected_exception.error_code;
 
-	kvm_deliver_exception_payload(vcpu);
+	WARN_ON_ONCE(vcpu->arch.pending_exception.valid);
 
 	if (nr == BP_VECTOR && !nrips) {
 		unsigned long rip, old_rip = kvm_rip_read(vcpu);
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 5d54fecff9a7..1c09b132c55c 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -3768,7 +3768,6 @@ static bool nested_vmx_preemption_timer_pending(struct kvm_vcpu *vcpu)
 static int vmx_check_nested_events(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	unsigned long exit_qual;
 	bool block_nested_events =
 	    vmx->nested.nested_run_pending || kvm_event_needs_reinjection(vcpu);
 	bool mtf_pending = vmx->nested.mtf_pending;
@@ -3804,41 +3803,15 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu)
 		return 0;
 	}
 
-	/*
-	 * Process any exceptions that are not debug traps before MTF.
-	 *
-	 * Note that only a pending nested run can block a pending exception.
-	 * Otherwise an injected NMI/interrupt should either be
-	 * lost or delivered to the nested hypervisor in the IDT_VECTORING_INFO,
-	 * while delivering the pending exception.
-	 */
-
-	if (vcpu->arch.pending_exception.valid && !vmx_pending_dbg_trap(vcpu)) {
-		if (vmx->nested.nested_run_pending)
-			return -EBUSY;
-		if (!nested_vmx_check_exception(vcpu, &exit_qual))
-			goto no_vmexit;
-		nested_vmx_inject_exception_vmexit(vcpu, exit_qual);
-		return 0;
-	}
-
 	if (mtf_pending) {
 		if (block_nested_events)
 			return -EBUSY;
+
 		nested_vmx_update_pending_dbg(vcpu);
 		nested_vmx_vmexit(vcpu, EXIT_REASON_MONITOR_TRAP_FLAG, 0, 0);
 		return 0;
 	}
 
-	if (vcpu->arch.pending_exception.valid) {
-		if (vmx->nested.nested_run_pending)
-			return -EBUSY;
-		if (!nested_vmx_check_exception(vcpu, &exit_qual))
-			goto no_vmexit;
-		nested_vmx_inject_exception_vmexit(vcpu, exit_qual);
-		return 0;
-	}
-
 	if (nested_vmx_preemption_timer_pending(vcpu)) {
 		if (block_nested_events)
 			return -EBUSY;
@@ -3884,6 +3857,34 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu)
 	return 0;
 }
 
+static int nested_vmx_deliver_exception_as_vmexit(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	unsigned long exit_qual;
+
+	if (vmx->nested.nested_run_pending)
+		return -EBUSY;
+
+	if (vmx->nested.mtf_pending && vmx_pending_dbg_trap(vcpu)) {
+		/*
+		 * A pending monitor trap takes precedence over pending
+		 * debug exception which is 'stashed' into
+		 * 'GUEST_PENDING_DBG_EXCEPTIONS'
+		 */
+
+		nested_vmx_update_pending_dbg(vcpu);
+		vmx->nested.mtf_pending = false;
+		nested_vmx_vmexit(vcpu, EXIT_REASON_MONITOR_TRAP_FLAG, 0, 0);
+		return 0;
+	}
+	if (vcpu->arch.pending_exception.valid) {
+		if (nested_vmx_check_exception(vcpu, &exit_qual))
+			nested_vmx_inject_exception_vmexit(vcpu, exit_qual);
+		return 0;
+	}
+	return 0;
+}
+
 static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu)
 {
 	ktime_t remaining =
@@ -6603,6 +6604,7 @@ __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *))
 
 struct kvm_x86_nested_ops vmx_nested_ops = {
 	.check_events = vmx_check_nested_events,
+	.deliver_exception_as_vmexit = nested_vmx_deliver_exception_as_vmexit,
 	.hv_timer_pending = nested_vmx_preemption_timer_pending,
 	.triple_fault = nested_vmx_triple_fault,
 	.get_state = vmx_get_nested_state,
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index a9b241d2b271..fc6bc40d47b0 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -1682,7 +1682,7 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu)
 	u32 error_code = vcpu->arch.injected_exception.error_code;
 	u32 intr_info = nr | INTR_INFO_VALID_MASK;
 
-	kvm_deliver_exception_payload(vcpu);
+	WARN_ON_ONCE(vcpu->arch.pending_exception.valid);
 
 	if (has_error_code) {
 		vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 493d87b0c2d5..a363204f37be 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -535,86 +535,30 @@ void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu)
 EXPORT_SYMBOL_GPL(kvm_deliver_exception_payload);
 
 static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
-		unsigned nr, bool has_error, u32 error_code,
-	        bool has_payload, unsigned long payload, bool reinject)
+				   unsigned int nr, bool has_error, u32 error_code,
+				   bool has_payload, unsigned long payload,
+				   bool reinject)
 {
-	u32 prev_nr;
-	int class1, class2;
-
+	struct kvm_queued_exception *exc;
 	kvm_make_request(KVM_REQ_EVENT, vcpu);
 
-	if (!vcpu->arch.pending_exception.valid && !vcpu->arch.injected_exception.valid) {
-	queue:
-		if (reinject) {
-			/*
-			 * On vmentry, vcpu->arch.exception.pending is only
-			 * true if an event injection was blocked by
-			 * nested_run_pending.  In that case, however,
-			 * vcpu_enter_guest requests an immediate exit,
-			 * and the guest shouldn't proceed far enough to
-			 * need reinjection.
-			 */
-			WARN_ON_ONCE(vcpu->arch.pending_exception.valid);
-			if (WARN_ON_ONCE(has_payload)) {
-				/*
-				 * A reinjected event has already
-				 * delivered its payload.
-				 */
-				has_payload = false;
-				payload = 0;
-			}
-
-			vcpu->arch.injected_exception.valid = true;
-			vcpu->arch.injected_exception.has_error_code = has_error;
-			vcpu->arch.injected_exception.nr = nr;
-			vcpu->arch.injected_exception.error_code = error_code;
+	WARN_ON_ONCE(vcpu->arch.pending_exception.valid);
+	WARN_ON_ONCE(reinject && vcpu->arch.injected_exception.valid);
 
-		} else {
-			vcpu->arch.pending_exception.valid = true;
-			vcpu->arch.injected_exception.valid = false;
-			vcpu->arch.pending_exception.has_error_code = has_error;
-			vcpu->arch.pending_exception.nr = nr;
-			vcpu->arch.pending_exception.error_code = error_code;
-		}
-
-		vcpu->arch.exception_payload.valid = has_payload;
-		vcpu->arch.exception_payload.value = payload;
-		if (!is_guest_mode(vcpu))
-			kvm_deliver_exception_payload(vcpu);
-		return;
-	}
-
-	/* to check exception */
-	prev_nr = vcpu->arch.injected_exception.nr;
-	if (prev_nr == DF_VECTOR) {
-		/* triple fault -> shutdown */
-		kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
-		return;
-	}
-	class1 = exception_class(prev_nr);
-	class2 = exception_class(nr);
-	if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY)
-		|| (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) {
-		/*
-		 * Generate double fault per SDM Table 5-5.  Set
-		 * exception.pending = true so that the double fault
-		 * can trigger a nested vmexit.
-		 */
-		vcpu->arch.pending_exception.valid = true;
-		vcpu->arch.injected_exception.valid = false;
-		vcpu->arch.pending_exception.has_error_code = true;
-		vcpu->arch.pending_exception.nr = DF_VECTOR;
-		vcpu->arch.pending_exception.error_code = 0;
+	exc = reinject ? &vcpu->arch.injected_exception :
+			 &vcpu->arch.pending_exception;
+	exc->valid = true;
+	exc->nr = nr;
+	exc->has_error_code = has_error;
+	exc->error_code = error_code;
 
-		vcpu->arch.exception_payload.valid = false;
-		vcpu->arch.exception_payload.value = 0;
-	} else
-		/* replace previous exception with a new one in a hope
-		   that instruction re-execution will regenerate lost
-		   exception */
-		goto queue;
+	// re-injected exception has its payload already delivered
+	WARN_ON_ONCE(reinject && has_payload);
+	vcpu->arch.exception_payload.valid = has_payload;
+	vcpu->arch.exception_payload.value = payload;
 }
 
+
 void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
 {
 	kvm_multiple_exception(vcpu, nr, false, 0, false, 0, false);
@@ -641,6 +585,95 @@ static void kvm_queue_exception_e_p(struct kvm_vcpu *vcpu, unsigned nr,
 			       true, payload, false);
 }
 
+static int kvm_do_deliver_pending_exception(struct kvm_vcpu *vcpu)
+{
+	int class1, class2, ret;
+
+	/* try to deliver current pending exception as VM exit */
+	if (is_guest_mode(vcpu)) {
+		ret = kvm_x86_ops.nested_ops->deliver_exception_as_vmexit(vcpu);
+		if (ret || !vcpu->arch.pending_exception.valid)
+			return ret;
+	}
+
+	/* No injected exception, so just deliver the payload and inject it */
+	if (!vcpu->arch.injected_exception.valid) {
+		trace_kvm_inj_exception(vcpu->arch.pending_exception.nr,
+					vcpu->arch.pending_exception.has_error_code,
+					vcpu->arch.pending_exception.error_code);
+queue:
+		/* Intel SDM 17.3.1.1 */
+		if (exception_type(vcpu->arch.pending_exception.nr) == EXCPT_FAULT)
+			__kvm_set_rflags(vcpu, kvm_get_rflags(vcpu) |
+					     X86_EFLAGS_RF);
+
+		kvm_deliver_exception_payload(vcpu);
+
+		/* Intel SDM 17.2.4
+		 * The processor clears the GD flag upon entering to the
+		 * debug exception handler, to allow the handler access
+		 * to the debug registers.
+		 */
+		if (vcpu->arch.pending_exception.nr == DB_VECTOR) {
+			if (vcpu->arch.dr7 & DR7_GD) {
+				vcpu->arch.dr7 &= ~DR7_GD;
+				kvm_update_dr7(vcpu);
+			}
+		}
+
+		if (vcpu->arch.pending_exception.error_code && !is_protmode(vcpu))
+			vcpu->arch.pending_exception.error_code = false;
+
+		vcpu->arch.injected_exception = vcpu->arch.pending_exception;
+		vcpu->arch.pending_exception.valid = false;
+		return 0;
+	}
+
+	/* Convert a pending exception and an injected #DF to a triple fault*/
+	if (vcpu->arch.injected_exception.nr == DF_VECTOR) {
+		/* triple fault -> shutdown */
+		vcpu->arch.injected_exception.valid = false;
+		vcpu->arch.pending_exception.valid = false;
+		kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+		return 0;
+	}
+
+	class1 = exception_class(vcpu->arch.injected_exception.nr);
+	class2 = exception_class(vcpu->arch.pending_exception.nr);
+
+	if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY)
+		|| (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) {
+
+		/* Generate double fault per SDM Table 5-5. */
+		vcpu->arch.injected_exception.valid = false;
+		vcpu->arch.pending_exception.valid = true;
+		vcpu->arch.pending_exception.has_error_code = true;
+		vcpu->arch.pending_exception.nr = DF_VECTOR;
+		vcpu->arch.pending_exception.error_code = 0;
+		vcpu->arch.exception_payload.valid = false;
+	} else
+		/* Drop the injected exception and replace it with the pending one*/
+		goto queue;
+
+	return 0;
+}
+
+static int kvm_deliver_pending_exception(struct kvm_vcpu *vcpu)
+{
+	int ret = 0;
+
+	if (!vcpu->arch.pending_exception.valid)
+		return ret;
+
+	ret = kvm_do_deliver_pending_exception(vcpu);
+
+	if (ret || !vcpu->arch.pending_exception.valid)
+		return ret;
+
+	WARN_ON_ONCE(vcpu->arch.pending_exception.nr != DF_VECTOR);
+	return kvm_do_deliver_pending_exception(vcpu);
+}
+
 int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err)
 {
 	if (err)
@@ -4297,6 +4330,12 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
 	    vcpu->arch.pending_exception.valid && vcpu->arch.exception_payload.valid)
 		kvm_deliver_exception_payload(vcpu);
 
+	/*
+	 * Currently we merge the pending and the injected exceptions
+	 * after each VM exit, which can fail only when nested run is pending,
+	 * in which case only injected (from us or L1) exception is possible.
+	 */
+
 	WARN_ON_ONCE(vcpu->arch.pending_exception.valid &&
 		     vcpu->arch.injected_exception.valid);
 
@@ -8401,8 +8440,6 @@ int kvm_check_nested_events(struct kvm_vcpu *vcpu)
 
 static void kvm_inject_exception(struct kvm_vcpu *vcpu)
 {
-	if (vcpu->arch.injected_exception.error_code && !is_protmode(vcpu))
-		vcpu->arch.injected_exception.error_code = false;
 	static_call(kvm_x86_queue_exception)(vcpu);
 }
 
@@ -8411,8 +8448,13 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit
 	int r;
 	bool can_inject = true;
 
-	/* try to reinject previous events if any */
+	r = kvm_deliver_pending_exception(vcpu);
+	if (r < 0)
+		goto busy;
+
+	WARN_ON_ONCE(vcpu->arch.pending_exception.valid);
 
+	/* try to reinject previous events if any */
 	if (vcpu->arch.injected_exception.valid) {
 		kvm_inject_exception(vcpu);
 		can_inject = false;
@@ -8431,7 +8473,7 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit
 	 * serviced prior to recognizing any new events in order to
 	 * fully complete the previous instruction.
 	 */
-	else if (!vcpu->arch.pending_exception.valid) {
+	else {
 		if (vcpu->arch.nmi_injected) {
 			static_call(kvm_x86_set_nmi)(vcpu);
 			can_inject = false;
@@ -8441,9 +8483,6 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit
 		}
 	}
 
-	WARN_ON_ONCE(vcpu->arch.pending_exception.valid &&
-		     vcpu->arch.injected_exception.valid);
-
 	/*
 	 * Call check_nested_events() even if we reinjected a previous event
 	 * in order for caller to determine if it should require immediate-exit
@@ -8456,30 +8495,6 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit
 			goto busy;
 	}
 
-	/* try to inject new event if pending */
-	if (vcpu->arch.pending_exception.valid) {
-		trace_kvm_inj_exception(vcpu->arch.pending_exception.nr,
-					vcpu->arch.pending_exception.has_error_code,
-					vcpu->arch.pending_exception.error_code);
-
-		vcpu->arch.injected_exception = vcpu->arch.pending_exception;
-		vcpu->arch.pending_exception.valid = false;
-
-		if (exception_type(vcpu->arch.injected_exception.nr) == EXCPT_FAULT)
-			__kvm_set_rflags(vcpu, kvm_get_rflags(vcpu) |
-					     X86_EFLAGS_RF);
-
-		if (vcpu->arch.injected_exception.nr == DB_VECTOR) {
-			kvm_deliver_exception_payload(vcpu);
-			if (vcpu->arch.dr7 & DR7_GD) {
-				vcpu->arch.dr7 &= ~DR7_GD;
-				kvm_update_dr7(vcpu);
-			}
-		}
-
-		kvm_inject_exception(vcpu);
-		can_inject = false;
-	}
 
 	/*
 	 * Finally, inject interrupt events.  If an event cannot be injected
@@ -9270,6 +9285,14 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 		kvm_lapic_sync_from_vapic(vcpu);
 
 	r = static_call(kvm_x86_handle_exit)(vcpu, exit_fastpath);
+
+	/*
+	 * Deliver the pending exception so that the state of having a pending
+	 * and an injected exception is not visible to the userspace.
+	 */
+
+	kvm_deliver_pending_exception(vcpu);
+
 	return r;
 
 cancel_injection:
@@ -11014,7 +11037,7 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
 	if (vcpu->arch.pv.pv_unhalted)
 		return true;
 
-	if (vcpu->arch.pending_exception.valid)
+	if (vcpu->arch.pending_exception.valid || vcpu->arch.injected_exception.valid)
 		return true;
 
 	if (kvm_test_request(KVM_REQ_NMI, vcpu) ||
-- 
2.26.2