[PATCH 4/4] kvm: WIP separation of injected and pending exception

Maxim Levitsky <mlevitsk@xxxxxxxxxx> · Thu, 25 Feb 2021 17:41:35 +0200

Signed-off-by: Maxim Levitsky <mlevitsk@xxxxxxxxxx>
---
 arch/x86/include/asm/kvm_host.h |  23 +-
 arch/x86/include/uapi/asm/kvm.h |  14 +-
 arch/x86/kvm/svm/nested.c       |  62 +++---
 arch/x86/kvm/svm/svm.c          |   8 +-
 arch/x86/kvm/vmx/nested.c       | 114 +++++-----
 arch/x86/kvm/vmx/vmx.c          |  14 +-
 arch/x86/kvm/x86.c              | 370 +++++++++++++++++++-------------
 arch/x86/kvm/x86.h              |   6 +-
 include/uapi/linux/kvm.h        |   1 +
 9 files changed, 367 insertions(+), 245 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 4aa48fb55361d..190e245aa6670 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -637,16 +637,22 @@ struct kvm_vcpu_arch {
 
 	u8 event_exit_inst_len;
 
-	struct kvm_queued_exception {
-		bool pending;
-		bool injected;
+	struct kvm_pending_exception {
+		bool valid;
 		bool has_error_code;
 		u8 nr;
 		u32 error_code;
 		unsigned long payload;
 		bool has_payload;
 		u8 nested_apf;
-	} exception;
+	} pending_exception;
+
+	struct kvm_queued_exception {
+		bool valid;
+		bool has_error_code;
+		u8 nr;
+		u32 error_code;
+	} injected_exception;
 
 	struct kvm_queued_interrupt {
 		bool injected;
@@ -1018,6 +1024,7 @@ struct kvm_arch {
 
 	bool guest_can_read_msr_platform_info;
 	bool exception_payload_enabled;
+	bool exception_separate_injected_pending;
 
 	/* Deflect RDMSR and WRMSR to user space when they trigger a #GP */
 	u32 user_space_msr_mask;
@@ -1351,6 +1358,14 @@ struct kvm_x86_ops {
 
 struct kvm_x86_nested_ops {
 	int (*check_events)(struct kvm_vcpu *vcpu);
+
+	/*
+	 * return value: 0 - delivered vm exit, 1 - exception not intercepted,
+	 * negative - failure
+	 * */
+
+	int (*deliver_exception)(struct kvm_vcpu *vcpu);
+
 	bool (*hv_timer_pending)(struct kvm_vcpu *vcpu);
 	int (*get_state)(struct kvm_vcpu *vcpu,
 			 struct kvm_nested_state __user *user_kvm_nested_state,
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index 5a3022c8af82b..9556e420e8ecb 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -345,9 +345,17 @@ struct kvm_vcpu_events {
 		__u8 smm_inside_nmi;
 		__u8 latched_init;
 	} smi;
-	__u8 reserved[27];
-	__u8 exception_has_payload;
-	__u64 exception_payload;
+
+	__u8 reserved[20];
+
+	struct {
+		__u32 error_code;
+		__u8 nr;
+		__u8 pad;
+		__u8 has_error_code;
+		__u8 has_payload;
+		__u64 payload;
+	} pending_exception;
 };
 
 /* for KVM_GET/SET_DEBUGREGS */
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 4c82abce0ea0c..9df01b6e2e091 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -315,15 +315,16 @@ static void nested_save_pending_event_to_vmcb12(struct vcpu_svm *svm,
 	u32 exit_int_info = 0;
 	unsigned int nr;
 
-	if (vcpu->arch.exception.injected) {
-		nr = vcpu->arch.exception.nr;
+	if (vcpu->arch.injected_exception.valid) {
+		nr = vcpu->arch.injected_exception.nr;
 		exit_int_info = nr | SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_EXEPT;
 
-		if (vcpu->arch.exception.has_error_code) {
+		if (vcpu->arch.injected_exception.has_error_code) {
 			exit_int_info |= SVM_EVTINJ_VALID_ERR;
 			vmcb12->control.exit_int_info_err =
-				vcpu->arch.exception.error_code;
+				vcpu->arch.injected_exception.error_code;
 		}
+		vcpu->arch.injected_exception.valid = false;
 
 	} else if (vcpu->arch.nmi_injected) {
 		exit_int_info = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;
@@ -923,30 +924,30 @@ int nested_svm_check_permissions(struct kvm_vcpu *vcpu)
 
 static bool nested_exit_on_exception(struct vcpu_svm *svm)
 {
-	unsigned int nr = svm->vcpu.arch.exception.nr;
+	unsigned int nr = svm->vcpu.arch.pending_exception.nr;
 
 	return (svm->nested.ctl.intercepts[INTERCEPT_EXCEPTION] & BIT(nr));
 }
 
 static void nested_svm_inject_exception_vmexit(struct vcpu_svm *svm)
 {
-	unsigned int nr = svm->vcpu.arch.exception.nr;
+	unsigned int nr = svm->vcpu.arch.pending_exception.nr;
 
 	svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr;
 	svm->vmcb->control.exit_code_hi = 0;
 
-	if (svm->vcpu.arch.exception.has_error_code)
-		svm->vmcb->control.exit_info_1 = svm->vcpu.arch.exception.error_code;
+	if (svm->vcpu.arch.pending_exception.has_error_code)
+		svm->vmcb->control.exit_info_1 = svm->vcpu.arch.pending_exception.error_code;
 
 	/*
 	 * EXITINFO2 is undefined for all exception intercepts other
 	 * than #PF.
 	 */
 	if (nr == PF_VECTOR) {
-		if (svm->vcpu.arch.exception.nested_apf)
+		if (svm->vcpu.arch.pending_exception.nested_apf)
 			svm->vmcb->control.exit_info_2 = svm->vcpu.arch.apf.nested_apf_token;
-		else if (svm->vcpu.arch.exception.has_payload)
-			svm->vmcb->control.exit_info_2 = svm->vcpu.arch.exception.payload;
+		else if (svm->vcpu.arch.pending_exception.has_payload)
+			svm->vmcb->control.exit_info_2 = svm->vcpu.arch.pending_exception.payload;
 		else
 			svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2;
 	} else if (nr == DB_VECTOR) {
@@ -957,7 +958,7 @@ static void nested_svm_inject_exception_vmexit(struct vcpu_svm *svm)
 			kvm_update_dr7(&svm->vcpu);
 		}
 	} else
-		WARN_ON(svm->vcpu.arch.exception.has_payload);
+		WARN_ON(svm->vcpu.arch.pending_exception.has_payload);
 
 	nested_svm_vmexit(svm);
 }
@@ -1023,20 +1024,6 @@ static int svm_check_nested_events(struct kvm_vcpu *vcpu)
 		return 0;
 	}
 
-	if (vcpu->arch.exception.pending) {
-		/*
-		 * Only pending nested run can block an pending exception
-		 * Otherwise an injected NMI/interrupt should either be
-		 * lost or delivered to the nested hypervisor in EXITINTINFO
-		 * */
-		if (svm->nested.nested_run_pending)
-                        return -EBUSY;
-		if (!nested_exit_on_exception(svm))
-			return 0;
-		nested_svm_inject_exception_vmexit(svm);
-		return 0;
-	}
-
 	if (vcpu->arch.smi_pending && !svm_smi_blocked(vcpu)) {
 		if (block_nested_events)
 			return -EBUSY;
@@ -1063,7 +1050,29 @@ static int svm_check_nested_events(struct kvm_vcpu *vcpu)
 		nested_svm_intr(svm);
 		return 0;
 	}
+	return 0;
+}
+
+int svm_deliver_nested_exception(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+
+	/*
+	 * Only pending exception can cause vm exit.
+	 * Injected exception are either already started delivery
+	 * or came from nested EVENTINJ which doesn't check intercepts
+	 */
+
+	if (!vcpu->arch.pending_exception.valid)
+		return 1;
+
+	if(svm->nested.nested_run_pending)
+		return -EBUSY;
+
+	if (!nested_exit_on_exception(svm))
+		return 1;
 
+	nested_svm_inject_exception_vmexit(svm);
 	return 0;
 }
 
@@ -1302,6 +1311,7 @@ static bool svm_get_nested_state_pages(struct kvm_vcpu *vcpu)
 
 struct kvm_x86_nested_ops svm_nested_ops = {
 	.check_events = svm_check_nested_events,
+	.deliver_exception = svm_deliver_nested_exception,
 	.get_nested_state_pages = svm_get_nested_state_pages,
 	.get_state = svm_get_nested_state,
 	.set_state = svm_set_nested_state,
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index cdbbda37b9419..0a1857f5fe55e 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -363,11 +363,11 @@ static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
 static void svm_queue_exception(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
-	unsigned nr = vcpu->arch.exception.nr;
-	bool has_error_code = vcpu->arch.exception.has_error_code;
-	u32 error_code = vcpu->arch.exception.error_code;
+	unsigned nr = vcpu->arch.injected_exception.nr;
+	bool has_error_code = vcpu->arch.injected_exception.has_error_code;
+	u32 error_code = vcpu->arch.injected_exception.error_code;
 
-	kvm_deliver_exception_payload(vcpu);
+	WARN_ON(vcpu->arch.pending_exception.valid);
 
 	if (nr == BP_VECTOR && !nrips) {
 		unsigned long rip, old_rip = kvm_rip_read(vcpu);
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 20ed1a351b2d9..be9c4e449aafd 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -388,17 +388,19 @@ static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
 static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned long *exit_qual)
 {
 	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
-	unsigned int nr = vcpu->arch.exception.nr;
-	bool has_payload = vcpu->arch.exception.has_payload;
-	unsigned long payload = vcpu->arch.exception.payload;
+	unsigned int nr = vcpu->arch.pending_exception.nr;
+	bool has_payload = vcpu->arch.pending_exception.has_payload;
+	unsigned long payload = vcpu->arch.pending_exception.payload;
+
+	/* injected exception doesn't need checking here */
 
 	if (nr == PF_VECTOR) {
-		if (vcpu->arch.exception.nested_apf) {
+		if (vcpu->arch.pending_exception.nested_apf) {
 			*exit_qual = vcpu->arch.apf.nested_apf_token;
 			return 1;
 		}
 		if (nested_vmx_is_page_fault_vmexit(vmcs12,
-						    vcpu->arch.exception.error_code)) {
+						    vcpu->arch.pending_exception.error_code)) {
 			*exit_qual = has_payload ? payload : vcpu->arch.cr2;
 			return 1;
 		}
@@ -3621,8 +3623,8 @@ static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu,
 	u32 idt_vectoring;
 	unsigned int nr;
 
-	if (vcpu->arch.exception.injected) {
-		nr = vcpu->arch.exception.nr;
+	if (vcpu->arch.injected_exception.valid) {
+		nr = vcpu->arch.injected_exception.nr;
 		idt_vectoring = nr | VECTORING_INFO_VALID_MASK;
 
 		if (kvm_exception_is_soft(nr)) {
@@ -3632,10 +3634,10 @@ static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu,
 		} else
 			idt_vectoring |= INTR_TYPE_HARD_EXCEPTION;
 
-		if (vcpu->arch.exception.has_error_code) {
+		if (vcpu->arch.injected_exception.has_error_code) {
 			idt_vectoring |= VECTORING_INFO_DELIVER_CODE_MASK;
 			vmcs12->idt_vectoring_error_code =
-				vcpu->arch.exception.error_code;
+				vcpu->arch.injected_exception.error_code;
 		}
 
 		vmcs12->idt_vectoring_info_field = idt_vectoring;
@@ -3716,11 +3718,11 @@ static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu,
 					       unsigned long exit_qual)
 {
 	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
-	unsigned int nr = vcpu->arch.exception.nr;
+	unsigned int nr = vcpu->arch.pending_exception.nr;
 	u32 intr_info = nr | INTR_INFO_VALID_MASK;
 
-	if (vcpu->arch.exception.has_error_code) {
-		vmcs12->vm_exit_intr_error_code = vcpu->arch.exception.error_code;
+	if (vcpu->arch.pending_exception.has_error_code) {
+		vmcs12->vm_exit_intr_error_code = vcpu->arch.pending_exception.error_code;
 		intr_info |= INTR_INFO_DELIVER_CODE_MASK;
 	}
 
@@ -3744,9 +3746,9 @@ static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu,
  */
 static inline bool vmx_pending_dbg_trap(struct kvm_vcpu *vcpu)
 {
-	return vcpu->arch.exception.pending &&
-			vcpu->arch.exception.nr == DB_VECTOR &&
-			vcpu->arch.exception.payload;
+	return vcpu->arch.pending_exception.valid &&
+			vcpu->arch.pending_exception.nr == DB_VECTOR &&
+			vcpu->arch.pending_exception.payload;
 }
 
 /*
@@ -3760,7 +3762,7 @@ static void nested_vmx_update_pending_dbg(struct kvm_vcpu *vcpu)
 {
 	if (vmx_pending_dbg_trap(vcpu))
 		vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
-			    vcpu->arch.exception.payload);
+			    vcpu->arch.pending_exception.payload);
 }
 
 static bool nested_vmx_preemption_timer_pending(struct kvm_vcpu *vcpu)
@@ -3772,10 +3774,8 @@ static bool nested_vmx_preemption_timer_pending(struct kvm_vcpu *vcpu)
 static int vmx_check_nested_events(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	unsigned long exit_qual;
 	bool block_nested_events =
 	    vmx->nested.nested_run_pending || kvm_event_needs_reinjection(vcpu);
-	bool mtf_pending = vmx->nested.mtf_pending;
 	struct kvm_lapic *apic = vcpu->arch.apic;
 
 	/*
@@ -3808,39 +3808,6 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu)
 		return 0;
 	}
 
-	/*
-	 * Process any exceptions that are not debug traps before MTF.
-	 *
-	 * Note that only pending nested run can block an pending exception
-	 * Otherwise an injected NMI/interrupt should either be
-	 * lost or delivered to the nested hypervisor in EXITINTINFO
-	 */
-
-	if (vcpu->arch.exception.pending && !vmx_pending_dbg_trap(vcpu)) {
-		if (vmx->nested.nested_run_pending)
-			return -EBUSY;
-		if (!nested_vmx_check_exception(vcpu, &exit_qual))
-			goto no_vmexit;
-		nested_vmx_inject_exception_vmexit(vcpu, exit_qual);
-		return 0;
-	}
-
-	if (mtf_pending) {
-		if (block_nested_events)
-			return -EBUSY;
-		nested_vmx_update_pending_dbg(vcpu);
-		nested_vmx_vmexit(vcpu, EXIT_REASON_MONITOR_TRAP_FLAG, 0, 0);
-		return 0;
-	}
-
-	if (vcpu->arch.exception.pending) {
-		if (vmx->nested.nested_run_pending)
-			return -EBUSY;
-		if (!nested_vmx_check_exception(vcpu, &exit_qual))
-			goto no_vmexit;
-		nested_vmx_inject_exception_vmexit(vcpu, exit_qual);
-		return 0;
-	}
 
 	if (nested_vmx_preemption_timer_pending(vcpu)) {
 		if (block_nested_events)
@@ -3887,6 +3854,50 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu)
 	return 0;
 }
 
+static int nested_vmx_deliver_nested_exception(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	unsigned long exit_qual;
+
+
+	if (!vcpu->arch.pending_exception.valid && !vmx->nested.mtf_pending)
+		return 1;
+
+	if(vmx->nested.nested_run_pending)
+		return -EBUSY;
+
+	/*
+	 * Process any exceptions that are not debug traps before MTF.
+	 *
+	 * Note that only pending nested run can block an pending exception
+	 * Otherwise an injected NMI/interrupt should either be
+	 * lost or delivered to the nested hypervisor in EXITINTINFO
+	 */
+
+	if (vcpu->arch.pending_exception.valid && !vmx_pending_dbg_trap(vcpu)) {
+		if (!nested_vmx_check_exception(vcpu, &exit_qual))
+			goto no_vmexit;
+		nested_vmx_inject_exception_vmexit(vcpu, exit_qual);
+		return 0;
+	}
+
+	if (vmx->nested.mtf_pending) {
+		/* TODO: check this */
+		nested_vmx_update_pending_dbg(vcpu);
+		nested_vmx_vmexit(vcpu, EXIT_REASON_MONITOR_TRAP_FLAG, 0, 0);
+		return 0;
+	}
+
+	if (vcpu->arch.pending_exception.valid) {
+		if (!nested_vmx_check_exception(vcpu, &exit_qual))
+			goto no_vmexit;
+		nested_vmx_inject_exception_vmexit(vcpu, exit_qual);
+		return 0;
+	}
+no_vmexit:
+	return 1;
+}
+
 static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu)
 {
 	ktime_t remaining =
@@ -6598,6 +6609,7 @@ __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *))
 
 struct kvm_x86_nested_ops vmx_nested_ops = {
 	.check_events = vmx_check_nested_events,
+	.deliver_exception = nested_vmx_deliver_nested_exception,
 	.hv_timer_pending = nested_vmx_preemption_timer_pending,
 	.get_state = vmx_get_nested_state,
 	.set_state = vmx_set_nested_state,
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index f2714d22228de..d480bd48d786f 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -1630,8 +1630,8 @@ static void vmx_update_emulated_instruction(struct kvm_vcpu *vcpu)
 	 * vmx_check_nested_events().
 	 */
 	if (nested_cpu_has_mtf(vmcs12) &&
-	    (!vcpu->arch.exception.pending ||
-	     vcpu->arch.exception.nr == DB_VECTOR))
+	    (!vcpu->arch.pending_exception.valid ||
+	     vcpu->arch.pending_exception.nr == DB_VECTOR))
 		vmx->nested.mtf_pending = true;
 	else
 		vmx->nested.mtf_pending = false;
@@ -1659,12 +1659,12 @@ static void vmx_clear_hlt(struct kvm_vcpu *vcpu)
 static void vmx_queue_exception(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	unsigned nr = vcpu->arch.exception.nr;
-	bool has_error_code = vcpu->arch.exception.has_error_code;
-	u32 error_code = vcpu->arch.exception.error_code;
+	unsigned nr = vcpu->arch.injected_exception.nr;
+	bool has_error_code = vcpu->arch.injected_exception.has_error_code;
+	u32 error_code = vcpu->arch.injected_exception.error_code;
 	u32 intr_info = nr | INTR_INFO_VALID_MASK;
 
-	kvm_deliver_exception_payload(vcpu);
+	WARN_ON(vcpu->arch.pending_exception.valid);
 
 	if (has_error_code) {
 		vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
@@ -5400,7 +5400,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
 			return 0;
 
 		if (vmx->emulation_required && !vmx->rmode.vm86_active &&
-		    vcpu->arch.exception.pending) {
+		    vcpu->arch.pending_exception.valid) {
 			vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
 			vcpu->run->internal.suberror =
 						KVM_INTERNAL_ERROR_EMULATION;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index a9d814a0b5e4f..eec62c0dafc36 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -480,9 +480,9 @@ static int exception_type(int vector)
 
 void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu)
 {
-	unsigned nr = vcpu->arch.exception.nr;
-	bool has_payload = vcpu->arch.exception.has_payload;
-	unsigned long payload = vcpu->arch.exception.payload;
+	unsigned nr = vcpu->arch.pending_exception.nr;
+	bool has_payload = vcpu->arch.pending_exception.has_payload;
+	unsigned long payload = vcpu->arch.pending_exception.payload;
 
 	if (!has_payload)
 		return;
@@ -528,83 +528,130 @@ void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu)
 		break;
 	}
 
-	vcpu->arch.exception.has_payload = false;
-	vcpu->arch.exception.payload = 0;
+	vcpu->arch.pending_exception.has_payload = false;
+	vcpu->arch.pending_exception.payload = 0;
 }
 EXPORT_SYMBOL_GPL(kvm_deliver_exception_payload);
 
-static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
-		unsigned nr, bool has_error, u32 error_code,
-	        bool has_payload, unsigned long payload, bool reinject)
+
+/*
+ * Delivers exception payload and converts/merges current pending
+ * exception with injected exception (if any) and writes
+ * result to injected exception
+ */
+int kvm_deliver_pending_exception(struct kvm_vcpu *vcpu)
 {
-	u32 prev_nr;
-	int class1, class2;
+	while (vcpu->arch.pending_exception.valid) {
+		u32 prev_nr;
+		int class1, class2;
 
-	kvm_make_request(KVM_REQ_EVENT, vcpu);
+		/* try to deliver current pending exception as VM exit */
+		if (is_guest_mode(vcpu)) {
+			int ret = kvm_x86_ops.nested_ops->deliver_exception(vcpu);
+			if (ret <= 0)
+				return ret;
+		}
 
-	if (!vcpu->arch.exception.pending && !vcpu->arch.exception.injected) {
-	queue:
-		if (reinject) {
-			/*
-			 * On vmentry, vcpu->arch.exception.pending is only
-			 * true if an event injection was blocked by
-			 * nested_run_pending.  In that case, however,
-			 * vcpu_enter_guest requests an immediate exit,
-			 * and the guest shouldn't proceed far enough to
-			 * need reinjection.
+		/* No injected exception, so just deliver the payload and inject it */
+		if (!vcpu->arch.injected_exception.valid) {
+
+			trace_kvm_inj_exception(vcpu->arch.pending_exception.nr,
+						vcpu->arch.pending_exception.has_error_code,
+						vcpu->arch.pending_exception.error_code);
+
+			/* Intel SDM 17.3.1.1 */
+			if (exception_type(vcpu->arch.pending_exception.nr) == EXCPT_FAULT)
+				__kvm_set_rflags(vcpu, kvm_get_rflags(vcpu) |
+						     X86_EFLAGS_RF);
+
+			kvm_deliver_exception_payload(vcpu);
+
+			/* Intel SDM 17.2.4
+			 * The processor clears the GD flag upon entering to the
+			 * debug exception handler, to allow the handler access
+			 * to the debug registers.
 			 */
-			WARN_ON_ONCE(vcpu->arch.exception.pending);
-			vcpu->arch.exception.injected = true;
-			if (WARN_ON_ONCE(has_payload)) {
-				/*
-				 * A reinjected event has already
-				 * delivered its payload.
-				 */
-				has_payload = false;
-				payload = 0;
+			if (vcpu->arch.pending_exception.nr == DB_VECTOR) {
+				if (vcpu->arch.dr7 & DR7_GD) {
+					vcpu->arch.dr7 &= ~DR7_GD;
+					kvm_update_dr7(vcpu);
+				}
 			}
-		} else {
-			vcpu->arch.exception.pending = true;
-			vcpu->arch.exception.injected = false;
+
+			if (vcpu->arch.pending_exception.error_code && !is_protmode(vcpu))
+				vcpu->arch.pending_exception.error_code = false;
+
+			vcpu->arch.pending_exception.valid = false;
+			vcpu->arch.injected_exception.valid = true;
+			vcpu->arch.injected_exception.has_error_code = vcpu->arch.pending_exception.has_error_code;
+			vcpu->arch.injected_exception.nr = vcpu->arch.pending_exception.nr;
+			vcpu->arch.injected_exception.error_code = vcpu->arch.pending_exception.error_code;
+			return 0;
+		}
+
+		/* Convert both pending and injected exception to triple fault*/
+		prev_nr = vcpu->arch.injected_exception.nr;
+		if (prev_nr == DF_VECTOR) {
+			/* triple fault -> shutdown */
+			vcpu->arch.injected_exception.valid = false;
+			vcpu->arch.pending_exception.valid = false;
+
+			/* TODO - make KVM_REQ_TRIPLE_FAULT inject vmexit when guest intercepts it */
+			kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+			break;
+		}
+
+		class1 = exception_class(prev_nr);
+		class2 = exception_class(vcpu->arch.pending_exception.nr);
+
+		vcpu->arch.injected_exception.valid = false;
+
+		if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY)
+			|| (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) {
+			/* Generate double fault per SDM Table 5-5. */
+			vcpu->arch.pending_exception.has_error_code = true;
+			vcpu->arch.pending_exception.nr = DF_VECTOR;
+			vcpu->arch.pending_exception.error_code = 0;
+			vcpu->arch.pending_exception.has_payload = false;
 		}
-		vcpu->arch.exception.has_error_code = has_error;
-		vcpu->arch.exception.nr = nr;
-		vcpu->arch.exception.error_code = error_code;
-		vcpu->arch.exception.has_payload = has_payload;
-		vcpu->arch.exception.payload = payload;
-		if (!is_guest_mode(vcpu))
-			kvm_deliver_exception_payload(vcpu);
-		return;
 	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_deliver_pending_exception);
 
-	/* to check exception */
-	prev_nr = vcpu->arch.exception.nr;
-	if (prev_nr == DF_VECTOR) {
-		/* triple fault -> shutdown */
-		kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
-		return;
+static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
+	        unsigned nr, bool has_error, u32 error_code,
+	        bool has_payload, unsigned long payload, bool reinject)
+{
+	kvm_make_request(KVM_REQ_EVENT, vcpu);
+
+	if (reinject) {
+		// exceptions are re-injected right after VM exit,
+		// before we are able to generate another
+		// pending exception
+		if (WARN_ON_ONCE(vcpu->arch.pending_exception.valid))
+			return;
+		// it is not possible to inject more that one exception
+		if (WARN_ON_ONCE(vcpu->arch.injected_exception.valid))
+			return;
+		vcpu->arch.injected_exception.valid = true;
+		vcpu->arch.injected_exception.nr = nr;
+		vcpu->arch.injected_exception.has_error_code = has_error;
+		vcpu->arch.injected_exception.error_code = error_code;
+
+		// re-injected exception has its payload already delivered
+		WARN_ON_ONCE(has_payload);
+	} else {
+		// can't have more that one pending exception
+		if (WARN_ON_ONCE(vcpu->arch.pending_exception.valid))
+			return;
+		vcpu->arch.pending_exception.valid = true;
+		vcpu->arch.pending_exception.nr = nr;
+		vcpu->arch.pending_exception.has_error_code = has_error;
+		vcpu->arch.pending_exception.error_code = error_code;
+		vcpu->arch.pending_exception.has_payload = has_payload;
+		vcpu->arch.pending_exception.payload = payload;
 	}
-	class1 = exception_class(prev_nr);
-	class2 = exception_class(nr);
-	if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY)
-		|| (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) {
-		/*
-		 * Generate double fault per SDM Table 5-5.  Set
-		 * exception.pending = true so that the double fault
-		 * can trigger a nested vmexit.
-		 */
-		vcpu->arch.exception.pending = true;
-		vcpu->arch.exception.injected = false;
-		vcpu->arch.exception.has_error_code = true;
-		vcpu->arch.exception.nr = DF_VECTOR;
-		vcpu->arch.exception.error_code = 0;
-		vcpu->arch.exception.has_payload = false;
-		vcpu->arch.exception.payload = 0;
-	} else
-		/* replace previous exception with a new one in a hope
-		   that instruction re-execution will regenerate lost
-		   exception */
-		goto queue;
 }
 
 void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
@@ -647,9 +694,9 @@ EXPORT_SYMBOL_GPL(kvm_complete_insn_gp);
 void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
 {
 	++vcpu->stat.pf_guest;
-	vcpu->arch.exception.nested_apf =
+	vcpu->arch.pending_exception.nested_apf =
 		is_guest_mode(vcpu) && fault->async_page_fault;
-	if (vcpu->arch.exception.nested_apf) {
+	if (vcpu->arch.pending_exception.nested_apf) {
 		vcpu->arch.apf.nested_apf_token = fault->address;
 		kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code);
 	} else {
@@ -4267,47 +4314,69 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
 	if (kvm_check_request(KVM_REQ_SMI, vcpu))
 		process_smi(vcpu);
 
+	events->exception.injected = 0;
+	events->exception.pending = 0;
+	events->pending_exception.has_payload = 0;
+	events->pending_exception.payload = 0;
+
+	/* In unlikely case when we have both pending and injected exception and userspace didn't enable
+	 * KVM_CAP_EXCEPTION_INJECTED_PENDING deliver the pending exception now
+	 */
+	if (!vcpu->kvm->arch.exception_separate_injected_pending) {
+		if (vcpu->arch.pending_exception.valid && vcpu->arch.injected_exception.valid)
+			if (kvm_deliver_pending_exception(vcpu) < 0)
+				/* in case the delivery fails, we
+				 * forget about the injected exception */
+				vcpu->arch.injected_exception.valid = false;
+	}
+
 	/*
-	 * In guest mode, payload delivery should be deferred,
-	 * so that the L1 hypervisor can intercept #PF before
-	 * CR2 is modified (or intercept #DB before DR6 is
-	 * modified under nVMX). Unless the per-VM capability,
+	 * Unless the per-VM capability,
 	 * KVM_CAP_EXCEPTION_PAYLOAD, is set, we may not defer the delivery of
 	 * an exception payload and handle after a KVM_GET_VCPU_EVENTS. Since we
 	 * opportunistically defer the exception payload, deliver it if the
 	 * capability hasn't been requested before processing a
 	 * KVM_GET_VCPU_EVENTS.
 	 */
+
 	if (!vcpu->kvm->arch.exception_payload_enabled &&
-	    vcpu->arch.exception.pending && vcpu->arch.exception.has_payload)
+	    vcpu->arch.pending_exception.valid && vcpu->arch.pending_exception.valid)
 		kvm_deliver_exception_payload(vcpu);
 
+	if (vcpu->arch.pending_exception.valid &&
+	    !kvm_exception_is_soft(vcpu->arch.pending_exception.nr)) {
+
+		events->exception.pending = true;
+		events->pending_exception.has_payload = vcpu->arch.pending_exception.has_payload;
+		events->pending_exception.payload = vcpu->arch.pending_exception.payload;
+
+		/* TODO: this code looks ugly */
+		if (vcpu->kvm->arch.exception_separate_injected_pending) {
+			events->pending_exception.has_error_code = vcpu->arch.pending_exception.has_error_code;
+			events->pending_exception.error_code = vcpu->arch.pending_exception.error_code;
+			events->pending_exception.nr = vcpu->arch.pending_exception.nr;
+		} else {
+			events->exception.has_error_code = vcpu->arch.pending_exception.has_error_code;
+			events->exception.error_code = vcpu->arch.pending_exception.error_code;
+			events->exception.nr = vcpu->arch.pending_exception.nr;
+		}
+	}
+
+	if (vcpu->arch.injected_exception.valid &&
+	    !kvm_exception_is_soft(vcpu->arch.injected_exception.nr)) {
+		events->exception.injected = true;
+		events->exception.nr = vcpu->arch.injected_exception.nr;
+		events->exception.has_error_code = vcpu->arch.injected_exception.has_error_code;
+		events->exception.error_code = vcpu->arch.injected_exception.error_code;
+	}
+
 	/*
-	 * The API doesn't provide the instruction length for software
-	 * exceptions, so don't report them. As long as the guest RIP
-	 * isn't advanced, we should expect to encounter the exception
-	 * again.
+	 * For ABI compatibility, deliberately conflate
+	 * pending and injected exceptions when
+	 * KVM_CAP_EXCEPTION_PAYLOAD isn't enabled.
 	 */
-	if (kvm_exception_is_soft(vcpu->arch.exception.nr)) {
-		events->exception.injected = 0;
-		events->exception.pending = 0;
-	} else {
-		events->exception.injected = vcpu->arch.exception.injected;
-		events->exception.pending = vcpu->arch.exception.pending;
-		/*
-		 * For ABI compatibility, deliberately conflate
-		 * pending and injected exceptions when
-		 * KVM_CAP_EXCEPTION_PAYLOAD isn't enabled.
-		 */
-		if (!vcpu->kvm->arch.exception_payload_enabled)
-			events->exception.injected |=
-				vcpu->arch.exception.pending;
-	}
-	events->exception.nr = vcpu->arch.exception.nr;
-	events->exception.has_error_code = vcpu->arch.exception.has_error_code;
-	events->exception.error_code = vcpu->arch.exception.error_code;
-	events->exception_has_payload = vcpu->arch.exception.has_payload;
-	events->exception_payload = vcpu->arch.exception.payload;
+	if (!vcpu->kvm->arch.exception_payload_enabled)
+		events->exception.injected |= vcpu->arch.pending_exception.valid;
 
 	events->interrupt.injected =
 		vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft;
@@ -4339,6 +4408,11 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
 
 static void kvm_smm_changed(struct kvm_vcpu *vcpu);
 
+static bool is_valid_exception(int nr)
+{
+	return nr < 32 && nr != NMI_VECTOR;
+}
+
 static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
 					      struct kvm_vcpu_events *events)
 {
@@ -4355,16 +4429,21 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
 		if (events->exception.pending)
 			events->exception.injected = 0;
 		else
-			events->exception_has_payload = 0;
+			events->pending_exception.has_payload = 0;
 	} else {
 		events->exception.pending = 0;
-		events->exception_has_payload = 0;
+		events->pending_exception.has_payload = 0;
 	}
 
 	if ((events->exception.injected || events->exception.pending) &&
-	    (events->exception.nr > 31 || events->exception.nr == NMI_VECTOR))
+	    (!is_valid_exception(events->exception.nr)))
 		return -EINVAL;
 
+	if (vcpu->kvm->arch.exception_separate_injected_pending)
+		if (events->exception.pending &&
+		    !is_valid_exception(events->pending_exception.nr))
+			return -EINVAL;
+
 	/* INITs are latched while in SMM */
 	if (events->flags & KVM_VCPUEVENT_VALID_SMM &&
 	    (events->smi.smm || events->smi.pending) &&
@@ -4372,13 +4451,30 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
 		return -EINVAL;
 
 	process_nmi(vcpu);
-	vcpu->arch.exception.injected = events->exception.injected;
-	vcpu->arch.exception.pending = events->exception.pending;
-	vcpu->arch.exception.nr = events->exception.nr;
-	vcpu->arch.exception.has_error_code = events->exception.has_error_code;
-	vcpu->arch.exception.error_code = events->exception.error_code;
-	vcpu->arch.exception.has_payload = events->exception_has_payload;
-	vcpu->arch.exception.payload = events->exception_payload;
+
+	if (events->exception.injected) {
+		vcpu->arch.injected_exception.valid = true;
+		vcpu->arch.injected_exception.nr = events->exception.nr;
+		vcpu->arch.injected_exception.has_error_code = events->exception.has_error_code;
+		vcpu->arch.injected_exception.error_code = events->exception.error_code;
+	}
+
+	if (events->exception.pending) {
+		vcpu->arch.pending_exception.valid = true;
+
+		if (vcpu->kvm->arch.exception_separate_injected_pending) {
+			vcpu->arch.pending_exception.nr = events->pending_exception.nr;
+			vcpu->arch.pending_exception.has_error_code = events->pending_exception.has_error_code;
+			vcpu->arch.pending_exception.error_code = events->pending_exception.error_code;
+		} else {
+			vcpu->arch.pending_exception.nr = events->exception.nr;
+			vcpu->arch.pending_exception.has_error_code = events->exception.has_error_code;
+			vcpu->arch.pending_exception.error_code = events->exception.error_code;
+		}
+
+		vcpu->arch.pending_exception.has_payload = events->pending_exception.has_payload;
+		vcpu->arch.pending_exception.payload = events->pending_exception.payload;
+	}
 
 	vcpu->arch.interrupt.injected = events->interrupt.injected;
 	vcpu->arch.interrupt.nr = events->interrupt.nr;
@@ -5347,6 +5443,11 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
 		kvm->arch.exception_payload_enabled = cap->args[0];
 		r = 0;
 		break;
+	case KVM_CAP_EXCEPTION_INJECTED_PENDING:
+		kvm->arch.exception_separate_injected_pending = cap->args[0];
+		r = 0;
+		break;
+
 	case KVM_CAP_X86_USER_SPACE_MSR:
 		kvm->arch.user_space_msr_mask = cap->args[0];
 		r = 0;
@@ -8345,8 +8446,6 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu)
 
 static void kvm_inject_exception(struct kvm_vcpu *vcpu)
 {
-	if (vcpu->arch.exception.error_code && !is_protmode(vcpu))
-		vcpu->arch.exception.error_code = false;
 	static_call(kvm_x86_queue_exception)(vcpu);
 }
 
@@ -8355,9 +8454,14 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit
 	int r;
 	bool can_inject = true;
 
-	/* try to reinject previous events if any */
+	r = kvm_deliver_pending_exception(vcpu);
+	if (r < 0)
+		goto busy;
+
+	WARN_ON_ONCE(vcpu->arch.pending_exception.valid);
 
-	if (vcpu->arch.exception.injected) {
+	/* try to reinject previous events if any */
+	if (vcpu->arch.injected_exception.valid) {
 		kvm_inject_exception(vcpu);
 		can_inject = false;
 	}
@@ -8375,7 +8479,7 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit
 	 * serviced prior to recognizing any new events in order to
 	 * fully complete the previous instruction.
 	 */
-	else if (!vcpu->arch.exception.pending) {
+	else {
 		if (vcpu->arch.nmi_injected) {
 			static_call(kvm_x86_set_nmi)(vcpu);
 			can_inject = false;
@@ -8385,9 +8489,6 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit
 		}
 	}
 
-	WARN_ON_ONCE(vcpu->arch.exception.injected &&
-		     vcpu->arch.exception.pending);
-
 	/*
 	 * Call check_nested_events() even if we reinjected a previous event
 	 * in order for caller to determine if it should require immediate-exit
@@ -8400,31 +8501,6 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit
 			goto busy;
 	}
 
-	/* try to inject new event if pending */
-	if (vcpu->arch.exception.pending) {
-		trace_kvm_inj_exception(vcpu->arch.exception.nr,
-					vcpu->arch.exception.has_error_code,
-					vcpu->arch.exception.error_code);
-
-		vcpu->arch.exception.pending = false;
-		vcpu->arch.exception.injected = true;
-
-		if (exception_type(vcpu->arch.exception.nr) == EXCPT_FAULT)
-			__kvm_set_rflags(vcpu, kvm_get_rflags(vcpu) |
-					     X86_EFLAGS_RF);
-
-		if (vcpu->arch.exception.nr == DB_VECTOR) {
-			kvm_deliver_exception_payload(vcpu);
-			if (vcpu->arch.dr7 & DR7_GD) {
-				vcpu->arch.dr7 &= ~DR7_GD;
-				kvm_update_dr7(vcpu);
-			}
-		}
-
-		kvm_inject_exception(vcpu);
-		can_inject = false;
-	}
-
 	/*
 	 * Finally, inject interrupt events.  If an event cannot be injected
 	 * due to architectural conditions (e.g. IF=0) a window-open exit
@@ -8482,7 +8558,7 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit
 	    kvm_x86_ops.nested_ops->hv_timer_pending(vcpu))
 		*req_immediate_exit = true;
 
-	WARN_ON(vcpu->arch.exception.pending);
+	WARN_ON(vcpu->arch.pending_exception.valid);
 	return;
 
 busy:
@@ -9584,7 +9660,7 @@ static void __set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 	kvm_rip_write(vcpu, regs->rip);
 	kvm_set_rflags(vcpu, regs->rflags | X86_EFLAGS_FIXED);
 
-	vcpu->arch.exception.pending = false;
+	vcpu->arch.pending_exception.valid = false;
 
 	kvm_make_request(KVM_REQ_EVENT, vcpu);
 }
@@ -9870,7 +9946,7 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
 
 	if (dbg->control & (KVM_GUESTDBG_INJECT_DB | KVM_GUESTDBG_INJECT_BP)) {
 		r = -EBUSY;
-		if (vcpu->arch.exception.pending)
+		if (vcpu->arch.pending_exception.valid)
 			goto out;
 		if (dbg->control & KVM_GUESTDBG_INJECT_DB)
 			kvm_queue_exception(vcpu, DB_VECTOR);
@@ -10931,7 +11007,7 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
 	if (vcpu->arch.pv.pv_unhalted)
 		return true;
 
-	if (vcpu->arch.exception.pending)
+	if (vcpu->arch.pending_exception.valid || vcpu->arch.injected_exception.valid)
 		return true;
 
 	if (kvm_test_request(KVM_REQ_NMI, vcpu) ||
@@ -11171,7 +11247,7 @@ bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu)
 {
 	if (unlikely(!lapic_in_kernel(vcpu) ||
 		     kvm_event_needs_reinjection(vcpu) ||
-		     vcpu->arch.exception.pending))
+		     vcpu->arch.pending_exception.valid))
 		return false;
 
 	if (kvm_hlt_in_guest(vcpu->kvm) && !kvm_can_deliver_async_pf(vcpu))
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index ee6e01067884d..e3848072c5bdb 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -58,8 +58,8 @@ static inline unsigned int __shrink_ple_window(unsigned int val,
 
 static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu)
 {
-	vcpu->arch.exception.pending = false;
-	vcpu->arch.exception.injected = false;
+	vcpu->arch.pending_exception.valid = false;
+	vcpu->arch.injected_exception.valid = false;
 }
 
 static inline void kvm_queue_interrupt(struct kvm_vcpu *vcpu, u8 vector,
@@ -77,7 +77,7 @@ static inline void kvm_clear_interrupt_queue(struct kvm_vcpu *vcpu)
 
 static inline bool kvm_event_needs_reinjection(struct kvm_vcpu *vcpu)
 {
-	return vcpu->arch.exception.injected || vcpu->arch.interrupt.injected ||
+	return vcpu->arch.injected_exception.valid || vcpu->arch.interrupt.injected ||
 		vcpu->arch.nmi_injected;
 }
 
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 63f8f6e956487..d913a46d36b04 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1077,6 +1077,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_SYS_HYPERV_CPUID 191
 #define KVM_CAP_DIRTY_LOG_RING 192
 #define KVM_CAP_X86_BUS_LOCK_EXIT 193
+#define KVM_CAP_EXCEPTION_INJECTED_PENDING 194
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
-- 
2.26.2