[PATCH 2/2] x86, apicv: Add Posted Interrupt supporting

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



From: Yang Zhang <yang.z.zhang@xxxxxxxxx>

Posted Interrupt allows APIC interrupts to inject into guest directly
without any vmexit.

- When delivering a interrupt to guest, if target vcpu is running,
  update Posted-interrupt requests bitmap and send a notification event
  to the vcpu. Then the vcpu will handle this interrupt automatically,
  without any software involvemnt.

- If target vcpu is not running or there already a notification event
  pending in the vcpu, do nothing. The interrupt will be handled by
  next vm entry.

Signed-off-by: Yang Zhang <yang.z.zhang@xxxxxxxxx>
---
 arch/x86/include/asm/entry_arch.h  |    1 +
 arch/x86/include/asm/hw_irq.h      |    1 +
 arch/x86/include/asm/irq.h         |    1 +
 arch/x86/include/asm/irq_vectors.h |    4 +
 arch/x86/include/asm/kvm_host.h    |    3 +
 arch/x86/include/asm/vmx.h         |    4 +
 arch/x86/kernel/entry_64.S         |    2 +
 arch/x86/kernel/irq.c              |   25 +++++++
 arch/x86/kernel/irqinit.c          |    2 +
 arch/x86/kvm/lapic.c               |   16 +++-
 arch/x86/kvm/lapic.h               |    1 +
 arch/x86/kvm/vmx.c                 |  133 +++++++++++++++++++++++++++++++++---
 12 files changed, 180 insertions(+), 13 deletions(-)

diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h
index 40afa00..7b0a29e 100644
--- a/arch/x86/include/asm/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -18,6 +18,7 @@ BUILD_INTERRUPT(reboot_interrupt,REBOOT_VECTOR)
 #endif
 
 BUILD_INTERRUPT(x86_platform_ipi, X86_PLATFORM_IPI_VECTOR)
+BUILD_INTERRUPT(posted_intr_ipi, POSTED_INTR_VECTOR)
 
 /*
  * every pentium local APIC has two 'local interrupts', with a
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index eb92a6e..ee61af3 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -28,6 +28,7 @@
 /* Interrupt handlers registered during init_IRQ */
 extern void apic_timer_interrupt(void);
 extern void x86_platform_ipi(void);
+extern void posted_intr_ipi(void);
 extern void error_interrupt(void);
 extern void irq_work_interrupt(void);
 
diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h
index ba870bb..cff9933 100644
--- a/arch/x86/include/asm/irq.h
+++ b/arch/x86/include/asm/irq.h
@@ -30,6 +30,7 @@ extern void irq_force_complete_move(int);
 #endif
 
 extern void (*x86_platform_ipi_callback)(void);
+extern void (*posted_intr_callback)(void);
 extern void native_init_IRQ(void);
 extern bool handle_irq(unsigned irq, struct pt_regs *regs);
 
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 1508e51..8f2e383 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -102,6 +102,10 @@
  */
 #define X86_PLATFORM_IPI_VECTOR		0xf7
 
+#ifdef CONFIG_HAVE_KVM
+#define POSTED_INTR_VECTOR 		0xf2
+#endif
+
 /*
  * IRQ work vector:
  */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 7e26d1a..82423a8 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -700,6 +700,9 @@ struct kvm_x86_ops {
 	int (*has_virtual_interrupt_delivery)(struct kvm_vcpu *vcpu);
 	void (*update_irq)(struct kvm_vcpu *vcpu);
 	void (*update_eoi_exitmap)(struct kvm_vcpu *vcpu, int vector, bool set);
+	int (*has_posted_interrupt)(struct kvm_vcpu *vcpu);
+	int (*send_nv)(struct kvm_vcpu *vcpu, int vector);
+	void (*update_irr)(struct kvm_vcpu *vcpu);
 	int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
 	int (*get_tdp_level)(void);
 	u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 1003341..7b9e1d0 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -152,6 +152,7 @@
 #define PIN_BASED_EXT_INTR_MASK                 0x00000001
 #define PIN_BASED_NMI_EXITING                   0x00000008
 #define PIN_BASED_VIRTUAL_NMIS                  0x00000020
+#define PIN_BASED_POSTED_INTR                   0x00000080
 
 #define VM_EXIT_SAVE_DEBUG_CONTROLS             0x00000002
 #define VM_EXIT_HOST_ADDR_SPACE_SIZE            0x00000200
@@ -174,6 +175,7 @@
 /* VMCS Encodings */
 enum vmcs_field {
 	VIRTUAL_PROCESSOR_ID            = 0x00000000,
+	POSTED_INTR_NV                  = 0x00000002,
 	GUEST_ES_SELECTOR               = 0x00000800,
 	GUEST_CS_SELECTOR               = 0x00000802,
 	GUEST_SS_SELECTOR               = 0x00000804,
@@ -208,6 +210,8 @@ enum vmcs_field {
 	VIRTUAL_APIC_PAGE_ADDR_HIGH     = 0x00002013,
 	APIC_ACCESS_ADDR		= 0x00002014,
 	APIC_ACCESS_ADDR_HIGH		= 0x00002015,
+	POSTED_INTR_DESC_ADDR           = 0x00002016,
+	POSTED_INTR_DESC_ADDR_HIGH      = 0x00002017,
 	EPT_POINTER                     = 0x0000201a,
 	EPT_POINTER_HIGH                = 0x0000201b,
 	EOI_EXIT_BITMAP0                = 0x0000201c,
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index b51b2c7..d06eea1 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1160,6 +1160,8 @@ apicinterrupt LOCAL_TIMER_VECTOR \
 	apic_timer_interrupt smp_apic_timer_interrupt
 apicinterrupt X86_PLATFORM_IPI_VECTOR \
 	x86_platform_ipi smp_x86_platform_ipi
+apicinterrupt POSTED_INTR_VECTOR \
+	posted_intr_ipi smp_posted_intr_ipi
 
 apicinterrupt THRESHOLD_APIC_VECTOR \
 	threshold_interrupt smp_threshold_interrupt
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index e4595f1..781d324 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -22,6 +22,9 @@ atomic_t irq_err_count;
 
 /* Function pointer for generic interrupt vector handling */
 void (*x86_platform_ipi_callback)(void) = NULL;
+/* Function pointer for posted interrupt vector handling */
+void (*posted_intr_callback)(void) = NULL;
+EXPORT_SYMBOL_GPL(posted_intr_callback);
 
 /*
  * 'what should we do if we get a hw irq event on an illegal vector'.
@@ -228,6 +231,28 @@ void smp_x86_platform_ipi(struct pt_regs *regs)
 	set_irq_regs(old_regs);
 }
 
+/*
+ * Handler for POSTED_INTERRUPT_VECTOR.
+ */
+void smp_posted_intr_ipi(struct pt_regs *regs)
+{
+	struct pt_regs *old_regs = set_irq_regs(regs);
+
+	ack_APIC_irq();
+
+	irq_enter();
+
+	exit_idle();
+
+	if (posted_intr_callback)
+		posted_intr_callback();
+
+	irq_exit();
+
+	set_irq_regs(old_regs);
+}
+
+
 EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq);
 
 #ifdef CONFIG_HOTPLUG_CPU
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 6e03b0d..d15ca4f 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -205,6 +205,8 @@ static void __init apic_intr_init(void)
 
 	/* IPI for X86 platform specific use */
 	alloc_intr_gate(X86_PLATFORM_IPI_VECTOR, x86_platform_ipi);
+	/* IPI for posted interrupt use */
+	alloc_intr_gate(POSTED_INTR_VECTOR, posted_intr_ipi);
 
 	/* IPI vectors for APIC spurious and error interrupts */
 	alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 2109a6a..d660b9d 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -350,6 +350,7 @@ static inline int apic_find_highest_irr(struct kvm_lapic *apic)
 	if (!apic->irr_pending)
 		return -1;
 
+	kvm_x86_ops->update_irr(apic->vcpu);
 	result = apic_search_irr(apic);
 	ASSERT(result == -1 || result >= 16);
 
@@ -725,18 +726,25 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
 		if (trig_mode) {
 			apic_debug("level trig mode for vector %d", vector);
 			apic_set_vector(vector, apic->regs + APIC_TMR);
-		} else
+		} else {
 			apic_clear_vector(vector, apic->regs + APIC_TMR);
-
+			if (kvm_x86_ops->has_posted_interrupt(vcpu)) {
+				result = 1;
+				apic->irr_pending = true;
+				kvm_x86_ops->send_nv(vcpu, vector);
+				goto out;
+			}
+		}
 		result = !apic_test_and_set_irr(vector, apic);
-		trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode,
-					  trig_mode, vector, !result);
 		if (!result) {
 			if (trig_mode)
 				apic_debug("level trig mode repeatedly for "
 						"vector %d", vector);
 			break;
 		}
+out:
+		trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode,
+					  trig_mode, vector, !result);
 
 		kvm_make_request(KVM_REQ_EVENT, vcpu);
 		kvm_vcpu_kick(vcpu);
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 10e3f66..0f8361e 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -42,6 +42,7 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu);
 int kvm_cpu_has_extint(struct kvm_vcpu *v);
 int kvm_cpu_get_extint(struct kvm_vcpu *v);
 int kvm_apic_get_highest_irr(struct kvm_vcpu *vcpu);
+void kvm_apic_update_irr(struct kvm_vcpu *vcpu, unsigned int *pir);
 void kvm_lapic_reset(struct kvm_vcpu *vcpu);
 u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu);
 void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8);
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 6b6bd03..07dbde6 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -31,6 +31,7 @@
 #include <linux/ftrace_event.h>
 #include <linux/slab.h>
 #include <linux/tboot.h>
+#include <linux/interrupt.h>
 #include "kvm_cache_regs.h"
 #include "x86.h"
 
@@ -86,6 +87,8 @@ module_param(fasteoi, bool, S_IRUGO);
 static bool __read_mostly enable_apicv_reg_vid = 1;
 module_param(enable_apicv_reg_vid, bool, S_IRUGO);
 
+static bool __read_mostly enable_apicv_pi = 1;
+module_param(enable_apicv_pi, bool, S_IRUGO);
 /*
  * If nested=1, nested virtualization is supported, i.e., guests may use
  * VMX and be a hypervisor for its own guests. If nested=0, guests may not
@@ -369,6 +372,35 @@ struct nested_vmx {
 	struct page *apic_access_page;
 };
 
+#define POSTED_INTR_ON  0
+/* Posted-Interrupt Descriptor */
+struct pi_desc {
+	u32 pir[8];     /* Posted interrupt requested */
+	union {
+		struct {
+			u8  on:1,
+			    rsvd:7;
+		} control;
+		u32 rsvd[8];
+	} u;
+} __aligned(64);
+
+static void pi_clear_on(struct pi_desc *pi_desc)
+{
+	clear_bit(POSTED_INTR_ON, (unsigned long *)&pi_desc->u.control);
+}
+
+static u8 pi_test_and_set_on(struct pi_desc *pi_desc)
+{
+	return test_and_set_bit(POSTED_INTR_ON,
+			(unsigned long *)&pi_desc->u.control);
+}
+
+static void pi_set_pir(int vector, struct pi_desc *pi_desc)
+{
+	set_bit(vector, (unsigned long *)pi_desc->pir);
+}
+
 struct vcpu_vmx {
 	struct kvm_vcpu       vcpu;
 	unsigned long         host_rsp;
@@ -435,6 +467,9 @@ struct vcpu_vmx {
 	u8 eoi_exitmap_changed;
 	u32 eoi_exit_bitmap[8];
 
+	/* Posted interrupt descriptor */
+	struct pi_desc *pi;
+
 	/* Support for a guest hypervisor (nested VMX) */
 	struct nested_vmx nested;
 };
@@ -779,6 +814,11 @@ static inline bool cpu_has_vmx_virtual_intr_delivery(void)
 		SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY;
 }
 
+static inline bool cpu_has_vmx_posted_intr(void)
+{
+	return vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR;
+}
+
 static inline bool cpu_has_vmx_flexpriority(void)
 {
 	return cpu_has_vmx_tpr_shadow() &&
@@ -2475,12 +2515,6 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 	u32 _vmexit_control = 0;
 	u32 _vmentry_control = 0;
 
-	min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
-	opt = PIN_BASED_VIRTUAL_NMIS;
-	if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
-				&_pin_based_exec_control) < 0)
-		return -EIO;
-
 	min = CPU_BASED_HLT_EXITING |
 #ifdef CONFIG_X86_64
 	      CPU_BASED_CR8_LOAD_EXITING |
@@ -2554,6 +2588,17 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 				&_vmexit_control) < 0)
 		return -EIO;
 
+	min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
+	opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR;
+	if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
+				&_pin_based_exec_control) < 0)
+		return -EIO;
+
+	if (!(_cpu_based_2nd_exec_control &
+		SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) ||
+		!(_vmexit_control & VM_EXIT_ACK_INTR_ON_EXIT))
+		_pin_based_exec_control &= ~PIN_BASED_POSTED_INTR;
+
 	min = 0;
 	opt = VM_ENTRY_LOAD_IA32_PAT;
 	if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS,
@@ -2739,6 +2784,9 @@ static __init int hardware_setup(void)
 	if (enable_apicv_reg_vid)
 		kvm_x86_ops->update_cr8_intercept = NULL;
 
+	if (!cpu_has_vmx_posted_intr() || !enable_apicv_reg_vid)
+		enable_apicv_pi = 0;
+
 	if (nested)
 		nested_vmx_setup_ctls_msrs();
 
@@ -3904,6 +3952,57 @@ static void ept_set_mmio_spte_mask(void)
 	kvm_mmu_set_mmio_spte_mask(0xffull << 49 | 0x6ull);
 }
 
+static void pi_handler(void)
+{
+	;
+}
+
+static int vmx_has_posted_interrupt(struct kvm_vcpu *vcpu)
+{
+	return irqchip_in_kernel(vcpu->kvm) && enable_apicv_pi;
+}
+
+static int vmx_send_nv(struct kvm_vcpu *vcpu,
+		int vector)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+	pi_set_pir(vector, vmx->pi);
+	if (!pi_test_and_set_on(vmx->pi) && (vcpu->mode == IN_GUEST_MODE)) {
+		apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), POSTED_INTR_VECTOR);
+		return 1;
+	}
+	return 0;
+}
+
+static void vmx_update_irr(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+        struct kvm_lapic *apic = vcpu->arch.apic;
+        unsigned int i, old, new, val, irr_off;
+
+	if (!enable_apicv_pi)
+		return;
+
+        for (i = 0; i <= 7; i++) {
+		if (vmx->pi->pir[i]) {
+			irr_off = APIC_IRR + i * 0x10;
+			do {
+				old = kvm_apic_get_reg(apic, irr_off);
+				new = old | vmx->pi->pir[i];
+				val = cmpxchg((u32 *)(apic->regs + irr_off), old, new);
+			} while (unlikely (val != old));
+			vmx->pi->pir[i] = 0;
+		}
+        }
+}
+
+static void free_pi(struct vcpu_vmx *vmx)
+{
+	if (enable_apicv_pi)
+		kfree(vmx->pi);
+}
+
 /*
  * Sets up the vmcs for emulated real mode.
  */
@@ -3913,6 +4012,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 	unsigned long a;
 #endif
 	int i;
+	u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl;
 	u32 vmexit_ctrl = vmcs_config.vmexit_ctrl;
 
 	/* I/O */
@@ -3925,8 +4025,10 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 	vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
 
 	/* Control */
-	vmcs_write32(PIN_BASED_VM_EXEC_CONTROL,
-		vmcs_config.pin_based_exec_ctrl);
+	if (!enable_apicv_pi)
+		pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR;
+
+	vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, pin_based_exec_ctrl);
 
 	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx));
 
@@ -3944,6 +4046,13 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 		vmcs_write16(GUEST_INTR_STATUS, 0);
 	}
 
+	if (enable_apicv_pi) {
+		vmx->pi = kmalloc(sizeof(struct pi_desc),
+				GFP_KERNEL | __GFP_ZERO);
+		vmcs_write64(POSTED_INTR_NV, POSTED_INTR_VECTOR);
+		vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((vmx->pi)));
+	}
+
 	if (ple_gap) {
 		vmcs_write32(PLE_GAP, ple_gap);
 		vmcs_write32(PLE_WINDOW, ple_window);
@@ -6220,6 +6329,8 @@ static void vmx_update_irq(struct kvm_vcpu *vcpu)
 					vmx->eoi_exit_bitmap[index]);
 		vmx->eoi_exitmap_changed = 0;
 	}
+	if (enable_apicv_pi)
+		pi_clear_on(vmx->pi);
 }
 
 static void vmx_update_eoi_exitmap(struct kvm_vcpu *vcpu,
@@ -6626,6 +6737,7 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
 
 	free_vpid(vmx);
 	free_nested(vmx);
+	free_pi(vmx);
 	free_loaded_vmcs(vmx->loaded_vmcs);
 	kfree(vmx->guest_msrs);
 	kvm_vcpu_uninit(vcpu);
@@ -7520,8 +7632,11 @@ static struct kvm_x86_ops vmx_x86_ops = {
 	.enable_irq_window = enable_irq_window,
 	.update_cr8_intercept = update_cr8_intercept,
 	.has_virtual_interrupt_delivery = vmx_has_virtual_interrupt_delivery,
+	.has_posted_interrupt = vmx_has_posted_interrupt,
 	.update_irq = vmx_update_irq,
 	.update_eoi_exitmap = vmx_update_eoi_exitmap,
+	.send_nv = vmx_send_nv,
+	.update_irr = vmx_update_irr,
 
 	.set_tss_addr = vmx_set_tss_addr,
 	.get_tdp_level = get_ept_level,
@@ -7618,7 +7733,7 @@ static int __init vmx_init(void)
 		/* SELF-IPI */
 		vmx_disable_intercept_for_msr_write(0x83f, false);
 	}
-
+	posted_intr_callback = pi_handler;
 	if (enable_ept) {
 		kvm_mmu_set_mask_ptes(0ull,
 			(enable_ept_ad_bits) ? VMX_EPT_ACCESS_BIT : 0ull,
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[Index of Archives]     [KVM ARM]     [KVM ia64]     [KVM ppc]     [Virtualization Tools]     [Spice Development]     [Libvirt]     [Libvirt Users]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite Questions]     [Linux Kernel]     [Linux SCSI]     [XFree86]
  Powered by Linux