From: Yang Zhang <yang.z.zhang@xxxxxxxxx> Posted Interrupt allows APIC interrupts to inject into guest directly without any vmexit. - When delivering a interrupt to guest, if target vcpu is running, update Posted-interrupt requests bitmap and send a notification event to the vcpu. Then the vcpu will handle this interrupt automatically, without any software involvemnt. - If target vcpu is not running or there already a notification event pending in the vcpu, do nothing. The interrupt will be handled by next vm entry. Signed-off-by: Yang Zhang <yang.z.zhang@xxxxxxxxx> --- arch/x86/include/asm/entry_arch.h | 1 + arch/x86/include/asm/hw_irq.h | 1 + arch/x86/include/asm/irq.h | 1 + arch/x86/include/asm/irq_vectors.h | 4 + arch/x86/include/asm/kvm_host.h | 3 + arch/x86/include/asm/vmx.h | 4 + arch/x86/kernel/entry_64.S | 2 + arch/x86/kernel/irq.c | 25 +++++++ arch/x86/kernel/irqinit.c | 2 + arch/x86/kvm/lapic.c | 16 +++- arch/x86/kvm/lapic.h | 1 + arch/x86/kvm/vmx.c | 133 +++++++++++++++++++++++++++++++++--- 12 files changed, 180 insertions(+), 13 deletions(-) diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h index 40afa00..7b0a29e 100644 --- a/arch/x86/include/asm/entry_arch.h +++ b/arch/x86/include/asm/entry_arch.h @@ -18,6 +18,7 @@ BUILD_INTERRUPT(reboot_interrupt,REBOOT_VECTOR) #endif BUILD_INTERRUPT(x86_platform_ipi, X86_PLATFORM_IPI_VECTOR) +BUILD_INTERRUPT(posted_intr_ipi, POSTED_INTR_VECTOR) /* * every pentium local APIC has two 'local interrupts', with a diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index eb92a6e..ee61af3 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -28,6 +28,7 @@ /* Interrupt handlers registered during init_IRQ */ extern void apic_timer_interrupt(void); extern void x86_platform_ipi(void); +extern void posted_intr_ipi(void); extern void error_interrupt(void); extern void irq_work_interrupt(void); diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h index ba870bb..cff9933 100644 --- a/arch/x86/include/asm/irq.h +++ b/arch/x86/include/asm/irq.h @@ -30,6 +30,7 @@ extern void irq_force_complete_move(int); #endif extern void (*x86_platform_ipi_callback)(void); +extern void (*posted_intr_callback)(void); extern void native_init_IRQ(void); extern bool handle_irq(unsigned irq, struct pt_regs *regs); diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index 1508e51..8f2e383 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -102,6 +102,10 @@ */ #define X86_PLATFORM_IPI_VECTOR 0xf7 +#ifdef CONFIG_HAVE_KVM +#define POSTED_INTR_VECTOR 0xf2 +#endif + /* * IRQ work vector: */ diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 7e26d1a..82423a8 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -700,6 +700,9 @@ struct kvm_x86_ops { int (*has_virtual_interrupt_delivery)(struct kvm_vcpu *vcpu); void (*update_irq)(struct kvm_vcpu *vcpu); void (*update_eoi_exitmap)(struct kvm_vcpu *vcpu, int vector, bool set); + int (*has_posted_interrupt)(struct kvm_vcpu *vcpu); + int (*send_nv)(struct kvm_vcpu *vcpu, int vector); + void (*update_irr)(struct kvm_vcpu *vcpu); int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); int (*get_tdp_level)(void); u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 1003341..7b9e1d0 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -152,6 +152,7 @@ #define PIN_BASED_EXT_INTR_MASK 0x00000001 #define PIN_BASED_NMI_EXITING 0x00000008 #define PIN_BASED_VIRTUAL_NMIS 0x00000020 +#define PIN_BASED_POSTED_INTR 0x00000080 #define VM_EXIT_SAVE_DEBUG_CONTROLS 0x00000002 #define VM_EXIT_HOST_ADDR_SPACE_SIZE 0x00000200 @@ -174,6 +175,7 @@ /* VMCS Encodings */ enum vmcs_field { VIRTUAL_PROCESSOR_ID = 0x00000000, + POSTED_INTR_NV = 0x00000002, GUEST_ES_SELECTOR = 0x00000800, GUEST_CS_SELECTOR = 0x00000802, GUEST_SS_SELECTOR = 0x00000804, @@ -208,6 +210,8 @@ enum vmcs_field { VIRTUAL_APIC_PAGE_ADDR_HIGH = 0x00002013, APIC_ACCESS_ADDR = 0x00002014, APIC_ACCESS_ADDR_HIGH = 0x00002015, + POSTED_INTR_DESC_ADDR = 0x00002016, + POSTED_INTR_DESC_ADDR_HIGH = 0x00002017, EPT_POINTER = 0x0000201a, EPT_POINTER_HIGH = 0x0000201b, EOI_EXIT_BITMAP0 = 0x0000201c, diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index b51b2c7..d06eea1 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1160,6 +1160,8 @@ apicinterrupt LOCAL_TIMER_VECTOR \ apic_timer_interrupt smp_apic_timer_interrupt apicinterrupt X86_PLATFORM_IPI_VECTOR \ x86_platform_ipi smp_x86_platform_ipi +apicinterrupt POSTED_INTR_VECTOR \ + posted_intr_ipi smp_posted_intr_ipi apicinterrupt THRESHOLD_APIC_VECTOR \ threshold_interrupt smp_threshold_interrupt diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index e4595f1..781d324 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -22,6 +22,9 @@ atomic_t irq_err_count; /* Function pointer for generic interrupt vector handling */ void (*x86_platform_ipi_callback)(void) = NULL; +/* Function pointer for posted interrupt vector handling */ +void (*posted_intr_callback)(void) = NULL; +EXPORT_SYMBOL_GPL(posted_intr_callback); /* * 'what should we do if we get a hw irq event on an illegal vector'. @@ -228,6 +231,28 @@ void smp_x86_platform_ipi(struct pt_regs *regs) set_irq_regs(old_regs); } +/* + * Handler for POSTED_INTERRUPT_VECTOR. + */ +void smp_posted_intr_ipi(struct pt_regs *regs) +{ + struct pt_regs *old_regs = set_irq_regs(regs); + + ack_APIC_irq(); + + irq_enter(); + + exit_idle(); + + if (posted_intr_callback) + posted_intr_callback(); + + irq_exit(); + + set_irq_regs(old_regs); +} + + EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq); #ifdef CONFIG_HOTPLUG_CPU diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 6e03b0d..d15ca4f 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -205,6 +205,8 @@ static void __init apic_intr_init(void) /* IPI for X86 platform specific use */ alloc_intr_gate(X86_PLATFORM_IPI_VECTOR, x86_platform_ipi); + /* IPI for posted interrupt use */ + alloc_intr_gate(POSTED_INTR_VECTOR, posted_intr_ipi); /* IPI vectors for APIC spurious and error interrupts */ alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 2109a6a..d660b9d 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -350,6 +350,7 @@ static inline int apic_find_highest_irr(struct kvm_lapic *apic) if (!apic->irr_pending) return -1; + kvm_x86_ops->update_irr(apic->vcpu); result = apic_search_irr(apic); ASSERT(result == -1 || result >= 16); @@ -725,18 +726,25 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, if (trig_mode) { apic_debug("level trig mode for vector %d", vector); apic_set_vector(vector, apic->regs + APIC_TMR); - } else + } else { apic_clear_vector(vector, apic->regs + APIC_TMR); - + if (kvm_x86_ops->has_posted_interrupt(vcpu)) { + result = 1; + apic->irr_pending = true; + kvm_x86_ops->send_nv(vcpu, vector); + goto out; + } + } result = !apic_test_and_set_irr(vector, apic); - trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode, - trig_mode, vector, !result); if (!result) { if (trig_mode) apic_debug("level trig mode repeatedly for " "vector %d", vector); break; } +out: + trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode, + trig_mode, vector, !result); kvm_make_request(KVM_REQ_EVENT, vcpu); kvm_vcpu_kick(vcpu); diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 10e3f66..0f8361e 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -42,6 +42,7 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu); int kvm_cpu_has_extint(struct kvm_vcpu *v); int kvm_cpu_get_extint(struct kvm_vcpu *v); int kvm_apic_get_highest_irr(struct kvm_vcpu *vcpu); +void kvm_apic_update_irr(struct kvm_vcpu *vcpu, unsigned int *pir); void kvm_lapic_reset(struct kvm_vcpu *vcpu); u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu); void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8); diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 6b6bd03..07dbde6 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -31,6 +31,7 @@ #include <linux/ftrace_event.h> #include <linux/slab.h> #include <linux/tboot.h> +#include <linux/interrupt.h> #include "kvm_cache_regs.h" #include "x86.h" @@ -86,6 +87,8 @@ module_param(fasteoi, bool, S_IRUGO); static bool __read_mostly enable_apicv_reg_vid = 1; module_param(enable_apicv_reg_vid, bool, S_IRUGO); +static bool __read_mostly enable_apicv_pi = 1; +module_param(enable_apicv_pi, bool, S_IRUGO); /* * If nested=1, nested virtualization is supported, i.e., guests may use * VMX and be a hypervisor for its own guests. If nested=0, guests may not @@ -369,6 +372,35 @@ struct nested_vmx { struct page *apic_access_page; }; +#define POSTED_INTR_ON 0 +/* Posted-Interrupt Descriptor */ +struct pi_desc { + u32 pir[8]; /* Posted interrupt requested */ + union { + struct { + u8 on:1, + rsvd:7; + } control; + u32 rsvd[8]; + } u; +} __aligned(64); + +static void pi_clear_on(struct pi_desc *pi_desc) +{ + clear_bit(POSTED_INTR_ON, (unsigned long *)&pi_desc->u.control); +} + +static u8 pi_test_and_set_on(struct pi_desc *pi_desc) +{ + return test_and_set_bit(POSTED_INTR_ON, + (unsigned long *)&pi_desc->u.control); +} + +static void pi_set_pir(int vector, struct pi_desc *pi_desc) +{ + set_bit(vector, (unsigned long *)pi_desc->pir); +} + struct vcpu_vmx { struct kvm_vcpu vcpu; unsigned long host_rsp; @@ -435,6 +467,9 @@ struct vcpu_vmx { u8 eoi_exitmap_changed; u32 eoi_exit_bitmap[8]; + /* Posted interrupt descriptor */ + struct pi_desc *pi; + /* Support for a guest hypervisor (nested VMX) */ struct nested_vmx nested; }; @@ -779,6 +814,11 @@ static inline bool cpu_has_vmx_virtual_intr_delivery(void) SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY; } +static inline bool cpu_has_vmx_posted_intr(void) +{ + return vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR; +} + static inline bool cpu_has_vmx_flexpriority(void) { return cpu_has_vmx_tpr_shadow() && @@ -2475,12 +2515,6 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) u32 _vmexit_control = 0; u32 _vmentry_control = 0; - min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING; - opt = PIN_BASED_VIRTUAL_NMIS; - if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS, - &_pin_based_exec_control) < 0) - return -EIO; - min = CPU_BASED_HLT_EXITING | #ifdef CONFIG_X86_64 CPU_BASED_CR8_LOAD_EXITING | @@ -2554,6 +2588,17 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) &_vmexit_control) < 0) return -EIO; + min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING; + opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR; + if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS, + &_pin_based_exec_control) < 0) + return -EIO; + + if (!(_cpu_based_2nd_exec_control & + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) || + !(_vmexit_control & VM_EXIT_ACK_INTR_ON_EXIT)) + _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR; + min = 0; opt = VM_ENTRY_LOAD_IA32_PAT; if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS, @@ -2739,6 +2784,9 @@ static __init int hardware_setup(void) if (enable_apicv_reg_vid) kvm_x86_ops->update_cr8_intercept = NULL; + if (!cpu_has_vmx_posted_intr() || !enable_apicv_reg_vid) + enable_apicv_pi = 0; + if (nested) nested_vmx_setup_ctls_msrs(); @@ -3904,6 +3952,57 @@ static void ept_set_mmio_spte_mask(void) kvm_mmu_set_mmio_spte_mask(0xffull << 49 | 0x6ull); } +static void pi_handler(void) +{ + ; +} + +static int vmx_has_posted_interrupt(struct kvm_vcpu *vcpu) +{ + return irqchip_in_kernel(vcpu->kvm) && enable_apicv_pi; +} + +static int vmx_send_nv(struct kvm_vcpu *vcpu, + int vector) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + pi_set_pir(vector, vmx->pi); + if (!pi_test_and_set_on(vmx->pi) && (vcpu->mode == IN_GUEST_MODE)) { + apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), POSTED_INTR_VECTOR); + return 1; + } + return 0; +} + +static void vmx_update_irr(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct kvm_lapic *apic = vcpu->arch.apic; + unsigned int i, old, new, val, irr_off; + + if (!enable_apicv_pi) + return; + + for (i = 0; i <= 7; i++) { + if (vmx->pi->pir[i]) { + irr_off = APIC_IRR + i * 0x10; + do { + old = kvm_apic_get_reg(apic, irr_off); + new = old | vmx->pi->pir[i]; + val = cmpxchg((u32 *)(apic->regs + irr_off), old, new); + } while (unlikely (val != old)); + vmx->pi->pir[i] = 0; + } + } +} + +static void free_pi(struct vcpu_vmx *vmx) +{ + if (enable_apicv_pi) + kfree(vmx->pi); +} + /* * Sets up the vmcs for emulated real mode. */ @@ -3913,6 +4012,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) unsigned long a; #endif int i; + u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl; u32 vmexit_ctrl = vmcs_config.vmexit_ctrl; /* I/O */ @@ -3925,8 +4025,10 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */ /* Control */ - vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, - vmcs_config.pin_based_exec_ctrl); + if (!enable_apicv_pi) + pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR; + + vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, pin_based_exec_ctrl); vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx)); @@ -3944,6 +4046,13 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) vmcs_write16(GUEST_INTR_STATUS, 0); } + if (enable_apicv_pi) { + vmx->pi = kmalloc(sizeof(struct pi_desc), + GFP_KERNEL | __GFP_ZERO); + vmcs_write64(POSTED_INTR_NV, POSTED_INTR_VECTOR); + vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((vmx->pi))); + } + if (ple_gap) { vmcs_write32(PLE_GAP, ple_gap); vmcs_write32(PLE_WINDOW, ple_window); @@ -6220,6 +6329,8 @@ static void vmx_update_irq(struct kvm_vcpu *vcpu) vmx->eoi_exit_bitmap[index]); vmx->eoi_exitmap_changed = 0; } + if (enable_apicv_pi) + pi_clear_on(vmx->pi); } static void vmx_update_eoi_exitmap(struct kvm_vcpu *vcpu, @@ -6626,6 +6737,7 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu) free_vpid(vmx); free_nested(vmx); + free_pi(vmx); free_loaded_vmcs(vmx->loaded_vmcs); kfree(vmx->guest_msrs); kvm_vcpu_uninit(vcpu); @@ -7520,8 +7632,11 @@ static struct kvm_x86_ops vmx_x86_ops = { .enable_irq_window = enable_irq_window, .update_cr8_intercept = update_cr8_intercept, .has_virtual_interrupt_delivery = vmx_has_virtual_interrupt_delivery, + .has_posted_interrupt = vmx_has_posted_interrupt, .update_irq = vmx_update_irq, .update_eoi_exitmap = vmx_update_eoi_exitmap, + .send_nv = vmx_send_nv, + .update_irr = vmx_update_irr, .set_tss_addr = vmx_set_tss_addr, .get_tdp_level = get_ept_level, @@ -7618,7 +7733,7 @@ static int __init vmx_init(void) /* SELF-IPI */ vmx_disable_intercept_for_msr_write(0x83f, false); } - + posted_intr_callback = pi_handler; if (enable_ept) { kvm_mmu_set_mask_ptes(0ull, (enable_ept_ad_bits) ? VMX_EPT_ACCESS_BIT : 0ull, -- 1.7.1 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html