Gleb Natapov wrote on 2012-11-25: > On Wed, Nov 21, 2012 at 04:09:39PM +0800, Yang Zhang wrote: >> Posted Interrupt allows vAPICV interrupts to inject into guest directly >> without any vmexit. >> >> - When delivering a interrupt to guest, if target vcpu is running, >> update Posted-interrupt requests bitmap and send a notification event >> to the vcpu. Then the vcpu will handle this interrupt automatically, >> without any software involvemnt. > Looks like you allocating one irq vector per vcpu per pcpu and then > migrate it or reallocate when vcpu move from one pcpu to another. > This is not scalable and migrating irq migration slows things down. > What's wrong with allocating one global vector for posted interrupt > during vmx initialization and use it for all vcpus? Consider the following situation: If vcpu A is running when notification event which belong to vcpu B is arrived, since the vector match the vcpu A's notification vector, then this event will be consumed by vcpu A(even it do nothing) and the interrupt cannot be handled in time. >> - If target vcpu is not running or there already a notification event >> pending in the vcpu, do nothing. The interrupt will be handled by old >> way. >> Signed-off-by: Yang Zhang <yang.z.zhang@xxxxxxxxx> >> --- >> arch/x86/include/asm/kvm_host.h | 3 + arch/x86/include/asm/vmx.h >> | 4 + arch/x86/kernel/apic/io_apic.c | 138 >> ++++++++++++++++++++++++++++ arch/x86/kvm/lapic.c | 31 >> ++++++- arch/x86/kvm/lapic.h | 8 ++ arch/x86/kvm/vmx.c >> | 192 +++++++++++++++++++++++++++++++++++++-- >> arch/x86/kvm/x86.c | 2 + include/linux/kvm_host.h >> | 1 + virt/kvm/kvm_main.c | 2 + 9 files changed, >> 372 insertions(+), 9 deletions(-) >> diff --git a/arch/x86/include/asm/kvm_host.h >> b/arch/x86/include/asm/kvm_host.h index 8e07a86..1145894 100644 --- >> a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h >> @@ -683,9 +683,12 @@ struct kvm_x86_ops { >> void (*enable_irq_window)(struct kvm_vcpu *vcpu); void >> (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr); int >> (*has_virtual_interrupt_delivery)(struct kvm_vcpu *vcpu); + int >> (*has_posted_interrupt)(struct kvm_vcpu *vcpu); void >> (*update_irq)(struct kvm_vcpu *vcpu); void (*set_eoi_exitmap)(struct >> kvm_vcpu *vcpu, int vector, int need_eoi, int global); >> + int (*send_nv)(struct kvm_vcpu *vcpu, int vector); >> + void (*pi_migrate)(struct kvm_vcpu *vcpu); >> int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); >> int (*get_tdp_level)(void); >> u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); >> diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h >> index 1003341..7b9e1d0 100644 >> --- a/arch/x86/include/asm/vmx.h >> +++ b/arch/x86/include/asm/vmx.h >> @@ -152,6 +152,7 @@ >> #define PIN_BASED_EXT_INTR_MASK 0x00000001 >> #define PIN_BASED_NMI_EXITING 0x00000008 >> #define PIN_BASED_VIRTUAL_NMIS 0x00000020 >> +#define PIN_BASED_POSTED_INTR 0x00000080 >> >> #define VM_EXIT_SAVE_DEBUG_CONTROLS 0x00000002 #define >> VM_EXIT_HOST_ADDR_SPACE_SIZE 0x00000200 @@ -174,6 +175,7 @@ >> /* VMCS Encodings */ enum vmcs_field { VIRTUAL_PROCESSOR_ID >> = 0x00000000, + POSTED_INTR_NV = 0x00000002, >> GUEST_ES_SELECTOR = 0x00000800, GUEST_CS_SELECTOR >> = 0x00000802, GUEST_SS_SELECTOR = 0x00000804, >> @@ -208,6 +210,8 @@ enum vmcs_field { VIRTUAL_APIC_PAGE_ADDR_HIGH >> = 0x00002013, APIC_ACCESS_ADDR = 0x00002014, >> APIC_ACCESS_ADDR_HIGH = 0x00002015, >> + POSTED_INTR_DESC_ADDR = 0x00002016, >> + POSTED_INTR_DESC_ADDR_HIGH = 0x00002017, >> EPT_POINTER = 0x0000201a, >> EPT_POINTER_HIGH = 0x0000201b, >> EOI_EXIT_BITMAP0 = 0x0000201c, >> diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c >> index 1817fa9..97cb8ee 100644 >> --- a/arch/x86/kernel/apic/io_apic.c >> +++ b/arch/x86/kernel/apic/io_apic.c >> @@ -3277,6 +3277,144 @@ int arch_setup_dmar_msi(unsigned int irq) >> } >> #endif >> +static int >> +pi_set_affinity(struct irq_data *data, const struct cpumask *mask, >> + bool force) >> +{ >> + unsigned int dest; >> + struct irq_cfg *cfg = (struct irq_cfg *)data->chip_data; >> + if (cpumask_equal(cfg->domain, mask)) >> + return IRQ_SET_MASK_OK; >> + >> + if (__ioapic_set_affinity(data, mask, &dest)) >> + return -1; >> + >> + return IRQ_SET_MASK_OK; >> +} >> + >> +static void pi_mask(struct irq_data *data) >> +{ >> + ; >> +} >> + >> +static void pi_unmask(struct irq_data *data) >> +{ >> + ; >> +} >> + >> +static struct irq_chip pi_chip = { >> + .name = "POSTED-INTR", >> + .irq_ack = ack_apic_edge, >> + .irq_unmask = pi_unmask, >> + .irq_mask = pi_mask, >> + .irq_set_affinity = pi_set_affinity, >> +}; >> + >> +int arch_pi_migrate(int irq, int cpu) >> +{ >> + struct irq_data *data = irq_get_irq_data(irq); >> + struct irq_cfg *cfg; >> + struct irq_desc *desc = irq_to_desc(irq); >> + unsigned long flags; >> + >> + if (!desc) >> + return -EINVAL; >> + >> + cfg = irq_cfg(irq); >> + if (cpumask_equal(cfg->domain, cpumask_of(cpu))) >> + return cfg->vector; >> + >> + irq_set_affinity(irq, cpumask_of(cpu)); >> + raw_spin_lock_irqsave(&desc->lock, flags); >> + irq_move_irq(data); >> + raw_spin_unlock_irqrestore(&desc->lock, flags); >> + >> + if (cfg->move_in_progress) >> + send_cleanup_vector(cfg); >> + return cfg->vector; >> +} >> +EXPORT_SYMBOL_GPL(arch_pi_migrate); >> + >> +static int arch_pi_create_irq(const struct cpumask *mask) >> +{ >> + int node = cpu_to_node(0); >> + unsigned int irq_want; >> + struct irq_cfg *cfg; >> + unsigned long flags; >> + unsigned int ret = 0; >> + int irq; >> + >> + irq_want = nr_irqs_gsi; >> + >> + irq = alloc_irq_from(irq_want, node); >> + if (irq < 0) >> + return 0; >> + cfg = alloc_irq_cfg(irq_want, node); > s/irq_want/irq. > >> + if (!cfg) { >> + free_irq_at(irq, NULL); >> + return 0; >> + } >> + >> + raw_spin_lock_irqsave(&vector_lock, flags); >> + if (!__assign_irq_vector(irq, cfg, mask)) >> + ret = irq; >> + raw_spin_unlock_irqrestore(&vector_lock, flags); >> + >> + if (ret) { >> + irq_set_chip_data(irq, cfg); >> + irq_clear_status_flags(irq, IRQ_NOREQUEST); >> + } else { >> + free_irq_at(irq, cfg); >> + } >> + return ret; >> +} > > This function is mostly cut&paste of create_irq_nr(). Yes, this function allow to allocate vector from specified cpu. >> + >> +int arch_pi_alloc_irq(void *vmx) >> +{ >> + int irq, cpu = smp_processor_id(); >> + struct irq_cfg *cfg; >> + >> + irq = arch_pi_create_irq(cpumask_of(cpu)); >> + if (!irq) { >> + pr_err("Posted Interrupt: no free irq\n"); >> + return -EINVAL; >> + } >> + irq_set_handler_data(irq, vmx); >> + irq_set_chip_and_handler_name(irq, &pi_chip, handle_edge_irq, "edge"); >> + irq_set_status_flags(irq, IRQ_MOVE_PCNTXT); >> + irq_set_affinity(irq, cpumask_of(cpu)); >> + >> + cfg = irq_cfg(irq); >> + if (cfg->move_in_progress) >> + send_cleanup_vector(cfg); >> + >> + return irq; >> +} >> +EXPORT_SYMBOL_GPL(arch_pi_alloc_irq); >> + >> +void arch_pi_free_irq(unsigned int irq, void *vmx) >> +{ >> + if (irq) { >> + irq_set_handler_data(irq, NULL); >> + /* This will mask the irq */ >> + free_irq(irq, vmx); >> + destroy_irq(irq); >> + } >> +} >> +EXPORT_SYMBOL_GPL(arch_pi_free_irq); >> + >> +int arch_pi_get_vector(unsigned int irq) >> +{ >> + struct irq_cfg *cfg; >> + >> + if (!irq) >> + return -EINVAL; >> + >> + cfg = irq_cfg(irq); >> + return cfg->vector; >> +} >> +EXPORT_SYMBOL_GPL(arch_pi_get_vector); >> + >> #ifdef CONFIG_HPET_TIMER >> >> static int hpet_msi_set_affinity(struct irq_data *data, >> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c >> index af48361..04220de 100644 >> --- a/arch/x86/kvm/lapic.c >> +++ b/arch/x86/kvm/lapic.c >> @@ -656,7 +656,7 @@ void kvm_set_eoi_exitmap(struct kvm_vcpu *vcpu, int > vector, >> static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, >> int vector, int level, int trig_mode) >> { >> - int result = 0; >> + int result = 0, send; >> struct kvm_vcpu *vcpu = apic->vcpu; >> >> switch (delivery_mode) { >> @@ -674,6 +674,13 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int > delivery_mode, >> } else { >> apic_clear_vector(vector, apic->regs + APIC_TMR); >> kvm_set_eoi_exitmap(vcpu, vector, 0, 0); >> + if (kvm_apic_pi_enabled(vcpu)) { > Provide send_nv() that returns 0 if pi is disabled. > >> + send = kvm_x86_ops->send_nv(vcpu, vector); >> + if (send) { > No need "send" variable here. ok. >> + result = 1; >> + break; >> + } >> + } >> } >> >> result = !apic_test_and_set_irr(vector, apic); >> @@ -1541,6 +1548,10 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu) >> >> if (kvm_x86_ops->has_virtual_interrupt_delivery(vcpu)) >> apic->vid_enabled = true; >> + >> + if (kvm_x86_ops->has_posted_interrupt(vcpu)) >> + apic->pi_enabled = true; >> + > This is global state, no need per apic variable. > >> return 0; >> nomem_free_apic: >> kfree(apic); >> @@ -1575,6 +1586,24 @@ int kvm_apic_get_highest_irr(struct kvm_vcpu > *vcpu) >> } >> EXPORT_SYMBOL_GPL(kvm_apic_get_highest_irr); >> +void kvm_apic_update_irr(struct kvm_vcpu *vcpu, unsigned int *pir) >> +{ >> + struct kvm_lapic *apic = vcpu->arch.apic; >> + unsigned int *reg; >> + unsigned int i; >> + >> + if (!apic || !apic_enabled(apic)) > Use kvm_vcpu_has_lapic() instead of !apic. ok. >> + return; >> + >> + for (i = 0; i <= 7; i++) { >> + reg = apic->regs + APIC_IRR + i * 0x10; >> + *reg |= pir[i]; > Non atomic access to IRR. Other threads may set bit there concurrently. Ok. >> + pir[i] = 0; >> + } > Should set apic->irr_pending to true when setting irr bit. Right. Will add it in next version. >> + return; >> +} >> +EXPORT_SYMBOL_GPL(kvm_apic_update_irr); >> + >> int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu) >> { >> u32 lvt0 = kvm_apic_get_reg(vcpu->arch.apic, APIC_LVT0); >> diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h >> index 2503a64..ad35868 100644 >> --- a/arch/x86/kvm/lapic.h >> +++ b/arch/x86/kvm/lapic.h >> @@ -21,6 +21,7 @@ struct kvm_lapic { >> struct kvm_vcpu *vcpu; bool irr_pending; bool vid_enabled; + bool >> pi_enabled; /* Number of bits set in ISR. */ s16 isr_count; /* The >> highest vector set in ISR; if -1 - invalid, must scan ISR. */ @@ -43,6 >> +44,7 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu); int >> kvm_cpu_has_extint(struct kvm_vcpu *v); int kvm_cpu_get_extint(struct >> kvm_vcpu *v); int kvm_apic_get_highest_irr(struct kvm_vcpu *vcpu); >> +void kvm_apic_update_irr(struct kvm_vcpu *vcpu, unsigned int *pir); >> void kvm_lapic_reset(struct kvm_vcpu *vcpu); u64 >> kvm_lapic_get_cr8(struct kvm_vcpu *vcpu); void >> kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8); >> @@ -94,6 +96,12 @@ static inline bool kvm_apic_vid_enabled(struct kvm_vcpu > *vcpu) >> return apic->vid_enabled; >> } >> +static inline bool kvm_apic_pi_enabled(struct kvm_vcpu *vcpu) >> +{ >> + struct kvm_lapic *apic = vcpu->arch.apic; >> + return apic->pi_enabled; >> +} >> + >> int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data); >> void kvm_lapic_init(void); >> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c >> index f6ef090..6448b96 100644 >> --- a/arch/x86/kvm/vmx.c >> +++ b/arch/x86/kvm/vmx.c >> @@ -31,6 +31,7 @@ >> #include <linux/ftrace_event.h> #include <linux/slab.h> #include >> <linux/tboot.h> +#include <linux/interrupt.h> #include >> "kvm_cache_regs.h" #include "x86.h" >> @@ -89,6 +90,8 @@ module_param(enable_apicv_reg, bool, S_IRUGO); >> static bool __read_mostly enable_apicv_vid = 0; >> module_param(enable_apicv_vid, bool, S_IRUGO); >> +static bool __read_mostly enable_apicv_pi = 0; >> +module_param(enable_apicv_pi, bool, S_IRUGO); >> /* >> * If nested=1, nested virtualization is supported, i.e., guests may use >> * VMX and be a hypervisor for its own guests. If nested=0, guests may not >> @@ -372,6 +375,44 @@ struct nested_vmx { >> struct page *apic_access_page; >> }; >> +/* Posted-Interrupt Descriptor */ >> +struct pi_desc { >> + u32 pir[8]; /* Posted interrupt requested */ >> + union { >> + struct { >> + u8 on:1, >> + rsvd:7; >> + } control; >> + u32 rsvd[8]; >> + } u; >> +} __aligned(64); >> + >> +#define POSTED_INTR_ON 0 >> +u8 pi_test_on(struct pi_desc *pi_desc) >> +{ >> + return test_bit(POSTED_INTR_ON, (unsigned long *)&pi_desc->u.control); >> +} >> +void pi_set_on(struct pi_desc *pi_desc) >> +{ >> + set_bit(POSTED_INTR_ON, (unsigned long *)&pi_desc->u.control); >> +} >> + >> +void pi_clear_on(struct pi_desc *pi_desc) >> +{ >> + clear_bit(POSTED_INTR_ON, (unsigned long *)&pi_desc->u.control); >> +} >> + >> +u8 pi_test_and_set_on(struct pi_desc *pi_desc) >> +{ >> + return test_and_set_bit(POSTED_INTR_ON, >> + (unsigned long *)&pi_desc->u.control); >> +} >> + >> +void pi_set_pir(int vector, struct pi_desc *pi_desc) >> +{ >> + set_bit(vector, (unsigned long *)pi_desc->pir); >> +} >> + >> struct vcpu_vmx { struct kvm_vcpu vcpu; unsigned long >> host_rsp; @@ -439,6 +480,11 @@ struct vcpu_vmx { u64 >> eoi_exit_bitmap[4]; u64 eoi_exit_bitmap_global[4]; >> + /* Posted interrupt descriptor */ >> + struct pi_desc *pi; >> + u32 irq; >> + u32 vector; >> + >> /* Support for a guest hypervisor (nested VMX) */ >> struct nested_vmx nested; >> }; >> @@ -698,6 +744,11 @@ static u64 host_efer; >> >> static void ept_save_pdptrs(struct kvm_vcpu *vcpu); >> +int arch_pi_get_vector(unsigned int irq); >> +int arch_pi_alloc_irq(struct vcpu_vmx *vmx); >> +void arch_pi_free_irq(unsigned int irq, struct vcpu_vmx *vmx); >> +int arch_pi_migrate(int irq, int cpu); >> + >> /* >> * Keep MSR_STAR at the end, as setup_msrs() will try to optimize it >> * away by decrementing the array size. >> @@ -783,6 +834,11 @@ static inline bool > cpu_has_vmx_virtual_intr_delivery(void) >> SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY; >> } >> +static inline bool cpu_has_vmx_posted_intr(void) >> +{ >> + return vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR; >> +} >> + >> static inline bool cpu_has_vmx_flexpriority(void) >> { >> return cpu_has_vmx_tpr_shadow() && >> @@ -1555,6 +1611,11 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, > int cpu) >> struct desc_ptr *gdt = &__get_cpu_var(host_gdt); >> unsigned long sysenter_esp; >> + if (enable_apicv_pi && to_vmx(vcpu)->pi) >> + pi_set_on(to_vmx(vcpu)->pi); >> + > Why? Here means the vcpu start migration. So we should prevent the notification event until migration end. >> + kvm_make_request(KVM_REQ_POSTED_INTR, vcpu); >> + >> kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); local_irq_disable(); >> list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link, @@ -1582,6 >> +1643,8 @@ static void vmx_vcpu_put(struct kvm_vcpu *vcpu) vcpu->cpu >> = -1; kvm_cpu_vmxoff(); } >> + if (enable_apicv_pi && to_vmx(vcpu)->pi) >> + pi_set_on(to_vmx(vcpu)->pi); > Why? When vcpu schedule out, no need to send notification event to it, just set the PIR and wakeup it is enough. >> } >> >> static void vmx_fpu_activate(struct kvm_vcpu *vcpu) >> @@ -2451,12 +2514,6 @@ static __init int setup_vmcs_config(struct > vmcs_config *vmcs_conf) >> u32 _vmexit_control = 0; >> u32 _vmentry_control = 0; >> - min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING; >> - opt = PIN_BASED_VIRTUAL_NMIS; >> - if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS, >> - &_pin_based_exec_control) < 0) >> - return -EIO; >> - >> min = CPU_BASED_HLT_EXITING | >> #ifdef CONFIG_X86_64 >> CPU_BASED_CR8_LOAD_EXITING | >> @@ -2531,6 +2588,17 @@ static __init int setup_vmcs_config(struct > vmcs_config *vmcs_conf) >> &_vmexit_control) < 0) >> return -EIO; >> + min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING; >> + opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR; >> + if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS, >> + &_pin_based_exec_control) < 0) >> + return -EIO; >> + >> + if (!(_cpu_based_2nd_exec_control & >> + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) || >> + !(_vmexit_control & VM_EXIT_ACK_INTR_ON_EXIT)) >> + _pin_based_exec_control &= ~PIN_BASED_POSTED_INTR; >> + >> min = 0; opt = VM_ENTRY_LOAD_IA32_PAT; if (adjust_vmx_controls(min, >> opt, MSR_IA32_VMX_ENTRY_CTLS, @@ -2715,6 +2783,9 @@ static __init int >> hardware_setup(void) if (!cpu_has_vmx_virtual_intr_delivery()) >> enable_apicv_vid = 0; >> + if (!cpu_has_vmx_posted_intr() || !x2apic_enabled()) > In nested guest x2apic may be enabled without irq remapping. Check for > irq remapping here. There are no posted interrupt available in nested case. We don't need to check IR here. > >> + enable_apicv_pi = 0; >> + >> if (nested) >> nested_vmx_setup_ctls_msrs(); >> @@ -3881,6 +3952,93 @@ static void ept_set_mmio_spte_mask(void) >> kvm_mmu_set_mmio_spte_mask(0xffull << 49 | 0x6ull); >> } >> +irqreturn_t pi_handler(int irq, void *data) >> +{ >> + struct vcpu_vmx *vmx = data; >> + >> + kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu); >> + kvm_vcpu_kick(&vmx->vcpu); >> + >> + return IRQ_HANDLED; >> +} >> + >> +static int vmx_has_posted_interrupt(struct kvm_vcpu *vcpu) >> +{ >> + return irqchip_in_kernel(vcpu->kvm) && enable_apicv_pi; >> +} >> + >> +static void vmx_pi_migrate(struct kvm_vcpu *vcpu) >> +{ >> + int ret = 0; >> + struct vcpu_vmx *vmx = to_vmx(vcpu); >> + >> + if (!enable_apicv_pi) >> + return ; >> + >> + preempt_disable(); >> + local_irq_disable(); >> + if (!vmx->irq) { >> + ret = arch_pi_alloc_irq(vmx); >> + if (ret < 0) { >> + vmx->irq = -1; >> + goto out; >> + } >> + vmx->irq = ret; >> + >> + ret = request_irq(vmx->irq, pi_handler, IRQF_NO_THREAD, >> + "Posted Interrupt", vmx); >> + if (ret) { >> + vmx->irq = -1; >> + goto out; >> + } >> + >> + ret = arch_pi_get_vector(vmx->irq); >> + } else >> + ret = arch_pi_migrate(vmx->irq, smp_processor_id()); >> + >> + if (ret < 0) { >> + vmx->irq = -1; >> + goto out; >> + } else { >> + vmx->vector = ret; >> + vmcs_write16(POSTED_INTR_NV, vmx->vector); >> + pi_clear_on(vmx->pi); >> + } >> +out: >> + local_irq_enable(); >> + preempt_enable(); >> + return ; >> +} >> + >> +static int vmx_send_nv(struct kvm_vcpu *vcpu, >> + int vector) >> +{ >> + struct vcpu_vmx *vmx = to_vmx(vcpu); >> + >> + if (unlikely(vmx->irq == -1)) >> + return 0; >> + >> + if (vcpu->cpu == smp_processor_id()) { >> + pi_set_on(vmx->pi); > Why? You clear this bit anyway in vmx_update_irq() during guest entry. Here means the target vcpu already in vmx non-root mode. Then it will consume the interrupt on next vm entry and we don't need to send the notification event from other cpu, just update PIR is enough. >> + return 0; + } + + pi_set_pir(vector, vmx->pi); + if >> (!pi_test_and_set_on(vmx->pi) && (vcpu->mode == IN_GUEST_MODE)) { >> + apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), vmx->vector); + return >> 1; + } + return 0; +} + +static void free_pi(struct vcpu_vmx *vmx) +{ >> + if (enable_apicv_pi) { + kfree(vmx->pi); >> + arch_pi_free_irq(vmx->irq, vmx); + } +} + >> /* >> * Sets up the vmcs for emulated real mode. >> */ >> @@ -3890,6 +4048,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) >> unsigned long a; >> #endif >> int i; >> + u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl; >> >> /* I/O */ vmcs_write64(IO_BITMAP_A, __pa(vmx_io_bitmap_a)); @@ >> -3901,8 +4060,10 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) >> vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */ >> >> /* Control */ >> - vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, >> - vmcs_config.pin_based_exec_ctrl); >> + if (!enable_apicv_pi) >> + pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR; >> + >> + vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, pin_based_exec_ctrl); >> >> vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, > vmx_exec_control(vmx)); >> >> @@ -3920,6 +4081,12 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) >> vmcs_write16(GUEST_INTR_STATUS, 0); >> } >> + if (enable_apicv_pi) { >> + vmx->pi = kmalloc(sizeof(struct pi_desc), >> + GFP_KERNEL | __GFP_ZERO); >> + vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((vmx->pi))); >> + } >> + >> if (ple_gap) { vmcs_write32(PLE_GAP, ple_gap); >> vmcs_write32(PLE_WINDOW, ple_window); @@ -6161,6 +6328,11 @@ static >> void vmx_update_irq(struct kvm_vcpu *vcpu) if (!enable_apicv_vid) >> return ; >> + if (enable_apicv_pi) { >> + kvm_apic_update_irr(vcpu, (unsigned int *)vmx->pi->pir); >> + pi_clear_on(vmx->pi); > Why do you do that? Isn't VMX process posted interrupts on vmentry if "on" bit > is set? > >> + } >> + >> vector = kvm_apic_get_highest_irr(vcpu); >> if (vector == -1) >> return; >> @@ -6586,6 +6758,7 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu) >> >> free_vpid(vmx); free_nested(vmx); + free_pi(vmx); >> free_loaded_vmcs(vmx->loaded_vmcs); kfree(vmx->guest_msrs); >> kvm_vcpu_uninit(vcpu); @@ -7483,8 +7656,11 @@ static struct >> kvm_x86_ops vmx_x86_ops = { .enable_irq_window = enable_irq_window, >> .update_cr8_intercept = update_cr8_intercept, >> .has_virtual_interrupt_delivery = vmx_has_virtual_interrupt_delivery, >> + .has_posted_interrupt = vmx_has_posted_interrupt, .update_irq = >> vmx_update_irq, .set_eoi_exitmap = vmx_set_eoi_exitmap, >> + .send_nv = vmx_send_nv, >> + .pi_migrate = vmx_pi_migrate, >> >> .set_tss_addr = vmx_set_tss_addr, >> .get_tdp_level = get_ept_level, >> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c >> index 8b8de3b..f035267 100644 >> --- a/arch/x86/kvm/x86.c >> +++ b/arch/x86/kvm/x86.c >> @@ -5250,6 +5250,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) >> bool req_immediate_exit = 0; >> >> if (vcpu->requests) { >> + if (kvm_check_request(KVM_REQ_POSTED_INTR, vcpu)) >> + kvm_x86_ops->pi_migrate(vcpu); >> if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu)) >> kvm_mmu_unload(vcpu); >> if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu)) >> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h >> index ecc5543..f8d8d34 100644 >> --- a/include/linux/kvm_host.h >> +++ b/include/linux/kvm_host.h >> @@ -107,6 +107,7 @@ static inline bool is_error_page(struct page *page) >> #define KVM_REQ_IMMEDIATE_EXIT 15 >> #define KVM_REQ_PMU 16 >> #define KVM_REQ_PMI 17 >> +#define KVM_REQ_POSTED_INTR 18 >> >> #define KVM_USERSPACE_IRQ_SOURCE_ID 0 >> #define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID 1 >> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c >> index be70035..05baf1c 100644 >> --- a/virt/kvm/kvm_main.c >> +++ b/virt/kvm/kvm_main.c >> @@ -1625,6 +1625,8 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu) >> smp_send_reschedule(cpu); >> put_cpu(); >> } >> +EXPORT_SYMBOL_GPL(kvm_vcpu_kick); >> + >> #endif /* !CONFIG_S390 */ >> >> void kvm_resched(struct kvm_vcpu *vcpu) >> -- >> 1.7.1 > > -- > Gleb. > -- > To unsubscribe from this list: send the line "unsubscribe kvm" in > the body of a message to majordomo@xxxxxxxxxxxxxxx > More majordomo info at http://vger.kernel.org/majordomo-info.html Best regards, Yang -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html