RE: [PATCH v2 6/6] x86, apicv: Add Posted Interrupt supporting

"Zhang, Yang Z" <yang.z.zhang@xxxxxxxxx> · Mon, 26 Nov 2012 03:51:04 +0000

Gleb Natapov wrote on 2012-11-25:
> On Wed, Nov 21, 2012 at 04:09:39PM +0800, Yang Zhang wrote:
>> Posted Interrupt allows vAPICV interrupts to inject into guest directly
>> without any vmexit.
>> 
>> - When delivering a interrupt to guest, if target vcpu is running,
>>   update Posted-interrupt requests bitmap and send a notification event
>>   to the vcpu. Then the vcpu will handle this interrupt automatically,
>>   without any software involvemnt.
> Looks like you allocating one irq vector per vcpu per pcpu and then
> migrate it or reallocate when vcpu move from one pcpu to another.
> This is not scalable and migrating irq migration slows things down.
> What's wrong with allocating one global vector for posted interrupt
> during vmx initialization and use it for all vcpus?

Consider the following situation: 
If vcpu A is running when notification event which belong to vcpu B is arrived, since the vector match the vcpu A's notification vector, then this event will be consumed by vcpu A(even it do nothing) and the interrupt cannot be handled in time.

>> - If target vcpu is not running or there already a notification event
>>   pending in the vcpu, do nothing. The interrupt will be handled by old
>>   way.
>> Signed-off-by: Yang Zhang <yang.z.zhang@xxxxxxxxx>
>> ---
>>  arch/x86/include/asm/kvm_host.h |    3 + arch/x86/include/asm/vmx.h   
>>    |    4 + arch/x86/kernel/apic/io_apic.c  |  138
>>  ++++++++++++++++++++++++++++ arch/x86/kvm/lapic.c            |   31
>>  ++++++- arch/x86/kvm/lapic.h            |    8 ++ arch/x86/kvm/vmx.c  
>>             |  192 +++++++++++++++++++++++++++++++++++++--
>>  arch/x86/kvm/x86.c              |    2 + include/linux/kvm_host.h     
>>    |    1 + virt/kvm/kvm_main.c             |    2 + 9 files changed,
>>  372 insertions(+), 9 deletions(-)
>> diff --git a/arch/x86/include/asm/kvm_host.h
>> b/arch/x86/include/asm/kvm_host.h index 8e07a86..1145894 100644 ---
>> a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h
>> @@ -683,9 +683,12 @@ struct kvm_x86_ops {
>>  	void (*enable_irq_window)(struct kvm_vcpu *vcpu); 	void
>>  (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr); 	int
>>  (*has_virtual_interrupt_delivery)(struct kvm_vcpu *vcpu); +	int
>>  (*has_posted_interrupt)(struct kvm_vcpu *vcpu); 	void
>>  (*update_irq)(struct kvm_vcpu *vcpu); 	void (*set_eoi_exitmap)(struct
>>  kvm_vcpu *vcpu, int vector, 			int need_eoi, int global);
>> +	int (*send_nv)(struct kvm_vcpu *vcpu, int vector);
>> +	void (*pi_migrate)(struct kvm_vcpu *vcpu);
>>  	int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
>>  	int (*get_tdp_level)(void);
>>  	u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
>> diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
>> index 1003341..7b9e1d0 100644
>> --- a/arch/x86/include/asm/vmx.h
>> +++ b/arch/x86/include/asm/vmx.h
>> @@ -152,6 +152,7 @@
>>  #define PIN_BASED_EXT_INTR_MASK                 0x00000001
>>  #define PIN_BASED_NMI_EXITING                   0x00000008
>>  #define PIN_BASED_VIRTUAL_NMIS                  0x00000020
>> +#define PIN_BASED_POSTED_INTR                   0x00000080
>> 
>>  #define VM_EXIT_SAVE_DEBUG_CONTROLS             0x00000002 #define
>>  VM_EXIT_HOST_ADDR_SPACE_SIZE            0x00000200 @@ -174,6 +175,7 @@
>>  /* VMCS Encodings */ enum vmcs_field { 	VIRTUAL_PROCESSOR_ID          
>>   = 0x00000000, +	POSTED_INTR_NV                  = 0x00000002,
>>  	GUEST_ES_SELECTOR               = 0x00000800, 	GUEST_CS_SELECTOR     
>>           = 0x00000802, 	GUEST_SS_SELECTOR               = 0x00000804,
>>  @@ -208,6 +210,8 @@ enum vmcs_field { 	VIRTUAL_APIC_PAGE_ADDR_HIGH    
>>  = 0x00002013, 	APIC_ACCESS_ADDR		= 0x00002014,
>>  	APIC_ACCESS_ADDR_HIGH		= 0x00002015,
>> +	POSTED_INTR_DESC_ADDR           = 0x00002016,
>> +	POSTED_INTR_DESC_ADDR_HIGH      = 0x00002017,
>>  	EPT_POINTER                     = 0x0000201a,
>>  	EPT_POINTER_HIGH                = 0x0000201b,
>>  	EOI_EXIT_BITMAP0                = 0x0000201c,
>> diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
>> index 1817fa9..97cb8ee 100644
>> --- a/arch/x86/kernel/apic/io_apic.c
>> +++ b/arch/x86/kernel/apic/io_apic.c
>> @@ -3277,6 +3277,144 @@ int arch_setup_dmar_msi(unsigned int irq)
>>  }
>>  #endif
>> +static int
>> +pi_set_affinity(struct irq_data *data, const struct cpumask *mask,
>> +		      bool force)
>> +{
>> +	unsigned int dest;
>> +	struct irq_cfg *cfg = (struct irq_cfg *)data->chip_data;
>> +	if (cpumask_equal(cfg->domain, mask))
>> +		return IRQ_SET_MASK_OK;
>> +
>> +	if (__ioapic_set_affinity(data, mask, &dest))
>> +		return -1;
>> +
>> +	return IRQ_SET_MASK_OK;
>> +}
>> +
>> +static void pi_mask(struct irq_data *data)
>> +{
>> +	;
>> +}
>> +
>> +static void pi_unmask(struct irq_data *data)
>> +{
>> +	;
>> +}
>> +
>> +static struct irq_chip pi_chip = {
>> +	.name       = "POSTED-INTR",
>> +	.irq_ack    = ack_apic_edge,
>> +	.irq_unmask = pi_unmask,
>> +	.irq_mask   = pi_mask,
>> +	.irq_set_affinity   = pi_set_affinity,
>> +};
>> +
>> +int arch_pi_migrate(int irq, int cpu)
>> +{
>> +	struct irq_data *data = irq_get_irq_data(irq);
>> +	struct irq_cfg *cfg;
>> +	struct irq_desc *desc = irq_to_desc(irq);
>> +	unsigned long flags;
>> +
>> +	if (!desc)
>> +		return -EINVAL;
>> +
>> +	cfg = irq_cfg(irq);
>> +	if (cpumask_equal(cfg->domain, cpumask_of(cpu)))
>> +		return cfg->vector;
>> +
>> +	irq_set_affinity(irq, cpumask_of(cpu));
>> +	raw_spin_lock_irqsave(&desc->lock, flags);
>> +	irq_move_irq(data);
>> +	raw_spin_unlock_irqrestore(&desc->lock, flags);
>> +
>> +	if (cfg->move_in_progress)
>> +		send_cleanup_vector(cfg);
>> +	return cfg->vector;
>> +}
>> +EXPORT_SYMBOL_GPL(arch_pi_migrate);
>> +
>> +static int arch_pi_create_irq(const struct cpumask *mask)
>> +{
>> +	int node = cpu_to_node(0);
>> +	unsigned int irq_want;
>> +	struct irq_cfg *cfg;
>> +	unsigned long flags;
>> +	unsigned int ret = 0;
>> +	int irq;
>> +
>> +	irq_want = nr_irqs_gsi;
>> +
>> +	irq = alloc_irq_from(irq_want, node);
>> +	if (irq < 0)
>> +		return 0;
>> +	cfg = alloc_irq_cfg(irq_want, node);
> s/irq_want/irq.
> 
>> +	if (!cfg) {
>> +		free_irq_at(irq, NULL);
>> +		return 0;
>> +	}
>> +
>> +	raw_spin_lock_irqsave(&vector_lock, flags);
>> +	if (!__assign_irq_vector(irq, cfg, mask))
>> +		ret = irq;
>> +	raw_spin_unlock_irqrestore(&vector_lock, flags);
>> +
>> +	if (ret) {
>> +		irq_set_chip_data(irq, cfg);
>> +		irq_clear_status_flags(irq, IRQ_NOREQUEST);
>> +	} else {
>> +		free_irq_at(irq, cfg);
>> +	}
>> +	return ret;
>> +}
> 
> This function is mostly cut&paste of create_irq_nr().

Yes, this function allow to allocate vector from specified cpu.

>> +
>> +int arch_pi_alloc_irq(void *vmx)
>> +{
>> +	int irq, cpu = smp_processor_id();
>> +	struct irq_cfg *cfg;
>> +
>> +	irq = arch_pi_create_irq(cpumask_of(cpu));
>> +	if (!irq) {
>> +		pr_err("Posted Interrupt: no free irq\n");
>> +		return -EINVAL;
>> +	}
>> +	irq_set_handler_data(irq, vmx);
>> +	irq_set_chip_and_handler_name(irq, &pi_chip, handle_edge_irq, "edge");
>> +	irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
>> +	irq_set_affinity(irq, cpumask_of(cpu));
>> +
>> +	cfg = irq_cfg(irq);
>> +	if (cfg->move_in_progress)
>> +		send_cleanup_vector(cfg);
>> +
>> +	return irq;
>> +}
>> +EXPORT_SYMBOL_GPL(arch_pi_alloc_irq);
>> +
>> +void arch_pi_free_irq(unsigned int irq, void *vmx)
>> +{
>> +	if (irq) {
>> +		irq_set_handler_data(irq, NULL);
>> +		/* This will mask the irq */
>> +		free_irq(irq, vmx);
>> +		destroy_irq(irq);
>> +	}
>> +}
>> +EXPORT_SYMBOL_GPL(arch_pi_free_irq);
>> +
>> +int arch_pi_get_vector(unsigned int irq)
>> +{
>> +	struct irq_cfg *cfg;
>> +
>> +	if (!irq)
>> +		return -EINVAL;
>> +
>> +	cfg = irq_cfg(irq);
>> +	return cfg->vector;
>> +}
>> +EXPORT_SYMBOL_GPL(arch_pi_get_vector);
>> +
>>  #ifdef CONFIG_HPET_TIMER
>>  
>>  static int hpet_msi_set_affinity(struct irq_data *data,
>> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
>> index af48361..04220de 100644
>> --- a/arch/x86/kvm/lapic.c
>> +++ b/arch/x86/kvm/lapic.c
>> @@ -656,7 +656,7 @@ void kvm_set_eoi_exitmap(struct kvm_vcpu *vcpu, int
> vector,
>>  static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
>>  			     int vector, int level, int trig_mode)
>>  {
>> -	int result = 0;
>> +	int result = 0, send;
>>  	struct kvm_vcpu *vcpu = apic->vcpu;
>>  
>>  	switch (delivery_mode) {
>> @@ -674,6 +674,13 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int
> delivery_mode,
>>  		} else {
>>  			apic_clear_vector(vector, apic->regs + APIC_TMR);
>>  			kvm_set_eoi_exitmap(vcpu, vector, 0, 0);
>> +			if (kvm_apic_pi_enabled(vcpu)) {
> Provide send_nv() that returns 0 if pi is disabled.
> 
>> +				send = kvm_x86_ops->send_nv(vcpu, vector);
>> +				if (send) {
> No need "send" variable here.

ok.

>> +					result = 1;
>> +					break;
>> +				}
>> +			}
>>  		}
>>  
>>  		result = !apic_test_and_set_irr(vector, apic);
>> @@ -1541,6 +1548,10 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
>> 
>>  	if (kvm_x86_ops->has_virtual_interrupt_delivery(vcpu))
>>  		apic->vid_enabled = true;
>> +
>> +	if (kvm_x86_ops->has_posted_interrupt(vcpu))
>> +		apic->pi_enabled = true;
>> +
> This is global state, no need per apic variable.
> 
>>  	return 0;
>>  nomem_free_apic:
>>  	kfree(apic);
>> @@ -1575,6 +1586,24 @@ int kvm_apic_get_highest_irr(struct kvm_vcpu
> *vcpu)
>>  }
>>  EXPORT_SYMBOL_GPL(kvm_apic_get_highest_irr);
>> +void kvm_apic_update_irr(struct kvm_vcpu *vcpu, unsigned int *pir)
>> +{
>> +	struct kvm_lapic *apic = vcpu->arch.apic;
>> +	unsigned int *reg;
>> +	unsigned int i;
>> +
>> +	if (!apic || !apic_enabled(apic))
> Use kvm_vcpu_has_lapic() instead of !apic.

ok.

>> +		return;
>> +
>> +	for (i = 0; i <= 7; i++) {
>> +		reg = apic->regs + APIC_IRR + i * 0x10;
>> +		*reg |= pir[i];
> Non atomic access to IRR. Other threads may set bit there concurrently.
Ok.

>> +		pir[i] = 0;
>> +	}
> Should set apic->irr_pending to true when setting irr bit.
Right. Will add it in next version.

>> +	return;
>> +}
>> +EXPORT_SYMBOL_GPL(kvm_apic_update_irr);
>> +
>>  int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu)
>>  {
>>  	u32 lvt0 = kvm_apic_get_reg(vcpu->arch.apic, APIC_LVT0);
>> diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
>> index 2503a64..ad35868 100644
>> --- a/arch/x86/kvm/lapic.h
>> +++ b/arch/x86/kvm/lapic.h
>> @@ -21,6 +21,7 @@ struct kvm_lapic {
>>  	struct kvm_vcpu *vcpu; 	bool irr_pending; 	bool vid_enabled; +	bool
>>  pi_enabled; 	/* Number of bits set in ISR. */ 	s16 isr_count; 	/* The
>>  highest vector set in ISR; if -1 - invalid, must scan ISR. */ @@ -43,6
>>  +44,7 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu); int
>>  kvm_cpu_has_extint(struct kvm_vcpu *v); int kvm_cpu_get_extint(struct
>>  kvm_vcpu *v); int kvm_apic_get_highest_irr(struct kvm_vcpu *vcpu);
>>  +void kvm_apic_update_irr(struct kvm_vcpu *vcpu, unsigned int *pir);
>>  void kvm_lapic_reset(struct kvm_vcpu *vcpu); u64
>>  kvm_lapic_get_cr8(struct kvm_vcpu *vcpu); void
>>  kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8);
>> @@ -94,6 +96,12 @@ static inline bool kvm_apic_vid_enabled(struct kvm_vcpu
> *vcpu)
>>  	return apic->vid_enabled;
>>  }
>> +static inline bool kvm_apic_pi_enabled(struct kvm_vcpu *vcpu)
>> +{
>> +	struct kvm_lapic *apic = vcpu->arch.apic;
>> +	return apic->pi_enabled;
>> +}
>> +
>>  int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data);
>>  void kvm_lapic_init(void);
>> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
>> index f6ef090..6448b96 100644
>> --- a/arch/x86/kvm/vmx.c
>> +++ b/arch/x86/kvm/vmx.c
>> @@ -31,6 +31,7 @@
>>  #include <linux/ftrace_event.h> #include <linux/slab.h> #include
>>  <linux/tboot.h> +#include <linux/interrupt.h> #include
>>  "kvm_cache_regs.h" #include "x86.h"
>> @@ -89,6 +90,8 @@ module_param(enable_apicv_reg, bool, S_IRUGO);
>>  static bool __read_mostly enable_apicv_vid = 0;
>>  module_param(enable_apicv_vid, bool, S_IRUGO);
>> +static bool __read_mostly enable_apicv_pi = 0;
>> +module_param(enable_apicv_pi, bool, S_IRUGO);
>>  /*
>>   * If nested=1, nested virtualization is supported, i.e., guests may use
>>   * VMX and be a hypervisor for its own guests. If nested=0, guests may not
>> @@ -372,6 +375,44 @@ struct nested_vmx {
>>  	struct page *apic_access_page;
>>  };
>> +/* Posted-Interrupt Descriptor */
>> +struct pi_desc {
>> +	u32 pir[8];     /* Posted interrupt requested */
>> +	union {
>> +		struct {
>> +			u8  on:1,
>> +			    rsvd:7;
>> +		} control;
>> +		u32 rsvd[8];
>> +	} u;
>> +} __aligned(64);
>> +
>> +#define POSTED_INTR_ON  0
>> +u8 pi_test_on(struct pi_desc *pi_desc)
>> +{
>> +	return test_bit(POSTED_INTR_ON, (unsigned long *)&pi_desc->u.control);
>> +}
>> +void pi_set_on(struct pi_desc *pi_desc)
>> +{
>> +	set_bit(POSTED_INTR_ON, (unsigned long *)&pi_desc->u.control);
>> +}
>> +
>> +void pi_clear_on(struct pi_desc *pi_desc)
>> +{
>> +	clear_bit(POSTED_INTR_ON, (unsigned long *)&pi_desc->u.control);
>> +}
>> +
>> +u8 pi_test_and_set_on(struct pi_desc *pi_desc)
>> +{
>> +	return test_and_set_bit(POSTED_INTR_ON,
>> +			(unsigned long *)&pi_desc->u.control);
>> +}
>> +
>> +void pi_set_pir(int vector, struct pi_desc *pi_desc)
>> +{
>> +	set_bit(vector, (unsigned long *)pi_desc->pir);
>> +}
>> +
>>  struct vcpu_vmx { 	struct kvm_vcpu       vcpu; 	unsigned long        
>>  host_rsp; @@ -439,6 +480,11 @@ struct vcpu_vmx { 	u64
>>  eoi_exit_bitmap[4]; 	u64 eoi_exit_bitmap_global[4];
>> +	/* Posted interrupt descriptor */
>> +	struct pi_desc *pi;
>> +	u32 irq;
>> +	u32 vector;
>> +
>>  	/* Support for a guest hypervisor (nested VMX) */
>>  	struct nested_vmx nested;
>>  };
>> @@ -698,6 +744,11 @@ static u64 host_efer;
>> 
>>  static void ept_save_pdptrs(struct kvm_vcpu *vcpu);
>> +int arch_pi_get_vector(unsigned int irq);
>> +int arch_pi_alloc_irq(struct vcpu_vmx *vmx);
>> +void arch_pi_free_irq(unsigned int irq, struct vcpu_vmx *vmx);
>> +int arch_pi_migrate(int irq, int cpu);
>> +
>>  /*
>>   * Keep MSR_STAR at the end, as setup_msrs() will try to optimize it
>>   * away by decrementing the array size.
>> @@ -783,6 +834,11 @@ static inline bool
> cpu_has_vmx_virtual_intr_delivery(void)
>>  		SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY;
>>  }
>> +static inline bool cpu_has_vmx_posted_intr(void)
>> +{
>> +	return vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR;
>> +}
>> +
>>  static inline bool cpu_has_vmx_flexpriority(void)
>>  {
>>  	return cpu_has_vmx_tpr_shadow() &&
>> @@ -1555,6 +1611,11 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu,
> int cpu)
>>  		struct desc_ptr *gdt = &__get_cpu_var(host_gdt);
>>  		unsigned long sysenter_esp;
>> +		if (enable_apicv_pi && to_vmx(vcpu)->pi)
>> +			pi_set_on(to_vmx(vcpu)->pi);
>> +
> Why?

Here means the vcpu start migration. So we should prevent the notification event until migration end.

>> +		kvm_make_request(KVM_REQ_POSTED_INTR, vcpu);
>> +
>>  		kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); 		local_irq_disable();
>>  		list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link, @@ -1582,6
>>  +1643,8 @@ static void vmx_vcpu_put(struct kvm_vcpu *vcpu) 		vcpu->cpu
>>  = -1; 		kvm_cpu_vmxoff(); 	}
>> +	if (enable_apicv_pi && to_vmx(vcpu)->pi)
>> +		pi_set_on(to_vmx(vcpu)->pi);
> Why?

When vcpu schedule out, no need to send notification event to it, just set the PIR and wakeup it is enough.

>>  }
>>  
>>  static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
>> @@ -2451,12 +2514,6 @@ static __init int setup_vmcs_config(struct
> vmcs_config *vmcs_conf)
>>  	u32 _vmexit_control = 0;
>>  	u32 _vmentry_control = 0;
>> -	min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
>> -	opt = PIN_BASED_VIRTUAL_NMIS;
>> -	if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
>> -				&_pin_based_exec_control) < 0)
>> -		return -EIO;
>> -
>>  	min = CPU_BASED_HLT_EXITING |
>>  #ifdef CONFIG_X86_64
>>  	      CPU_BASED_CR8_LOAD_EXITING |
>> @@ -2531,6 +2588,17 @@ static __init int setup_vmcs_config(struct
> vmcs_config *vmcs_conf)
>>  				&_vmexit_control) < 0)
>>  		return -EIO;
>> +	min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
>> +	opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR;
>> +	if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
>> +				&_pin_based_exec_control) < 0)
>> +		return -EIO;
>> +
>> +	if (!(_cpu_based_2nd_exec_control &
>> +		SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) ||
>> +		!(_vmexit_control & VM_EXIT_ACK_INTR_ON_EXIT))
>> +		_pin_based_exec_control &= ~PIN_BASED_POSTED_INTR;
>> +
>>  	min = 0; 	opt = VM_ENTRY_LOAD_IA32_PAT; 	if (adjust_vmx_controls(min,
>>  opt, MSR_IA32_VMX_ENTRY_CTLS, @@ -2715,6 +2783,9 @@ static __init int
>>  hardware_setup(void) 	if (!cpu_has_vmx_virtual_intr_delivery())
>>  		enable_apicv_vid = 0;
>> +	if (!cpu_has_vmx_posted_intr() || !x2apic_enabled())
> In nested guest x2apic may be enabled without irq remapping. Check for
> irq remapping here.

There are no posted interrupt available in nested case. We don't need to check IR here.

> 
>> +		enable_apicv_pi = 0;
>> +
>>  	if (nested)
>>  		nested_vmx_setup_ctls_msrs();
>> @@ -3881,6 +3952,93 @@ static void ept_set_mmio_spte_mask(void)
>>  	kvm_mmu_set_mmio_spte_mask(0xffull << 49 | 0x6ull);
>>  }
>> +irqreturn_t pi_handler(int irq, void *data)
>> +{
>> +	struct vcpu_vmx *vmx = data;
>> +
>> +	kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu);
>> +	kvm_vcpu_kick(&vmx->vcpu);
>> +
>> +	return IRQ_HANDLED;
>> +}
>> +
>> +static int vmx_has_posted_interrupt(struct kvm_vcpu *vcpu)
>> +{
>> +	return irqchip_in_kernel(vcpu->kvm) && enable_apicv_pi;
>> +}
>> +
>> +static void vmx_pi_migrate(struct kvm_vcpu *vcpu)
>> +{
>> +	int ret = 0;
>> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
>> +
>> +	if (!enable_apicv_pi)
>> +		return ;
>> +
>> +	preempt_disable();
>> +	local_irq_disable();
>> +	if (!vmx->irq) {
>> +		ret = arch_pi_alloc_irq(vmx);
>> +		if (ret < 0) {
>> +			vmx->irq = -1;
>> +			goto out;
>> +		}
>> +		vmx->irq = ret;
>> +
>> +		ret = request_irq(vmx->irq, pi_handler, IRQF_NO_THREAD,
>> +					"Posted Interrupt", vmx);
>> +		if (ret) {
>> +			vmx->irq = -1;
>> +			goto out;
>> +		}
>> +
>> +		ret = arch_pi_get_vector(vmx->irq);
>> +	} else
>> +		ret = arch_pi_migrate(vmx->irq, smp_processor_id());
>> +
>> +	if (ret < 0) {
>> +		vmx->irq = -1;
>> +		goto out;
>> +	} else {
>> +		vmx->vector = ret;
>> +		vmcs_write16(POSTED_INTR_NV, vmx->vector);
>> +		pi_clear_on(vmx->pi);
>> +	}
>> +out:
>> +	local_irq_enable();
>> +	preempt_enable();
>> +	return ;
>> +}
>> +
>> +static int vmx_send_nv(struct kvm_vcpu *vcpu,
>> +		int vector)
>> +{
>> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
>> +
>> +	if (unlikely(vmx->irq == -1))
>> +		return 0;
>> +
>> +	if (vcpu->cpu == smp_processor_id()) {
>> +		pi_set_on(vmx->pi);
> Why? You clear this bit anyway in vmx_update_irq() during guest entry.
Here means the target vcpu already in vmx non-root mode. Then it will consume the interrupt on next vm entry and we don't need to send the notification event from other cpu, just update PIR is enough.

>> +		return 0; +	} + +	pi_set_pir(vector, vmx->pi); +	if
>> (!pi_test_and_set_on(vmx->pi) && (vcpu->mode == IN_GUEST_MODE)) {
>> +		apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), vmx->vector); +		return
>> 1; +	} +	return 0; +} + +static void free_pi(struct vcpu_vmx *vmx) +{
>> +	if (enable_apicv_pi) { +		kfree(vmx->pi);
>> +		arch_pi_free_irq(vmx->irq, vmx); +	} +} +
>>  /*
>>   * Sets up the vmcs for emulated real mode.
>>   */
>> @@ -3890,6 +4048,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
>>  	unsigned long a;
>>  #endif
>>  	int i;
>> +	u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl;
>> 
>>  	/* I/O */ 	vmcs_write64(IO_BITMAP_A, __pa(vmx_io_bitmap_a)); @@
>>  -3901,8 +4060,10 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
>>  	vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
>>  
>>  	/* Control */
>> -	vmcs_write32(PIN_BASED_VM_EXEC_CONTROL,
>> -		vmcs_config.pin_based_exec_ctrl);
>> +	if (!enable_apicv_pi)
>> +		pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR;
>> +
>> +	vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, pin_based_exec_ctrl);
>> 
>>  	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
> vmx_exec_control(vmx));
>> 
>> @@ -3920,6 +4081,12 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
>>  		vmcs_write16(GUEST_INTR_STATUS, 0);
>>  	}
>> +	if (enable_apicv_pi) {
>> +		vmx->pi = kmalloc(sizeof(struct pi_desc),
>> +				GFP_KERNEL | __GFP_ZERO);
>> +		vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((vmx->pi)));
>> +	}
>> +
>>  	if (ple_gap) { 		vmcs_write32(PLE_GAP, ple_gap);
>>  		vmcs_write32(PLE_WINDOW, ple_window); @@ -6161,6 +6328,11 @@ static
>>  void vmx_update_irq(struct kvm_vcpu *vcpu) 	if (!enable_apicv_vid)
>>  		return ;
>> +	if (enable_apicv_pi) {
>> +		kvm_apic_update_irr(vcpu, (unsigned int *)vmx->pi->pir);
>> +		pi_clear_on(vmx->pi);
> Why do you do that? Isn't VMX process posted interrupts on vmentry if "on" bit
> is set?
> 
>> +	}
>> +
>>  	vector = kvm_apic_get_highest_irr(vcpu);
>>  	if (vector == -1)
>>  		return;
>> @@ -6586,6 +6758,7 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
>> 
>>  	free_vpid(vmx); 	free_nested(vmx); +	free_pi(vmx);
>>  	free_loaded_vmcs(vmx->loaded_vmcs); 	kfree(vmx->guest_msrs);
>>  	kvm_vcpu_uninit(vcpu); @@ -7483,8 +7656,11 @@ static struct
>>  kvm_x86_ops vmx_x86_ops = { 	.enable_irq_window = enable_irq_window,
>>  	.update_cr8_intercept = update_cr8_intercept,
>>  	.has_virtual_interrupt_delivery = vmx_has_virtual_interrupt_delivery,
>>  +	.has_posted_interrupt = vmx_has_posted_interrupt, 	.update_irq =
>>  vmx_update_irq, 	.set_eoi_exitmap = vmx_set_eoi_exitmap,
>> +	.send_nv = vmx_send_nv,
>> +	.pi_migrate = vmx_pi_migrate,
>> 
>>  	.set_tss_addr = vmx_set_tss_addr,
>>  	.get_tdp_level = get_ept_level,
>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>> index 8b8de3b..f035267 100644
>> --- a/arch/x86/kvm/x86.c
>> +++ b/arch/x86/kvm/x86.c
>> @@ -5250,6 +5250,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
>>  	bool req_immediate_exit = 0;
>>  
>>  	if (vcpu->requests) {
>> +		if (kvm_check_request(KVM_REQ_POSTED_INTR, vcpu))
>> +			kvm_x86_ops->pi_migrate(vcpu);
>>  		if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu))
>>  			kvm_mmu_unload(vcpu);
>>  		if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
>> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
>> index ecc5543..f8d8d34 100644
>> --- a/include/linux/kvm_host.h
>> +++ b/include/linux/kvm_host.h
>> @@ -107,6 +107,7 @@ static inline bool is_error_page(struct page *page)
>>  #define KVM_REQ_IMMEDIATE_EXIT    15
>>  #define KVM_REQ_PMU               16
>>  #define KVM_REQ_PMI               17
>> +#define KVM_REQ_POSTED_INTR       18
>> 
>>  #define KVM_USERSPACE_IRQ_SOURCE_ID		0
>>  #define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID	1
>> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
>> index be70035..05baf1c 100644
>> --- a/virt/kvm/kvm_main.c
>> +++ b/virt/kvm/kvm_main.c
>> @@ -1625,6 +1625,8 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
>>  			smp_send_reschedule(cpu);
>>  	put_cpu();
>>  }
>> +EXPORT_SYMBOL_GPL(kvm_vcpu_kick);
>> +
>>  #endif /* !CONFIG_S390 */
>>  
>>  void kvm_resched(struct kvm_vcpu *vcpu)
>> --
>> 1.7.1
> 
> --
> 			Gleb.
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@xxxxxxxxxxxxxxx
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

Best regards,
Yang

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html