Re: [PATCH v2 6/6] x86, apicv: Add Posted Interrupt supporting

Gleb Natapov <gleb@xxxxxxxxxx> · Sun, 25 Nov 2012 14:39:17 +0200

On Wed, Nov 21, 2012 at 04:09:39PM +0800, Yang Zhang wrote:
> Posted Interrupt allows vAPICV interrupts to inject into guest directly
> without any vmexit.
> 
> - When delivering a interrupt to guest, if target vcpu is running,
>   update Posted-interrupt requests bitmap and send a notification event
>   to the vcpu. Then the vcpu will handle this interrupt automatically,
>   without any software involvemnt.
> 
Looks like you allocating one irq vector per vcpu per pcpu and then
migrate it or reallocate when vcpu move from one pcpu to another.
This is not scalable and migrating irq migration slows things down.
What's wrong with allocating one global vector for posted interrupt
during vmx initialization and use it for all vcpus?

> - If target vcpu is not running or there already a notification event
>   pending in the vcpu, do nothing. The interrupt will be handled by old
>   way.
> 
> Signed-off-by: Yang Zhang <yang.z.zhang@xxxxxxxxx>
> ---
>  arch/x86/include/asm/kvm_host.h |    3 +
>  arch/x86/include/asm/vmx.h      |    4 +
>  arch/x86/kernel/apic/io_apic.c  |  138 ++++++++++++++++++++++++++++
>  arch/x86/kvm/lapic.c            |   31 ++++++-
>  arch/x86/kvm/lapic.h            |    8 ++
>  arch/x86/kvm/vmx.c              |  192 +++++++++++++++++++++++++++++++++++++--
>  arch/x86/kvm/x86.c              |    2 +
>  include/linux/kvm_host.h        |    1 +
>  virt/kvm/kvm_main.c             |    2 +
>  9 files changed, 372 insertions(+), 9 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 8e07a86..1145894 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -683,9 +683,12 @@ struct kvm_x86_ops {
>  	void (*enable_irq_window)(struct kvm_vcpu *vcpu);
>  	void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
>  	int (*has_virtual_interrupt_delivery)(struct kvm_vcpu *vcpu);
> +	int (*has_posted_interrupt)(struct kvm_vcpu *vcpu);
>  	void (*update_irq)(struct kvm_vcpu *vcpu);
>  	void (*set_eoi_exitmap)(struct kvm_vcpu *vcpu, int vector,
>  			int need_eoi, int global);
> +	int (*send_nv)(struct kvm_vcpu *vcpu, int vector);
> +	void (*pi_migrate)(struct kvm_vcpu *vcpu);
>  	int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
>  	int (*get_tdp_level)(void);
>  	u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
> diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
> index 1003341..7b9e1d0 100644
> --- a/arch/x86/include/asm/vmx.h
> +++ b/arch/x86/include/asm/vmx.h
> @@ -152,6 +152,7 @@
>  #define PIN_BASED_EXT_INTR_MASK                 0x00000001
>  #define PIN_BASED_NMI_EXITING                   0x00000008
>  #define PIN_BASED_VIRTUAL_NMIS                  0x00000020
> +#define PIN_BASED_POSTED_INTR                   0x00000080
>  
>  #define VM_EXIT_SAVE_DEBUG_CONTROLS             0x00000002
>  #define VM_EXIT_HOST_ADDR_SPACE_SIZE            0x00000200
> @@ -174,6 +175,7 @@
>  /* VMCS Encodings */
>  enum vmcs_field {
>  	VIRTUAL_PROCESSOR_ID            = 0x00000000,
> +	POSTED_INTR_NV                  = 0x00000002,
>  	GUEST_ES_SELECTOR               = 0x00000800,
>  	GUEST_CS_SELECTOR               = 0x00000802,
>  	GUEST_SS_SELECTOR               = 0x00000804,
> @@ -208,6 +210,8 @@ enum vmcs_field {
>  	VIRTUAL_APIC_PAGE_ADDR_HIGH     = 0x00002013,
>  	APIC_ACCESS_ADDR		= 0x00002014,
>  	APIC_ACCESS_ADDR_HIGH		= 0x00002015,
> +	POSTED_INTR_DESC_ADDR           = 0x00002016,
> +	POSTED_INTR_DESC_ADDR_HIGH      = 0x00002017,
>  	EPT_POINTER                     = 0x0000201a,
>  	EPT_POINTER_HIGH                = 0x0000201b,
>  	EOI_EXIT_BITMAP0                = 0x0000201c,
> diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
> index 1817fa9..97cb8ee 100644
> --- a/arch/x86/kernel/apic/io_apic.c
> +++ b/arch/x86/kernel/apic/io_apic.c
> @@ -3277,6 +3277,144 @@ int arch_setup_dmar_msi(unsigned int irq)
>  }
>  #endif
>  
> +static int
> +pi_set_affinity(struct irq_data *data, const struct cpumask *mask,
> +		      bool force)
> +{
> +	unsigned int dest;
> +	struct irq_cfg *cfg = (struct irq_cfg *)data->chip_data;
> +	if (cpumask_equal(cfg->domain, mask))
> +		return IRQ_SET_MASK_OK;
> +
> +	if (__ioapic_set_affinity(data, mask, &dest))
> +		return -1;
> +
> +	return IRQ_SET_MASK_OK;
> +}
> +
> +static void pi_mask(struct irq_data *data)
> +{
> +	;
> +}
> +
> +static void pi_unmask(struct irq_data *data)
> +{
> +	;
> +}
> +
> +static struct irq_chip pi_chip = {
> +	.name       = "POSTED-INTR",
> +	.irq_ack    = ack_apic_edge,
> +	.irq_unmask = pi_unmask,
> +	.irq_mask   = pi_mask,
> +	.irq_set_affinity   = pi_set_affinity,
> +};
> +
> +int arch_pi_migrate(int irq, int cpu)
> +{
> +	struct irq_data *data = irq_get_irq_data(irq);
> +	struct irq_cfg *cfg;
> +	struct irq_desc *desc = irq_to_desc(irq);
> +	unsigned long flags;
> +
> +	if (!desc)
> +		return -EINVAL;
> +
> +	cfg = irq_cfg(irq);
> +	if (cpumask_equal(cfg->domain, cpumask_of(cpu)))
> +		return cfg->vector;
> +
> +	irq_set_affinity(irq, cpumask_of(cpu));
> +	raw_spin_lock_irqsave(&desc->lock, flags);
> +	irq_move_irq(data);
> +	raw_spin_unlock_irqrestore(&desc->lock, flags);
> +
> +	if (cfg->move_in_progress)
> +		send_cleanup_vector(cfg);
> +	return cfg->vector;
> +}
> +EXPORT_SYMBOL_GPL(arch_pi_migrate);
> +
> +static int arch_pi_create_irq(const struct cpumask *mask)
> +{
> +	int node = cpu_to_node(0);
> +	unsigned int irq_want;
> +	struct irq_cfg *cfg;
> +	unsigned long flags;
> +	unsigned int ret = 0;
> +	int irq;
> +
> +	irq_want = nr_irqs_gsi;
> +
> +	irq = alloc_irq_from(irq_want, node);
> +	if (irq < 0)
> +		return 0;
> +	cfg = alloc_irq_cfg(irq_want, node);
s/irq_want/irq.

> +	if (!cfg) {
> +		free_irq_at(irq, NULL);
> +		return 0;
> +	}
> +
> +	raw_spin_lock_irqsave(&vector_lock, flags);
> +	if (!__assign_irq_vector(irq, cfg, mask))
> +		ret = irq;
> +	raw_spin_unlock_irqrestore(&vector_lock, flags);
> +
> +	if (ret) {
> +		irq_set_chip_data(irq, cfg);
> +		irq_clear_status_flags(irq, IRQ_NOREQUEST);
> +	} else {
> +		free_irq_at(irq, cfg);
> +	}
> +	return ret;
> +}

This function is mostly cut&paste of create_irq_nr().

> +
> +int arch_pi_alloc_irq(void *vmx)
> +{
> +	int irq, cpu = smp_processor_id();
> +	struct irq_cfg *cfg;
> +
> +	irq = arch_pi_create_irq(cpumask_of(cpu));
> +	if (!irq) {
> +		pr_err("Posted Interrupt: no free irq\n");
> +		return -EINVAL;
> +	}
> +	irq_set_handler_data(irq, vmx);
> +	irq_set_chip_and_handler_name(irq, &pi_chip, handle_edge_irq, "edge");
> +	irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
> +	irq_set_affinity(irq, cpumask_of(cpu));
> +
> +	cfg = irq_cfg(irq);
> +	if (cfg->move_in_progress)
> +		send_cleanup_vector(cfg);
> +
> +	return irq;
> +}
> +EXPORT_SYMBOL_GPL(arch_pi_alloc_irq);
> +
> +void arch_pi_free_irq(unsigned int irq, void *vmx)
> +{
> +	if (irq) {
> +		irq_set_handler_data(irq, NULL);
> +		/* This will mask the irq */
> +		free_irq(irq, vmx);
> +		destroy_irq(irq);
> +	}
> +}
> +EXPORT_SYMBOL_GPL(arch_pi_free_irq);
> +
> +int arch_pi_get_vector(unsigned int irq)
> +{
> +	struct irq_cfg *cfg;
> +
> +	if (!irq)
> +		return -EINVAL;
> +
> +	cfg = irq_cfg(irq);
> +	return cfg->vector;
> +}
> +EXPORT_SYMBOL_GPL(arch_pi_get_vector);
> +
>  #ifdef CONFIG_HPET_TIMER
>  
>  static int hpet_msi_set_affinity(struct irq_data *data,
> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> index af48361..04220de 100644
> --- a/arch/x86/kvm/lapic.c
> +++ b/arch/x86/kvm/lapic.c
> @@ -656,7 +656,7 @@ void kvm_set_eoi_exitmap(struct kvm_vcpu *vcpu, int vector,
>  static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
>  			     int vector, int level, int trig_mode)
>  {
> -	int result = 0;
> +	int result = 0, send;
>  	struct kvm_vcpu *vcpu = apic->vcpu;
>  
>  	switch (delivery_mode) {
> @@ -674,6 +674,13 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
>  		} else {
>  			apic_clear_vector(vector, apic->regs + APIC_TMR);
>  			kvm_set_eoi_exitmap(vcpu, vector, 0, 0);
> +			if (kvm_apic_pi_enabled(vcpu)) {
Provide send_nv() that returns 0 if pi is disabled.

> +				send = kvm_x86_ops->send_nv(vcpu, vector);
> +				if (send) {
No need "send" variable here.

> +					result = 1;
> +					break;
> +				}
> +			}
>  		}
>  
>  		result = !apic_test_and_set_irr(vector, apic);
> @@ -1541,6 +1548,10 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
>  
>  	if (kvm_x86_ops->has_virtual_interrupt_delivery(vcpu))
>  		apic->vid_enabled = true;
> +
> +	if (kvm_x86_ops->has_posted_interrupt(vcpu))
> +		apic->pi_enabled = true;
> +
This is global state, no need per apic variable.

>  	return 0;
>  nomem_free_apic:
>  	kfree(apic);
> @@ -1575,6 +1586,24 @@ int kvm_apic_get_highest_irr(struct kvm_vcpu *vcpu)
>  }
>  EXPORT_SYMBOL_GPL(kvm_apic_get_highest_irr);
>  
> +void kvm_apic_update_irr(struct kvm_vcpu *vcpu, unsigned int *pir)
> +{
> +	struct kvm_lapic *apic = vcpu->arch.apic;
> +	unsigned int *reg;
> +	unsigned int i;
> +
> +	if (!apic || !apic_enabled(apic))
Use kvm_vcpu_has_lapic() instead of !apic.

> +		return;
> +
> +	for (i = 0; i <= 7; i++) {
> +		reg = apic->regs + APIC_IRR + i * 0x10;
> +		*reg |= pir[i];
Non atomic access to IRR. Other threads may set bit there concurrently.

> +		pir[i] = 0;
> +	}
Should set apic->irr_pending to true when setting irr bit.

> +	return;
> +}
> +EXPORT_SYMBOL_GPL(kvm_apic_update_irr);
> +
>  int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu)
>  {
>  	u32 lvt0 = kvm_apic_get_reg(vcpu->arch.apic, APIC_LVT0);
> diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
> index 2503a64..ad35868 100644
> --- a/arch/x86/kvm/lapic.h
> +++ b/arch/x86/kvm/lapic.h
> @@ -21,6 +21,7 @@ struct kvm_lapic {
>  	struct kvm_vcpu *vcpu;
>  	bool irr_pending;
>  	bool vid_enabled;
> +	bool pi_enabled;
>  	/* Number of bits set in ISR. */
>  	s16 isr_count;
>  	/* The highest vector set in ISR; if -1 - invalid, must scan ISR. */
> @@ -43,6 +44,7 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu);
>  int kvm_cpu_has_extint(struct kvm_vcpu *v);
>  int kvm_cpu_get_extint(struct kvm_vcpu *v);
>  int kvm_apic_get_highest_irr(struct kvm_vcpu *vcpu);
> +void kvm_apic_update_irr(struct kvm_vcpu *vcpu, unsigned int *pir);
>  void kvm_lapic_reset(struct kvm_vcpu *vcpu);
>  u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu);
>  void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8);
> @@ -94,6 +96,12 @@ static inline bool kvm_apic_vid_enabled(struct kvm_vcpu *vcpu)
>  	return apic->vid_enabled;
>  }
>  
> +static inline bool kvm_apic_pi_enabled(struct kvm_vcpu *vcpu)
> +{
> +	struct kvm_lapic *apic = vcpu->arch.apic;
> +	return apic->pi_enabled;
> +}
> +
>  int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data);
>  void kvm_lapic_init(void);
>  
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index f6ef090..6448b96 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -31,6 +31,7 @@
>  #include <linux/ftrace_event.h>
>  #include <linux/slab.h>
>  #include <linux/tboot.h>
> +#include <linux/interrupt.h>
>  #include "kvm_cache_regs.h"
>  #include "x86.h"
>  
> @@ -89,6 +90,8 @@ module_param(enable_apicv_reg, bool, S_IRUGO);
>  static bool __read_mostly enable_apicv_vid = 0;
>  module_param(enable_apicv_vid, bool, S_IRUGO);
>  
> +static bool __read_mostly enable_apicv_pi = 0;
> +module_param(enable_apicv_pi, bool, S_IRUGO);
>  /*
>   * If nested=1, nested virtualization is supported, i.e., guests may use
>   * VMX and be a hypervisor for its own guests. If nested=0, guests may not
> @@ -372,6 +375,44 @@ struct nested_vmx {
>  	struct page *apic_access_page;
>  };
>  
> +/* Posted-Interrupt Descriptor */
> +struct pi_desc {
> +	u32 pir[8];     /* Posted interrupt requested */
> +	union {
> +		struct {
> +			u8  on:1,
> +			    rsvd:7;
> +		} control;
> +		u32 rsvd[8];
> +	} u;
> +} __aligned(64);
> +
> +#define POSTED_INTR_ON  0
> +u8 pi_test_on(struct pi_desc *pi_desc)
> +{
> +	return test_bit(POSTED_INTR_ON, (unsigned long *)&pi_desc->u.control);
> +}
> +void pi_set_on(struct pi_desc *pi_desc)
> +{
> +	set_bit(POSTED_INTR_ON, (unsigned long *)&pi_desc->u.control);
> +}
> +
> +void pi_clear_on(struct pi_desc *pi_desc)
> +{
> +	clear_bit(POSTED_INTR_ON, (unsigned long *)&pi_desc->u.control);
> +}
> +
> +u8 pi_test_and_set_on(struct pi_desc *pi_desc)
> +{
> +	return test_and_set_bit(POSTED_INTR_ON,
> +			(unsigned long *)&pi_desc->u.control);
> +}
> +
> +void pi_set_pir(int vector, struct pi_desc *pi_desc)
> +{
> +	set_bit(vector, (unsigned long *)pi_desc->pir);
> +}
> +
>  struct vcpu_vmx {
>  	struct kvm_vcpu       vcpu;
>  	unsigned long         host_rsp;
> @@ -439,6 +480,11 @@ struct vcpu_vmx {
>  	u64 eoi_exit_bitmap[4];
>  	u64 eoi_exit_bitmap_global[4];
>  
> +	/* Posted interrupt descriptor */
> +	struct pi_desc *pi;
> +	u32 irq;
> +	u32 vector;
> +
>  	/* Support for a guest hypervisor (nested VMX) */
>  	struct nested_vmx nested;
>  };
> @@ -698,6 +744,11 @@ static u64 host_efer;
>  
>  static void ept_save_pdptrs(struct kvm_vcpu *vcpu);
>  
> +int arch_pi_get_vector(unsigned int irq);
> +int arch_pi_alloc_irq(struct vcpu_vmx *vmx);
> +void arch_pi_free_irq(unsigned int irq, struct vcpu_vmx *vmx);
> +int arch_pi_migrate(int irq, int cpu);
> +
>  /*
>   * Keep MSR_STAR at the end, as setup_msrs() will try to optimize it
>   * away by decrementing the array size.
> @@ -783,6 +834,11 @@ static inline bool cpu_has_vmx_virtual_intr_delivery(void)
>  		SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY;
>  }
>  
> +static inline bool cpu_has_vmx_posted_intr(void)
> +{
> +	return vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR;
> +}
> +
>  static inline bool cpu_has_vmx_flexpriority(void)
>  {
>  	return cpu_has_vmx_tpr_shadow() &&
> @@ -1555,6 +1611,11 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
>  		struct desc_ptr *gdt = &__get_cpu_var(host_gdt);
>  		unsigned long sysenter_esp;
>  
> +		if (enable_apicv_pi && to_vmx(vcpu)->pi)
> +			pi_set_on(to_vmx(vcpu)->pi);
> +
Why?

> +		kvm_make_request(KVM_REQ_POSTED_INTR, vcpu);
> +
>  		kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
>  		local_irq_disable();
>  		list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link,
> @@ -1582,6 +1643,8 @@ static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
>  		vcpu->cpu = -1;
>  		kvm_cpu_vmxoff();
>  	}
> +	if (enable_apicv_pi && to_vmx(vcpu)->pi)
> +		pi_set_on(to_vmx(vcpu)->pi);
Why?

>  }
>  
>  static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
> @@ -2451,12 +2514,6 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
>  	u32 _vmexit_control = 0;
>  	u32 _vmentry_control = 0;
>  
> -	min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
> -	opt = PIN_BASED_VIRTUAL_NMIS;
> -	if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
> -				&_pin_based_exec_control) < 0)
> -		return -EIO;
> -
>  	min = CPU_BASED_HLT_EXITING |
>  #ifdef CONFIG_X86_64
>  	      CPU_BASED_CR8_LOAD_EXITING |
> @@ -2531,6 +2588,17 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
>  				&_vmexit_control) < 0)
>  		return -EIO;
>  
> +	min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
> +	opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR;
> +	if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
> +				&_pin_based_exec_control) < 0)
> +		return -EIO;
> +
> +	if (!(_cpu_based_2nd_exec_control &
> +		SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) ||
> +		!(_vmexit_control & VM_EXIT_ACK_INTR_ON_EXIT))
> +		_pin_based_exec_control &= ~PIN_BASED_POSTED_INTR;
> +
>  	min = 0;
>  	opt = VM_ENTRY_LOAD_IA32_PAT;
>  	if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS,
> @@ -2715,6 +2783,9 @@ static __init int hardware_setup(void)
>  	if (!cpu_has_vmx_virtual_intr_delivery())
>  		enable_apicv_vid = 0;
>  
> +	if (!cpu_has_vmx_posted_intr() || !x2apic_enabled())
In nested guest x2apic may be enabled without irq remapping. Check for
irq remapping here.

> +		enable_apicv_pi = 0;
> +
>  	if (nested)
>  		nested_vmx_setup_ctls_msrs();
>  
> @@ -3881,6 +3952,93 @@ static void ept_set_mmio_spte_mask(void)
>  	kvm_mmu_set_mmio_spte_mask(0xffull << 49 | 0x6ull);
>  }
>  
> +irqreturn_t pi_handler(int irq, void *data)
> +{
> +	struct vcpu_vmx *vmx = data;
> +
> +	kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu);
> +	kvm_vcpu_kick(&vmx->vcpu);
> +
> +	return IRQ_HANDLED;
> +}
> +
> +static int vmx_has_posted_interrupt(struct kvm_vcpu *vcpu)
> +{
> +	return irqchip_in_kernel(vcpu->kvm) && enable_apicv_pi;
> +}
> +
> +static void vmx_pi_migrate(struct kvm_vcpu *vcpu)
> +{
> +	int ret = 0;
> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
> +
> +	if (!enable_apicv_pi)
> +		return ;
> +
> +	preempt_disable();
> +	local_irq_disable();
> +	if (!vmx->irq) {
> +		ret = arch_pi_alloc_irq(vmx);
> +		if (ret < 0) {
> +			vmx->irq = -1;
> +			goto out;
> +		}
> +		vmx->irq = ret;
> +
> +		ret = request_irq(vmx->irq, pi_handler, IRQF_NO_THREAD,
> +					"Posted Interrupt", vmx);
> +		if (ret) {
> +			vmx->irq = -1;
> +			goto out;
> +		}
> +
> +		ret = arch_pi_get_vector(vmx->irq);
> +	} else
> +		ret = arch_pi_migrate(vmx->irq, smp_processor_id());
> +
> +	if (ret < 0) {
> +		vmx->irq = -1;
> +		goto out;
> +	} else {
> +		vmx->vector = ret;
> +		vmcs_write16(POSTED_INTR_NV, vmx->vector);
> +		pi_clear_on(vmx->pi);
> +	}
> +out:
> +	local_irq_enable();
> +	preempt_enable();
> +	return ;
> +}
> +
> +static int vmx_send_nv(struct kvm_vcpu *vcpu,
> +		int vector)
> +{
> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
> +
> +	if (unlikely(vmx->irq == -1))
> +		return 0;
> +
> +	if (vcpu->cpu == smp_processor_id()) {
> +		pi_set_on(vmx->pi);
Why? You clear this bit anyway in vmx_update_irq() during guest entry.

> +		return 0;
> +	}
> +
> +	pi_set_pir(vector, vmx->pi);
> +	if (!pi_test_and_set_on(vmx->pi) && (vcpu->mode == IN_GUEST_MODE)) {
> +		apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), vmx->vector);
> +		return 1;
> +	}
> +	return 0;
> +}
> +
> +static void free_pi(struct vcpu_vmx *vmx)
> +{
> +	if (enable_apicv_pi) {
> +		kfree(vmx->pi);
> +		arch_pi_free_irq(vmx->irq, vmx);
> +	}
> +}
> +
>  /*
>   * Sets up the vmcs for emulated real mode.
>   */
> @@ -3890,6 +4048,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
>  	unsigned long a;
>  #endif
>  	int i;
> +	u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl;
>  
>  	/* I/O */
>  	vmcs_write64(IO_BITMAP_A, __pa(vmx_io_bitmap_a));
> @@ -3901,8 +4060,10 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
>  	vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
>  
>  	/* Control */
> -	vmcs_write32(PIN_BASED_VM_EXEC_CONTROL,
> -		vmcs_config.pin_based_exec_ctrl);
> +	if (!enable_apicv_pi)
> +		pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR;
> +
> +	vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, pin_based_exec_ctrl);
>  
>  	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx));
>  
> @@ -3920,6 +4081,12 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
>  		vmcs_write16(GUEST_INTR_STATUS, 0);
>  	}
>  
> +	if (enable_apicv_pi) {
> +		vmx->pi = kmalloc(sizeof(struct pi_desc),
> +				GFP_KERNEL | __GFP_ZERO);
> +		vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((vmx->pi)));
> +	}
> +
>  	if (ple_gap) {
>  		vmcs_write32(PLE_GAP, ple_gap);
>  		vmcs_write32(PLE_WINDOW, ple_window);
> @@ -6161,6 +6328,11 @@ static void vmx_update_irq(struct kvm_vcpu *vcpu)
>  	if (!enable_apicv_vid)
>  		return ;
>  
> +	if (enable_apicv_pi) {
> +		kvm_apic_update_irr(vcpu, (unsigned int *)vmx->pi->pir);
> +		pi_clear_on(vmx->pi);
Why do you do that? Isn't VMX process posted interrupts on vmentry if "on" bit
is set?

> +	}
> +
>  	vector = kvm_apic_get_highest_irr(vcpu);
>  	if (vector == -1)
>  		return;
> @@ -6586,6 +6758,7 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
>  
>  	free_vpid(vmx);
>  	free_nested(vmx);
> +	free_pi(vmx);
>  	free_loaded_vmcs(vmx->loaded_vmcs);
>  	kfree(vmx->guest_msrs);
>  	kvm_vcpu_uninit(vcpu);
> @@ -7483,8 +7656,11 @@ static struct kvm_x86_ops vmx_x86_ops = {
>  	.enable_irq_window = enable_irq_window,
>  	.update_cr8_intercept = update_cr8_intercept,
>  	.has_virtual_interrupt_delivery = vmx_has_virtual_interrupt_delivery,
> +	.has_posted_interrupt = vmx_has_posted_interrupt,
>  	.update_irq = vmx_update_irq,
>  	.set_eoi_exitmap = vmx_set_eoi_exitmap,
> +	.send_nv = vmx_send_nv,
> +	.pi_migrate = vmx_pi_migrate,
>  
>  	.set_tss_addr = vmx_set_tss_addr,
>  	.get_tdp_level = get_ept_level,
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 8b8de3b..f035267 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -5250,6 +5250,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
>  	bool req_immediate_exit = 0;
>  
>  	if (vcpu->requests) {
> +		if (kvm_check_request(KVM_REQ_POSTED_INTR, vcpu))
> +			kvm_x86_ops->pi_migrate(vcpu);
>  		if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu))
>  			kvm_mmu_unload(vcpu);
>  		if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index ecc5543..f8d8d34 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -107,6 +107,7 @@ static inline bool is_error_page(struct page *page)
>  #define KVM_REQ_IMMEDIATE_EXIT    15
>  #define KVM_REQ_PMU               16
>  #define KVM_REQ_PMI               17
> +#define KVM_REQ_POSTED_INTR       18
>  
>  #define KVM_USERSPACE_IRQ_SOURCE_ID		0
>  #define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID	1
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index be70035..05baf1c 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -1625,6 +1625,8 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
>  			smp_send_reschedule(cpu);
>  	put_cpu();
>  }
> +EXPORT_SYMBOL_GPL(kvm_vcpu_kick);
> +
>  #endif /* !CONFIG_S390 */
>  
>  void kvm_resched(struct kvm_vcpu *vcpu)
> -- 
> 1.7.1

--
			Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html