RE: [PATCH v2 2/2] KVM: VMX: Add Posted Interrupt supporting

"Zhang, Yang Z" <yang.z.zhang@xxxxxxxxx> · Tue, 5 Feb 2013 03:13:52 +0000



Gleb Natapov wrote on 2013-02-04:
> On Mon, Feb 04, 2013 at 05:05:14PM +0800, Yang Zhang wrote:
>> From: Yang Zhang <yang.z.zhang@xxxxxxxxx>
>> 
>> Posted Interrupt allows APIC interrupts to inject into guest directly
>> without any vmexit.
>> 
>> - When delivering a interrupt to guest, if target vcpu is running,
>>   update Posted-interrupt requests bitmap and send a notification event
>>   to the vcpu. Then the vcpu will handle this interrupt automatically,
>>   without any software involvemnt.
>> - If target vcpu is not running or there already a notification event
>>   pending in the vcpu, do nothing. The interrupt will be handled by
>>   next vm entry
>> Signed-off-by: Yang Zhang <yang.z.zhang@xxxxxxxxx>
>> ---
>>  arch/x86/include/asm/entry_arch.h  |    1 +
>>  arch/x86/include/asm/hw_irq.h      |    1 +
>>  arch/x86/include/asm/irq_vectors.h |    4 +
>>  arch/x86/include/asm/kvm_host.h    |    3 + arch/x86/include/asm/vmx.h
>>          |    4 + arch/x86/kernel/entry_64.S         |    5 +
>>  arch/x86/kernel/irq.c              |   19 ++++
>>  arch/x86/kernel/irqinit.c          |    4 + arch/x86/kvm/lapic.c      
>>          |   15 +++- arch/x86/kvm/lapic.h               |    1 +
>>  arch/x86/kvm/svm.c                 |    6 ++ arch/x86/kvm/vmx.c       
>>           |  164 +++++++++++++++++++++++++++++++-----
>>  arch/x86/kvm/x86.c                 |    4 + include/linux/kvm_host.h  
>>          |    1 + 14 files changed, 208 insertions(+), 24 deletions(-)
>> diff --git a/arch/x86/include/asm/entry_arch.h
>> b/arch/x86/include/asm/entry_arch.h index 40afa00..7b0a29e 100644 ---
>> a/arch/x86/include/asm/entry_arch.h +++
>> b/arch/x86/include/asm/entry_arch.h @@ -18,6 +18,7 @@
>> BUILD_INTERRUPT(reboot_interrupt,REBOOT_VECTOR)
>>  #endif
>>  
>>  BUILD_INTERRUPT(x86_platform_ipi, X86_PLATFORM_IPI_VECTOR)
>> +BUILD_INTERRUPT(posted_intr_ipi, POSTED_INTR_VECTOR)
> Missing CONFIG_HAVE_KVM ifdef. Have you verified that this patch
> compiles with KVM support disabled? Also give it a name that will
> associate it with KVM.
Yes, but seems it is selected by x86 by default. And it always enabled when building kernel.
I will remove the select in Kconfig and try again.

>> 
>>  /*
>>   * every pentium local APIC has two 'local interrupts', with a
>> diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
>> index eb92a6e..ee61af3 100644
>> --- a/arch/x86/include/asm/hw_irq.h
>> +++ b/arch/x86/include/asm/hw_irq.h
>> @@ -28,6 +28,7 @@
>>  /* Interrupt handlers registered during init_IRQ */ extern void
>>  apic_timer_interrupt(void); extern void x86_platform_ipi(void);
>>  +extern void posted_intr_ipi(void); extern void error_interrupt(void);
>>  extern void irq_work_interrupt(void);
>> diff --git a/arch/x86/include/asm/irq_vectors.h
>> b/arch/x86/include/asm/irq_vectors.h index 1508e51..6421a63 100644 ---
>> a/arch/x86/include/asm/irq_vectors.h +++
>> b/arch/x86/include/asm/irq_vectors.h @@ -102,6 +102,10 @@
>>   */
>>  #define X86_PLATFORM_IPI_VECTOR		0xf7
>> +#ifdef CONFIG_HAVE_KVM
>> +#define POSTED_INTR_VECTOR		0xf2
>> +#endif
>> +
>>  /*
>>   * IRQ work vector:
>>   */
>> diff --git a/arch/x86/include/asm/kvm_host.h
>> b/arch/x86/include/asm/kvm_host.h index b8388e9..bab1c0a 100644 ---
>> a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h
>> @@ -704,6 +704,9 @@ struct kvm_x86_ops {
>>  	void (*hwapic_isr_update)(struct kvm *kvm, int isr);
>>  	void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
>>  	void (*set_virtual_x2apic_mode)(struct kvm_vcpu *vcpu, bool set);
>> +	bool (*send_notification_event)(struct kvm_vcpu *vcpu,
>> +					int vector, int *result);
>> +	bool (*sync_pir_to_irr)(struct kvm_vcpu *vcpu);
>>  	int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
>>  	int (*get_tdp_level)(void);
>>  	u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
>> diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
>> index 694586c..f5ec72c 100644
>> --- a/arch/x86/include/asm/vmx.h
>> +++ b/arch/x86/include/asm/vmx.h
>> @@ -153,6 +153,7 @@
>>  #define PIN_BASED_EXT_INTR_MASK                 0x00000001
>>  #define PIN_BASED_NMI_EXITING                   0x00000008
>>  #define PIN_BASED_VIRTUAL_NMIS                  0x00000020
>> +#define PIN_BASED_POSTED_INTR                   0x00000080
>> 
>>  #define VM_EXIT_SAVE_DEBUG_CONTROLS             0x00000002 #define
>>  VM_EXIT_HOST_ADDR_SPACE_SIZE            0x00000200 @@ -175,6 +176,7 @@
>>  /* VMCS Encodings */ enum vmcs_field { 	VIRTUAL_PROCESSOR_ID          
>>   = 0x00000000, +	POSTED_INTR_NV                  = 0x00000002,
>>  	GUEST_ES_SELECTOR               = 0x00000800, 	GUEST_CS_SELECTOR     
>>           = 0x00000802, 	GUEST_SS_SELECTOR               = 0x00000804,
>>  @@ -209,6 +211,8 @@ enum vmcs_field { 	VIRTUAL_APIC_PAGE_ADDR_HIGH    
>>  = 0x00002013, 	APIC_ACCESS_ADDR		= 0x00002014,
>>  	APIC_ACCESS_ADDR_HIGH		= 0x00002015,
>> +	POSTED_INTR_DESC_ADDR           = 0x00002016,
>> +	POSTED_INTR_DESC_ADDR_HIGH      = 0x00002017,
>>  	EPT_POINTER                     = 0x0000201a,
>>  	EPT_POINTER_HIGH                = 0x0000201b,
>>  	EOI_EXIT_BITMAP0                = 0x0000201c,
>> diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
>> index 70641af..c6c47a3 100644
>> --- a/arch/x86/kernel/entry_64.S
>> +++ b/arch/x86/kernel/entry_64.S
>> @@ -1177,6 +1177,11 @@ apicinterrupt LOCAL_TIMER_VECTOR \
>>  apicinterrupt X86_PLATFORM_IPI_VECTOR \
>>  	x86_platform_ipi smp_x86_platform_ipi
>> +#ifdef CONFIG_HAVE_KVM
>> +apicinterrupt POSTED_INTR_VECTOR \
>> +	posted_intr_ipi smp_posted_intr_ipi
>> +#endif
>> +
>>  apicinterrupt THRESHOLD_APIC_VECTOR \
>>  	threshold_interrupt smp_threshold_interrupt
>>  apicinterrupt THERMAL_APIC_VECTOR \
>> diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
>> index e4595f1..3551cf2 100644
>> --- a/arch/x86/kernel/irq.c
>> +++ b/arch/x86/kernel/irq.c
>> @@ -228,6 +228,25 @@ void smp_x86_platform_ipi(struct pt_regs *regs)
>>  	set_irq_regs(old_regs);
>>  }
>> +/* + * Handler for POSTED_INTERRUPT_VECTOR. + */ #ifdef
>> CONFIG_HAVE_KVM +void smp_posted_intr_ipi(struct pt_regs *regs) +{
>> +	struct pt_regs *old_regs = set_irq_regs(regs); + +	ack_APIC_irq(); +
>> +	irq_enter(); + +	exit_idle(); + +	irq_exit(); +
>> +	set_irq_regs(old_regs); +} + +
> One blank line is enough.
> 
>>  EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq);
>>  
>>  #ifdef CONFIG_HOTPLUG_CPU
>> diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
>> index 6e03b0d..f90c5ae 100644
>> --- a/arch/x86/kernel/irqinit.c
>> +++ b/arch/x86/kernel/irqinit.c
>> @@ -205,6 +205,10 @@ static void __init apic_intr_init(void)
>> 
>>  	/* IPI for X86 platform specific use */
>>  	alloc_intr_gate(X86_PLATFORM_IPI_VECTOR, x86_platform_ipi);
>> +#ifdef CONFIG_HAVE_KVM
>> +	/* IPI for posted interrupt use */
>> +	alloc_intr_gate(POSTED_INTR_VECTOR, posted_intr_ipi);
>> +#endif
>> 
>>  	/* IPI vectors for APIC spurious and error interrupts */
>>  	alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
>> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
>> index 02b51dd..df6b6a3 100644
>> --- a/arch/x86/kvm/lapic.c
>> +++ b/arch/x86/kvm/lapic.c
>> @@ -379,6 +379,7 @@ static inline int apic_find_highest_irr(struct kvm_lapic
> *apic)
>>  	if (!apic->irr_pending)
>>  		return -1;
>> +	kvm_x86_ops->sync_pir_to_irr(apic->vcpu);
>>  	result = apic_search_irr(apic);
>>  	ASSERT(result == -1 || result >= 16);
>> @@ -685,6 +686,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int
> delivery_mode,
>>  {
>>  	int result = 0;
>>  	struct kvm_vcpu *vcpu = apic->vcpu;
>> +	bool send = false;
>> 
>>  	switch (delivery_mode) {
>>  	case APIC_DM_LOWEST:
>> @@ -700,7 +702,12 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int
> delivery_mode,
>>  		} else
>>  			apic_clear_vector(vector, apic->regs + APIC_TMR);
>> -		result = !apic_test_and_set_irr(vector, apic);
>> +		if (kvm_x86_ops->vm_has_apicv(vcpu->kvm))
> Just call send_notification_event() and do the check inside. And call it
> deliver_posted_interrupt() or something. It does more than just sends
> notification event. Actually it may not send it at all.
The code logic is different w/ or w/o apicv. So even put the check inside callee, we still need check it in caller. I think current solution is more clear.

>> +			send = kvm_x86_ops->send_notification_event(vcpu,
>> +						vector, &result);
>> +		else
>> +			result = !apic_test_and_set_irr(vector, apic);
>> +
>>  		trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode,
>>  					  trig_mode, vector, !result);
>>  		if (!result) {
>> @@ -710,8 +717,10 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int
> delivery_mode,
>>  			break;
>>  		}
>> -		kvm_make_request(KVM_REQ_EVENT, vcpu);
>> -		kvm_vcpu_kick(vcpu);
>> +		if (!send) {
>> +			kvm_make_request(KVM_REQ_EVENT, vcpu);
>> +			kvm_vcpu_kick(vcpu);
>> +		}
>>  		break;
>>  
>>  	case APIC_DM_REMRD:
>> diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
>> index 1676d34..632111f 100644
>> --- a/arch/x86/kvm/lapic.h
>> +++ b/arch/x86/kvm/lapic.h
>> @@ -46,6 +46,7 @@ void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu);
>>  void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value);
>>  u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu);
>>  void kvm_apic_set_version(struct kvm_vcpu *vcpu);
>> +void kvm_apic_update_irr(struct kvm_vcpu *vcpu, unsigned int *pir);
>> 
>>  int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest);
>>  int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda);
>> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
>> index a7d60d7..37f961d 100644
>> --- a/arch/x86/kvm/svm.c
>> +++ b/arch/x86/kvm/svm.c
>> @@ -3591,6 +3591,11 @@ static void svm_hwapic_isr_update(struct kvm *kvm,
> int isr)
>>  	return;
>>  }
>> +static bool svm_sync_pir_to_irr(struct kvm_vcpu *vcpu)
>> +{
>> +	return false;
>> +}
>> +
>>  static int svm_nmi_allowed(struct kvm_vcpu *vcpu) { 	struct vcpu_svm
>>  *svm = to_svm(vcpu); @@ -4319,6 +4324,7 @@ static struct kvm_x86_ops
>>  svm_x86_ops = { 	.vm_has_apicv = svm_vm_has_apicv, 	.load_eoi_exitmap
>>  = svm_load_eoi_exitmap, 	.hwapic_isr_update = svm_hwapic_isr_update,
>> +	.sync_pir_to_irr = svm_sync_pir_to_irr,
>> 
>>  	.set_tss_addr = svm_set_tss_addr,
>>  	.get_tdp_level = get_npt_level,
>> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
>> index e826d29..d2b02f2 100644
>> --- a/arch/x86/kvm/vmx.c
>> +++ b/arch/x86/kvm/vmx.c
>> @@ -84,8 +84,8 @@ module_param(vmm_exclusive, bool, S_IRUGO);
>>  static bool __read_mostly fasteoi = 1;
>>  module_param(fasteoi, bool, S_IRUGO);
>> -static bool __read_mostly enable_apicv_reg_vid = 1;
>> -module_param(enable_apicv_reg_vid, bool, S_IRUGO);
>> +static bool __read_mostly enable_apicv = 1;
>> +module_param(enable_apicv, bool, S_IRUGO);
>> 
>>  /*
>>   * If nested=1, nested virtualization is supported, i.e., guests may use
>> @@ -370,6 +370,41 @@ struct nested_vmx {
>>  	struct page *apic_access_page;
>>  };
>> +#define POSTED_INTR_ON  0
>> +/* Posted-Interrupt Descriptor */
>> +struct pi_desc {
>> +	u32 pir[8];     /* Posted interrupt requested */
>> +	union {
>> +		struct {
>> +			u8  on:1,
>> +			    rsvd:7;
>> +		} control;
>> +		u32 rsvd[8];
>> +	} u;
>> +} __aligned(64);
>> +
>> +static bool pi_test_on(struct pi_desc *pi_desc)
>> +{
>> +	return test_bit(POSTED_INTR_ON,	(unsigned long *)&pi_desc->u.control);
>> +}
>> +
>> +static bool pi_test_and_set_on(struct pi_desc *pi_desc)
>> +{
>> +	return test_and_set_bit(POSTED_INTR_ON,
>> +			(unsigned long *)&pi_desc->u.control);
>> +}
>> +
>> +static bool pi_test_and_clear_on(struct pi_desc *pi_desc)
>> +{
>> +	return test_and_clear_bit(POSTED_INTR_ON,
>> +			(unsigned long *)&pi_desc->u.control);
>> +}
>> +
>> +static int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc)
>> +{
>> +	return test_and_set_bit(vector, (unsigned long *)pi_desc->pir);
>> +}
>> +
>>  struct vcpu_vmx {
>>  	struct kvm_vcpu       vcpu;
>>  	unsigned long         host_rsp;
>> @@ -434,6 +469,9 @@ struct vcpu_vmx {
>> 
>>  	bool rdtscp_enabled;
>> +	/* Posted interrupt descriptor */
>> +	struct pi_desc *pi;
>> +
> You haven't answered on my previous review why are you trying save 46
> bytes here.
Sorry. I cannot get your point. It's just a pointer and only takes 8 bytes.

>>  	/* Support for a guest hypervisor (nested VMX) */
>>  	struct nested_vmx nested;
>>  };
>> @@ -788,6 +826,18 @@ static inline bool
> cpu_has_vmx_virtual_intr_delivery(void)
>>  		SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY;
>>  }
>> +static inline bool cpu_has_vmx_posted_intr(void)
>> +{
>> +	return vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR;
>> +}
>> +
>> +static inline bool cpu_has_vmx_apicv(void)
>> +{
>> +	return cpu_has_vmx_apic_register_virt() &&
>> +		cpu_has_vmx_virtual_intr_delivery() &&
>> +		cpu_has_vmx_posted_intr();
>> +}
>> +
>>  static inline bool cpu_has_vmx_flexpriority(void)
>>  {
>>  	return cpu_has_vmx_tpr_shadow() &&
>> @@ -2535,12 +2585,6 @@ static __init int setup_vmcs_config(struct
> vmcs_config *vmcs_conf)
>>  	u32 _vmexit_control = 0;
>>  	u32 _vmentry_control = 0;
>> -	min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
>> -	opt = PIN_BASED_VIRTUAL_NMIS;
>> -	if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
>> -				&_pin_based_exec_control) < 0)
>> -		return -EIO;
>> -
>>  	min = CPU_BASED_HLT_EXITING |
>>  #ifdef CONFIG_X86_64
>>  	      CPU_BASED_CR8_LOAD_EXITING |
>> @@ -2617,6 +2661,17 @@ static __init int setup_vmcs_config(struct
> vmcs_config *vmcs_conf)
>>  				&_vmexit_control) < 0)
>>  		return -EIO;
>> +	min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
>> +	opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR;
>> +	if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
>> +				&_pin_based_exec_control) < 0)
>> +		return -EIO;
>> +
>> +	if (!(_cpu_based_2nd_exec_control &
>> +		SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) ||
>> +		!(_vmexit_control & VM_EXIT_ACK_INTR_ON_EXIT))
>> +		_pin_based_exec_control &= ~PIN_BASED_POSTED_INTR;
>> +
>>  	min = 0; 	opt = VM_ENTRY_LOAD_IA32_PAT; 	if (adjust_vmx_controls(min,
>>  opt, MSR_IA32_VMX_ENTRY_CTLS, @@ -2795,11 +2850,10 @@ static __init
>>  int hardware_setup(void) 	if (!cpu_has_vmx_ple()) 		ple_gap = 0;
>> -	if (!cpu_has_vmx_apic_register_virt() ||
>> -				!cpu_has_vmx_virtual_intr_delivery())
>> -		enable_apicv_reg_vid = 0;
>> +	if (!cpu_has_vmx_apicv())
>> +		enable_apicv = 0;
>> 
>> -	if (enable_apicv_reg_vid)
>> +	if (enable_apicv)
>>  		kvm_x86_ops->update_cr8_intercept = NULL;
>>  	else
>>  		kvm_x86_ops->hwapic_irr_update = NULL;
>> @@ -3868,6 +3922,61 @@ static void
> vmx_disable_intercept_msr_write_x2apic(u32 msr)
>>  			msr, MSR_TYPE_W);
>>  }
>> +static int vmx_vm_has_apicv(struct kvm *kvm)
>> +{
>> +	return enable_apicv && irqchip_in_kernel(kvm);
>> +}
>> +
>> +static bool vmx_send_notification_event(struct kvm_vcpu *vcpu,
>> +		int vector, int *result)
>> +{
>> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
>> +
>> +	*result = !pi_test_and_set_pir(vector, vmx->pi);
> The problem here is that interrupt may still be pending in IRR so
> eventually it will be coalesced, but we report it as delivered here. I
> do not see solution for this yet.
Yes, it's true and it may result in the interrupt losing. But even in real hardware, an interrupt also will lost in some cases: for example, cpu doesn't turn on irq in time or there is a high priority interrupt pending in IRR.
And since there already an interrupt pending in IRR, so the interrupt still will be handled.

>> +	if (!pi_test_and_set_on(vmx->pi) && (vcpu->mode == IN_GUEST_MODE)) {
>> +		kvm_make_request(KVM_REQ_PENDING_PIR, vcpu);
> Why not set KVM_REQ_EVENT here? What this intermediate event is needed
> for?
see answer in below.

>> +		apic->send_IPI_mask(get_cpu_mask(vcpu->cpu),
>> +				    POSTED_INTR_VECTOR);
>> +		if (!pi_test_on(vmx->pi))
> Isn't it too optimistic of you to expect IPI to be delivered and
> processed by remote CPU by this point?
I have collected some data in my box and it shows about 5 percent of the posted interrupt will be handled when calling this check. How about add a unlikely() here?
Also it means 5% of check the request is unnecessary. And check KVM_REQ_EVENT is more costly, so I use a more light request to do it.

>> +			clear_bit(KVM_REQ_PENDING_PIR, &vcpu->requests)	;
>> +		return true;
>> +	}
>> +	return false;
>> +}
>> +
>> +static bool vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
>> +{
>> +	struct vcpu_vmx *vmx = to_vmx(vcpu);
>> +	struct kvm_lapic *apic = vcpu->arch.apic;
>> +	unsigned int i, old, new, ret_val, irr_offset, pir_val;
>> +	bool make_request = false;
>> +
>> +	if (!vmx_vm_has_apicv(vcpu->kvm) || !pi_test_and_clear_on(vmx->pi))
>> +		return false;
>> +
>> +	for (i = 0; i <= 7; i++) {
>> +		pir_val = xchg(&vmx->pi->pir[i], 0);
>> +		if (pir_val) {
>> +			irr_offset = APIC_IRR + i * 0x10;
>> +			do {
>> +				old = kvm_apic_get_reg(apic, irr_offset);
>> +				new = old | pir_val;
>> +				ret_val = cmpxchg((u32 *)(apic->regs +
>> +						irr_offset), old, new);
>> +			} while (unlikely(ret_val != old));
>> +			make_request = true;
>> +		}
>> +	}
>> +
>> +	return make_request;
>> +}
>> +
>> +static void free_pi(struct vcpu_vmx *vmx)
>> +{
>> +	if (vmx_vm_has_apicv(vmx->vcpu.kvm))
>> +		kfree(vmx->pi);
>> +}
>> +
>>  /*
>>   * Set up the vmcs's constant host-state fields, i.e., host-state fields that
>>   * will not change in the lifetime of the guest.
>> @@ -3928,6 +4037,15 @@ static void set_cr4_guest_host_mask(struct
> vcpu_vmx *vmx)
>>  	vmcs_writel(CR4_GUEST_HOST_MASK,
>>  ~vmx->vcpu.arch.cr4_guest_owned_bits); }
>> +static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx)
>> +{
>> +	u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl;
>> +
>> +	if (!vmx_vm_has_apicv(vmx->vcpu.kvm))
>> +		pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR;
>> +	return pin_based_exec_ctrl;
>> +}
>> +
>>  static u32 vmx_exec_control(struct vcpu_vmx *vmx) { 	u32 exec_control
>>  = vmcs_config.cpu_based_exec_ctrl; @@ -3945,11 +4063,6 @@ static u32
>>  vmx_exec_control(struct vcpu_vmx *vmx) 	return exec_control; }
>> -static int vmx_vm_has_apicv(struct kvm *kvm)
>> -{
>> -	return enable_apicv_reg_vid && irqchip_in_kernel(kvm);
>> -}
>> -
>>  static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) { 	u32
>>  exec_control = vmcs_config.cpu_based_2nd_exec_ctrl; @@ -4005,8 +4118,7
>>  @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
>>  	vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
>>  
>>  	/* Control */
>> -	vmcs_write32(PIN_BASED_VM_EXEC_CONTROL,
>> -		vmcs_config.pin_based_exec_ctrl);
>> +	vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_ctrl(vmx));
>> 
>>  	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
> vmx_exec_control(vmx));
>> 
>> @@ -4015,13 +4127,17 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
>>  				vmx_secondary_exec_control(vmx));
>>  	}
>> -	if (enable_apicv_reg_vid) {
>> +	if (vmx_vm_has_apicv(vmx->vcpu.kvm)) {
>>  		vmcs_write64(EOI_EXIT_BITMAP0, 0);
>>  		vmcs_write64(EOI_EXIT_BITMAP1, 0);
>>  		vmcs_write64(EOI_EXIT_BITMAP2, 0);
>>  		vmcs_write64(EOI_EXIT_BITMAP3, 0);
>>  
>>  		vmcs_write16(GUEST_INTR_STATUS, 0);
>> +
>> +		vmx->pi = kzalloc(sizeof(struct pi_desc), GFP_KERNEL);
>> +		vmcs_write64(POSTED_INTR_NV, POSTED_INTR_VECTOR);
>> +		vmcs_write64(POSTED_INTR_DESC_ADDR, __pa((vmx->pi)));
>>  	}
>>  
>>  	if (ple_gap) { @@ -4171,6 +4287,9 @@ static int vmx_vcpu_reset(struct
>>  kvm_vcpu *vcpu) 		vmcs_write64(APIC_ACCESS_ADDR, 			    
>>  page_to_phys(vmx->vcpu.kvm->arch.apic_access_page));
>> +	if (vmx_vm_has_apicv(vcpu->kvm))
>> +		memset(vmx->pi, 0, sizeof(struct pi_desc));
>> +
>>  	if (vmx->vpid != 0)
>>  		vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
>> @@ -6746,6 +6865,7 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
>> 
>>  	free_vpid(vmx); 	free_nested(vmx); +	free_pi(vmx);
>>  	free_loaded_vmcs(vmx->loaded_vmcs); 	kfree(vmx->guest_msrs);
>>  	kvm_vcpu_uninit(vcpu); @@ -7647,6 +7767,8 @@ static struct
>>  kvm_x86_ops vmx_x86_ops = { 	.load_eoi_exitmap = vmx_load_eoi_exitmap,
>>  	.hwapic_irr_update = vmx_hwapic_irr_update, 	.hwapic_isr_update =
>>  vmx_hwapic_isr_update,
>> +	.sync_pir_to_irr = vmx_sync_pir_to_irr,
>> +	.send_notification_event = vmx_send_notification_event,
>> 
>>  	.set_tss_addr = vmx_set_tss_addr, 	.get_tdp_level = get_ept_level, @@
>>  -7750,7 +7872,7 @@ static int __init vmx_init(void)
>>  	memcpy(vmx_msr_bitmap_longmode_x2apic, 			vmx_msr_bitmap_longmode,
>>  PAGE_SIZE);
>> -	if (enable_apicv_reg_vid) {
>> +	if (enable_apicv) {
>>  		for (msr = 0x800; msr <= 0x8ff; msr++)
>>  			vmx_disable_intercept_msr_read_x2apic(msr);
>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>> index 9f25d70..6e1e6e7 100644
>> --- a/arch/x86/kvm/x86.c
>> +++ b/arch/x86/kvm/x86.c
>> @@ -2681,6 +2681,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
>>  static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, 				   
>>  struct kvm_lapic_state *s) { +	kvm_x86_ops->sync_pir_to_irr(vcpu);
>>  	memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s);
>>  
>>  	return 0; @@ -5698,6 +5699,9 @@ static int vcpu_enter_guest(struct
>>  kvm_vcpu *vcpu) 			kvm_deliver_pmi(vcpu); 		if
>>  (kvm_check_request(KVM_REQ_EOIBITMAP, vcpu))
>>  			update_eoi_exitmap(vcpu);
>> +		if (kvm_check_request(KVM_REQ_PENDING_PIR, vcpu))
>> +			if (kvm_x86_ops->sync_pir_to_irr(vcpu))
>> +				kvm_make_request(KVM_REQ_EVENT, vcpu);
>>  	}
>>  
>>  	if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
>> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
>> index 0350e0d..a410819 100644
>> --- a/include/linux/kvm_host.h
>> +++ b/include/linux/kvm_host.h
>> @@ -124,6 +124,7 @@ static inline bool is_error_page(struct page *page)
>>  #define KVM_REQ_MCLOCK_INPROGRESS 20
>>  #define KVM_REQ_EPR_EXIT          21
>>  #define KVM_REQ_EOIBITMAP         22
>> +#define KVM_REQ_PENDING_PIR	  23
>> 
>>  #define KVM_USERSPACE_IRQ_SOURCE_ID		0
>>  #define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID	1
>> --
>> 1.7.1
> 
> --
> 			Gleb.


Best regards,
Yang


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html