Re: [v3 24/26] KVM: Update Posted-Interrupts Descriptor when vCPU is blocked

Marcelo Tosatti <mtosatti@xxxxxxxxxx> · Wed, 11 Mar 2015 22:15:26 -0300

On Fri, Mar 06, 2015 at 06:51:52AM +0000, Wu, Feng wrote:
> 
> 
> > -----Original Message-----
> > From: Marcelo Tosatti [mailto:mtosatti@xxxxxxxxxx]
> > Sent: Wednesday, March 04, 2015 8:06 PM
> > To: Wu, Feng
> > Cc: tglx@xxxxxxxxxxxxx; mingo@xxxxxxxxxx; hpa@xxxxxxxxx; x86@xxxxxxxxxx;
> > gleb@xxxxxxxxxx; pbonzini@xxxxxxxxxx; dwmw2@xxxxxxxxxxxxx;
> > joro@xxxxxxxxxx; alex.williamson@xxxxxxxxxx; jiang.liu@xxxxxxxxxxxxxxx;
> > eric.auger@xxxxxxxxxx; linux-kernel@xxxxxxxxxxxxxxx;
> > iommu@xxxxxxxxxxxxxxxxxxxxxxxxxx; kvm@xxxxxxxxxxxxxxx
> > Subject: Re: [v3 24/26] KVM: Update Posted-Interrupts Descriptor when vCPU
> > is blocked
> > 
> > On Mon, Mar 02, 2015 at 01:36:51PM +0000, Wu, Feng wrote:
> > >
> > >
> > > > -----Original Message-----
> > > > From: Marcelo Tosatti [mailto:mtosatti@xxxxxxxxxx]
> > > > Sent: Friday, February 27, 2015 7:41 AM
> > > > To: Wu, Feng
> > > > Cc: tglx@xxxxxxxxxxxxx; mingo@xxxxxxxxxx; hpa@xxxxxxxxx;
> > x86@xxxxxxxxxx;
> > > > gleb@xxxxxxxxxx; pbonzini@xxxxxxxxxx; dwmw2@xxxxxxxxxxxxx;
> > > > joro@xxxxxxxxxx; alex.williamson@xxxxxxxxxx; jiang.liu@xxxxxxxxxxxxxxx;
> > > > eric.auger@xxxxxxxxxx; linux-kernel@xxxxxxxxxxxxxxx;
> > > > iommu@xxxxxxxxxxxxxxxxxxxxxxxxxx; kvm@xxxxxxxxxxxxxxx
> > > > Subject: Re: [v3 24/26] KVM: Update Posted-Interrupts Descriptor when
> > vCPU
> > > > is blocked
> > > >
> > > > On Fri, Dec 12, 2014 at 11:14:58PM +0800, Feng Wu wrote:
> > > > > This patch updates the Posted-Interrupts Descriptor when vCPU
> > > > > is blocked.
> > > > >
> > > > > pre-block:
> > > > > - Add the vCPU to the blocked per-CPU list
> > > > > - Clear 'SN'
> > > > > - Set 'NV' to POSTED_INTR_WAKEUP_VECTOR
> > > > >
> > > > > post-block:
> > > > > - Remove the vCPU from the per-CPU list
> > > > >
> > > > > Signed-off-by: Feng Wu <feng.wu@xxxxxxxxx>
> > > > > ---
> > > > >  arch/x86/include/asm/kvm_host.h |  2 +
> > > > >  arch/x86/kvm/vmx.c              | 96
> > > > +++++++++++++++++++++++++++++++++++++++++
> > > > >  arch/x86/kvm/x86.c              | 22 +++++++---
> > > > >  include/linux/kvm_host.h        |  4 ++
> > > > >  virt/kvm/kvm_main.c             |  6 +++
> > > > >  5 files changed, 123 insertions(+), 7 deletions(-)
> > > > >
> > > > > diff --git a/arch/x86/include/asm/kvm_host.h
> > > > b/arch/x86/include/asm/kvm_host.h
> > > > > index 13e3e40..32c110a 100644
> > > > > --- a/arch/x86/include/asm/kvm_host.h
> > > > > +++ b/arch/x86/include/asm/kvm_host.h
> > > > > @@ -101,6 +101,8 @@ static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t
> > > > base_gfn, int level)
> > > > >
> > > > >  #define ASYNC_PF_PER_VCPU 64
> > > > >
> > > > > +extern void (*wakeup_handler_callback)(void);
> > > > > +
> > > > >  enum kvm_reg {
> > > > >  	VCPU_REGS_RAX = 0,
> > > > >  	VCPU_REGS_RCX = 1,
> > > > > diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> > > > > index bf2e6cd..a1c83a2 100644
> > > > > --- a/arch/x86/kvm/vmx.c
> > > > > +++ b/arch/x86/kvm/vmx.c
> > > > > @@ -832,6 +832,13 @@ static DEFINE_PER_CPU(struct vmcs *,
> > > > current_vmcs);
> > > > >  static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu);
> > > > >  static DEFINE_PER_CPU(struct desc_ptr, host_gdt);
> > > > >
> > > > > +/*
> > > > > + * We maintian a per-CPU linked-list of vCPU, so in wakeup_handler() we
> > > > > + * can find which vCPU should be waken up.
> > > > > + */
> > > > > +static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu);
> > > > > +static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock);
> > > > > +
> > > > >  static unsigned long *vmx_io_bitmap_a;
> > > > >  static unsigned long *vmx_io_bitmap_b;
> > > > >  static unsigned long *vmx_msr_bitmap_legacy;
> > > > > @@ -1921,6 +1928,7 @@ static void vmx_vcpu_load(struct kvm_vcpu
> > *vcpu,
> > > > int cpu)
> > > > >  		struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
> > > > >  		struct pi_desc old, new;
> > > > >  		unsigned int dest;
> > > > > +		unsigned long flags;
> > > > >
> > > > >  		memset(&old, 0, sizeof(old));
> > > > >  		memset(&new, 0, sizeof(new));
> > > > > @@ -1942,6 +1950,20 @@ static void vmx_vcpu_load(struct kvm_vcpu
> > > > *vcpu, int cpu)
> > > > >  			new.nv = POSTED_INTR_VECTOR;
> > > > >  		} while (cmpxchg(&pi_desc->control, old.control,
> > > > >  				new.control) != old.control);
> > > > > +
> > > > > +		/*
> > > > > +		 * Delete the vCPU from the related wakeup queue
> > > > > +		 * if we are resuming from blocked state
> > > > > +		 */
> > > > > +		if (vcpu->blocked) {
> > > > > +			vcpu->blocked = false;
> > > > > +			spin_lock_irqsave(&per_cpu(blocked_vcpu_on_cpu_lock,
> > > > > +				vcpu->wakeup_cpu), flags);
> > > > > +			list_del(&vcpu->blocked_vcpu_list);
> > > > > +
> > 	spin_unlock_irqrestore(&per_cpu(blocked_vcpu_on_cpu_lock,
> > > > > +				vcpu->wakeup_cpu), flags);
> > > > > +			vcpu->wakeup_cpu = -1;
> > > > > +		}
> > > > >  	}
> > > > >  }
> > > > >
> > > > > @@ -1950,6 +1972,9 @@ static void vmx_vcpu_put(struct kvm_vcpu
> > *vcpu)
> > > > >  	if (irq_remapping_cap(IRQ_POSTING_CAP)) {
> > > > >  		struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
> > > > >  		struct pi_desc old, new;
> > > > > +		unsigned long flags;
> > > > > +		int cpu;
> > > > > +		struct cpumask cpu_others_mask;
> > > > >
> > > > >  		memset(&old, 0, sizeof(old));
> > > > >  		memset(&new, 0, sizeof(new));
> > > > > @@ -1961,6 +1986,54 @@ static void vmx_vcpu_put(struct kvm_vcpu
> > > > *vcpu)
> > > > >  				pi_set_sn(&new);
> > > > >  			} while (cmpxchg(&pi_desc->control, old.control,
> > > > >  					new.control) != old.control);
> > > > > +		} else if (vcpu->blocked) {
> > > > > +			/*
> > > > > +			 * The vcpu is blocked on the wait queue.
> > > > > +			 * Store the blocked vCPU on the list of the
> > > > > +			 * vcpu->wakeup_cpu, which is the destination
> > > > > +			 * of the wake-up notification event.
> > > > > +			 */
> > > > > +			vcpu->wakeup_cpu = vcpu->cpu;
> > > > > +			spin_lock_irqsave(&per_cpu(blocked_vcpu_on_cpu_lock,
> > > > > +					  vcpu->wakeup_cpu), flags);
> > > > > +			list_add_tail(&vcpu->blocked_vcpu_list,
> > > > > +				      &per_cpu(blocked_vcpu_on_cpu,
> > > > > +				      vcpu->wakeup_cpu));
> > > > > +			spin_unlock_irqrestore(
> > > > > +					&per_cpu(blocked_vcpu_on_cpu_lock,
> > > > > +					vcpu->wakeup_cpu), flags);
> > > > > +
> > > > > +			do {
> > > > > +				old.control = new.control = pi_desc->control;
> > > > > +
> > > > > +				/*
> > > > > +				 * We should not block the vCPU if
> > > > > +				 * an interrupt is posted for it.
> > > > > +				 */
> > > > > +				if (pi_test_on(pi_desc) == 1) {
> > > > > +					/*
> > > > > +					 * We need schedule the wakeup worker
> > > > > +					 * on a different cpu other than
> > > > > +					 * vcpu->cpu, because in some case,
> > > > > +					 * schedule_work() will call
> > > > > +					 * try_to_wake_up() which needs acquire
> > > > > +					 * the rq lock. This can cause deadlock.
> > > > > +					 */
> > > > > +					cpumask_copy(&cpu_others_mask,
> > > > > +						     cpu_online_mask);
> > > > > +					cpu_clear(vcpu->cpu, cpu_others_mask);
> > > > > +					cpu = any_online_cpu(cpu_others_mask);
> > > > > +
> > > > > +					schedule_work_on(cpu,
> > > > > +							 &vcpu->wakeup_worker);
> > > > > +				}
> > > > > +
> > > > > +				pi_clear_sn(&new);
> > > > > +
> > > > > +				/* set 'NV' to 'wakeup vector' */
> > > > > +				new.nv = POSTED_INTR_WAKEUP_VECTOR;
> > > > > +			} while (cmpxchg(&pi_desc->control, old.control,
> > > > > +				new.control) != old.control);
> > > > >  		}
> > > >
> > > > This can be done exclusively on HLT emulation, correct? (that is, on
> > > > entry to HLT and exit from HLT).
> > >
> > > Do you mean the following?
> > > In kvm_emulate_halt(), we do:
> > > 1. Add vCPU in the blocking list
> > > 2. Clear 'SN'
> > > 3. set 'NV' to POSTED_INTR_WAKEUP_VECTOR
> > >
> > > In __vcpu_run(), after kvm_vcpu_block(), we remove the vCPU from the
> > > Bloc king list.
> > 
> > Yes (please check its OK to do this...).
> 
> I think about this for some time, and I feel this may be another solution
> to implement it. Do you mind sharing your ideas about why do you think
> this alternative is better than the current one? Thanks a lot!

Two reasons:

1) Because it does not add overhead to vcpu_puts thats are not due to
HLT. Doing so removes the "vcpu->blocked" variable (its implicit in the 
code anyway).
2) Easier to spot races.

Do you have any reason why having the code at vcpu_put/vcpu_load is     
better than the proposal to have the code at kvm_vcpu_block?

> > > > If the vcpu is scheduled out for any other reason (transition to
> > > > userspace or transition to other thread), it will eventually resume
> > > > execution. And in that case, continuation of execution does not depend
> > > > on the event (VT-d interrupt) notification.
> > >
> > > Yes, I think this is true for my current implementation, right?
> > >
> > > >
> > > > There is a race window with the code above, I believe.
> > >
> > > I did careful code review back and forth for the above code, It will
> > > be highly appreciated if you can point out the race window!
> > 
> > So the remapping HW sees either POSTED_INTR_VECTOR or
> > POSTED_INTR_WAKEUP_VECTOR.
> > 
> > You should:
> > 
> > 1. Set POSTED_INTR_WAKEUP_VECTOR.
> > 2. Check for PIR / ON bit, which might have been set by
> > POSTED_INTR_VECTOR notification.
> > 3. emulate HLT.
> 
> My original idea for pre-block operation is:
> 1. Add vCPU to the per-cpu blocking list. Here is the code for this in my patch:
> +                       vcpu->wakeup_cpu = vcpu->cpu;
> +                       spin_lock_irqsave(&per_cpu(blocked_vcpu_on_cpu_lock,
> +                                         vcpu->wakeup_cpu), flags);
> +                       list_add_tail(&vcpu->blocked_vcpu_list,
> +                                     &per_cpu(blocked_vcpu_on_cpu,
> +                                     vcpu->wakeup_cpu));
> +                       spin_unlock_irqrestore(
> +                                       &per_cpu(blocked_vcpu_on_cpu_lock,
> +                                       vcpu->wakeup_cpu), flags);
> 2. Update Posted-interrupt descriptor, here is the code in my patch:
> +                       do {
> +                               old.control = new.control = pi_desc->control;
> +
> +                               /*
> +                                * We should not block the vCPU if
> +                                * an interrupt is posted for it.
> +                                */
> +                               if (pi_test_on(pi_desc) == 1) {
> +                                       /*
> +                                        * We need schedule the wakeup worker
> +                                        * on a different cpu other than
> +                                        * vcpu->cpu, because in some case,
> +                                        * schedule_work() will call
> +                                        * try_to_wake_up() which needs acquire
> +                                        * the rq lock. This can cause deadlock.
> +                                        */
> +                                       cpumask_copy(&cpu_others_mask,
> +                                                    cpu_online_mask);
> +                                       cpu_clear(vcpu->cpu, cpu_others_mask);
> +                                       cpu = any_online_cpu(cpu_others_mask);
> +
> +                                       schedule_work_on(cpu,
> +                                                        &vcpu->wakeup_worker);
> +                               }
> +
> +                               WARN((pi_desc->sn == 1),
> +                                    "Warning: SN field of posted-interrupts "
> +                                    "is set before blocking\n");
> +
> +                               /* set 'NV' to 'wakeup vector' */
> +                               new.nv = POSTED_INTR_WAKEUP_VECTOR;
> +                       } while (cmpxchg(&pi_desc->control, old.control,
> +                               new.control) != old.control);
> 
> If PIR/ON bit is set by POSTED_INTR_VECTOR notification during the above operation, we will stop
> blocking the vCPU like about. But seems I missed something in the above code which should be in
> my mind from the beginning, I should add a 'break' in the end the above ' if (pi_test_on(pi_desc) == 1) {}',
> so in this case, the 'NV' filed remains unchanged.

Right have to think carefully about all cases.

> > > > >  	}
> > > > >
> > > > > @@ -2842,6 +2915,8 @@ static int hardware_enable(void)
> > > > >  		return -EBUSY;
> > > > >
> > > > >  	INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
> > > > > +	INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu));
> > > > > +	spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
> > > > >
> > > > >  	/*
> > > > >  	 * Now we can enable the vmclear operation in kdump
> > > > > @@ -9315,6 +9390,25 @@ static struct kvm_x86_ops vmx_x86_ops = {
> > > > >  	.pi_set_sn = vmx_pi_set_sn,
> > > > >  };
> > > > >
> > > > > +/*
> > > > > + * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR.
> > > > > + */
> > > > > +void wakeup_handler(void)
> > > > > +{
> > > > > +	struct kvm_vcpu *vcpu;
> > > > > +	int cpu = smp_processor_id();
> > > > > +
> > > > > +	spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
> > > > > +	list_for_each_entry(vcpu, &per_cpu(blocked_vcpu_on_cpu, cpu),
> > > > > +			blocked_vcpu_list) {
> > > > > +		struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
> > > > > +
> > > > > +		if (pi_test_on(pi_desc) == 1)
> > > > > +			kvm_vcpu_kick(vcpu);
> > > > > +	}
> > > > > +	spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
> > > > > +}
> > > >
> > > > Looping through all blocked vcpus does not scale:
> > > > Can you allocate more vectors and then multiplex those
> > > > vectors amongst the HLT'ed vcpus?
> > >
> > > I am a little confused about this, can you elaborate it a bit more?
> > > Thanks a lot!
> > 
> > Picture the following overcommitment scenario:
> > 
> > * High ratio of vCPUs/pCPUs, in the ratio 128/1 (this is exaggerated
> > to demonstrate the issue).
> > * Every VT-d interrupt is going to scan 128 entries in the list.
> > 
> > Moreover, the test:
> > 
> > 		if (pi_test_on(pi_desc) == 1)
> > 			kvm_vcpu_kick(vcpu);
> > 
> > Can trigger for vCPUs which have not been waken up due
> > to VT-d interrupts, but for other interrupts.
> > 
> > You can allocate, say 16 vectors on the pCPU for VT-d interrupts:
> > 
> > POSTED_INTERRUPT_WAKEUP_VECTOR_1,
> > POSTED_INTERRUPT_WAKEUP_VECTOR_2,
> > ...
> > 
> 
> Global vector is a limited resources in the system, and this involves
> common x86 interrupt code changes. I am not sure we can allocate
> so many dedicated global vector for KVM usage.

Why not? Have KVM use all free vectors (so if vectors are necessary for
other purposes, people should shrink the KVM vector pool).

BTW the Intel docs talk about that ("one vector per vCPU").

> > > > It seems there is a bunch free:
> > > >
> > > > commit 52aec3308db85f4e9f5c8b9f5dc4fbd0138c6fa4
> > > > Author: Alex Shi <alex.shi@xxxxxxxxx>
> > > > Date:   Thu Jun 28 09:02:23 2012 +0800
> > > >
> > > >     x86/tlb: replace INVALIDATE_TLB_VECTOR by
> > CALL_FUNCTION_VECTOR
> > > >
> > > > Can you add only vcpus which have posted IRTEs that point to this pCPU
> > > > to the HLT'ed vcpu lists? (so for example, vcpus without assigned
> > > > devices are not part of the list).
> > >
> > > Is it easy to find whether a vCPU (or the associated domain) has assigned
> > devices?
> > > If so, we can only add those vCPUs with assigned devices.
> > 
> > When configuring IRTE, at kvm_arch_vfio_update_pi_irte?
> 
> Yes.
> 
> > 
> > > > > +
> > > > >  static int __init vmx_init(void)
> > > > >  {
> > > > >  	int r, i, msr;
> > > > > @@ -9429,6 +9523,8 @@ static int __init vmx_init(void)
> > > > >
> > > > >  	update_ple_window_actual_max();
> > > > >
> > > > > +	wakeup_handler_callback = wakeup_handler;
> > > > > +
> > > > >  	return 0;
> > > > >
> > > > >  out7:
> > > > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > > > > index 0033df3..1551a46 100644
> > > > > --- a/arch/x86/kvm/x86.c
> > > > > +++ b/arch/x86/kvm/x86.c
> > > > > @@ -6152,6 +6152,21 @@ static int vcpu_enter_guest(struct kvm_vcpu
> > > > *vcpu)
> > > > >  			kvm_vcpu_reload_apic_access_page(vcpu);
> > > > >  	}
> > > > >
> > > > > +	/*
> > > > > +	 * Since posted-interrupts can be set by VT-d HW now, in this
> > > > > +	 * case, KVM_REQ_EVENT is not set. We move the following
> > > > > +	 * operations out of the if statement.
> > > > > +	 */
> > > > > +	if (kvm_lapic_enabled(vcpu)) {
> > > > > +		/*
> > > > > +		 * Update architecture specific hints for APIC
> > > > > +		 * virtual interrupt delivery.
> > > > > +		 */
> > > > > +		if (kvm_x86_ops->hwapic_irr_update)
> > > > > +			kvm_x86_ops->hwapic_irr_update(vcpu,
> > > > > +				kvm_lapic_find_highest_irr(vcpu));
> > > > > +	}
> > > > > +
> > > >
> > > > This is a hot fast path. You can set KVM_REQ_EVENT from wakeup_handler.
> > >
> > > I am afraid Setting KVM_REQ_EVENT from wakeup_handler doesn't help
> > much,
> > > if vCPU is running in ROOT mode, and VT-d hardware issues an notification
> > event,
> > > POSTED_INTR_VECTOR interrupt handler will be called.
> > 
> > If vCPU is in root mode, remapping HW will find IRTE configured with
> > vector == POSTED_INTR_WAKEUP_VECTOR, use that vector, which will
> > VM-exit, and execute the interrupt handler wakeup_handler. Right?
> 
> There are two cases:
> Case 1: vCPU is blocked, so it is in root mode, this is what you described above.
> Case 2, vCPU is running in root mode, such as, handling vm-exits, in this case,
> the notification vector is 'POSTED_INTR_VECTOR', and if external interrupts
> from assigned devices happen, the handled of 'POSTED_INTR_VECTOR' will
> be called ( it is 'smp_kvm_posted_intr_ipi' in fact), this routine doesn't need
> do real things, since the pending interrupts in PIR will be synced to vIRR before
> VM-Entry (this code have already been there when enabling CPU-side
> posted-interrupt along with APICv). Like what I said before, it is a little hard to
> get vCPU related information in it, even if we get, it is not accurate and may harm
> the performance.(need search)
> 
> So only setting KVM_REQ_EVENT in wakeup_handler cannot cover the notification
> event for 'POSTED_INTR_VECTOR'.
> 
> > 
> > The point of this comment is that you can keep the
> > 
> > "if (kvm_x86_ops->hwapic_irr_update)
> > 	kvm_x86_ops->hwapic_irr_update(vcpu,
> > 			kvm_lapic_find_highest_irr(vcpu));
> > "
> > 
> > Code inside KVM_REQ_EVENT handling section of vcpu_run, as long as
> > wakeup_handler sets KVM_REQ_EVENT.
> 
> Please see above.

OK can you set KVM_REQ_EVENT in case the ON bit is set,
after disabling interrupts ?

kvm_lapic_find_highest_irr(vcpu) eats some cache 
(4 cachelines) versus 1 cacheline for reading ON bit.

> > > > Please remove blocked and wakeup_cpu, they should not be necessary.
> > >
> > > Why do you think wakeup_cpu is not needed, when vCPU is blocked,
> > > wakeup_cpu saves the cpu which the vCPU is blocked on, after vCPU
> > > is woken up, it can run on a different cpu, so we need wakeup_cpu to
> > > find the right list to wake up the vCPU.
> > 
> > If the vCPU was moved it should have updated IRTE destination field
> > to the pCPU which it has moved to?
> 
> Every time a vCPU is scheduled to a new pCPU, the IRTE destination filed
> would be updated accordingly. 
> 
> When vCPU is blocked. To wake up the blocked vCPU, we need to find which
> list the vCPU is blocked on, and this is what wakeup_cpu used for?

Right, perhaps prev_vcpu is a better name.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html