Re: [EXTERNAL] [PATCH] KVM: x86/xen: Fix runstate updates to be atomic when preempting vCPU

Joao Martins <joao.m.martins@xxxxxxxxxx> · Fri, 29 Oct 2021 12:31:32 +0100

On 10/28/21 23:22, David Woodhouse wrote:
> On Mon, 2021-10-25 at 13:19 +0100, David Woodhouse wrote:
>> On Mon, 2021-10-25 at 11:39 +0100, David Woodhouse wrote:
>>>> One possible solution (which I even have unfinished patches for) is to
>>>> put all the gfn_to_pfn_caches on a list, and refresh them when the MMU
>>>> notifier receives an invalidation.
>>>
>>> For this use case I'm not even sure why I'd *want* to cache the PFN and
>>> explicitly kmap/memremap it, when surely by *definition* there's a
>>> perfectly serviceable HVA which already points to it?
>>
>> That's indeed true for *this* use case but my *next* use case is
>> actually implementing the event channel delivery.
>>
>> What we have in-kernel already is everything we absolutely *need* in
>> order to host Xen guests, but I really do want to fix the fact that
>> even IPIs and timers are bouncing up through userspace.
> 
> Here's a completely untested attempt, in which all the complexity is
> based around the fact that I can't just pin the pages as João and
> Ankur's original did.
> 
> It adds a new KVM_IRQ_ROUTING_XEN_EVTCHN with an ABI that allows for us
> to add FIFO event channels, but for now only supports 2 level.
> 
> In kvm_xen_set_evtchn() I currently use kvm_map_gfn() *without* a cache
> at all, but I'll work something out for that. I think I can use a
> gfn_to_hva_cache (like the one removed in commit 319afe685) and in the
> rare case that it's invalid, I can take kvm->lock to revalidate it.
> 
> It sets the bit in the global shared info but doesn't touch the target
> vCPU's vcpu_info; instead it sets a bit in an *in-kernel* shadow of the
> target's evtchn_pending_sel word, and kicks the vCPU.
> 
> That shadow is actually synced to the guest's vcpu_info struct in
> kvm_xen_has_interrupt(). There's a little bit of fun asm there to set
> the bits in the userspace struct and then clear the same set of bits in
> the kernel shadow *if* the first op didn't fault. Or such is the
> intent; I didn't hook up a test yet.
> 
> As things stand, I should be able to use this for delivery of PIRQs
> from my VMM, where things like passed-through PCI MSI gets turned into
> Xen event channels. As well as KVM unit tests, of course.
> 
Cool stuff!! I remember we only made IPIs and timers work but not PIRQs
event channels.

> The plan is then to hook up IPIs and timers — again based on the Oracle
> code from before, but using eventfds for the actual evtchn delivery. 
> 
I recall the eventfd_signal() was there should the VMM choose supply an
eventfd. But working without one was mainly for IPI/timers due to
performance reasons (avoiding the call to eventfd_signal()). We saw some
slight overhead there -- but I can't find the data right now :(

> diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
> index c4bca001a7c9..bff5c458af96 100644
> --- a/arch/x86/kvm/xen.c
> +++ b/arch/x86/kvm/xen.c
> @@ -207,6 +207,8 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state)
>  
>  int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
>  {
> +	unsigned long evtchn_pending_sel = READ_ONCE(v->arch.xen.evtchn_pending_sel);
> +	bool atomic = in_atomic() || !task_is_running(current);
>  	int err;
>  	u8 rc = 0;
>  
> @@ -216,6 +218,9 @@ int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
>  	 */
>  	struct gfn_to_hva_cache *ghc = &v->arch.xen.vcpu_info_cache;
>  	struct kvm_memslots *slots = kvm_memslots(v->kvm);
> +	bool ghc_valid = slots->generation == ghc->generation &&
> +		!kvm_is_error_hva(ghc->hva) && ghc->memslot;
> +
>  	unsigned int offset = offsetof(struct vcpu_info, evtchn_upcall_pending);
>  
>  	/* No need for compat handling here */
> @@ -231,8 +236,7 @@ int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
>  	 * cache in kvm_read_guest_offset_cached(), but just uses
>  	 * __get_user() instead. And falls back to the slow path.
>  	 */
> -	if (likely(slots->generation == ghc->generation &&
> -		   !kvm_is_error_hva(ghc->hva) && ghc->memslot)) {
> +	if (!evtchn_pending_sel && ghc_valid) {
>  		/* Fast path */
>  		pagefault_disable();
>  		err = __get_user(rc, (u8 __user *)ghc->hva + offset);
> @@ -251,12 +255,72 @@ int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
>  	 * and we'll end up getting called again from a context where we *can*
>  	 * fault in the page and wait for it.
>  	 */
> -	if (in_atomic() || !task_is_running(current))
> +	if (atomic)
>  		return 1;
>  
> -	kvm_read_guest_offset_cached(v->kvm, ghc, &rc, offset,
> -				     sizeof(rc));
> +	if (!ghc_valid) {
> +		err = kvm_gfn_to_hva_cache_init(v->kvm, ghc, ghc->gpa, ghc->len);
> +		if (err && !ghc->memslot) {
> +			/*
> +			 * If this failed, userspace has screwed up the
> +			 * vcpu_info mapping. No interrupts for you.
> +			 */
> +			return 0;
> +		}
> +	}
>  
> +	/*
> +	 * Now we have a valid (protected by srcu) userspace HVA in
> +	 * ghc->hva which points to the struct vcpu_info. If there
> +	 * are any bits in the in-kernel evtchn_pending_sel then
> +	 * we need to write those to the guest vcpu_info and set
> +	 * its evtchn_upcall_pending flag. If there aren't any bits
> +	 * to add, we only want to *check* evtchn_upcall_pending.
> +	 */
> +	if (evtchn_pending_sel) {
> +		if (IS_ENABLED(CONFIG_64BIT) && v->kvm->arch.xen.long_mode) {
> +			struct vcpu_info __user *vi = (void *)ghc->hva;
> +
> +			/* Attempt to set the evtchn_pending_sel bits in the
> +			 * guest, and if that succeeds then clear the same
> +			 * bits in the in-kernel version. */
> +			asm volatile("1:\t" LOCK_PREFIX "orq %1, %0\n"
> +				     "\tnotq %0\n"
> +				     "\t" LOCK_PREFIX "andq %2, %0\n"
> +				     "2:\n"
> +				     "\t.section .fixup,\"ax\"\n"
> +				     "3:\tjmp\t2b\n"
> +				     "\t.previous\n"
> +				     _ASM_EXTABLE_UA(1b, 3b)
> +				     : "=r" (evtchn_pending_sel)
> +				     : "m" (vi->evtchn_pending_sel),
> +				       "m" (v->arch.xen.evtchn_pending_sel),
> +				       "0" (evtchn_pending_sel));
> +		} else {
> +			struct compat_vcpu_info __user *vi = (void *)ghc->hva;
> +			u32 evtchn_pending_sel32 = evtchn_pending_sel;
> +
> +			/* Attempt to set the evtchn_pending_sel bits in the
> +			 * guest, and if that succeeds then clear the same
> +			 * bits in the in-kernel version. */
> +			asm volatile("1:\t" LOCK_PREFIX "orl %1, %0\n"
> +				     "\tnotl %0\n"
> +				     "\t" LOCK_PREFIX "andl %2, %0\n"
> +				     "2:\n"
> +				     "\t.section .fixup,\"ax\"\n"
> +				     "3:\tjmp\t2b\n"
> +				     "\t.previous\n"
> +				     _ASM_EXTABLE_UA(1b, 3b)
> +				     : "=r" (evtchn_pending_sel32)
> +				     : "m" (vi->evtchn_pending_sel),
> +				       "m" (v->arch.xen.evtchn_pending_sel),
> +				       "0" (evtchn_pending_sel32));
> +		}

Perhaps I am not reading it right (or I forgot) but don't you need to use the shared info
vcpu info when the guest hasn't explicitly registered for a *separate* vcpu info, no?

Or maybe this relies on the API contract (?) that VMM needs to register the vcpu info in
addition to shared info regardless of Xen guest explicitly asking for a separate vcpu
info. If so, might be worth a comment...

> +		rc = 1;
> +		__put_user(rc, (u8 __user *)ghc->hva + offset);
> +	} else {
> +		__get_user(rc, (u8 __user *)ghc->hva + offset);
> +	}
>  	return rc;
>  }
>  
> @@ -772,3 +836,105 @@ int kvm_xen_hypercall(struct kvm_vcpu *vcpu)
>  
>  	return 0;
>  }
> +
> +static inline int max_evtchn_port(struct kvm *kvm)
> +{
> +	if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode)
> +		return 4096;
> +	else
> +		return 1024;
> +}
> +

Maybe worth using Xen ABI interface macros that help tieing this in
to Xen guest ABI. Particular the macros in include/xen/interface/event_channel.h

#define EVTCHN_2L_NR_CHANNELS (sizeof(xen_ulong_t) * sizeof(xen_ulong_t) * 64)

Sadly doesn't cover 32-bit case :( given the xen_ulong_t.

> +int kvm_xen_setup_evtchn(struct kvm *kvm,
> +			 struct kvm_kernel_irq_routing_entry *e,
> +			 const struct kvm_irq_routing_entry *ue)
> +
> +{
> +	struct kvm_vcpu *vcpu;
> +
> +	if (kvm->arch.xen.shinfo_gfn == GPA_INVALID)
> +		return -EINVAL;
> +
> +	if (e->xen_evtchn.vcpu >= KVM_MAX_VCPUS)
> +		return -EINVAL;
> +
> +	vcpu = kvm_get_vcpu_by_id(kvm, ue->u.xen_evtchn.vcpu);
> +	if (!vcpu)
> +		return -EINVAL;
> +
> +	if (vcpu->arch.xen.vcpu_info_set)
> +		return -EINVAL;
> +
> +	if (!kvm->arch.xen.upcall_vector)
> +		return -EINVAL;
> +
> +	/* Once we support the per-vCPU LAPIC based vector we will permit
> +	 * that here instead of the per-KVM upcall vector */
> +
> +	if (e->xen_evtchn.port >= max_evtchn_port(kvm))
> +		return -EINVAL;
> +
> +	/* We only support 2 level event channels for now */
> +	if (e->xen_evtchn.priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL)
> +		return -EINVAL;
> +