[PATCH v2 05/17] KVM: x86/xen: Use gfn_to_pfn_cache for vcpu_info

David Woodhouse <dwmw2@xxxxxxxxxxxxx> · Mon, 28 Feb 2022 20:05:40 +0000

From: David Woodhouse <dwmw@xxxxxxxxxxxx>

Currently, the fast path of kvm_xen_set_evtchn_fast() doesn't set the
index bits in the target vCPU's evtchn_pending_sel, because it only has
a userspace virtual address with which to do so. It just sets them in
the kernel, and kvm_xen_has_interrupt() then completes the delivery to
the actual vcpu_info structure when the vCPU runs.

Using a gfn_to_pfn_cache allows kvm_xen_set_evtchn_fast() to do the full
delivery in the common case.

Clean up the fallback case too, by moving the deferred delivery out into
a separate kvm_xen_inject_pending_events() function which isn't ever
called in atomic contexts as __kvm_xen_has_interrupt() is.

Signed-off-by: David Woodhouse <dwmw@xxxxxxxxxxxx>
---
 arch/x86/include/asm/kvm_host.h |   3 +-
 arch/x86/kvm/x86.c              |   9 +-
 arch/x86/kvm/xen.c              | 242 +++++++++++++++++---------------
 arch/x86/kvm/xen.h              |  20 ++-
 4 files changed, 154 insertions(+), 120 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 127cef57f50b..02da08030eee 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -600,9 +600,8 @@ struct kvm_vcpu_hv {
 struct kvm_vcpu_xen {
 	u64 hypercall_rip;
 	u32 current_runstate;
-	bool vcpu_info_set;
 	bool vcpu_time_info_set;
-	struct gfn_to_hva_cache vcpu_info_cache;
+	struct gfn_to_pfn_cache vcpu_info_cache;
 	struct gfn_to_hva_cache vcpu_time_info_cache;
 	struct gfn_to_pfn_cache runstate_cache;
 	u64 last_steal;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 43c1a431537a..16549714b04f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3136,9 +3136,9 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
 
 	if (vcpu->pv_time.active)
 		kvm_setup_guest_pvclock(v, &vcpu->pv_time, 0);
-	if (vcpu->xen.vcpu_info_set)
-		kvm_setup_pvclock_page(v, &vcpu->xen.vcpu_info_cache,
-				       offsetof(struct compat_vcpu_info, time));
+	if (vcpu->xen.vcpu_info_cache.active)
+		kvm_setup_guest_pvclock(v, &vcpu->xen.vcpu_info_cache,
+					offsetof(struct compat_vcpu_info, time));
 	if (vcpu->xen.vcpu_time_info_set)
 		kvm_setup_pvclock_page(v, &vcpu->xen.vcpu_time_info_cache, 0);
 	if (!v->vcpu_idx)
@@ -10338,6 +10338,9 @@ static int vcpu_run(struct kvm_vcpu *vcpu)
 			break;
 
 		kvm_clear_request(KVM_REQ_UNBLOCK, vcpu);
+		if (kvm_xen_has_pending_events(vcpu))
+			kvm_xen_inject_pending_events(vcpu);
+
 		if (kvm_cpu_has_pending_timer(vcpu))
 			kvm_inject_pending_timer_irqs(vcpu);
 
diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index de1b3be7239e..56e8a6980b65 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -8,6 +8,7 @@
 
 #include "x86.h"
 #include "xen.h"
+#include "lapic.h"
 #include "hyperv.h"
 
 #include <linux/kvm_host.h>
@@ -246,23 +247,79 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state)
 	mark_page_dirty_in_slot(v->kvm, gpc->memslot, gpc->gpa >> PAGE_SHIFT);
 }
 
-int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
+/*
+ * On event channel delivery, the vcpu_info may not have been accessible.
+ * In that case, there are bits in vcpu->arch.xen.evtchn_pending_sel which
+ * need to be marked into the vcpu_info (and evtchn_upcall_pending set).
+ * Do so now that we can sleep in the context of the vCPU to bring the
+ * page in, and refresh the pfn cache for it.
+ */
+void kvm_xen_inject_pending_events(struct kvm_vcpu *v)
 {
 	unsigned long evtchn_pending_sel = READ_ONCE(v->arch.xen.evtchn_pending_sel);
-	bool atomic = in_atomic() || !task_is_running(current);
-	int err;
+	struct gfn_to_pfn_cache *gpc = &v->arch.xen.vcpu_info_cache;
+	unsigned long flags;
+
+	if (!evtchn_pending_sel)
+		return;
+
+	/*
+	 * Yes, this is an open-coded loop. But that's just what put_user()
+	 * does anyway. Page it in and retry the instruction. We're just a
+	 * little more honest about it.
+	 */
+	read_lock_irqsave(&gpc->lock, flags);
+	while (!kvm_gfn_to_pfn_cache_check(v->kvm, gpc, gpc->gpa,
+					   sizeof(struct vcpu_info))) {
+		read_unlock_irqrestore(&gpc->lock, flags);
+
+		if (kvm_gfn_to_pfn_cache_refresh(v->kvm, gpc, gpc->gpa,
+						 sizeof(struct vcpu_info)))
+			return;
+
+		read_lock_irqsave(&gpc->lock, flags);
+	}
+
+	/* Now gpc->khva is a valid kernel address for the vcpu_info */
+	if (IS_ENABLED(CONFIG_64BIT) && v->kvm->arch.xen.long_mode) {
+		struct vcpu_info *vi = gpc->khva;
+
+		asm volatile(LOCK_PREFIX "orq %0, %1\n"
+			     "notq %0\n"
+			     LOCK_PREFIX "andq %0, %2\n"
+			     : "=r" (evtchn_pending_sel),
+			       "+m" (vi->evtchn_pending_sel),
+			       "+m" (v->arch.xen.evtchn_pending_sel)
+			     : "0" (evtchn_pending_sel));
+		WRITE_ONCE(vi->evtchn_upcall_pending, 1);
+	} else {
+		u32 evtchn_pending_sel32 = evtchn_pending_sel;
+		struct compat_vcpu_info *vi = gpc->khva;
+
+		asm volatile(LOCK_PREFIX "orl %0, %1\n"
+			     "notl %0\n"
+			     LOCK_PREFIX "andl %0, %2\n"
+			     : "=r" (evtchn_pending_sel32),
+			       "+m" (vi->evtchn_pending_sel),
+			       "+m" (v->arch.xen.evtchn_pending_sel)
+			     : "0" (evtchn_pending_sel32));
+		WRITE_ONCE(vi->evtchn_upcall_pending, 1);
+	}
+	read_unlock_irqrestore(&gpc->lock, flags);
+
+	mark_page_dirty_in_slot(v->kvm, gpc->memslot, gpc->gpa >> PAGE_SHIFT);
+}
+
+int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
+{
+	struct gfn_to_pfn_cache *gpc = &v->arch.xen.vcpu_info_cache;
+	unsigned long flags;
 	u8 rc = 0;
 
 	/*
 	 * If the global upcall vector (HVMIRQ_callback_vector) is set and
 	 * the vCPU's evtchn_upcall_pending flag is set, the IRQ is pending.
 	 */
-	struct gfn_to_hva_cache *ghc = &v->arch.xen.vcpu_info_cache;
-	struct kvm_memslots *slots = kvm_memslots(v->kvm);
-	bool ghc_valid = slots->generation == ghc->generation &&
-		!kvm_is_error_hva(ghc->hva) && ghc->memslot;
-
-	unsigned int offset = offsetof(struct vcpu_info, evtchn_upcall_pending);
 
 	/* No need for compat handling here */
 	BUILD_BUG_ON(offsetof(struct vcpu_info, evtchn_upcall_pending) !=
@@ -272,101 +329,35 @@ int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
 	BUILD_BUG_ON(sizeof(rc) !=
 		     sizeof_field(struct compat_vcpu_info, evtchn_upcall_pending));
 
-	/*
-	 * For efficiency, this mirrors the checks for using the valid
-	 * cache in kvm_read_guest_offset_cached(), but just uses
-	 * __get_user() instead. And falls back to the slow path.
-	 */
-	if (!evtchn_pending_sel && ghc_valid) {
-		/* Fast path */
-		pagefault_disable();
-		err = __get_user(rc, (u8 __user *)ghc->hva + offset);
-		pagefault_enable();
-		if (!err)
-			return rc;
-	}
-
-	/* Slow path */
+	read_lock_irqsave(&gpc->lock, flags);
+	while (!kvm_gfn_to_pfn_cache_check(v->kvm, gpc, gpc->gpa,
+					   sizeof(struct vcpu_info))) {
+		read_unlock_irqrestore(&gpc->lock, flags);
 
-	/*
-	 * This function gets called from kvm_vcpu_block() after setting the
-	 * task to TASK_INTERRUPTIBLE, to see if it needs to wake immediately
-	 * from a HLT. So we really mustn't sleep. If the page ended up absent
-	 * at that point, just return 1 in order to trigger an immediate wake,
-	 * and we'll end up getting called again from a context where we *can*
-	 * fault in the page and wait for it.
-	 */
-	if (atomic)
-		return 1;
+		/*
+		 * This function gets called from kvm_vcpu_block() after setting the
+		 * task to TASK_INTERRUPTIBLE, to see if it needs to wake immediately
+		 * from a HLT. So we really mustn't sleep. If the page ended up absent
+		 * at that point, just return 1 in order to trigger an immediate wake,
+		 * and we'll end up getting called again from a context where we *can*
+		 * fault in the page and wait for it.
+		 */
+		if (in_atomic() || !task_is_running(current))
+			return 1;
 
-	if (!ghc_valid) {
-		err = kvm_gfn_to_hva_cache_init(v->kvm, ghc, ghc->gpa, ghc->len);
-		if (err || !ghc->memslot) {
+		if (kvm_gfn_to_pfn_cache_refresh(v->kvm, gpc, gpc->gpa,
+						 sizeof(struct vcpu_info))) {
 			/*
 			 * If this failed, userspace has screwed up the
 			 * vcpu_info mapping. No interrupts for you.
 			 */
 			return 0;
 		}
+		read_lock_irqsave(&gpc->lock, flags);
 	}
 
-	/*
-	 * Now we have a valid (protected by srcu) userspace HVA in
-	 * ghc->hva which points to the struct vcpu_info. If there
-	 * are any bits in the in-kernel evtchn_pending_sel then
-	 * we need to write those to the guest vcpu_info and set
-	 * its evtchn_upcall_pending flag. If there aren't any bits
-	 * to add, we only want to *check* evtchn_upcall_pending.
-	 */
-	if (evtchn_pending_sel) {
-		bool long_mode = v->kvm->arch.xen.long_mode;
-
-		if (!user_access_begin((void __user *)ghc->hva, sizeof(struct vcpu_info)))
-			return 0;
-
-		if (IS_ENABLED(CONFIG_64BIT) && long_mode) {
-			struct vcpu_info __user *vi = (void __user *)ghc->hva;
-
-			/* Attempt to set the evtchn_pending_sel bits in the
-			 * guest, and if that succeeds then clear the same
-			 * bits in the in-kernel version. */
-			asm volatile("1:\t" LOCK_PREFIX "orq %0, %1\n"
-				     "\tnotq %0\n"
-				     "\t" LOCK_PREFIX "andq %0, %2\n"
-				     "2:\n"
-				     _ASM_EXTABLE_UA(1b, 2b)
-				     : "=r" (evtchn_pending_sel),
-				       "+m" (vi->evtchn_pending_sel),
-				       "+m" (v->arch.xen.evtchn_pending_sel)
-				     : "0" (evtchn_pending_sel));
-		} else {
-			struct compat_vcpu_info __user *vi = (void __user *)ghc->hva;
-			u32 evtchn_pending_sel32 = evtchn_pending_sel;
-
-			/* Attempt to set the evtchn_pending_sel bits in the
-			 * guest, and if that succeeds then clear the same
-			 * bits in the in-kernel version. */
-			asm volatile("1:\t" LOCK_PREFIX "orl %0, %1\n"
-				     "\tnotl %0\n"
-				     "\t" LOCK_PREFIX "andl %0, %2\n"
-				     "2:\n"
-				     _ASM_EXTABLE_UA(1b, 2b)
-				     : "=r" (evtchn_pending_sel32),
-				       "+m" (vi->evtchn_pending_sel),
-				       "+m" (v->arch.xen.evtchn_pending_sel)
-				     : "0" (evtchn_pending_sel32));
-		}
-		rc = 1;
-		unsafe_put_user(rc, (u8 __user *)ghc->hva + offset, err);
-
-	err:
-		user_access_end();
-
-		mark_page_dirty_in_slot(v->kvm, ghc->memslot, ghc->gpa >> PAGE_SHIFT);
-	} else {
-		__get_user(rc, (u8 __user *)ghc->hva + offset);
-	}
-
+	rc = ((struct vcpu_info *)gpc->khva)->evtchn_upcall_pending;
+	read_unlock_irqrestore(&gpc->lock, flags);
 	return rc;
 }
 
@@ -456,25 +447,18 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
 			     offsetof(struct compat_vcpu_info, time));
 
 		if (data->u.gpa == GPA_INVALID) {
-			vcpu->arch.xen.vcpu_info_set = false;
+			kvm_gfn_to_pfn_cache_destroy(vcpu->kvm, &vcpu->arch.xen.vcpu_info_cache);
 			r = 0;
 			break;
 		}
 
-		/* It must fit within a single page */
-		if ((data->u.gpa & ~PAGE_MASK) + sizeof(struct vcpu_info) > PAGE_SIZE) {
-			r = -EINVAL;
-			break;
-		}
-
-		r = kvm_gfn_to_hva_cache_init(vcpu->kvm,
+		r = kvm_gfn_to_pfn_cache_init(vcpu->kvm,
 					      &vcpu->arch.xen.vcpu_info_cache,
-					      data->u.gpa,
+					      NULL, KVM_HOST_USES_PFN, data->u.gpa,
 					      sizeof(struct vcpu_info));
-		if (!r) {
-			vcpu->arch.xen.vcpu_info_set = true;
+		if (!r)
 			kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
-		}
+
 		break;
 
 	case KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO:
@@ -630,7 +614,7 @@ int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
 
 	switch (data->type) {
 	case KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO:
-		if (vcpu->arch.xen.vcpu_info_set)
+		if (vcpu->arch.xen.vcpu_info_cache.active)
 			data->u.gpa = vcpu->arch.xen.vcpu_info_cache.gpa;
 		else
 			data->u.gpa = GPA_INVALID;
@@ -903,16 +887,17 @@ int kvm_xen_set_evtchn_fast(struct kvm_kernel_irq_routing_entry *e,
 	if (!vcpu)
 		return -1;
 
-	if (!vcpu->arch.xen.vcpu_info_set)
+	if (!vcpu->arch.xen.vcpu_info_cache.active)
 		return -1;
 
 	if (e->xen_evtchn.port >= max_evtchn_port(kvm))
 		return -1;
 
 	rc = -EWOULDBLOCK;
-	read_lock_irqsave(&gpc->lock, flags);
 
 	idx = srcu_read_lock(&kvm->srcu);
+
+	read_lock_irqsave(&gpc->lock, flags);
 	if (!kvm_gfn_to_pfn_cache_check(kvm, gpc, gpc->gpa, PAGE_SIZE))
 		goto out_rcu;
 
@@ -940,17 +925,44 @@ int kvm_xen_set_evtchn_fast(struct kvm_kernel_irq_routing_entry *e,
 	} else if (test_bit(e->xen_evtchn.port, mask_bits)) {
 		rc = -1; /* Masked */
 	} else {
-		rc = 1; /* Delivered. But was the vCPU waking already? */
-		if (!test_and_set_bit(port_word_bit, &vcpu->arch.xen.evtchn_pending_sel))
-			kick_vcpu = true;
+		rc = 1; /* Delivered to the bitmap in shared_info. */
+		/* Now switch to the vCPU's vcpu_info to set the index and pending_sel */
+		read_unlock_irqrestore(&gpc->lock, flags);
+		gpc = &vcpu->arch.xen.vcpu_info_cache;
+
+		read_lock_irqsave(&gpc->lock, flags);
+		if (!kvm_gfn_to_pfn_cache_check(kvm, gpc, gpc->gpa, sizeof(struct vcpu_info))) {
+			/*
+			 * Could not access the vcpu_info. Set the bit in-kernel
+			 * and prod the vCPU to deliver it for itself.
+			 */
+			if (!test_and_set_bit(port_word_bit, &vcpu->arch.xen.evtchn_pending_sel))
+				kick_vcpu = true;
+			goto out_rcu;
+		}
+
+		if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) {
+			struct vcpu_info *vcpu_info = gpc->khva;
+			if (!test_and_set_bit(port_word_bit, &vcpu_info->evtchn_pending_sel)) {
+				WRITE_ONCE(vcpu_info->evtchn_upcall_pending, 1);
+				kick_vcpu = true;
+			}
+		} else {
+			struct compat_vcpu_info *vcpu_info = gpc->khva;
+			if (!test_and_set_bit(port_word_bit,
+					      (unsigned long *)&vcpu_info->evtchn_pending_sel)) {
+				WRITE_ONCE(vcpu_info->evtchn_upcall_pending, 1);
+				kick_vcpu = true;
+			}
+		}
 	}
 
  out_rcu:
-	srcu_read_unlock(&kvm->srcu, idx);
 	read_unlock_irqrestore(&gpc->lock, flags);
+	srcu_read_unlock(&kvm->srcu, idx);
 
 	if (kick_vcpu) {
-		kvm_make_request(KVM_REQ_EVENT, vcpu);
+		kvm_make_request(KVM_REQ_UNBLOCK, vcpu);
 		kvm_vcpu_kick(vcpu);
 	}
 
@@ -1052,4 +1064,6 @@ void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu)
 {
 	kvm_gfn_to_pfn_cache_destroy(vcpu->kvm,
 				     &vcpu->arch.xen.runstate_cache);
+	kvm_gfn_to_pfn_cache_destroy(vcpu->kvm,
+				     &vcpu->arch.xen.vcpu_info_cache);
 }
diff --git a/arch/x86/kvm/xen.h b/arch/x86/kvm/xen.h
index 54b2bf4c3001..7dd0590f93e1 100644
--- a/arch/x86/kvm/xen.h
+++ b/arch/x86/kvm/xen.h
@@ -15,6 +15,7 @@
 extern struct static_key_false_deferred kvm_xen_enabled;
 
 int __kvm_xen_has_interrupt(struct kvm_vcpu *vcpu);
+void kvm_xen_inject_pending_events(struct kvm_vcpu *vcpu);
 int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data);
 int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data);
 int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data);
@@ -46,11 +47,19 @@ static inline bool kvm_xen_hypercall_enabled(struct kvm *kvm)
 static inline int kvm_xen_has_interrupt(struct kvm_vcpu *vcpu)
 {
 	if (static_branch_unlikely(&kvm_xen_enabled.key) &&
-	    vcpu->arch.xen.vcpu_info_set && vcpu->kvm->arch.xen.upcall_vector)
+	    vcpu->arch.xen.vcpu_info_cache.active &&
+	    vcpu->kvm->arch.xen.upcall_vector)
 		return __kvm_xen_has_interrupt(vcpu);
 
 	return 0;
 }
+
+static inline bool kvm_xen_has_pending_events(struct kvm_vcpu *vcpu)
+{
+	return static_branch_unlikely(&kvm_xen_enabled.key) &&
+		vcpu->arch.xen.evtchn_pending_sel;
+}
+
 #else
 static inline int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data)
 {
@@ -83,6 +92,15 @@ static inline int kvm_xen_has_interrupt(struct kvm_vcpu *vcpu)
 {
 	return 0;
 }
+
+static inline void kvm_xen_inject_pending_events(struct kvm_vcpu *vcpu)
+{
+}
+
+static inline bool kvm_xen_has_pending_events(struct kvm_vcpu *vcpu)
+{
+	return false;
+}
 #endif
 
 int kvm_xen_hypercall(struct kvm_vcpu *vcpu);
-- 
2.33.1