Re: [PATCH v2 7/8] KVM: x86/xen: avoid blocking in hardirq context in kvm_xen_set_evtchn_fast()

David Woodhouse <dwmw2@xxxxxxxxxxxxx> · Fri, 08 Mar 2024 18:10:39 +0000

On Tue, 2024-02-27 at 09:43 -0500, Steven Rostedt wrote:
> 
> > --- a/arch/x86/kvm/xen.c
> > +++ b/arch/x86/kvm/xen.c
> > @@ -1736,9 +1736,23 @@ static int set_shinfo_evtchn_pending(struct kvm_vcpu *vcpu, u32 port)
> >         unsigned long flags;
> >         int rc = -EWOULDBLOCK;
> >   
> > -       read_lock_irqsave(&gpc->lock, flags);
> > +       local_irq_save(flags);
> > +       if (!read_trylock(&gpc->lock)) {
> 
> Note, directly disabling interrupts in PREEMPT_RT is just as bad as doing
> so in non RT and the taking a mutex.
> 
> Worse yet, in PREEMPT_RT read_unlock_irqrestore() isn't going to re-enable
> interrupts.
> 
> Not really sure what the best solution is here.

I think the best solution is to realise that now Paul made the in-
interrupt code paths use read_trylock(), we don't even need to use
read_lock_irqsave() anywhere anyway, and then we don't have to *care*
that PREEMPT_RT does it differently. Going to do some testing with this
version...

From: David Woodhouse <dwmw@xxxxxxxxxxxx>
Subject: [PATCH] KVM: x86/xen: avoid blocking in hardirq context in
 kvm_xen_set_evtchn_fast()

As described in [1] compiling with CONFIG_PROVE_RAW_LOCK_NESTING shows that
kvm_xen_set_evtchn_fast() is blocking on pfncache locks in IRQ context.
There is only actually blocking with PREEMPT_RT because the locks will
turned into mutexes. There is no 'raw' version of rwlock_t that can be used
to avoid that, so use read_trylock() and treat failure to lock the same as
an invalid cache.

However, with PREEMPT_RT read_lock_irqsave() doesn't actually disable IRQs
either. So open-coding a trylock_irqsave() using local_irq_save() would be
wrong in the PREEMPT_RT case.

In fact, if kvm_xen_set_evtchn_fast() is now going to use a trylock anyway
when invoked in hardirq context, there's actually no *need* for interrupts
to be disabled by any other code path that takes the lock. So just switch
all other users to a plain read_lock() and then the different semantics
on PREEMPT_RT are not an issue.

Add a warning in __kvm_xen_has_interrupt() just to be sure that one never
does occur from interrupt context.

[1] https://lore.kernel.org/lkml/99771ef3a4966a01fefd3adbb2ba9c3a75f97cf2.camel@xxxxxxxxxxxxx/T/#mbd06e5a04534ce9c0ee94bd8f1e8d942b2d45bd6
Fixes: 77c9b9dea4fb ("KVM: x86/xen: Use fast path for Xen timer delivery")
Fixes: bbe17c625d68 ("KVM: x86/xen: Fix potential deadlock in kvm_xen_update_runstate_guest()")
Co-developed-by: Paul Durrant <pdurrant@xxxxxxxxxx>
Signed-off-by: Paul Durrant <pdurrant@xxxxxxxxxx>
Signed-off-by: David Woodhouse <dwmw@xxxxxxxxxxxx>
---
 arch/x86/kvm/xen.c | 80 ++++++++++++++++++++++++----------------------
 1 file changed, 41 insertions(+), 39 deletions(-)

diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index f8113eb8d040..ab3bbe75602d 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -45,7 +45,7 @@ static int kvm_xen_shared_info_init(struct kvm *kvm)
 	int ret = 0;
 	int idx = srcu_read_lock(&kvm->srcu);
 
-	read_lock_irq(&gpc->lock);
+	read_lock(&gpc->lock);
 	while (!kvm_gpc_check(gpc, PAGE_SIZE)) {
 		read_unlock_irq(&gpc->lock);
 
@@ -53,7 +53,7 @@ static int kvm_xen_shared_info_init(struct kvm *kvm)
 		if (ret)
 			goto out;
 
-		read_lock_irq(&gpc->lock);
+		read_lock(&gpc->lock);
 	}
 
 	/*
@@ -96,7 +96,7 @@ static int kvm_xen_shared_info_init(struct kvm *kvm)
 	smp_wmb();
 
 	wc->version = wc_version + 1;
-	read_unlock_irq(&gpc->lock);
+	read_unlock(&gpc->lock);
 
 	kvm_make_all_cpus_request(kvm, KVM_REQ_MASTERCLOCK_UPDATE);
 
@@ -277,7 +277,6 @@ static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic)
 	struct gfn_to_pfn_cache *gpc2 = &vx->runstate2_cache;
 	size_t user_len, user_len1, user_len2;
 	struct vcpu_runstate_info rs;
-	unsigned long flags;
 	size_t times_ofs;
 	uint8_t *update_bit = NULL;
 	uint64_t entry_time;
@@ -373,16 +372,13 @@ static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic)
 	 * gfn_to_pfn caches that cover the region.
 	 */
 	if (atomic) {
-		local_irq_save(flags);
-		if (!read_trylock(&gpc1->lock)) {
-			local_irq_restore(flags);
+		if (!read_trylock(&gpc1->lock))
 			return;
-		}
 	} else {
-		read_lock_irqsave(&gpc1->lock, flags);
+		read_lock(&gpc1->lock);
 	}
 	while (!kvm_gpc_check(gpc1, user_len1)) {
-		read_unlock_irqrestore(&gpc1->lock, flags);
+		read_unlock(&gpc1->lock);
 
 		/* When invoked from kvm_sched_out() we cannot sleep */
 		if (atomic)
@@ -391,7 +387,7 @@ static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic)
 		if (kvm_gpc_refresh(gpc1, user_len1))
 			return;
 
-		read_lock_irqsave(&gpc1->lock, flags);
+		read_lock(&gpc1->lock);
 	}
 
 	if (likely(!user_len2)) {
@@ -419,7 +415,7 @@ static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic)
 		lock_set_subclass(&gpc1->lock.dep_map, 1, _THIS_IP_);
 		if (atomic) {
 			if (!read_trylock(&gpc2->lock)) {
-				read_unlock_irqrestore(&gpc1->lock, flags);
+				read_unlock(&gpc1->lock);
 				return;
 			}
 		} else {
@@ -428,7 +424,7 @@ static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic)
 
 		if (!kvm_gpc_check(gpc2, user_len2)) {
 			read_unlock(&gpc2->lock);
-			read_unlock_irqrestore(&gpc1->lock, flags);
+			read_unlock(&gpc1->lock);
 
 			/* When invoked from kvm_sched_out() we cannot sleep */
 			if (atomic)
@@ -533,7 +529,7 @@ static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic)
 	}
 
 	kvm_gpc_mark_dirty_in_slot(gpc1);
-	read_unlock_irqrestore(&gpc1->lock, flags);
+	read_unlock(&gpc1->lock);
 }
 
 void kvm_xen_update_runstate(struct kvm_vcpu *v, int state)
@@ -592,7 +588,6 @@ void kvm_xen_inject_pending_events(struct kvm_vcpu *v)
 {
 	unsigned long evtchn_pending_sel = READ_ONCE(v->arch.xen.evtchn_pending_sel);
 	struct gfn_to_pfn_cache *gpc = &v->arch.xen.vcpu_info_cache;
-	unsigned long flags;
 
 	if (!evtchn_pending_sel)
 		return;
@@ -602,14 +597,14 @@ void kvm_xen_inject_pending_events(struct kvm_vcpu *v)
 	 * does anyway. Page it in and retry the instruction. We're just a
 	 * little more honest about it.
 	 */
-	read_lock_irqsave(&gpc->lock, flags);
+	read_lock(&gpc->lock);
 	while (!kvm_gpc_check(gpc, sizeof(struct vcpu_info))) {
-		read_unlock_irqrestore(&gpc->lock, flags);
+		read_unlock(&gpc->lock);
 
 		if (kvm_gpc_refresh(gpc, sizeof(struct vcpu_info)))
 			return;
 
-		read_lock_irqsave(&gpc->lock, flags);
+		read_lock(&gpc->lock);
 	}
 
 	/* Now gpc->khva is a valid kernel address for the vcpu_info */
@@ -639,7 +634,7 @@ void kvm_xen_inject_pending_events(struct kvm_vcpu *v)
 	}
 
 	kvm_gpc_mark_dirty_in_slot(gpc);
-	read_unlock_irqrestore(&gpc->lock, flags);
+	read_unlock(&gpc->lock);
 
 	/* For the per-vCPU lapic vector, deliver it as MSI. */
 	if (v->arch.xen.upcall_vector)
@@ -649,7 +644,6 @@ void kvm_xen_inject_pending_events(struct kvm_vcpu *v)
 int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
 {
 	struct gfn_to_pfn_cache *gpc = &v->arch.xen.vcpu_info_cache;
-	unsigned long flags;
 	u8 rc = 0;
 
 	/*
@@ -665,9 +659,10 @@ int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
 	BUILD_BUG_ON(sizeof(rc) !=
 		     sizeof_field(struct compat_vcpu_info, evtchn_upcall_pending));
 
-	read_lock_irqsave(&gpc->lock, flags);
+	WARN_ON_ONCE(in_interrupt());
+	read_lock(&gpc->lock);
 	while (!kvm_gpc_check(gpc, sizeof(struct vcpu_info))) {
-		read_unlock_irqrestore(&gpc->lock, flags);
+		read_unlock(&gpc->lock);
 
 		/*
 		 * This function gets called from kvm_vcpu_block() after setting the
@@ -687,11 +682,11 @@ int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
 			 */
 			return 0;
 		}
-		read_lock_irqsave(&gpc->lock, flags);
+		read_lock(&gpc->lock);
 	}
 
 	rc = ((struct vcpu_info *)gpc->khva)->evtchn_upcall_pending;
-	read_unlock_irqrestore(&gpc->lock, flags);
+	read_unlock(&gpc->lock);
 	return rc;
 }
 
@@ -1382,12 +1377,11 @@ static bool wait_pending_event(struct kvm_vcpu *vcpu, int nr_ports,
 	struct kvm *kvm = vcpu->kvm;
 	struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache;
 	unsigned long *pending_bits;
-	unsigned long flags;
 	bool ret = true;
 	int idx, i;
 
 	idx = srcu_read_lock(&kvm->srcu);
-	read_lock_irqsave(&gpc->lock, flags);
+	read_lock(&gpc->lock);
 	if (!kvm_gpc_check(gpc, PAGE_SIZE))
 		goto out_rcu;
 
@@ -1408,7 +1402,7 @@ static bool wait_pending_event(struct kvm_vcpu *vcpu, int nr_ports,
 	}
 
  out_rcu:
-	read_unlock_irqrestore(&gpc->lock, flags);
+	read_unlock(&gpc->lock);
 	srcu_read_unlock(&kvm->srcu, idx);
 
 	return ret;
@@ -1730,12 +1724,17 @@ static int set_shinfo_evtchn_pending(struct kvm_vcpu *vcpu, u32 port)
 	struct kvm *kvm = vcpu->kvm;
 	struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache;
 	unsigned long *pending_bits, *mask_bits;
-	unsigned long flags;
 	int rc = -EWOULDBLOCK;
 
-	read_lock_irqsave(&gpc->lock, flags);
+	if (in_interrupt()) {
+		if (!read_trylock(&gpc->lock))
+			goto out;
+	} else {
+		read_lock(&gpc->lock);
+	}
+
 	if (!kvm_gpc_check(gpc, PAGE_SIZE))
-		goto out;
+		goto out_unlock;
 
 	if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) {
 		struct shared_info *shinfo = gpc->khva;
@@ -1758,8 +1757,9 @@ static int set_shinfo_evtchn_pending(struct kvm_vcpu *vcpu, u32 port)
 		rc = 1; /* It is newly raised */
 	}
 
+ out_unlock:
+	read_unlock(&gpc->lock);
  out:
-	read_unlock_irqrestore(&gpc->lock, flags);
 	return rc;
 }
 
@@ -1767,23 +1767,23 @@ static bool set_vcpu_info_evtchn_pending(struct kvm_vcpu *vcpu, u32 port)
 {
 	struct kvm *kvm = vcpu->kvm;
 	struct gfn_to_pfn_cache *gpc = &vcpu->arch.xen.vcpu_info_cache;
-	unsigned long flags;
 	bool kick_vcpu = false;
+	bool locked;
 
-	read_lock_irqsave(&gpc->lock, flags);
+	locked = read_trylock(&gpc->lock);
 
 	/*
 	 * Try to deliver the event directly to the vcpu_info. If successful and
 	 * the guest is using upcall_vector delivery, send the MSI.
-	 * If the pfncache is invalid, set the shadow. In this case, or if the
-	 * guest is using another form of event delivery, the vCPU must be
-	 * kicked to complete the delivery.
+	 * If the pfncache lock is contended or the cache is invalid, set the
+	 * shadow. In this case, or if the guest is using another form of event
+	 * delivery, the vCPU must be kicked to complete the delivery.
 	 */
 	if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) {
 		struct vcpu_info *vcpu_info = gpc->khva;
 		int port_word_bit = port / 64;
 
-		if (!kvm_gpc_check(gpc, sizeof(*vcpu_info))) {
+		if ((!locked || !kvm_gpc_check(gpc, sizeof(*vcpu_info)))) {
 			if (!test_and_set_bit(port_word_bit, &vcpu->arch.xen.evtchn_pending_sel))
 				kick_vcpu = true;
 			goto out;
@@ -1797,7 +1797,7 @@ static bool set_vcpu_info_evtchn_pending(struct kvm_vcpu *vcpu, u32 port)
 		struct compat_vcpu_info *vcpu_info = gpc->khva;
 		int port_word_bit = port / 32;
 
-		if (!kvm_gpc_check(gpc, sizeof(*vcpu_info))) {
+		if ((!locked || !kvm_gpc_check(gpc, sizeof(*vcpu_info)))) {
 			if (!test_and_set_bit(port_word_bit, &vcpu->arch.xen.evtchn_pending_sel))
 				kick_vcpu = true;
 			goto out;
@@ -1816,7 +1816,9 @@ static bool set_vcpu_info_evtchn_pending(struct kvm_vcpu *vcpu, u32 port)
 	}
 
  out:
-	read_unlock_irqrestore(&gpc->lock, flags);
+	if (locked)
+		read_unlock(&gpc->lock);
+
 	return kick_vcpu;
 }
 
-- 
2.34.1




Attachment:
smime.p7s

Description: S/MIME cryptographic signature