On Tue, 2024-02-27 at 09:43 -0500, Steven Rostedt wrote: > > > --- a/arch/x86/kvm/xen.c > > +++ b/arch/x86/kvm/xen.c > > @@ -1736,9 +1736,23 @@ static int set_shinfo_evtchn_pending(struct kvm_vcpu *vcpu, u32 port) > > unsigned long flags; > > int rc = -EWOULDBLOCK; > > > > - read_lock_irqsave(&gpc->lock, flags); > > + local_irq_save(flags); > > + if (!read_trylock(&gpc->lock)) { > > Note, directly disabling interrupts in PREEMPT_RT is just as bad as doing > so in non RT and the taking a mutex. > > Worse yet, in PREEMPT_RT read_unlock_irqrestore() isn't going to re-enable > interrupts. > > Not really sure what the best solution is here. I think the best solution is to realise that now Paul made the in- interrupt code paths use read_trylock(), we don't even need to use read_lock_irqsave() anywhere anyway, and then we don't have to *care* that PREEMPT_RT does it differently. Going to do some testing with this version... From: David Woodhouse <dwmw@xxxxxxxxxxxx> Subject: [PATCH] KVM: x86/xen: avoid blocking in hardirq context in kvm_xen_set_evtchn_fast() As described in [1] compiling with CONFIG_PROVE_RAW_LOCK_NESTING shows that kvm_xen_set_evtchn_fast() is blocking on pfncache locks in IRQ context. There is only actually blocking with PREEMPT_RT because the locks will turned into mutexes. There is no 'raw' version of rwlock_t that can be used to avoid that, so use read_trylock() and treat failure to lock the same as an invalid cache. However, with PREEMPT_RT read_lock_irqsave() doesn't actually disable IRQs either. So open-coding a trylock_irqsave() using local_irq_save() would be wrong in the PREEMPT_RT case. In fact, if kvm_xen_set_evtchn_fast() is now going to use a trylock anyway when invoked in hardirq context, there's actually no *need* for interrupts to be disabled by any other code path that takes the lock. So just switch all other users to a plain read_lock() and then the different semantics on PREEMPT_RT are not an issue. Add a warning in __kvm_xen_has_interrupt() just to be sure that one never does occur from interrupt context. [1] https://lore.kernel.org/lkml/99771ef3a4966a01fefd3adbb2ba9c3a75f97cf2.camel@xxxxxxxxxxxxx/T/#mbd06e5a04534ce9c0ee94bd8f1e8d942b2d45bd6 Fixes: 77c9b9dea4fb ("KVM: x86/xen: Use fast path for Xen timer delivery") Fixes: bbe17c625d68 ("KVM: x86/xen: Fix potential deadlock in kvm_xen_update_runstate_guest()") Co-developed-by: Paul Durrant <pdurrant@xxxxxxxxxx> Signed-off-by: Paul Durrant <pdurrant@xxxxxxxxxx> Signed-off-by: David Woodhouse <dwmw@xxxxxxxxxxxx> --- arch/x86/kvm/xen.c | 80 ++++++++++++++++++++++++---------------------- 1 file changed, 41 insertions(+), 39 deletions(-) diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index f8113eb8d040..ab3bbe75602d 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -45,7 +45,7 @@ static int kvm_xen_shared_info_init(struct kvm *kvm) int ret = 0; int idx = srcu_read_lock(&kvm->srcu); - read_lock_irq(&gpc->lock); + read_lock(&gpc->lock); while (!kvm_gpc_check(gpc, PAGE_SIZE)) { read_unlock_irq(&gpc->lock); @@ -53,7 +53,7 @@ static int kvm_xen_shared_info_init(struct kvm *kvm) if (ret) goto out; - read_lock_irq(&gpc->lock); + read_lock(&gpc->lock); } /* @@ -96,7 +96,7 @@ static int kvm_xen_shared_info_init(struct kvm *kvm) smp_wmb(); wc->version = wc_version + 1; - read_unlock_irq(&gpc->lock); + read_unlock(&gpc->lock); kvm_make_all_cpus_request(kvm, KVM_REQ_MASTERCLOCK_UPDATE); @@ -277,7 +277,6 @@ static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic) struct gfn_to_pfn_cache *gpc2 = &vx->runstate2_cache; size_t user_len, user_len1, user_len2; struct vcpu_runstate_info rs; - unsigned long flags; size_t times_ofs; uint8_t *update_bit = NULL; uint64_t entry_time; @@ -373,16 +372,13 @@ static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic) * gfn_to_pfn caches that cover the region. */ if (atomic) { - local_irq_save(flags); - if (!read_trylock(&gpc1->lock)) { - local_irq_restore(flags); + if (!read_trylock(&gpc1->lock)) return; - } } else { - read_lock_irqsave(&gpc1->lock, flags); + read_lock(&gpc1->lock); } while (!kvm_gpc_check(gpc1, user_len1)) { - read_unlock_irqrestore(&gpc1->lock, flags); + read_unlock(&gpc1->lock); /* When invoked from kvm_sched_out() we cannot sleep */ if (atomic) @@ -391,7 +387,7 @@ static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic) if (kvm_gpc_refresh(gpc1, user_len1)) return; - read_lock_irqsave(&gpc1->lock, flags); + read_lock(&gpc1->lock); } if (likely(!user_len2)) { @@ -419,7 +415,7 @@ static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic) lock_set_subclass(&gpc1->lock.dep_map, 1, _THIS_IP_); if (atomic) { if (!read_trylock(&gpc2->lock)) { - read_unlock_irqrestore(&gpc1->lock, flags); + read_unlock(&gpc1->lock); return; } } else { @@ -428,7 +424,7 @@ static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic) if (!kvm_gpc_check(gpc2, user_len2)) { read_unlock(&gpc2->lock); - read_unlock_irqrestore(&gpc1->lock, flags); + read_unlock(&gpc1->lock); /* When invoked from kvm_sched_out() we cannot sleep */ if (atomic) @@ -533,7 +529,7 @@ static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic) } kvm_gpc_mark_dirty_in_slot(gpc1); - read_unlock_irqrestore(&gpc1->lock, flags); + read_unlock(&gpc1->lock); } void kvm_xen_update_runstate(struct kvm_vcpu *v, int state) @@ -592,7 +588,6 @@ void kvm_xen_inject_pending_events(struct kvm_vcpu *v) { unsigned long evtchn_pending_sel = READ_ONCE(v->arch.xen.evtchn_pending_sel); struct gfn_to_pfn_cache *gpc = &v->arch.xen.vcpu_info_cache; - unsigned long flags; if (!evtchn_pending_sel) return; @@ -602,14 +597,14 @@ void kvm_xen_inject_pending_events(struct kvm_vcpu *v) * does anyway. Page it in and retry the instruction. We're just a * little more honest about it. */ - read_lock_irqsave(&gpc->lock, flags); + read_lock(&gpc->lock); while (!kvm_gpc_check(gpc, sizeof(struct vcpu_info))) { - read_unlock_irqrestore(&gpc->lock, flags); + read_unlock(&gpc->lock); if (kvm_gpc_refresh(gpc, sizeof(struct vcpu_info))) return; - read_lock_irqsave(&gpc->lock, flags); + read_lock(&gpc->lock); } /* Now gpc->khva is a valid kernel address for the vcpu_info */ @@ -639,7 +634,7 @@ void kvm_xen_inject_pending_events(struct kvm_vcpu *v) } kvm_gpc_mark_dirty_in_slot(gpc); - read_unlock_irqrestore(&gpc->lock, flags); + read_unlock(&gpc->lock); /* For the per-vCPU lapic vector, deliver it as MSI. */ if (v->arch.xen.upcall_vector) @@ -649,7 +644,6 @@ void kvm_xen_inject_pending_events(struct kvm_vcpu *v) int __kvm_xen_has_interrupt(struct kvm_vcpu *v) { struct gfn_to_pfn_cache *gpc = &v->arch.xen.vcpu_info_cache; - unsigned long flags; u8 rc = 0; /* @@ -665,9 +659,10 @@ int __kvm_xen_has_interrupt(struct kvm_vcpu *v) BUILD_BUG_ON(sizeof(rc) != sizeof_field(struct compat_vcpu_info, evtchn_upcall_pending)); - read_lock_irqsave(&gpc->lock, flags); + WARN_ON_ONCE(in_interrupt()); + read_lock(&gpc->lock); while (!kvm_gpc_check(gpc, sizeof(struct vcpu_info))) { - read_unlock_irqrestore(&gpc->lock, flags); + read_unlock(&gpc->lock); /* * This function gets called from kvm_vcpu_block() after setting the @@ -687,11 +682,11 @@ int __kvm_xen_has_interrupt(struct kvm_vcpu *v) */ return 0; } - read_lock_irqsave(&gpc->lock, flags); + read_lock(&gpc->lock); } rc = ((struct vcpu_info *)gpc->khva)->evtchn_upcall_pending; - read_unlock_irqrestore(&gpc->lock, flags); + read_unlock(&gpc->lock); return rc; } @@ -1382,12 +1377,11 @@ static bool wait_pending_event(struct kvm_vcpu *vcpu, int nr_ports, struct kvm *kvm = vcpu->kvm; struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache; unsigned long *pending_bits; - unsigned long flags; bool ret = true; int idx, i; idx = srcu_read_lock(&kvm->srcu); - read_lock_irqsave(&gpc->lock, flags); + read_lock(&gpc->lock); if (!kvm_gpc_check(gpc, PAGE_SIZE)) goto out_rcu; @@ -1408,7 +1402,7 @@ static bool wait_pending_event(struct kvm_vcpu *vcpu, int nr_ports, } out_rcu: - read_unlock_irqrestore(&gpc->lock, flags); + read_unlock(&gpc->lock); srcu_read_unlock(&kvm->srcu, idx); return ret; @@ -1730,12 +1724,17 @@ static int set_shinfo_evtchn_pending(struct kvm_vcpu *vcpu, u32 port) struct kvm *kvm = vcpu->kvm; struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache; unsigned long *pending_bits, *mask_bits; - unsigned long flags; int rc = -EWOULDBLOCK; - read_lock_irqsave(&gpc->lock, flags); + if (in_interrupt()) { + if (!read_trylock(&gpc->lock)) + goto out; + } else { + read_lock(&gpc->lock); + } + if (!kvm_gpc_check(gpc, PAGE_SIZE)) - goto out; + goto out_unlock; if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) { struct shared_info *shinfo = gpc->khva; @@ -1758,8 +1757,9 @@ static int set_shinfo_evtchn_pending(struct kvm_vcpu *vcpu, u32 port) rc = 1; /* It is newly raised */ } + out_unlock: + read_unlock(&gpc->lock); out: - read_unlock_irqrestore(&gpc->lock, flags); return rc; } @@ -1767,23 +1767,23 @@ static bool set_vcpu_info_evtchn_pending(struct kvm_vcpu *vcpu, u32 port) { struct kvm *kvm = vcpu->kvm; struct gfn_to_pfn_cache *gpc = &vcpu->arch.xen.vcpu_info_cache; - unsigned long flags; bool kick_vcpu = false; + bool locked; - read_lock_irqsave(&gpc->lock, flags); + locked = read_trylock(&gpc->lock); /* * Try to deliver the event directly to the vcpu_info. If successful and * the guest is using upcall_vector delivery, send the MSI. - * If the pfncache is invalid, set the shadow. In this case, or if the - * guest is using another form of event delivery, the vCPU must be - * kicked to complete the delivery. + * If the pfncache lock is contended or the cache is invalid, set the + * shadow. In this case, or if the guest is using another form of event + * delivery, the vCPU must be kicked to complete the delivery. */ if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) { struct vcpu_info *vcpu_info = gpc->khva; int port_word_bit = port / 64; - if (!kvm_gpc_check(gpc, sizeof(*vcpu_info))) { + if ((!locked || !kvm_gpc_check(gpc, sizeof(*vcpu_info)))) { if (!test_and_set_bit(port_word_bit, &vcpu->arch.xen.evtchn_pending_sel)) kick_vcpu = true; goto out; @@ -1797,7 +1797,7 @@ static bool set_vcpu_info_evtchn_pending(struct kvm_vcpu *vcpu, u32 port) struct compat_vcpu_info *vcpu_info = gpc->khva; int port_word_bit = port / 32; - if (!kvm_gpc_check(gpc, sizeof(*vcpu_info))) { + if ((!locked || !kvm_gpc_check(gpc, sizeof(*vcpu_info)))) { if (!test_and_set_bit(port_word_bit, &vcpu->arch.xen.evtchn_pending_sel)) kick_vcpu = true; goto out; @@ -1816,7 +1816,9 @@ static bool set_vcpu_info_evtchn_pending(struct kvm_vcpu *vcpu, u32 port) } out: - read_unlock_irqrestore(&gpc->lock, flags); + if (locked) + read_unlock(&gpc->lock); + return kick_vcpu; } -- 2.34.1
Attachment:
smime.p7s
Description: S/MIME cryptographic signature