On Wed, May 31, 2017 at 02:03:10PM +0200, Paolo Bonzini wrote: > Linu Cherian reported a WARN in cleanup_srcu_struct when shutting > down a guest that has iperf running on a VFIO assigned device. > > This happens because irqfd_wakeup calls srcu_read_lock(&kvm->irq_srcu) > in interrupt context, while a worker thread does the same inside > kvm_set_irq. If the interrupt happens while the worker thread is > executing __srcu_read_lock, lock_count can fall behind. > > The docs say you are not supposed to call srcu_read_lock() and > srcu_read_unlock() from irq context, but KVM interrupt injection happens > from (host) interrupt context and it would be nice if SRCU supported the > use case. KVM is using SRCU here not really for the "sleepable" part, > but rather due to its faster detection of grace periods, therefore it > is not possible to switch back to RCU, effectively reverting commit > 719d93cd5f5c ("kvm/irqchip: Speed up KVM_SET_GSI_ROUTING", 2014-01-16). > > However, the docs are painting a worse situation than it actually is. > You can have an SRCU instance only has users in irq context, and you > can mix process and irq context as long as process context users > disable interrupts. In addition, __srcu_read_unlock() actually uses > this_cpu_dec on both srcutree and srcuclassic. For those two > implementations, only srcu_read_lock() is unsafe. > > When srcuclassic's __srcu_read_unlock() was changed to use this_cpu_dec(), > in commit 5a41344a3d83 ("srcu: Simplify __srcu_read_unlock() via > this_cpu_dec()", 2012-11-29), __srcu_read_lock() did two increments. > Therefore it kept __this_cpu_inc, with preempt_disable/enable in the > caller. srcutree however only does one increment, so on most > architectures it is more efficient for __srcu_read_lock to use > this_cpu_inc, too. > > There would be a slowdown if 1) fast this_cpu_inc is not available and > cannot be implemented (this usually means that atomic_inc has implicit > memory barriers), and 2) local_irq_save/restore is slower than disabling > preemption. The main architecture with these constraints is s390, which > however is already paying the price in __srcu_read_unlock and has not > complained. A valid optimization on s390 would be to skip the smp_mb; > AIUI, this_cpu_inc implies a memory barrier (!) due to its implementation. > > Likewise, srcutiny has one increment only in __srcu_read_lock, and a > decrement in__srcu_read_unlock. However, it's not on a percpu variable so > it cannot use this_cpu_inc/dec. This patch changes srcutiny to use > respectively atomic_inc and atomic_dec_return_relaxed. Because srcutiny > is only used for uniprocessor machines, the extra cost of atomic operations > is averted at least on x86, where atomic_inc and atomic_dec_return_relaxed > compile to unlocked inc and xadd instructions respectively. > > Fixes: 719d93cd5f5c ("kvm/irqchip: Speed up KVM_SET_GSI_ROUTING") > Reported-by: Linu Cherian <linuc.decode@xxxxxxxxx> > Suggested-by: Linu Cherian <linuc.decode@xxxxxxxxx> > Cc: kvm@xxxxxxxxxxxxxxx > Cc: Paul E. McKenney <paulmck@xxxxxxxxxxxxxxxxxx> Take your choice, or use both. ;-) Reviewed-by: Paul E. McKenney <paulmck@xxxxxxxxxxxxxxxxxx> Tested-by: Paul E. McKenney <paulmck@xxxxxxxxxxxxxxxxxx> > Signed-off-by: Paolo Bonzini <pbonzini@xxxxxxxxxx> > --- > include/linux/srcutiny.h | 2 +- > kernel/rcu/rcutorture.c | 4 ++-- > kernel/rcu/srcutiny.c | 21 ++++++++++----------- > kernel/rcu/srcutree.c | 5 ++--- > 4 files changed, 15 insertions(+), 17 deletions(-) > > diff --git a/include/linux/srcutiny.h b/include/linux/srcutiny.h > index 42311ee0334f..7343e3b03087 100644 > --- a/include/linux/srcutiny.h > +++ b/include/linux/srcutiny.h > @@ -27,7 +27,7 @@ > #include <linux/swait.h> > > struct srcu_struct { > - int srcu_lock_nesting[2]; /* srcu_read_lock() nesting depth. */ > + atomic_t srcu_lock_nesting[2]; /* srcu_read_lock() nesting depth. */ > struct swait_queue_head srcu_wq; > /* Last srcu_read_unlock() wakes GP. */ > unsigned long srcu_gp_seq; /* GP seq # for callback tagging. */ > diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c > index ae6e574d4cf5..fa4e3895ab08 100644 > --- a/kernel/rcu/rcutorture.c > +++ b/kernel/rcu/rcutorture.c > @@ -611,8 +611,8 @@ static void srcu_torture_stats(void) > idx = READ_ONCE(srcu_ctlp->srcu_idx) & 0x1; > pr_alert("%s%s Tiny SRCU per-CPU(idx=%d): (%d,%d)\n", > torture_type, TORTURE_FLAG, idx, > - READ_ONCE(srcu_ctlp->srcu_lock_nesting[!idx]), > - READ_ONCE(srcu_ctlp->srcu_lock_nesting[idx])); > + atomic_read(&srcu_ctlp->srcu_lock_nesting[!idx]), > + atomic_read(&srcu_ctlp->srcu_lock_nesting[idx])); > #endif > } > > diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c > index 36e1f82faed1..e9ba84ac4e0f 100644 > --- a/kernel/rcu/srcutiny.c > +++ b/kernel/rcu/srcutiny.c > @@ -35,8 +35,8 @@ > > static int init_srcu_struct_fields(struct srcu_struct *sp) > { > - sp->srcu_lock_nesting[0] = 0; > - sp->srcu_lock_nesting[1] = 0; > + atomic_set(&sp->srcu_lock_nesting[0], 0); > + atomic_set(&sp->srcu_lock_nesting[1], 0); > init_swait_queue_head(&sp->srcu_wq); > sp->srcu_gp_seq = 0; > rcu_segcblist_init(&sp->srcu_cblist); > @@ -86,7 +86,8 @@ int init_srcu_struct(struct srcu_struct *sp) > */ > void cleanup_srcu_struct(struct srcu_struct *sp) > { > - WARN_ON(sp->srcu_lock_nesting[0] || sp->srcu_lock_nesting[1]); > + WARN_ON(atomic_read(&sp->srcu_lock_nesting[0]) || > + atomic_read(&sp->srcu_lock_nesting[1])); > flush_work(&sp->srcu_work); > WARN_ON(rcu_seq_state(sp->srcu_gp_seq)); > WARN_ON(sp->srcu_gp_running); > @@ -97,7 +98,7 @@ void cleanup_srcu_struct(struct srcu_struct *sp) > > /* > * Counts the new reader in the appropriate per-CPU element of the > - * srcu_struct. Must be called from process context. > + * srcu_struct. > * Returns an index that must be passed to the matching srcu_read_unlock(). > */ > int __srcu_read_lock(struct srcu_struct *sp) > @@ -105,21 +106,19 @@ int __srcu_read_lock(struct srcu_struct *sp) > int idx; > > idx = READ_ONCE(sp->srcu_idx); > - WRITE_ONCE(sp->srcu_lock_nesting[idx], sp->srcu_lock_nesting[idx] + 1); > + atomic_inc(&sp->srcu_lock_nesting[idx]); > return idx; > } > EXPORT_SYMBOL_GPL(__srcu_read_lock); > > /* > * Removes the count for the old reader from the appropriate element of > - * the srcu_struct. Must be called from process context. > + * the srcu_struct. > */ > void __srcu_read_unlock(struct srcu_struct *sp, int idx) > { > - int newval = sp->srcu_lock_nesting[idx] - 1; > - > - WRITE_ONCE(sp->srcu_lock_nesting[idx], newval); > - if (!newval && READ_ONCE(sp->srcu_gp_waiting)) > + if (atomic_dec_return_relaxed(&sp->srcu_lock_nesting[idx]) == 0 && > + READ_ONCE(sp->srcu_gp_waiting)) > swake_up(&sp->srcu_wq); > } > EXPORT_SYMBOL_GPL(__srcu_read_unlock); > @@ -148,7 +147,7 @@ void srcu_drive_gp(struct work_struct *wp) > idx = sp->srcu_idx; > WRITE_ONCE(sp->srcu_idx, !sp->srcu_idx); > WRITE_ONCE(sp->srcu_gp_waiting, true); /* srcu_read_unlock() wakes! */ > - swait_event(sp->srcu_wq, !READ_ONCE(sp->srcu_lock_nesting[idx])); > + swait_event(sp->srcu_wq, !atomic_read(&sp->srcu_lock_nesting[idx])); > WRITE_ONCE(sp->srcu_gp_waiting, false); /* srcu_read_unlock() cheap. */ > rcu_seq_end(&sp->srcu_gp_seq); > > diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c > index 3ae8474557df..157654fa436a 100644 > --- a/kernel/rcu/srcutree.c > +++ b/kernel/rcu/srcutree.c > @@ -357,7 +357,7 @@ void cleanup_srcu_struct(struct srcu_struct *sp) > > /* > * Counts the new reader in the appropriate per-CPU element of the > - * srcu_struct. Must be called from process context. > + * srcu_struct. > * Returns an index that must be passed to the matching srcu_read_unlock(). > */ > int __srcu_read_lock(struct srcu_struct *sp) > @@ -365,7 +365,7 @@ int __srcu_read_lock(struct srcu_struct *sp) > int idx; > > idx = READ_ONCE(sp->srcu_idx) & 0x1; > - __this_cpu_inc(sp->sda->srcu_lock_count[idx]); > + this_cpu_inc(sp->sda->srcu_lock_count[idx]); > smp_mb(); /* B */ /* Avoid leaking the critical section. */ > return idx; > } > @@ -375,7 +375,6 @@ int __srcu_read_lock(struct srcu_struct *sp) > * Removes the count for the old reader from the appropriate per-CPU > * element of the srcu_struct. Note that this may well be a different > * CPU than that which was incremented by the corresponding srcu_read_lock(). > - * Must be called from process context. > */ > void __srcu_read_unlock(struct srcu_struct *sp, int idx) > { > -- > 1.8.3.1 > >