Re: [PATCH tip/core/rcu 3/4] kernel/smp: Provide CSD lock timeout diagnostics

"Paul E. McKenney" <paulmck@xxxxxxxxxx> · Fri, 4 Sep 2020 11:09:02 -0700

On Fri, Sep 04, 2020 at 11:19:53AM +0800, Boqun Feng wrote:
> On Mon, Aug 31, 2020 at 11:14:16AM -0700, paulmck@xxxxxxxxxx wrote:
> > From: "Paul E. McKenney" <paulmck@xxxxxxxxxx>
> > 
> > This commit causes csd_lock_wait() to emit diagnostics when a CPU
> > fails to respond quickly enough to one of the smp_call_function()
> > family of function calls.  These diagnostics are enabled by a new
> > CSD_LOCK_WAIT_DEBUG Kconfig option that depends on DEBUG_KERNEL.
> > 
> > This commit was inspired by an earlier patch by Josef Bacik.
> > 
> > [ paulmck: Fix for syzbot+0f719294463916a3fc0e@xxxxxxxxxxxxxxxxxxxxxxxxx ]
> > [ paulmck: Fix KASAN use-after-free issue reported by Qian Cai. ]
> > [ paulmck: Fix botched nr_cpu_ids comparison per Dan Carpenter. ]
> > [ paulmck: Apply Peter Zijlstra feedback. ]
> > Link: https://lore.kernel.org/lkml/00000000000042f21905a991ecea@xxxxxxxxxx
> > Link: https://lore.kernel.org/lkml/0000000000002ef21705a9933cf3@xxxxxxxxxx
> > Cc: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
> > Cc: Ingo Molnar <mingo@xxxxxxxxxx>
> > Cc: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
> > Cc: Sebastian Andrzej Siewior <bigeasy@xxxxxxxxxxxxx>
> > Signed-off-by: Paul E. McKenney <paulmck@xxxxxxxxxx>
> > ---
> >  kernel/smp.c      | 132 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
> >  lib/Kconfig.debug |  11 +++++
> >  2 files changed, 143 insertions(+)
> > 
> > diff --git a/kernel/smp.c b/kernel/smp.c
> > index a47382d..c5d3188 100644
> > --- a/kernel/smp.c
> > +++ b/kernel/smp.c
> > @@ -20,6 +20,9 @@
> >  #include <linux/sched.h>
> >  #include <linux/sched/idle.h>
> >  #include <linux/hypervisor.h>
> > +#include <linux/sched/clock.h>
> > +#include <linux/nmi.h>
> > +#include <linux/sched/debug.h>
> >  
> >  #include "smpboot.h"
> >  #include "sched/smp.h"
> > @@ -96,6 +99,103 @@ void __init call_function_init(void)
> >  	smpcfd_prepare_cpu(smp_processor_id());
> >  }
> >  
> > +#ifdef CONFIG_CSD_LOCK_WAIT_DEBUG
> > +
> > +static DEFINE_PER_CPU(call_single_data_t *, cur_csd);
> > +static DEFINE_PER_CPU(smp_call_func_t, cur_csd_func);
> > +static DEFINE_PER_CPU(void *, cur_csd_info);
> > +
> > +#define CSD_LOCK_TIMEOUT (5ULL * NSEC_PER_SEC)
> > +atomic_t csd_bug_count = ATOMIC_INIT(0);
> > +
> > +/* Record current CSD work for current CPU, NULL to erase. */
> > +static void csd_lock_record(call_single_data_t *csd)
> > +{
> > +	if (!csd) {
> > +		smp_mb(); /* NULL cur_csd after unlock. */
> > +		__this_cpu_write(cur_csd, NULL);
> > +		return;
> > +	}
> > +	__this_cpu_write(cur_csd_func, csd->func);
> > +	__this_cpu_write(cur_csd_info, csd->info);
> > +	smp_wmb(); /* func and info before csd. */
> > +	__this_cpu_write(cur_csd, csd);
> > +	smp_mb(); /* Update cur_csd before function call. */
> > +		  /* Or before unlock, as the case may be. */
> > +}
> > +
> > +static __always_inline int csd_lock_wait_getcpu(call_single_data_t *csd)
> > +{
> > +	unsigned int csd_type;
> > +
> > +	csd_type = CSD_TYPE(csd);
> > +	if (csd_type == CSD_TYPE_ASYNC || csd_type == CSD_TYPE_SYNC)
> > +		return csd->dst; /* Other CSD_TYPE_ values might not have ->dst. */
> > +	return -1;
> > +}
> > +
> > +/*
> > + * Complain if too much time spent waiting.  Note that only
> > + * the CSD_TYPE_SYNC/ASYNC types provide the destination CPU,
> > + * so waiting on other types gets much less information.
> > + */
> > +static __always_inline bool csd_lock_wait_toolong(call_single_data_t *csd, u64 ts0, u64 *ts1, int *bug_id)
> > +{
> > +	int cpu = -1;
> > +	int cpux;
> > +	bool firsttime;
> > +	u64 ts2, ts_delta;
> > +	call_single_data_t *cpu_cur_csd;
> > +	unsigned int flags = READ_ONCE(csd->flags);
> > +
> > +	if (!(flags & CSD_FLAG_LOCK)) {
> > +		if (!unlikely(*bug_id))
> > +			return true;
> > +		cpu = csd_lock_wait_getcpu(csd);
> > +		pr_alert("csd: CSD lock (#%d) got unstuck on CPU#%02d, CPU#%02d released the lock.\n",
> > +			 *bug_id, raw_smp_processor_id(), cpu);
> > +		return true;
> > +	}
> > +
> > +	ts2 = sched_clock();
> > +	ts_delta = ts2 - *ts1;
> > +	if (likely(ts_delta <= CSD_LOCK_TIMEOUT))
> > +		return false;
> > +
> > +	firsttime = !*bug_id;
> > +	if (firsttime)
> > +		*bug_id = atomic_inc_return(&csd_bug_count);
> > +	cpu = csd_lock_wait_getcpu(csd);
> > +	if (WARN_ONCE(cpu < 0 || cpu >= nr_cpu_ids, "%s: cpu = %d\n", __func__, cpu))
> > +		cpux = 0;
> > +	else
> > +		cpux = cpu;
> > +	cpu_cur_csd = smp_load_acquire(&per_cpu(cur_csd, cpux)); /* Before func and info. */
> 
> Should we re-read csd->flags and check CSD_FLAG_LOCK again to improve
> the accuracy or to avoid unnecessary stack dumping? Because cpux may
> just finish the csd in question and clear the CSD_FLAG_LOCK bit, so it's
> late but no so late ;-) Maybe we can put the following check:
> 
> > +	pr_alert("csd: %s non-responsive CSD lock (#%d) on CPU#%d, waiting %llu ns for CPU#%02d %pS(%ps).\n",
> > +		 firsttime ? "Detected" : "Continued", *bug_id, raw_smp_processor_id(), ts2 - ts0,
> > +		 cpu, csd->func, csd->info);
> 
> 	/*
> 	 * Re-read and check the flag.
> 	 *
> 	 * If destination cpu just handled the csd in question, we want
> 	 * to report the timeout but may not want to do stack dumping on
> 	 * either src or dst cpu, because in that case the stacks will
> 	 * be pretty much pointless. Since csd_lock_record(NULL) will
> 	 * issue a smp_mb() before setting cur_csd, we are guaranteed
> 	 * to observe the unlocked state of ->flags if the previous
> 	 * smp_load_acquire() observe the csd_lock_record(NULL) of
> 	 * handling the csd in question as follow:
> 	 * 
> 	 * CPU 0			CPU 1
> 	 *				<queue csd>
> 	 * <handle other csds>		csd_lock_wait_toolong()
> 	 * 				flags = READ_ONCE(csd->flags); // locked
> 	 * <handle the csd of CPU 1>
> 	 * csd_lock_record(csd);
> 	 * func(info);
> 	 * csd_unlock(csd); // unlocked
> 	 * csd_lock_record(NULL);
> 	 *				<ts_delat > CSD_LOCK_TIMEOUT> // going to report the timeout
> 	 *				cpu_cur_csd = smp_load_acquire(...);
> 	 * <handle other csds>
> 	 * csd_lock_record(other_csd);
> 	 *				flags = READ_ONCE(csd->flags); // observe the unlocked state
> 	 *				                               // no matter cpu_cur_csd is NULL or other_csd
> 	 * 
> 	 */
> 	flags = READ_ONCE(csd->flags);
> 	if (!(flags & CSD_FLAG_LOCK)) {
> 		pr_alert("csd: CSD lock (#%d) got unstuck on CPU#%02d, CPU#%02d released the lock.\n",
> 			 *bug_id, raw_smp_processor_id(), cpux);
> 		return true;
> 	}
> 
> Thoughts?

it might eventually be necessary to do this, but we do need to see
a real problem before layering on more code.  After all, the episode
could just as easily end right after the re-read.  And thus far, aside
from scftorture, the episodes in the wild were of the type that never
does end.  Plus the stack dump will take quite a bit more time than the
interval between the initial read and your proposed re-read.  So again,
we need to see a real problem before doing something like this.

							Thanx, Paul