Re: current_thread_info() vs task_thread_info(current)

"Paul E. McKenney" <paulmck@xxxxxxxxxxxxxxxxxx> · Mon, 18 Jul 2011 20:46:58 -0700

On Mon, Jul 18, 2011 at 08:04:04PM -0700, Paul E. McKenney wrote:
> On Tue, Jul 19, 2011 at 10:57:37AM +1000, Benjamin Herrenschmidt wrote:
> > On Mon, 2011-07-18 at 17:39 -0700, Paul E. McKenney wrote:
> > > > Hrm, no I don't see that happening no. The preempt count when
> > > exiting an
> > > > irq or softirq stack should be the exact same as when entering it,
> > > which
> > > > is why we don't bother copying it over. Do you see any case where
> > > that
> > > > wouldn't hold ?
> > > 
> > > Nope, other than seeing preempt_count() transition from zero to three
> > > across a spin_unlock_irqrestore() for no good reason that I could see.
> > 
> > Do you have a nice repro-case ? :-)
> > 
> > That sounds really nasty ... smells really like something bad's
> > happening from an interrupt, but we don't copy back the preempt-count
> > from the interrupt stacks at all, so that's really really odd.
> 
> Good question...  The system I reproduced on four times over the
> weekend is out of commission.  Trying the same test on another system
> with a minimal patch -- will let you know how it goes.

OK, the ABAT system trillian reproduces this.  Here is the repeat-by:

1.	Apply the patch shown below to 3.0-rc7.

2.	Build the system with the following config:

	CONFIG_RCU_TRACE=n
	CONFIG_NO_HZ=n
	CONFIG_SMP=y
	CONFIG_RCU_FANOUT=4
	CONFIG_NR_CPUS=8
	CONFIG_RCU_FANOUT_EXACT=n
	CONFIG_HOTPLUG_CPU=n
	CONFIG_SUSPEND=n
	CONFIG_HIBERNATION=n
	CONFIG_PREEMPT_NONE=n
	CONFIG_PREEMPT_VOLUNTARY=n
	CONFIG_PREEMPT=y
	#CHECK#CONFIG_TREE_PREEMPT_RCU=y
	CONFIG_RCU_TORTURE_TEST=m
	CONFIG_MODULE_UNLOAD=y
	CONFIG_SYSFS_DEPRECATED_V2=y
	CONFIG_IKCONFIG=y
	CONFIG_IKCONFIG_PROC=y
	CONFIG_PRINTK_TIME=y

3.	Boot the system and do the following:

	modprobe rcutorture torture_type=rcu_expedited stat_interval=30 test_no_idle_hz

4.	Within a few seconds (12.7 milliseconds on trillian), you
	should get something like the following in dmesg:

	rcu_expedited-torture:--- Start of test: nreaders=16 nfakewriters=4 stat_interval=30 verbose=0 test_no_idle_hz=1 shuffle_interval=3 stutter=5 irqreader=1 fqs_duration=0 fqs_holdoff=0 fqs_stutter=3 test_boost=1/0 test_boost_interval=7 test_boost_duration=4
	BUG: kernel/rcutree_plugin.h:802: preemption disabled
	BUG: scheduling while atomic: rcu_torture_fak/9422/0x00000003//825
	Modules linked in: rcutorture [last unloaded: scsi_wait_scan]
	Call Trace:
	[c0000000bdc93940] [c000000000015104] .show_stack+0x74/0x1c0 (unreliable)
	[c0000000bdc939f0] [c00000000007b1d8] .__schedule_bug+0x88/0x90
	[c0000000bdc93a70] [c000000000687f2c] .schedule+0x94c/0xa80
	[c0000000bdc93d30] [c000000000105e04] .synchronize_rcu_expedited+0x3a4/0x550
	[c0000000bdc93e20] [d000000001511a08] .rcu_torture_fakewriter+0xc8/0x190 [rcutorture]
	[c0000000bdc93ed0] [c0000000000b49dc] .kthread+0xbc/0xd0
	[c0000000bdc93f90] [c000000000021fe4] .kernel_thread+0x54/0x70

Lines 802 of kernel/rcutree_plugin.h is the one marked below with
/*-----*/:

	current->rcu_debug_loc = __LINE__;
	rcu_check_preempt(__LINE__);
	synchronize_sched_expedited();
	rcu_check_preempt(__LINE__);

	raw_spin_lock_irqsave(&rsp->onofflock, flags);

	/* Initialize ->expmask for all non-leaf rcu_node structures. */
	current->rcu_debug_loc = __LINE__;
	rcu_check_preempt(__LINE__);  /*-----*/
	rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) {
		raw_spin_lock(&rnp->lock); /* irqs already disabled. */
		rnp->expmask = rnp->qsmaskinit;
		raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
	}

So preempt_count() was zero just before the raw_spin_lock_irqsave()
and 3 just after.  The scheduling-while-atomic bug is from the
wait_event() a few lines further down.

							Thanx, Paul

------------------------------------------------------------------------

rcu: decrease rcu_report_exp_rnp coupling with scheduler

PREEMPT_RCU read-side critical sections blocking an expedited grace
period invoke rcu_report_exp_rnp().  When the last such critical section
has completed, rcu_report_exp_rnp() invokes the scheduler to wake up the
task that invoked synchronize_rcu_expedited() -- needlessly holding the
root rcu_node structure's lock while doing so, thus needlessly providing
a way for RCU and the scheduler to deadlock.

This commit therefore releases the root rcu_node structure's lock before
calling wake_up().  And also adds a bunch of diagnostics.

Reported-by: Ed Tomlinson <edt@xxxxxx>
Signed-off-by: Paul E. McKenney <paulmck@xxxxxxxxxxxxxxxxxx>

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 496770a..3501f3f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1258,6 +1258,7 @@ struct task_struct {
 #endif /* #ifdef CONFIG_PREEMPT_RCU */
 #ifdef CONFIG_TREE_PREEMPT_RCU
 	struct rcu_node *rcu_blocked_node;
+	int rcu_debug_loc;
 #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
 #ifdef CONFIG_RCU_BOOST
 	struct rt_mutex *rcu_boost_mutex;
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 14dc7dd..e182224 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -696,8 +696,10 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
 	raw_spin_lock_irqsave(&rnp->lock, flags);
 	for (;;) {
 		if (!sync_rcu_preempt_exp_done(rnp))
+			raw_spin_unlock_irqrestore(&rnp->lock, flags);
 			break;
 		if (rnp->parent == NULL) {
+			raw_spin_unlock_irqrestore(&rnp->lock, flags);
 			wake_up(&sync_rcu_preempt_exp_wq);
 			break;
 		}
@@ -707,7 +709,6 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
 		raw_spin_lock(&rnp->lock); /* irqs already disabled */
 		rnp->expmask &= ~mask;
 	}
-	raw_spin_unlock_irqrestore(&rnp->lock, flags);
 }
 
 /*
@@ -735,6 +736,19 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
 		rcu_report_exp_rnp(rsp, rnp);
 }
 
+static inline void rcu_check_preempt(int lineno)
+{
+	static int warned;
+
+	if (warned)
+		return;
+	if (preempt_count() != 0) {
+		warned = 1;
+		printk(KERN_ERR "BUG: %s:%d: preemption disabled\n",
+		       __FILE__, lineno);
+	}
+}
+
 /*
  * Wait for an rcu-preempt grace period, but expedite it.  The basic idea
  * is to invoke synchronize_sched_expedited() to push all the tasks to
@@ -748,10 +762,13 @@ void synchronize_rcu_expedited(void)
 	long snap;
 	int trycount = 0;
 
+	rcu_check_preempt(__LINE__);
 	smp_mb(); /* Caller's modifications seen first by other CPUs. */
 	snap = ACCESS_ONCE(sync_rcu_preempt_exp_count) + 1;
 	smp_mb(); /* Above access cannot bleed into critical section. */
 
+	current->rcu_debug_loc = __LINE__;
+
 	/*
 	 * Acquire lock, falling back to synchronize_rcu() if too many
 	 * lock-acquisition failures.  Of course, if someone does the
@@ -761,6 +778,8 @@ void synchronize_rcu_expedited(void)
 		if (trycount++ < 10)
 			udelay(trycount * num_online_cpus());
 		else {
+			rcu_check_preempt(__LINE__);
+			current->rcu_debug_loc = __LINE__;
 			synchronize_rcu();
 			return;
 		}
@@ -771,27 +790,39 @@ void synchronize_rcu_expedited(void)
 		goto unlock_mb_ret; /* Others did our work for us. */
 
 	/* force all RCU readers onto ->blkd_tasks lists. */
+	current->rcu_debug_loc = __LINE__;
+	rcu_check_preempt(__LINE__);
 	synchronize_sched_expedited();
+	rcu_check_preempt(__LINE__);
 
 	raw_spin_lock_irqsave(&rsp->onofflock, flags);
 
 	/* Initialize ->expmask for all non-leaf rcu_node structures. */
+	current->rcu_debug_loc = __LINE__;
+	rcu_check_preempt(__LINE__);
 	rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) {
 		raw_spin_lock(&rnp->lock); /* irqs already disabled. */
 		rnp->expmask = rnp->qsmaskinit;
 		raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
 	}
+	rcu_check_preempt(__LINE__);
 
 	/* Snapshot current state of ->blkd_tasks lists. */
+	current->rcu_debug_loc = __LINE__;
+	rcu_check_preempt(__LINE__);
 	rcu_for_each_leaf_node(rsp, rnp)
 		sync_rcu_preempt_exp_init(rsp, rnp);
+	rcu_check_preempt(__LINE__);
 	if (NUM_RCU_NODES > 1)
 		sync_rcu_preempt_exp_init(rsp, rcu_get_root(rsp));
+	rcu_check_preempt(__LINE__);
 
 	raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
 
 	/* Wait for snapshotted ->blkd_tasks lists to drain. */
+	rcu_check_preempt(__LINE__);
 	rnp = rcu_get_root(rsp);
+	current->rcu_debug_loc = __LINE__;
 	wait_event(sync_rcu_preempt_exp_wq,
 		   sync_rcu_preempt_exp_done(rnp));
 
@@ -802,6 +833,7 @@ unlock_mb_ret:
 	mutex_unlock(&sync_rcu_preempt_exp_mutex);
 mb_ret:
 	smp_mb(); /* ensure subsequent action seen after grace period. */
+	current->rcu_debug_loc = 0;
 }
 EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
 
diff --git a/kernel/sched.c b/kernel/sched.c
index 9769c75..a37c9bf 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4135,6 +4135,8 @@ EXPORT_SYMBOL(sub_preempt_count);
 
 #endif
 
+DECLARE_PER_CPU(int, rcu_debug_loc);
+
 /*
  * Print scheduling while atomic bug:
  */
@@ -4142,8 +4144,9 @@ static noinline void __schedule_bug(struct task_struct *prev)
 {
 	struct pt_regs *regs = get_irq_regs();
 
-	printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
-		prev->comm, prev->pid, preempt_count());
+	printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x//%d\n",
+		prev->comm, prev->pid, preempt_count(),
+		current->rcu_debug_loc);
 
 	debug_show_held_locks(prev);
 	print_modules();
--
To unsubscribe from this list: send the line "unsubscribe linux-arch" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html