[PATCH v5 08/18] rcu: Add per-CB tracing for queuing, flush and invocation.

"Joel Fernandes (Google)" <joel@xxxxxxxxxxxxxxxxx> · Thu, 1 Sep 2022 22:17:10 +0000

This is very useful to debug whether call_rcu_lazy() is working
correctly. In the future, any BPF tools could also be modified to read
the new information.

The additional tracing is enabled by a new CONFIG_RCU_TRACE_CB and is
kept disabled by default.

Signed-off-by: Joel Fernandes (Google) <joel@xxxxxxxxxxxxxxxxx>
---
 include/linux/types.h      | 44 ++++++++++++++++++++++++
 include/trace/events/rcu.h | 69 +++++++++++++++++++++++++++++++++++---
 kernel/rcu/Kconfig         | 11 ++++++
 kernel/rcu/rcu_segcblist.c | 21 ++++++++++++
 kernel/rcu/rcu_segcblist.h |  8 +++++
 kernel/rcu/tree.c          | 46 +++++++++++++++++++++++--
 kernel/rcu/tree_nocb.h     | 26 +++++++++++++-
 7 files changed, 217 insertions(+), 8 deletions(-)

diff --git a/include/linux/types.h b/include/linux/types.h
index ea8cf60a8a79..47501fdfd3c1 100644
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -198,10 +198,51 @@ struct ustat {
 	char			f_fpack[6];
 };
 
+#ifdef CONFIG_RCU_TRACE_CB
+/*
+ * Debug information that a caller can store within a callback_head.
+ * Its expected to provide at least 12 bytes before BUILD_BUG starts
+ * complaining.
+ */
+enum cb_debug_flags {
+	CB_DEBUG_KFREE,
+	CB_DEBUG_LAZY,
+	CB_DEBUG_BYPASS,
+
+	// A new non-lazy CB showed up and we decided to not use
+	// the bypass list for it. So we flushed the old ones.
+	CB_DEBUG_NON_LAZY_FLUSHED,
+
+	// We decided to use the bypass list but had to flush
+	// the old bypass CBs because they got too old or too big.
+	CB_DEBUG_BYPASS_FLUSHED,
+	CB_DEBUG_BYPASS_LAZY_FLUSHED,
+
+	// The GP thread flushed the bypass CBs if they got old or big.
+	CB_DEBUG_GPTHREAD_FLUSHED,
+
+	// De-offload from NOCB mode.
+	CB_DEBUG_DEOFFLOAD_FLUSHED,
+
+	// rcu_barrier() flushes lazy/bypass CBs for CB exec ordering.
+	CB_DEBUG_BARRIER_FLUSHED
+};
+
+struct cb_debug_info {
+	// 16-bit jiffie deltas can provide about 60 seconds of resolution
+	// at HZ=1000 before wrapping. That's enough for debug.
+	u16 cb_queue_jiff;
+	u16 first_bp_jiff;
+	u16 cb_flush_jiff;
+	enum cb_debug_flags flags:16;
+};
+#endif
+
 /**
  * struct callback_head - callback structure for use with RCU and task_work
  * @next: next update requests in a list
  * @func: actual update function to call after the grace period.
+ * @di: debug information that can be stored.
  *
  * The struct is aligned to size of pointer. On most architectures it happens
  * naturally due ABI requirements, but some architectures (like CRIS) have
@@ -220,6 +261,9 @@ struct ustat {
 struct callback_head {
 	struct callback_head *next;
 	void (*func)(struct callback_head *head);
+#ifdef CONFIG_RCU_TRACE_CB
+	struct cb_debug_info di;
+#endif
 } __attribute__((aligned(sizeof(void *))));
 #define rcu_head callback_head
 
diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h
index 90b2fb0292cb..dd6baec6745c 100644
--- a/include/trace/events/rcu.h
+++ b/include/trace/events/rcu.h
@@ -630,24 +630,85 @@ TRACE_EVENT_RCU(rcu_batch_start,
  */
 TRACE_EVENT_RCU(rcu_invoke_callback,
 
-	TP_PROTO(const char *rcuname, struct rcu_head *rhp),
+	TP_PROTO(const char *rcuname, struct rcu_head *rhp
+#ifdef CONFIG_RCU_TRACE_CB
+		, unsigned long jiffies_first
+#endif
+	),
 
-	TP_ARGS(rcuname, rhp),
+	TP_ARGS(rcuname, rhp
+#ifdef CONFIG_RCU_TRACE_CB
+		, jiffies_first
+#endif
+		),
 
 	TP_STRUCT__entry(
 		__field(const char *, rcuname)
 		__field(void *, rhp)
 		__field(void *, func)
+#ifdef CONFIG_RCU_TRACE_CB
+		__field(u16, cb_debug_flags)
+		__field(int, cb_queue_exec_latency)
+		__array(char, bypass_flush_reason, 32)
+		__field(int, cb_bypass_slack)
+		__field(int, cb_queue_flush_latency)
+#endif
 	),
 
 	TP_fast_assign(
 		__entry->rcuname = rcuname;
 		__entry->rhp = rhp;
 		__entry->func = rhp->func;
+#ifdef CONFIG_RCU_TRACE_CB
+		__entry->cb_debug_flags = rhp->di.flags;
+		__entry->cb_queue_exec_latency =
+			(jiffies - jiffies_first) - rhp->di.cb_queue_jiff,
+
+		/* The following 3 fields are valid only for bypass/lazy CBs. */
+		__entry->cb_bypass_slack =
+			(rhp->di.flags & BIT(CB_DEBUG_BYPASS)) ?
+				rhp->di.cb_queue_jiff - rhp->di.first_bp_jiff : 0,
+		__entry->cb_queue_flush_latency =
+			(rhp->di.flags & BIT(CB_DEBUG_BYPASS)) ?
+				rhp->di.cb_flush_jiff - rhp->di.cb_queue_jiff : 0;
+
+
+		if (__entry->cb_debug_flags & BIT(CB_DEBUG_NON_LAZY_FLUSHED))
+			strcpy(__entry->bypass_flush_reason, "non-lazy");
+		else if (__entry->cb_debug_flags & BIT(CB_DEBUG_BYPASS_FLUSHED))
+			strcpy(__entry->bypass_flush_reason, "bypass");
+		else if (__entry->cb_debug_flags & BIT(CB_DEBUG_BYPASS_LAZY_FLUSHED))
+			strcpy(__entry->bypass_flush_reason, "bypass-lazy");
+		else if (__entry->cb_debug_flags & BIT(CB_DEBUG_GPTHREAD_FLUSHED))
+			strcpy(__entry->bypass_flush_reason, "gpthread");
+		else if (__entry->cb_debug_flags & BIT(CB_DEBUG_BARRIER_FLUSHED))
+			strcpy(__entry->bypass_flush_reason, "rcu_barrier");
+		else if (__entry->cb_debug_flags & BIT(CB_DEBUG_DEOFFLOAD_FLUSHED))
+			strcpy(__entry->bypass_flush_reason, "deoffload");
+#endif
 	),
 
-	TP_printk("%s rhp=%p func=%ps",
-		  __entry->rcuname, __entry->rhp, __entry->func)
+	TP_printk("%s rhp=%p func=%ps"
+#ifdef CONFIG_RCU_TRACE_CB
+			" lazy=%d "
+			"bypass=%d "
+			"flush_reason=%s "
+			"queue_exec_latency=%d "
+			"bypass_slack=%d "
+			"queue_flush_latency=%d"
+#endif
+			,
+		  __entry->rcuname, __entry->rhp, __entry->func
+#ifdef CONFIG_RCU_TRACE_CB
+		,
+		!!(__entry->cb_debug_flags & BIT(CB_DEBUG_LAZY)),
+		!!(__entry->cb_debug_flags & BIT(CB_DEBUG_BYPASS)),
+		__entry->bypass_flush_reason,
+		__entry->cb_queue_exec_latency,
+		__entry->cb_bypass_slack,
+		__entry->cb_queue_flush_latency
+#endif
+	 )
 );
 
 /*
diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig
index 3128d01427cb..cd6ff63517f4 100644
--- a/kernel/rcu/Kconfig
+++ b/kernel/rcu/Kconfig
@@ -319,4 +319,15 @@ config RCU_LAZY
 	  To save power, batch RCU callbacks and flush after delay, memory
 	  pressure or callback list growing too big.
 
+config RCU_TRACE_CB
+	bool "Enable additional callback tracing"
+	depends on RCU_LAZY
+	default n
+	help
+	  Enable additional callback tracing for RCU. Currently only
+	  tracing for lazy/bypass callbacks is done. In the future, it could be
+	  extended to non-lazy callbacks as well. The trace point contains
+	  detailed information about callback flushing reason and execution
+	  time. This information can be retrieved by BPF tools via tracepoints.
+
 endmenu # "RCU Subsystem"
diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c
index 55b50e592986..8f0b024f7bc5 100644
--- a/kernel/rcu/rcu_segcblist.c
+++ b/kernel/rcu/rcu_segcblist.c
@@ -32,6 +32,27 @@ void rcu_cblist_enqueue(struct rcu_cblist *rclp, struct rcu_head *rhp)
 	WRITE_ONCE(rclp->len, rclp->len + 1);
 }
 
+/*
+ * Set a debug flag on all CBs in the unsegmented cblist @rclp.
+ *
+ * The callback causing the flush. Should be NULL if its a timer that does it.
+ */
+#ifdef CONFIG_RCU_TRACE_CB
+void rcu_cblist_set_flush(struct rcu_cblist *rcl,
+			 enum cb_debug_flags flags,
+			 unsigned long flush_jiff)
+{
+	if (!rcl || !rcl->head)
+		return;
+
+	for (struct rcu_head *head = rcl->head;
+			head && head != *(rcl->tail); head = head->next) {
+		head->di.flags |= flags;
+		head->di.cb_flush_jiff = flush_jiff;
+	}
+}
+#endif
+
 /*
  * Flush the second rcu_cblist structure onto the first one, obliterating
  * any contents of the first.  If rhp is non-NULL, enqueue it as the sole
diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h
index 431cee212467..ac1f34024b84 100644
--- a/kernel/rcu/rcu_segcblist.h
+++ b/kernel/rcu/rcu_segcblist.h
@@ -53,6 +53,14 @@ static inline long rcu_segcblist_n_cbs(struct rcu_segcblist *rsclp)
 #endif
 }
 
+#ifdef CONFIG_RCU_TRACE_CB
+void rcu_cblist_set_flush(struct rcu_cblist *rcl,
+			 enum cb_debug_flags flags,
+			 unsigned long flush_jiff);
+#else
+#define rcu_cblist_set_flush(...)
+#endif
+
 static inline void rcu_segcblist_set_flags(struct rcu_segcblist *rsclp,
 					   int flags)
 {
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index aaced29a0a71..8111d9f37621 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2179,6 +2179,12 @@ int rcutree_dead_cpu(unsigned int cpu)
 	return 0;
 }
 
+static unsigned long cb_debug_jiffies_first;
+
+#if defined(CONFIG_RCU_TRACE_CB) && defined(CONFIG_RCU_LAZY)
+extern unsigned long jiffies_till_flush;
+#endif
+
 /*
  * Invoke any RCU callbacks that have made it to the end of their grace
  * period.  Throttle as specified by rdp->blimit.
@@ -2241,7 +2247,12 @@ static void rcu_do_batch(struct rcu_data *rdp)
 		debug_rcu_head_unqueue(rhp);
 
 		rcu_lock_acquire(&rcu_callback_map);
-		trace_rcu_invoke_callback(rcu_state.name, rhp);
+
+		trace_rcu_invoke_callback(rcu_state.name, rhp
+#ifdef CONFIG_RCU_TRACE_CB
+			, READ_ONCE(cb_debug_jiffies_first)
+#endif
+		);
 
 		f = rhp->func;
 		WRITE_ONCE(rhp->func, (rcu_callback_t)0L);
@@ -2736,6 +2747,17 @@ __call_rcu_common(struct rcu_head *head, rcu_callback_t func, bool lazy)
 	struct rcu_data *rdp;
 	bool was_alldone;
 
+	/*
+	 * For CB debugging, record the starting reference to jiffies while
+	 * making sure it does not wrap around. The starting reference is
+	 * used to calculate 16-bit jiffie deltas. To be safe, reset the
+	 * reference once the delta exceeds 15-bits worth.
+	 */
+	if (IS_ENABLED(CONFIG_RCU_TRACE_CB) &&
+		(!READ_ONCE(cb_debug_jiffies_first) ||
+		 (jiffies - READ_ONCE(cb_debug_jiffies_first) > (1 << 15))))
+		WRITE_ONCE(cb_debug_jiffies_first, jiffies);
+
 	/* Misaligned rcu_head! */
 	WARN_ON_ONCE((unsigned long)head & (sizeof(void *) - 1));
 
@@ -2771,14 +2793,29 @@ __call_rcu_common(struct rcu_head *head, rcu_callback_t func, bool lazy)
 
 	check_cb_ovld(rdp);
 
-	if (__is_kvfree_rcu_offset((unsigned long)func))
+#ifdef CONFIG_RCU_TRACE_CB
+	head->di.flags = 0;
+	WARN_ON_ONCE(jiffies - READ_ONCE(cb_debug_jiffies_first) > 65535);
+	head->di.cb_queue_jiff = (u16)(jiffies - READ_ONCE(cb_debug_jiffies_first));
+#endif
+
+	if (__is_kvfree_rcu_offset((unsigned long)func)) {
 		trace_rcu_kvfree_callback(rcu_state.name, head,
 					 (unsigned long)func,
 					 rcu_segcblist_n_cbs(&rdp->cblist));
-	else
+#ifdef CONFIG_RCU_TRACE_CB
+		head->di.flags |= BIT(CB_DEBUG_KFREE);
+#endif
+	} else {
 		trace_rcu_callback(rcu_state.name, head,
 				   rcu_segcblist_n_cbs(&rdp->cblist));
 
+#ifdef CONFIG_RCU_TRACE_CB
+		if (lazy)
+			head->di.flags |= BIT(CB_DEBUG_LAZY);
+#endif
+	}
+
 	if (rcu_nocb_try_bypass(rdp, head, &was_alldone, flags, lazy))
 		return; // Enqueued onto ->nocb_bypass, so just leave.
 	// If no-CBs CPU gets here, rcu_nocb_try_bypass() acquired ->nocb_lock.
@@ -3943,6 +3980,9 @@ static void rcu_barrier_entrain(struct rcu_data *rdp)
 	rdp->barrier_head.func = rcu_barrier_callback;
 	debug_rcu_head_queue(&rdp->barrier_head);
 	rcu_nocb_lock(rdp);
+
+	rcu_cblist_set_flush(&rdp->nocb_bypass, BIT(CB_DEBUG_BARRIER_FLUSHED),
+			(jiffies - READ_ONCE(cb_debug_jiffies_first)));
 	WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies, false,
 		     /* wake gp thread */ true));
 	if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head)) {
diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
index 560ba87911c5..ee5924ba2f3b 100644
--- a/kernel/rcu/tree_nocb.h
+++ b/kernel/rcu/tree_nocb.h
@@ -355,6 +355,11 @@ static bool rcu_nocb_do_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
 	if (rhp)
 		rcu_segcblist_inc_len(&rdp->cblist); /* Must precede enqueue. */
 
+	/* The lazy CBs are being flushed, but a new one might be enqueued. */
+#ifdef CONFIG_RCU_TRACE_CB
+	WARN_ON_ONCE(rhp && lazy != !!(rhp->di.flags & BIT(CB_DEBUG_LAZY)));
+#endif
+
 	/*
 	 * If the new CB requested was a lazy one, queue it onto the main
 	 * ->cblist so we can take advantage of a sooner grade period.
@@ -407,6 +412,10 @@ static void rcu_nocb_try_flush_bypass(struct rcu_data *rdp, unsigned long j)
 	if (!rcu_rdp_is_offloaded(rdp) ||
 	    !rcu_nocb_bypass_trylock(rdp))
 		return;
+
+	rcu_cblist_set_flush(&rdp->nocb_bypass, BIT(CB_DEBUG_GPTHREAD_FLUSHED),
+			     (j - READ_ONCE(cb_debug_jiffies_first)));
+
 	WARN_ON_ONCE(!rcu_nocb_do_flush_bypass(rdp, NULL, j, false));
 }
 
@@ -489,8 +498,10 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
 			trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
 					    TPS("FirstQ"));
 
-		WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, j, false, false));
+		rcu_cblist_set_flush(&rdp->nocb_bypass, BIT(CB_DEBUG_NON_LAZY_FLUSHED),
+				     (j - READ_ONCE(cb_debug_jiffies_first)));
 
+		WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, j, false, false));
 		WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass));
 		return false; // Caller must enqueue the callback.
 	}
@@ -503,6 +514,10 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
 	    ncbs >= qhimark) {
 		rcu_nocb_lock(rdp);
 
+		rcu_cblist_set_flush(&rdp->nocb_bypass,
+				lazy ? BIT(CB_DEBUG_BYPASS_LAZY_FLUSHED) : BIT(CB_DEBUG_BYPASS_FLUSHED),
+				(j - READ_ONCE(cb_debug_jiffies_first)));
+
 		if (!rcu_nocb_flush_bypass(rdp, rhp, j, lazy, false)) {
 			*was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
 			if (*was_alldone)
@@ -543,6 +558,11 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
 		trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("FirstBQ"));
 	}
 
+#ifdef CONFIG_RCU_TRACE_CB
+	rhp->di.flags |= BIT(CB_DEBUG_BYPASS);
+	rhp->di.first_bp_jiff = READ_ONCE(rdp->nocb_bypass_first) - READ_ONCE(cb_debug_jiffies_first);
+#endif
+
 	rcu_nocb_bypass_unlock(rdp);
 	smp_mb(); /* Order enqueue before wake. */
 
@@ -1156,6 +1176,10 @@ static long rcu_nocb_rdp_deoffload(void *arg)
 	 * return false, which means that future calls to rcu_nocb_try_bypass()
 	 * will refuse to put anything into the bypass.
 	 */
+
+	rcu_cblist_set_flush(&rdp->nocb_bypass, BIT(CB_DEBUG_DEOFFLOAD_FLUSHED),
+			(jiffies - READ_ONCE(cb_debug_jiffies_first)));
+
 	WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies, false, false));
 	/*
 	 * Start with invoking rcu_core() early. This way if the current thread
-- 
2.37.2.789.g6183377224-goog