[PATCH] sched/core: Fix wrong warning check in rq_clock_start_loop_update()

Hao Jia <jiahao.os@xxxxxxxxxxxxx> · Wed, 13 Sep 2023 16:24:24 +0800

Igor Raits and Bagas Sanjaya report a RQCF_ACT_SKIP leak warning.
Link: https://lore.kernel.org/all/a5dd536d-041a-2ce9-f4b7-64d8d85c86dc@xxxxxxxxx

Commit ebb83d84e49b54 ("sched/core: Avoid multiple
calling update_rq_clock() in __cfsb_csd_unthrottle()")
add RQCF_ACT_SKIP leak warning in rq_clock_start_loop_update().
But this warning is inaccurate and may be triggered
incorrectly in the following situations:

    CPU0                                      CPU1

__schedule()
  *rq->clock_update_flags <<= 1;*   unregister_fair_sched_group()
  pick_next_task_fair+0x4a/0x410      destroy_cfs_bandwidth()
    newidle_balance+0x115/0x3e0       for_each_possible_cpu(i) *i=0*
      rq_unpin_lock(this_rq, rf)      __cfsb_csd_unthrottle()
      raw_spin_rq_unlock(this_rq)
                                      rq_lock(*CPU0_rq*, &rf)
                                      rq_clock_start_loop_update()
                                      rq->clock_update_flags & RQCF_ACT_SKIP <--

      raw_spin_rq_lock(this_rq)

So we remove this wrong check. Add assert_clock_updated() to
check that rq clock has been updated before calling
rq_clock_start_loop_update(). And we cannot unconditionally set
rq->clock_update_flags to RQCF_ACT_SKIP in rq_clock_start_loop_update().
So we use the variable rq_clock_flags in rq_clock_start_loop_update()
to record the previous state of rq->clock_update_flags.
Correspondingly, restore rq->clock_update_flags through
rq_clock_flags in rq_clock_stop_loop_update() to prevent
losing its previous information.

Fixes: ebb83d84e49b ("sched/core: Avoid multiple calling update_rq_clock() in __cfsb_csd_unthrottle()")
Cc: stable@xxxxxxxxxxxxxxx
Reported-by: Igor Raits <igor.raits@xxxxxxxxx>
Reported-by: Bagas Sanjaya <bagasdotme@xxxxxxxxx>
Signed-off-by: Hao Jia <jiahao.os@xxxxxxxxxxxxx>
---
 kernel/sched/fair.c  | 10 ++++++----
 kernel/sched/sched.h | 12 +++++++-----
 2 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 8dbff6e7ad4f..a64a002573d9 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5679,6 +5679,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
 #ifdef CONFIG_SMP
 static void __cfsb_csd_unthrottle(void *arg)
 {
+	unsigned int rq_clock_flags;
 	struct cfs_rq *cursor, *tmp;
 	struct rq *rq = arg;
 	struct rq_flags rf;
@@ -5691,7 +5692,7 @@ static void __cfsb_csd_unthrottle(void *arg)
 	 * Do it once and skip the potential next ones.
 	 */
 	update_rq_clock(rq);
-	rq_clock_start_loop_update(rq);
+	rq_clock_start_loop_update(rq, &rq_clock_flags);
 
 	/*
 	 * Since we hold rq lock we're safe from concurrent manipulation of
@@ -5712,7 +5713,7 @@ static void __cfsb_csd_unthrottle(void *arg)
 
 	rcu_read_unlock();
 
-	rq_clock_stop_loop_update(rq);
+	rq_clock_stop_loop_update(rq, &rq_clock_flags);
 	rq_unlock(rq, &rf);
 }
 
@@ -6230,6 +6231,7 @@ static void __maybe_unused update_runtime_enabled(struct rq *rq)
 /* cpu offline callback */
 static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
 {
+	unsigned int rq_clock_flags;
 	struct task_group *tg;
 
 	lockdep_assert_rq_held(rq);
@@ -6239,7 +6241,7 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
 	 * set_rq_offline(), so we should skip updating
 	 * the rq clock again in unthrottle_cfs_rq().
 	 */
-	rq_clock_start_loop_update(rq);
+	rq_clock_start_loop_update(rq, &rq_clock_flags);
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(tg, &task_groups, list) {
@@ -6264,7 +6266,7 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
 	}
 	rcu_read_unlock();
 
-	rq_clock_stop_loop_update(rq);
+	rq_clock_stop_loop_update(rq, &rq_clock_flags);
 }
 
 bool cfs_task_bw_constrained(struct task_struct *p)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 04846272409c..ff2864f202f5 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1558,20 +1558,22 @@ static inline void rq_clock_cancel_skipupdate(struct rq *rq)
  * when using list_for_each_entry_*)
  * rq_clock_start_loop_update() can be called after updating the clock
  * once and before iterating over the list to prevent multiple update.
+ * And use @rq_clock_flags to record the previous state of rq->clock_update_flags.
  * After the iterative traversal, we need to call rq_clock_stop_loop_update()
- * to clear RQCF_ACT_SKIP of rq->clock_update_flags.
+ * to restore rq->clock_update_flags through @rq_clock_flags.
  */
-static inline void rq_clock_start_loop_update(struct rq *rq)
+static inline void rq_clock_start_loop_update(struct rq *rq, unsigned int *rq_clock_flags)
 {
 	lockdep_assert_rq_held(rq);
-	SCHED_WARN_ON(rq->clock_update_flags & RQCF_ACT_SKIP);
+	assert_clock_updated(rq);
+	*rq_clock_flags = rq->clock_update_flags;
 	rq->clock_update_flags |= RQCF_ACT_SKIP;
 }
 
-static inline void rq_clock_stop_loop_update(struct rq *rq)
+static inline void rq_clock_stop_loop_update(struct rq *rq, unsigned int *rq_clock_flags)
 {
 	lockdep_assert_rq_held(rq);
-	rq->clock_update_flags &= ~RQCF_ACT_SKIP;
+	rq->clock_update_flags = *rq_clock_flags;
 }
 
 struct rq_flags {
-- 
2.39.2