Patch "sched/psi: Fix avgs_work re-arm in psi_avgs_work()" has been added to the 6.1-stable tree

Sasha Levin <sashal@xxxxxxxxxx> · Sun, 23 Jul 2023 21:33:41 -0400

This is a note to let you know that I've just added the patch titled

    sched/psi: Fix avgs_work re-arm in psi_avgs_work()

to the 6.1-stable tree which can be found at:
    http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=summary

The filename of the patch is:
     sched-psi-fix-avgs_work-re-arm-in-psi_avgs_work.patch
and it can be found in the queue-6.1 subdirectory.

If you, or anyone else, feels it should not be added to the stable tree,
please let <stable@xxxxxxxxxxxxxxx> know about it.



commit cd6a5ae395de7987446d45c2944bc8de4a8917f7
Author: Chengming Zhou <zhouchengming@xxxxxxxxxxxxx>
Date:   Fri Oct 14 19:05:51 2022 +0800

    sched/psi: Fix avgs_work re-arm in psi_avgs_work()
    
    [ Upstream commit 2fcd7bbae90a6d844da8660a9d27079281dfbba2 ]
    
    Pavan reported a problem that PSI avgs_work idle shutoff is not
    working at all. Because PSI_NONIDLE condition would be observed in
    psi_avgs_work()->collect_percpu_times()->get_recent_times() even if
    only the kworker running avgs_work on the CPU.
    
    Although commit 1b69ac6b40eb ("psi: fix aggregation idle shut-off")
    avoided the ping-pong wake problem when the worker sleep, psi_avgs_work()
    still will always re-arm the avgs_work, so shutoff is not working.
    
    This patch changes to use PSI_STATE_RESCHEDULE to flag whether to
    re-arm avgs_work in get_recent_times(). For the current CPU, we re-arm
    avgs_work only when (NR_RUNNING > 1 || NR_IOWAIT > 0 || NR_MEMSTALL > 0),
    for other CPUs we can just check PSI_NONIDLE delta. The new flag
    is only used in psi_avgs_work(), so we check in get_recent_times()
    that current_work() is avgs_work.
    
    One potential problem is that the brief period of non-idle time
    incurred between the aggregation run and the kworker's dequeue will
    be stranded in the per-cpu buckets until avgs_work run next time.
    The buckets can hold 4s worth of time, and future activity will wake
    the avgs_work with a 2s delay, giving us 2s worth of data we can leave
    behind when shut off the avgs_work. If the kworker run other works after
    avgs_work shut off and doesn't have any scheduler activities for 2s,
    this maybe a problem.
    
    Reported-by: Pavan Kondeti <quic_pkondeti@xxxxxxxxxxx>
    Signed-off-by: Chengming Zhou <zhouchengming@xxxxxxxxxxxxx>
    Signed-off-by: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx>
    Acked-by: Johannes Weiner <hannes@xxxxxxxxxxx>
    Acked-by: Suren Baghdasaryan <surenb@xxxxxxxxxx>
    Tested-by: Chengming Zhou <zhouchengming@xxxxxxxxxxxxx>
    Link: https://lore.kernel.org/r/20221014110551.22695-1-zhouchengming@xxxxxxxxxxxxx
    Stable-dep-of: aff037078eca ("sched/psi: use kernfs polling functions for PSI trigger polling")
    Signed-off-by: Sasha Levin <sashal@xxxxxxxxxx>

diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h
index 14a1ebb74e11f..1e0a0d7ace3af 100644
--- a/include/linux/psi_types.h
+++ b/include/linux/psi_types.h
@@ -72,6 +72,9 @@ enum psi_states {
 /* Use one bit in the state mask to track TSK_ONCPU */
 #define PSI_ONCPU	(1 << NR_PSI_STATES)
 
+/* Flag whether to re-arm avgs_work, see details in get_recent_times() */
+#define PSI_STATE_RESCHEDULE	(1 << (NR_PSI_STATES + 1))
+
 enum psi_aggregators {
 	PSI_AVGS = 0,
 	PSI_POLL,
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index e83c321461cf4..02e011cabe917 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -243,6 +243,8 @@ static void get_recent_times(struct psi_group *group, int cpu,
 			     u32 *pchanged_states)
 {
 	struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu);
+	int current_cpu = raw_smp_processor_id();
+	unsigned int tasks[NR_PSI_TASK_COUNTS];
 	u64 now, state_start;
 	enum psi_states s;
 	unsigned int seq;
@@ -257,6 +259,8 @@ static void get_recent_times(struct psi_group *group, int cpu,
 		memcpy(times, groupc->times, sizeof(groupc->times));
 		state_mask = groupc->state_mask;
 		state_start = groupc->state_start;
+		if (cpu == current_cpu)
+			memcpy(tasks, groupc->tasks, sizeof(groupc->tasks));
 	} while (read_seqcount_retry(&groupc->seq, seq));
 
 	/* Calculate state time deltas against the previous snapshot */
@@ -281,6 +285,28 @@ static void get_recent_times(struct psi_group *group, int cpu,
 		if (delta)
 			*pchanged_states |= (1 << s);
 	}
+
+	/*
+	 * When collect_percpu_times() from the avgs_work, we don't want to
+	 * re-arm avgs_work when all CPUs are IDLE. But the current CPU running
+	 * this avgs_work is never IDLE, cause avgs_work can't be shut off.
+	 * So for the current CPU, we need to re-arm avgs_work only when
+	 * (NR_RUNNING > 1 || NR_IOWAIT > 0 || NR_MEMSTALL > 0), for other CPUs
+	 * we can just check PSI_NONIDLE delta.
+	 */
+	if (current_work() == &group->avgs_work.work) {
+		bool reschedule;
+
+		if (cpu == current_cpu)
+			reschedule = tasks[NR_RUNNING] +
+				     tasks[NR_IOWAIT] +
+				     tasks[NR_MEMSTALL] > 1;
+		else
+			reschedule = *pchanged_states & (1 << PSI_NONIDLE);
+
+		if (reschedule)
+			*pchanged_states |= PSI_STATE_RESCHEDULE;
+	}
 }
 
 static void calc_avgs(unsigned long avg[3], int missed_periods,
@@ -416,7 +442,6 @@ static void psi_avgs_work(struct work_struct *work)
 	struct delayed_work *dwork;
 	struct psi_group *group;
 	u32 changed_states;
-	bool nonidle;
 	u64 now;
 
 	dwork = to_delayed_work(work);
@@ -427,7 +452,6 @@ static void psi_avgs_work(struct work_struct *work)
 	now = sched_clock();
 
 	collect_percpu_times(group, PSI_AVGS, &changed_states);
-	nonidle = changed_states & (1 << PSI_NONIDLE);
 	/*
 	 * If there is task activity, periodically fold the per-cpu
 	 * times and feed samples into the running averages. If things
@@ -438,7 +462,7 @@ static void psi_avgs_work(struct work_struct *work)
 	if (now >= group->avg_next_update)
 		group->avg_next_update = update_averages(group, now);
 
-	if (nonidle) {
+	if (changed_states & PSI_STATE_RESCHEDULE) {
 		schedule_delayed_work(dwork, nsecs_to_jiffies(
 				group->avg_next_update - now) + 1);
 	}