Patch "sched: psi: fix bogus pressure spikes from aggregation race" has been added to the 6.10-stable tree

Sasha Levin <sashal@xxxxxxxxxx> · Tue, 8 Oct 2024 02:04:25 -0400

This is a note to let you know that I've just added the patch titled

    sched: psi: fix bogus pressure spikes from aggregation race

to the 6.10-stable tree which can be found at:
    http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=summary

The filename of the patch is:
     sched-psi-fix-bogus-pressure-spikes-from-aggregation.patch
and it can be found in the queue-6.10 subdirectory.

If you, or anyone else, feels it should not be added to the stable tree,
please let <stable@xxxxxxxxxxxxxxx> know about it.



commit 5ec405f1ef7d68de99d26714cbd36db097f02de3
Author: Johannes Weiner <hannes@xxxxxxxxxxx>
Date:   Thu Oct 3 07:29:05 2024 -0400

    sched: psi: fix bogus pressure spikes from aggregation race
    
    [ Upstream commit 3840cbe24cf060ea05a585ca497814609f5d47d1 ]
    
    Brandon reports sporadic, non-sensical spikes in cumulative pressure
    time (total=) when reading cpu.pressure at a high rate. This is due to
    a race condition between reader aggregation and tasks changing states.
    
    While it affects all states and all resources captured by PSI, in
    practice it most likely triggers with CPU pressure, since scheduling
    events are so frequent compared to other resource events.
    
    The race context is the live snooping of ongoing stalls during a
    pressure read. The read aggregates per-cpu records for stalls that
    have concluded, but will also incorporate ad-hoc the duration of any
    active state that hasn't been recorded yet. This is important to get
    timely measurements of ongoing stalls. Those ad-hoc samples are
    calculated on-the-fly up to the current time on that CPU; since the
    stall hasn't concluded, it's expected that this is the minimum amount
    of stall time that will enter the per-cpu records once it does.
    
    The problem is that the path that concludes the state uses a CPU clock
    read that is not synchronized against aggregators; the clock is read
    outside of the seqlock protection. This allows aggregators to race and
    snoop a stall with a longer duration than will actually be recorded.
    
    With the recorded stall time being less than the last snapshot
    remembered by the aggregator, a subsequent sample will underflow and
    observe a bogus delta value, resulting in an erratic jump in pressure.
    
    Fix this by moving the clock read of the state change into the seqlock
    protection. This ensures no aggregation can snoop live stalls past the
    time that's recorded when the state concludes.
    
    Reported-by: Brandon Duffany <brandon@xxxxxxxxxxxxx>
    Link: https://bugzilla.kernel.org/show_bug.cgi?id=219194
    Link: https://lore.kernel.org/lkml/20240827121851.GB438928@xxxxxxxxxxx/
    Fixes: df77430639c9 ("psi: Reduce calls to sched_clock() in psi")
    Cc: stable@xxxxxxxxxxxxxxx
    Signed-off-by: Johannes Weiner <hannes@xxxxxxxxxxx>
    Reviewed-by: Chengming Zhou <chengming.zhou@xxxxxxxxx>
    Signed-off-by: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx>
    Signed-off-by: Sasha Levin <sashal@xxxxxxxxxx>

diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 507d7b8d79afa..8d4a3d9de4797 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -765,13 +765,14 @@ static void record_times(struct psi_group_cpu *groupc, u64 now)
 }
 
 static void psi_group_change(struct psi_group *group, int cpu,
-			     unsigned int clear, unsigned int set, u64 now,
+			     unsigned int clear, unsigned int set,
 			     bool wake_clock)
 {
 	struct psi_group_cpu *groupc;
 	unsigned int t, m;
 	enum psi_states s;
 	u32 state_mask;
+	u64 now;
 
 	lockdep_assert_rq_held(cpu_rq(cpu));
 	groupc = per_cpu_ptr(group->pcpu, cpu);
@@ -786,6 +787,7 @@ static void psi_group_change(struct psi_group *group, int cpu,
 	 * SOME and FULL time these may have resulted in.
 	 */
 	write_seqcount_begin(&groupc->seq);
+	now = cpu_clock(cpu);
 
 	/*
 	 * Start with TSK_ONCPU, which doesn't have a corresponding
@@ -899,18 +901,15 @@ void psi_task_change(struct task_struct *task, int clear, int set)
 {
 	int cpu = task_cpu(task);
 	struct psi_group *group;
-	u64 now;
 
 	if (!task->pid)
 		return;
 
 	psi_flags_change(task, clear, set);
 
-	now = cpu_clock(cpu);
-
 	group = task_psi_group(task);
 	do {
-		psi_group_change(group, cpu, clear, set, now, true);
+		psi_group_change(group, cpu, clear, set, true);
 	} while ((group = group->parent));
 }
 
@@ -919,7 +918,6 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
 {
 	struct psi_group *group, *common = NULL;
 	int cpu = task_cpu(prev);
-	u64 now = cpu_clock(cpu);
 
 	if (next->pid) {
 		psi_flags_change(next, 0, TSK_ONCPU);
@@ -936,7 +934,7 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
 				break;
 			}
 
-			psi_group_change(group, cpu, 0, TSK_ONCPU, now, true);
+			psi_group_change(group, cpu, 0, TSK_ONCPU, true);
 		} while ((group = group->parent));
 	}
 
@@ -974,7 +972,7 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
 		do {
 			if (group == common)
 				break;
-			psi_group_change(group, cpu, clear, set, now, wake_clock);
+			psi_group_change(group, cpu, clear, set, wake_clock);
 		} while ((group = group->parent));
 
 		/*
@@ -986,7 +984,7 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
 		if ((prev->psi_flags ^ next->psi_flags) & ~TSK_ONCPU) {
 			clear &= ~TSK_ONCPU;
 			for (; group; group = group->parent)
-				psi_group_change(group, cpu, clear, set, now, wake_clock);
+				psi_group_change(group, cpu, clear, set, wake_clock);
 		}
 	}
 }
@@ -997,8 +995,8 @@ void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_st
 	int cpu = task_cpu(curr);
 	struct psi_group *group;
 	struct psi_group_cpu *groupc;
-	u64 now, irq;
 	s64 delta;
+	u64 irq;
 
 	if (static_branch_likely(&psi_disabled))
 		return;
@@ -1011,7 +1009,6 @@ void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_st
 	if (prev && task_psi_group(prev) == group)
 		return;
 
-	now = cpu_clock(cpu);
 	irq = irq_time_read(cpu);
 	delta = (s64)(irq - rq->psi_irq_time);
 	if (delta < 0)
@@ -1019,12 +1016,15 @@ void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_st
 	rq->psi_irq_time = irq;
 
 	do {
+		u64 now;
+
 		if (!group->enabled)
 			continue;
 
 		groupc = per_cpu_ptr(group->pcpu, cpu);
 
 		write_seqcount_begin(&groupc->seq);
+		now = cpu_clock(cpu);
 
 		record_times(groupc, now);
 		groupc->times[PSI_IRQ_FULL] += delta;
@@ -1223,11 +1223,9 @@ void psi_cgroup_restart(struct psi_group *group)
 	for_each_possible_cpu(cpu) {
 		struct rq *rq = cpu_rq(cpu);
 		struct rq_flags rf;
-		u64 now;
 
 		rq_lock_irq(rq, &rf);
-		now = cpu_clock(cpu);
-		psi_group_change(group, cpu, 0, 0, now, true);
+		psi_group_change(group, cpu, 0, 0, true);
 		rq_unlock_irq(rq, &rf);
 	}
 }