From: Kairui Song <kasong@xxxxxxxxxxx> psi_group->parent has the same hierarchy as the cgroup it's in. So just iterate through cgroup instead. By adjusting the iteration logic, save some space in psi_group struct, and the performance is actually better. I see a measurable performance gain using mmtests/perfpipe: (AVG of 100 test, ops/sec, the higher the better) KVM guest on a i7-9700: psi=0 root cgroup 5 levels of cgroup Before: 59221 55352 47821 After: 60100 56036 50884 KVM guest on a Ryzen 9 5900HX: psi=0 root cgroup 5 levels of cgroup Before: 144566 138919 128888 After: 145812 139580 133514 Signed-off-by: Kairui Song <kasong@xxxxxxxxxxx> Signed-off-by: Kairui Song <ryncsn@xxxxxxxxx> --- include/linux/psi_types.h | 1 - kernel/sched/psi.c | 47 ++++++++++++++++++++++++++++----------- 2 files changed, 34 insertions(+), 14 deletions(-) diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h index 1e0a0d7ace3a..4066b846ce4a 100644 --- a/include/linux/psi_types.h +++ b/include/linux/psi_types.h @@ -154,7 +154,6 @@ struct psi_trigger { }; struct psi_group { - struct psi_group *parent; bool enabled; /* Protects data used by the aggregator */ diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 8ac8b81bfee6..c74f8ce46f81 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -858,15 +858,34 @@ static void psi_group_change(struct psi_group *group, int cpu, schedule_delayed_work(&group->avgs_work, PSI_FREQ); } -static inline struct psi_group *task_psi_group(struct task_struct *task) +static inline struct psi_group *psi_iter_first(struct task_struct *task, void **iter) { #ifdef CONFIG_CGROUPS - if (static_branch_likely(&psi_cgroups_enabled)) - return cgroup_psi(task_dfl_cgroup(task)); + if (static_branch_likely(&psi_cgroups_enabled)) { + struct cgroup *cgroup = task_dfl_cgroup(task); + + *iter = cgroup_parent(cgroup); + return cgroup_psi(cgroup); + } #endif return &psi_system; } +static inline struct psi_group *psi_iter_next(void **iter) +{ +#ifdef CONFIG_CGROUPS + if (static_branch_likely(&psi_cgroups_enabled)) { + struct cgroup *cgroup = *iter; + + if (cgroup) { + *iter = cgroup_parent(cgroup); + return cgroup_psi(cgroup); + } + } +#endif + return NULL; +} + static void psi_flags_change(struct task_struct *task, int clear, int set) { if (((task->psi_flags & set) || @@ -886,6 +905,7 @@ void psi_task_change(struct task_struct *task, int clear, int set) { int cpu = task_cpu(task); struct psi_group *group; + void *iter; u64 now; if (!task->pid) @@ -895,16 +915,17 @@ void psi_task_change(struct task_struct *task, int clear, int set) now = cpu_clock(cpu); - group = task_psi_group(task); + group = psi_iter_first(task, &iter); do { psi_group_change(group, cpu, clear, set, now, true); - } while ((group = group->parent)); + } while ((group = psi_iter_next(&iter))); } void psi_task_switch(struct task_struct *prev, struct task_struct *next, bool sleep) { struct psi_group *group, *common = NULL; + void *iter; int cpu = task_cpu(prev); u64 now = cpu_clock(cpu); @@ -915,7 +936,7 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, * ancestors with @prev, those will already have @prev's * TSK_ONCPU bit set, and we can stop the iteration there. */ - group = task_psi_group(next); + group = psi_iter_first(prev, &iter); do { if (per_cpu_ptr(group->pcpu, cpu)->state_mask & PSI_ONCPU) { @@ -924,7 +945,7 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, } psi_group_change(group, cpu, 0, TSK_ONCPU, now, true); - } while ((group = group->parent)); + } while ((group = psi_iter_next(&iter))); } if (prev->pid) { @@ -957,12 +978,12 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, psi_flags_change(prev, clear, set); - group = task_psi_group(prev); + group = psi_iter_first(prev, &iter); do { if (group == common) break; psi_group_change(group, cpu, clear, set, now, wake_clock); - } while ((group = group->parent)); + } while ((group = psi_iter_next(&iter))); /* * TSK_ONCPU is handled up to the common ancestor. If there are @@ -972,7 +993,7 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, */ if ((prev->psi_flags ^ next->psi_flags) & ~TSK_ONCPU) { clear &= ~TSK_ONCPU; - for (; group; group = group->parent) + for (; group; group = psi_iter_next(&iter)) psi_group_change(group, cpu, clear, set, now, wake_clock); } } @@ -983,6 +1004,7 @@ void psi_account_irqtime(struct task_struct *task, u32 delta) { int cpu = task_cpu(task); struct psi_group *group; + void *iter; struct psi_group_cpu *groupc; u64 now; @@ -991,7 +1013,7 @@ void psi_account_irqtime(struct task_struct *task, u32 delta) now = cpu_clock(cpu); - group = task_psi_group(task); + group = psi_iter_first(task, &iter); do { if (!group->enabled) continue; @@ -1007,7 +1029,7 @@ void psi_account_irqtime(struct task_struct *task, u32 delta) if (group->poll_states & (1 << PSI_IRQ_FULL)) psi_schedule_poll_work(group, 1, false); - } while ((group = group->parent)); + } while ((group = psi_iter_next(&iter))); } #endif @@ -1089,7 +1111,6 @@ int psi_cgroup_alloc(struct cgroup *cgroup) return -ENOMEM; } group_init(cgroup->psi); - cgroup->psi->parent = cgroup_psi(cgroup_parent(cgroup)); return 0; } -- 2.39.1