Separate locks can be used to eliminate contention between subsystems that make use of rstat. The base stats also get their own lock. Where applicable, check for the existence of a subsystem pointer to determine if the given cgroup_subsys_state is the base css or not for deciding which lock to take. Signed-off-by: JP Kobryn <inwardvessel@xxxxxxxxx> --- include/linux/cgroup-defs.h | 2 + kernel/cgroup/rstat.c | 92 +++++++++++++++++++++++++------------ 2 files changed, 65 insertions(+), 29 deletions(-) diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 1932f8ae7995..4d87519ff023 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -790,6 +790,8 @@ struct cgroup_subsys { * specifies the mask of subsystems that this one depends on. */ unsigned int depends_on; + + spinlock_t rstat_lock; }; extern struct percpu_rw_semaphore cgroup_threadgroup_rwsem; diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index 4381eb9ac426..958bdccf0359 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -9,8 +9,9 @@ #include <trace/events/cgroup.h> -static DEFINE_SPINLOCK(cgroup_rstat_lock); -static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_cpu_lock); +static DEFINE_SPINLOCK(cgroup_rstat_base_lock); +static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_base_cpu_lock); +static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_cpu_lock[CGROUP_SUBSYS_COUNT]); static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu); @@ -86,7 +87,7 @@ void _cgroup_rstat_cpu_unlock(raw_spinlock_t *cpu_lock, int cpu, __bpf_kfunc void cgroup_rstat_updated(struct cgroup_subsys_state *css, int cpu) { struct cgroup *cgrp = css->cgroup; - raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu); + raw_spinlock_t *cpu_lock; unsigned long flags; /* @@ -100,6 +101,11 @@ __bpf_kfunc void cgroup_rstat_updated(struct cgroup_subsys_state *css, int cpu) if (data_race(css_rstat_cpu(css, cpu)->updated_next)) return; + if (css->ss) + cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock[css->ss->id], cpu); + else + cpu_lock = per_cpu_ptr(&cgroup_rstat_base_cpu_lock, cpu); + flags = _cgroup_rstat_cpu_lock(cpu_lock, cpu, cgrp, true); /* put @cgrp and all ancestors on the corresponding updated lists */ @@ -207,11 +213,16 @@ static struct cgroup_subsys_state *cgroup_rstat_push_children( static struct cgroup_subsys_state *cgroup_rstat_updated_list( struct cgroup_subsys_state *root, int cpu) { - raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu); struct cgroup_rstat_cpu *rstatc = css_rstat_cpu(root, cpu); struct cgroup_subsys_state *head = NULL, *parent, *child; + raw_spinlock_t *cpu_lock; unsigned long flags; + if (root->ss) + cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock[root->ss->id], cpu); + else + cpu_lock = per_cpu_ptr(&cgroup_rstat_base_cpu_lock, cpu); + flags = _cgroup_rstat_cpu_lock(cpu_lock, cpu, root->cgroup, false); /* Return NULL if this subtree is not on-list */ @@ -285,37 +296,44 @@ __bpf_hook_end(); * number processed last. */ static inline void __cgroup_rstat_lock(struct cgroup_subsys_state *css, - int cpu_in_loop) - __acquires(&cgroup_rstat_lock) + spinlock_t *lock, int cpu_in_loop) + __acquires(lock) { struct cgroup *cgrp = css->cgroup; bool contended; - contended = !spin_trylock_irq(&cgroup_rstat_lock); + contended = !spin_trylock_irq(lock); if (contended) { trace_cgroup_rstat_lock_contended(cgrp, cpu_in_loop, contended); - spin_lock_irq(&cgroup_rstat_lock); + spin_lock_irq(lock); } trace_cgroup_rstat_locked(cgrp, cpu_in_loop, contended); } static inline void __cgroup_rstat_unlock(struct cgroup_subsys_state *css, - int cpu_in_loop) - __releases(&cgroup_rstat_lock) + spinlock_t *lock, int cpu_in_loop) + __releases(lock) { struct cgroup *cgrp = css->cgroup; trace_cgroup_rstat_unlock(cgrp, cpu_in_loop, false); - spin_unlock_irq(&cgroup_rstat_lock); + spin_unlock_irq(lock); } /* see cgroup_rstat_flush() */ static void cgroup_rstat_flush_locked(struct cgroup_subsys_state *css) - __releases(&cgroup_rstat_lock) __acquires(&cgroup_rstat_lock) + __releases(&css->ss->rstat_lock) __acquires(&css->ss->rstat_lock) { + spinlock_t *lock; int cpu; - lockdep_assert_held(&cgroup_rstat_lock); + if (!css->ss) { + pr_warn("cannot use generic flush on base subsystem\n"); + return; + } + + lock = &css->ss->rstat_lock; + lockdep_assert_held(lock); for_each_possible_cpu(cpu) { struct cgroup_subsys_state *pos = cgroup_rstat_updated_list(css, cpu); @@ -334,11 +352,11 @@ static void cgroup_rstat_flush_locked(struct cgroup_subsys_state *css) } /* play nice and yield if necessary */ - if (need_resched() || spin_needbreak(&cgroup_rstat_lock)) { - __cgroup_rstat_unlock(css, cpu); + if (need_resched() || spin_needbreak(lock)) { + __cgroup_rstat_unlock(css, lock, cpu); if (!cond_resched()) cpu_relax(); - __cgroup_rstat_lock(css, cpu); + __cgroup_rstat_lock(css, lock, cpu); } } } @@ -358,11 +376,22 @@ static void cgroup_rstat_flush_locked(struct cgroup_subsys_state *css) */ __bpf_kfunc void cgroup_rstat_flush(struct cgroup_subsys_state *css) { + spinlock_t *lock; + + if (!css->ss) { + int cpu; + + for_each_possible_cpu(cpu) + cgroup_base_stat_flush(css->cgroup, cpu); + return; + } + might_sleep(); - __cgroup_rstat_lock(css, -1); + lock = &css->ss->rstat_lock; + __cgroup_rstat_lock(css, lock, -1); cgroup_rstat_flush_locked(css); - __cgroup_rstat_unlock(css, -1); + __cgroup_rstat_unlock(css, lock, -1); } /** @@ -374,11 +403,11 @@ __bpf_kfunc void cgroup_rstat_flush(struct cgroup_subsys_state *css) * * This function may block. */ -void cgroup_rstat_flush_hold(struct cgroup_subsys_state *css) - __acquires(&cgroup_rstat_lock) +static void cgroup_rstat_base_flush_hold(struct cgroup_subsys_state *css) + __acquires(&cgroup_rstat_base_lock) { might_sleep(); - __cgroup_rstat_lock(css, -1); + __cgroup_rstat_lock(css, &cgroup_rstat_base_lock, -1); cgroup_rstat_flush_locked(css); } @@ -386,10 +415,10 @@ void cgroup_rstat_flush_hold(struct cgroup_subsys_state *css) * cgroup_rstat_flush_release - release cgroup_rstat_flush_hold() * @cgrp: cgroup used by tracepoint */ -void cgroup_rstat_flush_release(struct cgroup_subsys_state *css) - __releases(&cgroup_rstat_lock) +static void cgroup_rstat_base_flush_release(struct cgroup_subsys_state *css) + __releases(&cgroup_rstat_base_lock) { - __cgroup_rstat_unlock(css, -1); + __cgroup_rstat_unlock(css, &cgroup_rstat_base_lock, -1); } int cgroup_rstat_init(struct cgroup_subsys_state *css) @@ -435,10 +464,15 @@ void cgroup_rstat_exit(struct cgroup_subsys_state *css) void __init cgroup_rstat_boot(void) { - int cpu; + struct cgroup_subsys *ss; + int cpu, ssid; + + for_each_possible_cpu(cpu) { + raw_spin_lock_init(per_cpu_ptr(&cgroup_rstat_base_cpu_lock, cpu)); - for_each_possible_cpu(cpu) - raw_spin_lock_init(per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu)); + for_each_subsys(ss, ssid) + raw_spin_lock_init(per_cpu_ptr(&cgroup_rstat_cpu_lock[ssid], cpu)); + } } /* @@ -629,12 +663,12 @@ void cgroup_base_stat_cputime_show(struct seq_file *seq) u64 usage, utime, stime, ntime; if (cgroup_parent(cgrp)) { - cgroup_rstat_flush_hold(css); + cgroup_rstat_base_flush_hold(css); usage = cgrp->bstat.cputime.sum_exec_runtime; cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime, &utime, &stime); ntime = cgrp->bstat.ntime; - cgroup_rstat_flush_release(css); + cgroup_rstat_base_flush_release(css); } else { /* cgrp->bstat of root is not actually used, reuse it */ root_cgroup_cputime(&cgrp->bstat); -- 2.47.1