Like global slab shrink, this commit also uses refcount+RCU method to make memcg slab shrink lockless. We can reproduce the down_read_trylock() hotspot through the following script: ``` DIR="/root/shrinker/memcg/mnt" do_create() { mkdir -p /sys/fs/cgroup/memory/test mkdir -p /sys/fs/cgroup/perf_event/test echo 4G > /sys/fs/cgroup/memory/test/memory.limit_in_bytes for i in `seq 0 $1`; do mkdir -p /sys/fs/cgroup/memory/test/$i; echo $$ > /sys/fs/cgroup/memory/test/$i/cgroup.procs; echo $$ > /sys/fs/cgroup/perf_event/test/cgroup.procs; mkdir -p $DIR/$i; done } do_mount() { for i in `seq $1 $2`; do mount -t tmpfs $i $DIR/$i; done } do_touch() { for i in `seq $1 $2`; do echo $$ > /sys/fs/cgroup/memory/test/$i/cgroup.procs; echo $$ > /sys/fs/cgroup/perf_event/test/cgroup.procs; dd if=/dev/zero of=$DIR/$i/file$i bs=1M count=1 & done } case "$1" in touch) do_touch $2 $3 ;; test) do_create 4000 do_mount 0 4000 do_touch 0 3000 ;; *) exit 1 ;; esac ``` Save the above script, then run test and touch commands. Then we can use the following perf command to view hotspots: perf top -U -F 999 [-g] 1) Before applying this patchset: 35.34% [kernel] [k] down_read_trylock 18.44% [kernel] [k] shrink_slab 15.98% [kernel] [k] pv_native_safe_halt 15.08% [kernel] [k] up_read 5.33% [kernel] [k] idr_find 2.71% [kernel] [k] _find_next_bit 2.21% [kernel] [k] shrink_node 1.29% [kernel] [k] shrink_lruvec 0.66% [kernel] [k] do_shrink_slab 0.33% [kernel] [k] list_lru_count_one 0.33% [kernel] [k] __radix_tree_lookup 0.25% [kernel] [k] mem_cgroup_iter - 82.19% 19.49% [kernel] [k] shrink_slab - 62.00% shrink_slab 36.37% down_read_trylock 15.52% up_read 5.48% idr_find 3.38% _find_next_bit + 0.98% do_shrink_slab 2) After applying this patchset: 46.83% [kernel] [k] shrink_slab 20.52% [kernel] [k] pv_native_safe_halt 8.85% [kernel] [k] do_shrink_slab 7.71% [kernel] [k] _find_next_bit 1.72% [kernel] [k] xas_descend 1.70% [kernel] [k] shrink_node 1.44% [kernel] [k] shrink_lruvec 1.43% [kernel] [k] mem_cgroup_iter 1.28% [kernel] [k] xas_load 0.89% [kernel] [k] super_cache_count 0.84% [kernel] [k] xas_start 0.66% [kernel] [k] list_lru_count_one - 65.50% 40.44% [kernel] [k] shrink_slab - 22.96% shrink_slab 13.11% _find_next_bit - 9.91% do_shrink_slab - 1.59% super_cache_count 0.92% list_lru_count_one We can see that the first perf hotspot becomes shrink_slab, which is what we expect. Signed-off-by: Qi Zheng <zhengqi.arch@xxxxxxxxxxxxx> --- mm/vmscan.c | 58 +++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 41 insertions(+), 17 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 767569698946..357a1f2ad690 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -213,6 +213,12 @@ static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg, lockdep_is_held(&shrinker_rwsem)); } +static struct shrinker_info *shrinker_info_rcu(struct mem_cgroup *memcg, + int nid) +{ + return rcu_dereference(memcg->nodeinfo[nid]->shrinker_info); +} + static int expand_one_shrinker_info(struct mem_cgroup *memcg, int map_size, int defer_size, int old_map_size, int old_defer_size, @@ -339,7 +345,7 @@ void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id) struct shrinker_info *info; rcu_read_lock(); - info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info); + info = shrinker_info_rcu(memcg, nid); if (!WARN_ON_ONCE(shrinker_id >= info->map_nr_max)) { /* Pairs with smp mb in shrink_slab() */ smp_mb__before_atomic(); @@ -359,7 +365,6 @@ static int prealloc_memcg_shrinker(struct shrinker *shrinker) return -ENOSYS; down_write(&shrinker_rwsem); - /* This may call shrinker, so it must use down_read_trylock() */ id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL); if (id < 0) goto unlock; @@ -392,18 +397,28 @@ static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker, struct mem_cgroup *memcg) { struct shrinker_info *info; + long nr_deferred; - info = shrinker_info_protected(memcg, nid); - return atomic_long_xchg(&info->nr_deferred[shrinker->id], 0); + rcu_read_lock(); + info = shrinker_info_rcu(memcg, nid); + nr_deferred = atomic_long_xchg(&info->nr_deferred[shrinker->id], 0); + rcu_read_unlock(); + + return nr_deferred; } static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker, struct mem_cgroup *memcg) { struct shrinker_info *info; + long nr_deferred; + + rcu_read_lock(); + info = shrinker_info_rcu(memcg, nid); + nr_deferred = atomic_long_add_return(nr, &info->nr_deferred[shrinker->id]); + rcu_read_unlock(); - info = shrinker_info_protected(memcg, nid); - return atomic_long_add_return(nr, &info->nr_deferred[shrinker->id]); + return nr_deferred; } void reparent_shrinker_deferred(struct mem_cgroup *memcg) @@ -955,19 +970,18 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, { struct shrinker_info *info; unsigned long ret, freed = 0; - int i; + int i = 0; if (!mem_cgroup_online(memcg)) return 0; - if (!down_read_trylock(&shrinker_rwsem)) - return 0; - - info = shrinker_info_protected(memcg, nid); +again: + rcu_read_lock(); + info = shrinker_info_rcu(memcg, nid); if (unlikely(!info)) goto unlock; - for_each_set_bit(i, info->map, info->map_nr_max) { + for_each_set_bit_from(i, info->map, info->map_nr_max) { struct shrink_control sc = { .gfp_mask = gfp_mask, .nid = nid, @@ -982,6 +996,10 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, continue; } + if (!shrinker_try_get(shrinker)) + continue; + rcu_read_unlock(); + /* Call non-slab shrinkers even though kmem is disabled */ if (!memcg_kmem_online() && !(shrinker->flags & SHRINKER_NONSLAB)) @@ -1014,13 +1032,19 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, } freed += ret; - if (rwsem_is_contended(&shrinker_rwsem)) { - freed = freed ? : 1; - break; - } + shrinker_put(shrinker); + + /* + * We have already exited the read-side of rcu critical section + * before calling do_shrink_slab(), the shrinker_info may be + * released in expand_one_shrinker_info(), so restart the + * iteration. + */ + i++; + goto again; } unlock: - up_read(&shrinker_rwsem); + rcu_read_unlock(); return freed; } #else /* CONFIG_MEMCG */ -- 2.30.2