On 20.02.2023 12:16, Qi Zheng wrote: > Like global slab shrink, since commit 1cd0bd06093c > ("rcu: Remove CONFIG_SRCU"), it's time to use SRCU > to protect readers who previously held shrinker_rwsem. > > We can test with the following script: > > ``` > DIR="/root/shrinker/memcg/mnt" > > do_create() > { > mkdir /sys/fs/cgroup/memory/test > echo 200M > /sys/fs/cgroup/memory/test/memory.limit_in_bytes > for i in `seq 0 $1`; > do > mkdir /sys/fs/cgroup/memory/test/$i; > echo $$ > /sys/fs/cgroup/memory/test/$i/cgroup.procs; > mkdir -p $DIR/$i; > done > } > > do_mount() > { > for i in `seq $1 $2`; > do > mount -t tmpfs $i $DIR/$i; > done > } > > do_touch() > { > for i in `seq $1 $2`; > do > echo $$ > /sys/fs/cgroup/memory/test/$i/cgroup.procs; > dd if=/dev/zero of=$DIR/$i/file$i bs=1M count=1 & > done > } > > do_create 2000 > do_mount 0 2000 > do_touch 0 1000 > ``` > > Before applying: > > 46.60% [kernel] [k] down_read_trylock > 18.70% [kernel] [k] up_read > 15.44% [kernel] [k] shrink_slab > 4.37% [kernel] [k] _find_next_bit > 2.75% [kernel] [k] xa_load > 2.07% [kernel] [k] idr_find > 1.73% [kernel] [k] do_shrink_slab > 1.42% [kernel] [k] shrink_lruvec > 0.74% [kernel] [k] shrink_node > 0.60% [kernel] [k] list_lru_count_one > > After applying: > > 19.53% [kernel] [k] _find_next_bit > 14.63% [kernel] [k] do_shrink_slab > 14.58% [kernel] [k] shrink_slab > 11.83% [kernel] [k] shrink_lruvec > 9.33% [kernel] [k] __blk_flush_plug > 6.67% [kernel] [k] mem_cgroup_iter > 3.73% [kernel] [k] list_lru_count_one > 2.43% [kernel] [k] shrink_node > 1.96% [kernel] [k] super_cache_count > 1.78% [kernel] [k] __rcu_read_unlock > 1.38% [kernel] [k] __srcu_read_lock > 1.30% [kernel] [k] xas_descend > > We can see that the readers is no longer blocked. > > Signed-off-by: Qi Zheng <zhengqi.arch@xxxxxxxxxxxxx> > --- > mm/vmscan.c | 56 ++++++++++++++++++++++++++++++----------------------- > 1 file changed, 32 insertions(+), 24 deletions(-) > > diff --git a/mm/vmscan.c b/mm/vmscan.c > index 95a3d6ddc6c1..dc47396ecd0e 100644 > --- a/mm/vmscan.c > +++ b/mm/vmscan.c > @@ -57,6 +57,7 @@ > #include <linux/khugepaged.h> > #include <linux/rculist_nulls.h> > #include <linux/random.h> > +#include <linux/srcu.h> > > #include <asm/tlbflush.h> > #include <asm/div64.h> > @@ -221,8 +222,21 @@ static inline int shrinker_defer_size(int nr_items) > static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg, > int nid) > { > - return rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_info, > - lockdep_is_held(&shrinker_rwsem)); > + return srcu_dereference_check(memcg->nodeinfo[nid]->shrinker_info, > + &shrinker_srcu, > + lockdep_is_held(&shrinker_rwsem)); > +} > + > +static struct shrinker_info *shrinker_info_srcu(struct mem_cgroup *memcg, > + int nid) > +{ > + return srcu_dereference(memcg->nodeinfo[nid]->shrinker_info, > + &shrinker_srcu); > +} > + > +static void free_shrinker_info_rcu(struct rcu_head *head) > +{ > + kvfree(container_of(head, struct shrinker_info, rcu)); > } > > static int expand_one_shrinker_info(struct mem_cgroup *memcg, > @@ -257,7 +271,7 @@ static int expand_one_shrinker_info(struct mem_cgroup *memcg, > defer_size - old_defer_size); > > rcu_assign_pointer(pn->shrinker_info, new); > - kvfree_rcu(old, rcu); > + call_srcu(&shrinker_srcu, &old->rcu, free_shrinker_info_rcu); > } > > return 0; > @@ -350,13 +364,14 @@ void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id) > { > if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) { > struct shrinker_info *info; > + int srcu_idx; > > - rcu_read_lock(); > - info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info); > + srcu_idx = srcu_read_lock(&shrinker_srcu); > + info = shrinker_info_srcu(memcg, nid); > /* Pairs with smp mb in shrink_slab() */ > smp_mb__before_atomic(); > set_bit(shrinker_id, info->map); > - rcu_read_unlock(); > + srcu_read_unlock(&shrinker_srcu, srcu_idx); > } > } > > @@ -370,7 +385,6 @@ static int prealloc_memcg_shrinker(struct shrinker *shrinker) > return -ENOSYS; > > down_write(&shrinker_rwsem); > - /* This may call shrinker, so it must use down_read_trylock() */ > id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL); > if (id < 0) > goto unlock; > @@ -404,7 +418,7 @@ static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker, > { > struct shrinker_info *info; > > - info = shrinker_info_protected(memcg, nid); > + info = shrinker_info_srcu(memcg, nid); > return atomic_long_xchg(&info->nr_deferred[shrinker->id], 0); > } > > @@ -413,13 +427,13 @@ static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker, > { > struct shrinker_info *info; > > - info = shrinker_info_protected(memcg, nid); > + info = shrinker_info_srcu(memcg, nid); > return atomic_long_add_return(nr, &info->nr_deferred[shrinker->id]); > } > > void reparent_shrinker_deferred(struct mem_cgroup *memcg) > { > - int i, nid; > + int i, nid, srcu_idx; > long nr; > struct mem_cgroup *parent; > struct shrinker_info *child_info, *parent_info; > @@ -429,16 +443,16 @@ void reparent_shrinker_deferred(struct mem_cgroup *memcg) > parent = root_mem_cgroup; > > /* Prevent from concurrent shrinker_info expand */ > - down_read(&shrinker_rwsem); > + srcu_idx = srcu_read_lock(&shrinker_srcu); > for_each_node(nid) { > - child_info = shrinker_info_protected(memcg, nid); > - parent_info = shrinker_info_protected(parent, nid); > + child_info = shrinker_info_srcu(memcg, nid); > + parent_info = shrinker_info_srcu(parent, nid); > for (i = 0; i < shrinker_nr_max; i++) { > nr = atomic_long_read(&child_info->nr_deferred[i]); > atomic_long_add(nr, &parent_info->nr_deferred[i]); > } > } > - up_read(&shrinker_rwsem); > + srcu_read_unlock(&shrinker_srcu, srcu_idx); > } > > static bool cgroup_reclaim(struct scan_control *sc) > @@ -891,15 +905,14 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, > { > struct shrinker_info *info; > unsigned long ret, freed = 0; > + int srcu_idx; > int i; > > if (!mem_cgroup_online(memcg)) > return 0; > > - if (!down_read_trylock(&shrinker_rwsem)) > - return 0; > - > - info = shrinker_info_protected(memcg, nid); > + srcu_idx = srcu_read_lock(&shrinker_srcu); > + info = shrinker_info_srcu(memcg, nid); > if (unlikely(!info)) > goto unlock; There is shrinker_nr_max dereference under this hunk. It's not in the patch: for_each_set_bit(i, info->map, shrinker_nr_max) { Since shrinker_nr_max may grow in parallel, this leads to access beyond allocated memory :( It looks like we should save size of info->map as a new member of struct shrinker_info. > > @@ -949,14 +962,9 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, > set_shrinker_bit(memcg, nid, i); > } > freed += ret; > - > - if (rwsem_is_contended(&shrinker_rwsem)) { > - freed = freed ? : 1; > - break; > - } > } > unlock: > - up_read(&shrinker_rwsem); > + srcu_read_unlock(&shrinker_srcu, srcu_idx); > return freed; > } > #else /* CONFIG_MEMCG */