On 22.02.2023 11:21, Qi Zheng wrote: > > > On 2023/2/22 16:16, Qi Zheng wrote: >> Hi Kirill, >> >> On 2023/2/22 05:43, Kirill Tkhai wrote: >>> On 20.02.2023 12:16, Qi Zheng wrote: >>>> Like global slab shrink, since commit 1cd0bd06093c<...> >>>> static bool cgroup_reclaim(struct scan_control *sc) >>>> @@ -891,15 +905,14 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, >>>> { >>>> struct shrinker_info *info; >>>> unsigned long ret, freed = 0; >>>> + int srcu_idx; >>>> int i; >>>> if (!mem_cgroup_online(memcg)) >>>> return 0; >>>> - if (!down_read_trylock(&shrinker_rwsem)) >>>> - return 0; >>>> - >>>> - info = shrinker_info_protected(memcg, nid); >>>> + srcu_idx = srcu_read_lock(&shrinker_srcu); >>>> + info = shrinker_info_srcu(memcg, nid); >>>> if (unlikely(!info)) >>>> goto unlock; >>> >>> There is shrinker_nr_max dereference under this hunk. It's not in the patch: >>> >>> for_each_set_bit(i, info->map, shrinker_nr_max) { >>> >>> Since shrinker_nr_max may grow in parallel, this leads to access beyond allocated memory :( >> >> Oh, indeed. >> >>> >>> It looks like we should save size of info->map as a new member of struct shrinker_info. >> >> Agree, then we only traverse info->map_size each time. Like below: >> >> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h >> index b6eda2ab205d..f1b5d2803007 100644 >> --- a/include/linux/memcontrol.h >> +++ b/include/linux/memcontrol.h >> @@ -97,6 +97,7 @@ struct shrinker_info { >> struct rcu_head rcu; >> atomic_long_t *nr_deferred; >> unsigned long *map; >> + int map_size; Sure, like this. The only thing (after rethinking) I want to say is that despite "size" was may suggestion, now it makes me think that name "size" is about size in bytes. Possible, something like map_nr_max would be better here. >> }; >> >> struct lruvec_stats_percpu { >> diff --git a/mm/vmscan.c b/mm/vmscan.c >> index f94bfe540675..dd07eb107915 100644 >> --- a/mm/vmscan.c >> +++ b/mm/vmscan.c >> @@ -239,14 +239,20 @@ static void free_shrinker_info_rcu(struct rcu_head *head) >> kvfree(container_of(head, struct shrinker_info, rcu)); >> } >> >> -static int expand_one_shrinker_info(struct mem_cgroup *memcg, >> - int map_size, int defer_size, >> - int old_map_size, int old_defer_size) >> +static int expand_one_shrinker_info(struct mem_cgroup *memcg, int new_nr_max, >> + int old_nr_max) >> { >> + int map_size, defer_size, old_map_size, old_defer_size; >> struct shrinker_info *new, *old; >> struct mem_cgroup_per_node *pn; >> int nid; >> - int size = map_size + defer_size; >> + int size; >> + >> + map_size = shrinker_map_size(new_nr_max); >> + defer_size = shrinker_defer_size(new_nr_max); >> + old_map_size = shrinker_map_size(shrinker_nr_max); >> + old_defer_size = shrinker_defer_size(shrinker_nr_max); > > Perhaps these should still be calculated outside the loop as before. Yeah, for me it's also better. >> + size = map_size + defer_size; >> >> for_each_node(nid) { >> pn = memcg->nodeinfo[nid]; >> @@ -261,6 +267,7 @@ static int expand_one_shrinker_info(struct mem_cgroup *memcg, >> >> new->nr_deferred = (atomic_long_t *)(new + 1); >> new->map = (void *)new->nr_deferred + defer_size; >> + new->map_size = new_nr_max; >> >> /* map: set all old bits, clear all new bits */ >> memset(new->map, (int)0xff, old_map_size); >> @@ -310,6 +317,7 @@ int alloc_shrinker_info(struct mem_cgroup *memcg) >> } >> info->nr_deferred = (atomic_long_t *)(info + 1); >> info->map = (void *)info->nr_deferred + defer_size; >> + info->map_size = shrinker_nr_max; >> rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_info, info); >> } >> mutex_unlock(&shrinker_mutex); >> @@ -327,8 +335,6 @@ static int expand_shrinker_info(int new_id) >> { >> int ret = 0; >> int new_nr_max = new_id + 1; >> - int map_size, defer_size = 0; >> - int old_map_size, old_defer_size = 0; >> struct mem_cgroup *memcg; >> >> if (!need_expand(new_nr_max)) >> @@ -339,15 +345,9 @@ static int expand_shrinker_info(int new_id) >> >> lockdep_assert_held(&shrinker_mutex); >> >> - map_size = shrinker_map_size(new_nr_max); >> - defer_size = shrinker_defer_size(new_nr_max); >> - old_map_size = shrinker_map_size(shrinker_nr_max); >> - old_defer_size = shrinker_defer_size(shrinker_nr_max); >> - >> memcg = mem_cgroup_iter(NULL, NULL, NULL); >> do { >> - ret = expand_one_shrinker_info(memcg, map_size, defer_size, >> - old_map_size, old_defer_size); >> + ret = expand_one_shrinker_info(memcg, new_nr_max, shrinker_nr_max); >> if (ret) { >> mem_cgroup_iter_break(NULL, memcg); >> goto out; >> @@ -912,7 +912,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, >> if (unlikely(!info)) >> goto unlock; >> >> - for_each_set_bit(i, info->map, shrinker_nr_max) { >> + for_each_set_bit(i, info->map, info->map_size) { >> struct shrink_control sc = { >> .gfp_mask = gfp_mask, >> .nid = nid, >> >> I will send the v2. >> >> Thanks, >> Qi >> >>> >>>> @@ -949,14 +962,9 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, >>>> set_shrinker_bit(memcg, nid, i); >>>> } >>>> freed += ret; >>>> - >>>> - if (rwsem_is_contended(&shrinker_rwsem)) { >>>> - freed = freed ? : 1; >>>> - break; >>>> - } >>>> } >>>> unlock: >>>> - up_read(&shrinker_rwsem); >>>> + srcu_read_unlock(&shrinker_srcu, srcu_idx); >>>> return freed; >>>> } >>>> #else /* CONFIG_MEMCG */ >>> >> >