On 13.05.2018 19:47, Vladimir Davydov wrote: > On Thu, May 10, 2018 at 12:52:36PM +0300, Kirill Tkhai wrote: >> Imagine a big node with many cpus, memory cgroups and containers. >> Let we have 200 containers, every container has 10 mounts, >> and 10 cgroups. All container tasks don't touch foreign >> containers mounts. If there is intensive pages write, >> and global reclaim happens, a writing task has to iterate >> over all memcgs to shrink slab, before it's able to go >> to shrink_page_list(). >> >> Iteration over all the memcg slabs is very expensive: >> the task has to visit 200 * 10 = 2000 shrinkers >> for every memcg, and since there are 2000 memcgs, >> the total calls are 2000 * 2000 = 4000000. >> >> So, the shrinker makes 4 million do_shrink_slab() calls >> just to try to isolate SWAP_CLUSTER_MAX pages in one >> of the actively writing memcg via shrink_page_list(). >> I've observed a node spending almost 100% in kernel, >> making useless iteration over already shrinked slab. >> >> This patch adds bitmap of memcg-aware shrinkers to memcg. >> The size of the bitmap depends on bitmap_nr_ids, and during >> memcg life it's maintained to be enough to fit bitmap_nr_ids >> shrinkers. Every bit in the map is related to corresponding >> shrinker id. >> >> Next patches will maintain set bit only for really charged >> memcg. This will allow shrink_slab() to increase its >> performance in significant way. See the last patch for >> the numbers. >> >> Signed-off-by: Kirill Tkhai <ktkhai@xxxxxxxxxxxxx> >> --- >> include/linux/memcontrol.h | 21 ++++++++ >> mm/memcontrol.c | 116 ++++++++++++++++++++++++++++++++++++++++++++ >> mm/vmscan.c | 16 ++++++ >> 3 files changed, 152 insertions(+), 1 deletion(-) >> >> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h >> index 6cbea2f25a87..e5e7e0fc7158 100644 >> --- a/include/linux/memcontrol.h >> +++ b/include/linux/memcontrol.h >> @@ -105,6 +105,17 @@ struct lruvec_stat { >> long count[NR_VM_NODE_STAT_ITEMS]; >> }; >> >> +#ifdef CONFIG_MEMCG_SHRINKER >> +/* >> + * Bitmap of shrinker::id corresponding to memcg-aware shrinkers, >> + * which have elements charged to this memcg. >> + */ >> +struct memcg_shrinker_map { >> + struct rcu_head rcu; >> + unsigned long map[0]; >> +}; >> +#endif /* CONFIG_MEMCG_SHRINKER */ >> + > > AFAIR we don't normally ifdef structure definitions. > >> /* >> * per-zone information in memory controller. >> */ >> @@ -118,6 +129,9 @@ struct mem_cgroup_per_node { >> >> struct mem_cgroup_reclaim_iter iter[DEF_PRIORITY + 1]; >> >> +#ifdef CONFIG_MEMCG_SHRINKER >> + struct memcg_shrinker_map __rcu *shrinker_map; >> +#endif >> struct rb_node tree_node; /* RB tree node */ >> unsigned long usage_in_excess;/* Set to the value by which */ >> /* the soft limit is exceeded*/ >> @@ -1255,4 +1269,11 @@ static inline void memcg_put_cache_ids(void) >> >> #endif /* CONFIG_MEMCG && !CONFIG_SLOB */ >> >> +#ifdef CONFIG_MEMCG_SHRINKER > >> +#define MEMCG_SHRINKER_MAP(memcg, nid) (memcg->nodeinfo[nid]->shrinker_map) > > I don't really like this helper macro. Accessing shrinker_map directly > looks cleaner IMO. > >> + >> +extern int memcg_shrinker_nr_max; > > As I've mentioned before, the capacity of shrinker map should be a > private business of memcontrol.c IMHO. We shouldn't use it in vmscan.c > as max shrinker id, instead we should introduce another variable for > this, private to vmscan.c. > >> +extern int memcg_expand_shrinker_maps(int old_id, int id); > > ... Then this function would take just one argument, max id, and would > update shrinker_map capacity if necessary in memcontrol.c under the > corresponding mutex, which would look much more readable IMHO as all > shrinker_map related manipulations would be isolated in memcontrol.c. > >> +#endif /* CONFIG_MEMCG_SHRINKER */ >> + >> #endif /* _LINUX_MEMCONTROL_H */ >> diff --git a/mm/memcontrol.c b/mm/memcontrol.c >> index 3df3efa7ff40..18e0fdf302a9 100644 >> --- a/mm/memcontrol.c >> +++ b/mm/memcontrol.c >> @@ -322,6 +322,116 @@ struct workqueue_struct *memcg_kmem_cache_wq; >> >> #endif /* !CONFIG_SLOB */ >> >> +#ifdef CONFIG_MEMCG_SHRINKER >> +int memcg_shrinker_nr_max; > > memcg_shrinker_map_capacity, may be? > >> +static DEFINE_MUTEX(shrinkers_nr_max_mutex); > > memcg_shrinker_map_mutex? > >> + >> +static void memcg_free_shrinker_map_rcu(struct rcu_head *head) >> +{ >> + kvfree(container_of(head, struct memcg_shrinker_map, rcu)); >> +} >> + >> +static int memcg_expand_one_shrinker_map(struct mem_cgroup *memcg, >> + int size, int old_size) > > If you followed my advice and made the shrinker_map_capacity private to > memcontrol.c, you wouldn't need to pass old_size here either, just max > shrinker id. > >> +{ >> + struct memcg_shrinker_map *new, *old; >> + int nid; >> + >> + lockdep_assert_held(&shrinkers_nr_max_mutex); >> + >> + for_each_node(nid) { >> + old = rcu_dereference_protected(MEMCG_SHRINKER_MAP(memcg, nid), true); >> + /* Not yet online memcg */ >> + if (old_size && !old) >> + return 0; >> + >> + new = kvmalloc(sizeof(*new) + size, GFP_KERNEL); >> + if (!new) >> + return -ENOMEM; >> + >> + /* Set all old bits, clear all new bits */ >> + memset(new->map, (int)0xff, old_size); >> + memset((void *)new->map + old_size, 0, size - old_size); >> + >> + rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, new); >> + if (old) >> + call_rcu(&old->rcu, memcg_free_shrinker_map_rcu); >> + } >> + >> + return 0; >> +} >> + >> +static void memcg_free_shrinker_maps(struct mem_cgroup *memcg) >> +{ >> + struct mem_cgroup_per_node *pn; >> + struct memcg_shrinker_map *map; >> + int nid; >> + >> + if (memcg == root_mem_cgroup) >> + return; > > Nit: there's mem_cgroup_is_root() helper. > >> + >> + mutex_lock(&shrinkers_nr_max_mutex); > > Why do you need to take the mutex here? You don't access shrinker map > capacity here AFAICS. Allocation of shrinkers map is in css_online() now, and this wants its pay. memcg_expand_one_shrinker_map() must be able to differ mem cgroups with allocated maps, mem cgroups with not allocated maps, and mem cgroups with failed/failing css_online. So, the mutex is used for synchronization with expanding. See "old_size && !old" check in memcg_expand_one_shrinker_map(). >> + for_each_node(nid) { >> + pn = mem_cgroup_nodeinfo(memcg, nid); >> + map = rcu_dereference_protected(pn->shrinker_map, true); >> + if (map) >> + call_rcu(&map->rcu, memcg_free_shrinker_map_rcu); >> + rcu_assign_pointer(pn->shrinker_map, NULL); >> + } >> + mutex_unlock(&shrinkers_nr_max_mutex); >> +} >> + >> +static int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg) >> +{ >> + int ret, size = memcg_shrinker_nr_max/BITS_PER_BYTE; >> + >> + if (memcg == root_mem_cgroup) >> + return 0; > > Nit: mem_cgroup_is_root(). > >> + >> + mutex_lock(&shrinkers_nr_max_mutex); > >> + ret = memcg_expand_one_shrinker_map(memcg, size, 0); > > I don't think it's worth reusing the function designed for reallocating > shrinker maps for initial allocation. Please just fold the code here - > it will make both 'alloc' and 'expand' easier to follow IMHO. These function will have 80% code the same. What are the reasons to duplicate the same functionality? Two functions are more difficult for support, and everywhere in kernel we try to avoid this IMHO. >> + mutex_unlock(&shrinkers_nr_max_mutex); >> + >> + if (ret) >> + memcg_free_shrinker_maps(memcg); >> + >> + return ret; >> +} >> + > >> +static struct idr mem_cgroup_idr; > > Stray change. > >> + >> +int memcg_expand_shrinker_maps(int old_nr, int nr) >> +{ >> + int size, old_size, ret = 0; >> + struct mem_cgroup *memcg; >> + >> + old_size = old_nr / BITS_PER_BYTE; >> + size = nr / BITS_PER_BYTE; >> + >> + mutex_lock(&shrinkers_nr_max_mutex); >> + > >> + if (!root_mem_cgroup) >> + goto unlock; > > This wants a comment. Which comment does this want? "root_mem_cgroup is not initialized, so it does not have child mem cgroups"? >> + >> + for_each_mem_cgroup(memcg) { >> + if (memcg == root_mem_cgroup) > > Nit: mem_cgroup_is_root(). > >> + continue; >> + ret = memcg_expand_one_shrinker_map(memcg, size, old_size); >> + if (ret) >> + goto unlock; >> + } >> +unlock: >> + mutex_unlock(&shrinkers_nr_max_mutex); >> + return ret; >> +} >> +#else /* CONFIG_MEMCG_SHRINKER */ >> +static int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg) >> +{ >> + return 0; >> +} >> +static void memcg_free_shrinker_maps(struct mem_cgroup *memcg) { } >> +#endif /* CONFIG_MEMCG_SHRINKER */ >> + >> /** >> * mem_cgroup_css_from_page - css of the memcg associated with a page >> * @page: page of interest >> @@ -4471,6 +4581,11 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css) >> { >> struct mem_cgroup *memcg = mem_cgroup_from_css(css); >> >> + if (memcg_alloc_shrinker_maps(memcg)) { >> + mem_cgroup_id_remove(memcg); >> + return -ENOMEM; >> + } >> + >> /* Online state pins memcg ID, memcg ID pins CSS */ >> atomic_set(&memcg->id.ref, 1); >> css_get(css); >> @@ -4522,6 +4637,7 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css) >> vmpressure_cleanup(&memcg->vmpressure); >> cancel_work_sync(&memcg->high_work); >> mem_cgroup_remove_from_trees(memcg); >> + memcg_free_shrinker_maps(memcg); >> memcg_free_kmem(memcg); >> mem_cgroup_free(memcg); >> } >> diff --git a/mm/vmscan.c b/mm/vmscan.c >> index d691beac1048..d8a2870710e0 100644 >> --- a/mm/vmscan.c >> +++ b/mm/vmscan.c >> @@ -174,12 +174,26 @@ static DEFINE_IDR(shrinker_idr); >> >> static int prealloc_memcg_shrinker(struct shrinker *shrinker) >> { >> - int id, ret; >> + int id, nr, ret; >> >> down_write(&shrinker_rwsem); >> ret = id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL); >> if (ret < 0) >> goto unlock; >> + >> + if (id >= memcg_shrinker_nr_max) { >> + nr = memcg_shrinker_nr_max * 2; >> + if (nr == 0) >> + nr = BITS_PER_BYTE; >> + BUG_ON(id >= nr); > > The logic defining shrinker map capacity growth should be private to > memcontrol.c IMHO. > >> + >> + if (memcg_expand_shrinker_maps(memcg_shrinker_nr_max, nr)) { >> + idr_remove(&shrinker_idr, id); >> + goto unlock; >> + } >> + memcg_shrinker_nr_max = nr; >> + } >> + >> shrinker->id = id; >> ret = 0; >> unlock: >>