On 22.04.2018 20:59, Vladimir Davydov wrote: > On Tue, Apr 17, 2018 at 09:53:31PM +0300, Kirill Tkhai wrote: >> Imagine a big node with many cpus, memory cgroups and containers. >> Let we have 200 containers, every container has 10 mounts, >> and 10 cgroups. All container tasks don't touch foreign >> containers mounts. If there is intensive pages write, >> and global reclaim happens, a writing task has to iterate >> over all memcgs to shrink slab, before it's able to go >> to shrink_page_list(). >> >> Iteration over all the memcg slabs is very expensive: >> the task has to visit 200 * 10 = 2000 shrinkers >> for every memcg, and since there are 2000 memcgs, >> the total calls are 2000 * 2000 = 4000000. >> >> So, the shrinker makes 4 million do_shrink_slab() calls >> just to try to isolate SWAP_CLUSTER_MAX pages in one >> of the actively writing memcg via shrink_page_list(). >> I've observed a node spending almost 100% in kernel, >> making useless iteration over already shrinked slab. >> >> This patch adds bitmap of memcg-aware shrinkers to memcg. >> The size of the bitmap depends on bitmap_nr_ids, and during >> memcg life it's maintained to be enough to fit bitmap_nr_ids >> shrinkers. Every bit in the map is related to corresponding >> shrinker id. >> >> Next patches will maintain set bit only for really charged >> memcg. This will allow shrink_slab() to increase its >> performance in significant way. See the last patch for >> the numbers. >> >> Signed-off-by: Kirill Tkhai <ktkhai@xxxxxxxxxxxxx> >> --- >> include/linux/memcontrol.h | 15 +++++ >> mm/memcontrol.c | 125 ++++++++++++++++++++++++++++++++++++++++++++ >> mm/vmscan.c | 21 +++++++ >> 3 files changed, 160 insertions(+), 1 deletion(-) >> >> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h >> index af9eed2e3e04..2ec96ab46b01 100644 >> --- a/include/linux/memcontrol.h >> +++ b/include/linux/memcontrol.h >> @@ -115,6 +115,7 @@ struct mem_cgroup_per_node { >> unsigned long lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS]; >> >> struct mem_cgroup_reclaim_iter iter[DEF_PRIORITY + 1]; > >> + struct memcg_shrinker_map __rcu *shrinkers_map; > > shrinker_map > >> >> struct rb_node tree_node; /* RB tree node */ >> unsigned long usage_in_excess;/* Set to the value by which */ >> @@ -153,6 +154,11 @@ struct mem_cgroup_thresholds { >> struct mem_cgroup_threshold_ary *spare; >> }; >> >> +struct memcg_shrinker_map { >> + struct rcu_head rcu; >> + unsigned long map[0]; >> +}; >> + > > This struct should be defined before struct mem_cgroup_per_node. > > A comment explaining what this map is for and what it maps would be > really helpful. > >> enum memcg_kmem_state { >> KMEM_NONE, >> KMEM_ALLOCATED, >> @@ -1200,6 +1206,8 @@ extern int memcg_nr_cache_ids; >> void memcg_get_cache_ids(void); >> void memcg_put_cache_ids(void); >> >> +extern int shrinkers_max_nr; >> + > > memcg_shrinker_id_max? memcg_shrinker_id_max sounds like an includive value, doesn't it? While shrinker->id < shrinker_max_nr. Let's better use memcg_shrinker_nr_max. >> /* >> * Helper macro to loop through all memcg-specific caches. Callers must still >> * check if the cache is valid (it is either valid or NULL). >> @@ -1223,6 +1231,13 @@ static inline int memcg_cache_id(struct mem_cgroup *memcg) >> return memcg ? memcg->kmemcg_id : -1; >> } >> >> +extern struct memcg_shrinker_map __rcu *root_shrinkers_map[]; >> +#define SHRINKERS_MAP(memcg, nid) \ >> + (memcg == root_mem_cgroup || !memcg ? \ >> + root_shrinkers_map[nid] : memcg->nodeinfo[nid]->shrinkers_map) >> + >> +extern int expand_shrinker_maps(int old_id, int id); >> + > > I'm strongly against using a special map for the root cgroup. I'd prefer > to disable this optimization for the root cgroup altogether and simply > iterate over all registered shrinkers when it comes to global reclaim. > It shouldn't degrade performance as the root cgroup is singular. > >> #else >> #define for_each_memcg_cache_index(_idx) \ >> for (; NULL; ) >> diff --git a/mm/memcontrol.c b/mm/memcontrol.c >> index 2959a454a072..562dfb1be9ef 100644 >> --- a/mm/memcontrol.c >> +++ b/mm/memcontrol.c >> @@ -305,6 +305,113 @@ EXPORT_SYMBOL(memcg_kmem_enabled_key); >> >> struct workqueue_struct *memcg_kmem_cache_wq; >> >> +static DECLARE_RWSEM(shrinkers_max_nr_rwsem); > > Why rwsem? AFAIU you want to synchronize between two code paths: when a > memory cgroup is allocated (or switched online?) to allocate a shrinker > map for it and in the function growing shrinker maps for all cgroups. > A mutex should be enough for this. > >> +struct memcg_shrinker_map __rcu *root_shrinkers_map[MAX_NUMNODES] = { 0 }; >> + >> +static void get_shrinkers_max_nr(void) >> +{ >> + down_read(&shrinkers_max_nr_rwsem); >> +} >> + >> +static void put_shrinkers_max_nr(void) >> +{ >> + up_read(&shrinkers_max_nr_rwsem); >> +} >> + >> +static void kvfree_map_rcu(struct rcu_head *head) > > free_shrinker_map_rcu > >> +{ >> + kvfree(container_of(head, struct memcg_shrinker_map, rcu)); >> +} >> + >> +static int memcg_expand_maps(struct mem_cgroup *memcg, int nid, > > Bad name: the function reallocates just one map, not many maps; the name > doesn't indicate that it is about shrinkers; the name is inconsistent > with alloc_shrinker_maps and free_shrinker_maps. Please fix. > >> + int size, int old_size) >> +{ >> + struct memcg_shrinker_map *new, *old; >> + >> + lockdep_assert_held(&shrinkers_max_nr_rwsem); >> + >> + new = kvmalloc(sizeof(*new) + size, GFP_KERNEL); >> + if (!new) >> + return -ENOMEM; >> + >> + /* Set all old bits, clear all new bits */ >> + memset(new->map, (int)0xff, old_size); >> + memset((void *)new->map + old_size, 0, size - old_size); >> + >> + old = rcu_dereference_protected(SHRINKERS_MAP(memcg, nid), true); >> + >> + if (memcg) >> + rcu_assign_pointer(memcg->nodeinfo[nid]->shrinkers_map, new); >> + else >> + rcu_assign_pointer(root_shrinkers_map[nid], new); >> + >> + if (old) >> + call_rcu(&old->rcu, kvfree_map_rcu); >> + >> + return 0; >> +} >> + >> +static int alloc_shrinker_maps(struct mem_cgroup *memcg, int nid) >> +{ >> + /* Skip allocation, when we're initializing root_mem_cgroup */ >> + if (!root_mem_cgroup) >> + return 0; >> + >> + return memcg_expand_maps(memcg, nid, shrinkers_max_nr/BITS_PER_BYTE, 0); >> +} >> + >> +static void free_shrinker_maps(struct mem_cgroup *memcg, >> + struct mem_cgroup_per_node *pn) >> +{ >> + struct memcg_shrinker_map *map; >> + >> + if (memcg == root_mem_cgroup) >> + return; >> + >> + /* IDR unhashed long ago, and expand_shrinker_maps can't race with us */ >> + map = rcu_dereference_protected(pn->shrinkers_map, true); >> + kvfree_map_rcu(&map->rcu); >> +} >> + >> +static struct idr mem_cgroup_idr; >> + >> +int expand_shrinker_maps(int old_nr, int nr) >> +{ >> + int id, size, old_size, node, ret; >> + struct mem_cgroup *memcg; >> + >> + old_size = old_nr / BITS_PER_BYTE; >> + size = nr / BITS_PER_BYTE; >> + >> + down_write(&shrinkers_max_nr_rwsem); >> + for_each_node(node) { > > Iterating over cgroups first, numa nodes second seems like a better idea > to me. I think you should fold for_each_node in memcg_expand_maps. > >> + idr_for_each_entry(&mem_cgroup_idr, memcg, id) { > > Iterating over mem_cgroup_idr looks strange. Why don't you use > for_each_mem_cgroup? We want to allocate shrinkers maps in mem_cgroup_css_alloc(), since mem_cgroup_css_online() mustn't fail (it's a requirement of currently existing design of memcg_cgroup::id). A new memcg is added to parent's list between two of these calls: css_create() ss->css_alloc() list_add_tail_rcu(&css->sibling, &parent_css->children) ss->css_online() for_each_mem_cgroup() does not see allocated, but not linked children. >> + if (id == 1) >> + memcg = NULL; >> + ret = memcg_expand_maps(memcg, node, size, old_size); >> + if (ret) >> + goto unlock; >> + } >> + >> + /* root_mem_cgroup is not initialized yet */ >> + if (id == 0) >> + ret = memcg_expand_maps(NULL, node, size, old_size); >> + } >> +unlock: >> + up_write(&shrinkers_max_nr_rwsem); >> + return ret; >> +} >> +#else /* CONFIG_SLOB */ >> +static void get_shrinkers_max_nr(void) { } >> +static void put_shrinkers_max_nr(void) { } >> + >> +static int alloc_shrinker_maps(struct mem_cgroup *memcg, int nid) >> +{ >> + return 0; >> +} >> +static void free_shrinker_maps(struct mem_cgroup *memcg, >> + struct mem_cgroup_per_node *pn) { } >> + >> #endif /* !CONFIG_SLOB */ >> >> /** >> @@ -3002,6 +3109,8 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, >> } >> >> #ifndef CONFIG_SLOB >> +int shrinkers_max_nr; >> + >> static int memcg_online_kmem(struct mem_cgroup *memcg) >> { >> int memcg_id; >> @@ -4266,7 +4375,10 @@ static DEFINE_IDR(mem_cgroup_idr); >> static void mem_cgroup_id_remove(struct mem_cgroup *memcg) >> { >> if (memcg->id.id > 0) { >> + /* Removing IDR must be visible for expand_shrinker_maps() */ >> + get_shrinkers_max_nr(); >> idr_remove(&mem_cgroup_idr, memcg->id.id); >> + put_shrinkers_max_nr(); >> memcg->id.id = 0; >> } >> } >> @@ -4333,12 +4445,17 @@ static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) >> if (!pn->lruvec_stat_cpu) >> goto err_pcpu; >> >> + if (alloc_shrinker_maps(memcg, node)) >> + goto err_maps; >> + >> lruvec_init(&pn->lruvec); >> pn->usage_in_excess = 0; >> pn->on_tree = false; >> pn->memcg = memcg; >> return 0; >> >> +err_maps: >> + free_percpu(pn->lruvec_stat_cpu); >> err_pcpu: >> memcg->nodeinfo[node] = NULL; >> kfree(pn); >> @@ -4352,6 +4469,7 @@ static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) >> if (!pn) >> return; >> >> + free_shrinker_maps(memcg, pn); >> free_percpu(pn->lruvec_stat_cpu); >> kfree(pn); >> } >> @@ -4407,13 +4525,18 @@ static struct mem_cgroup *mem_cgroup_alloc(void) >> #ifdef CONFIG_CGROUP_WRITEBACK >> INIT_LIST_HEAD(&memcg->cgwb_list); >> #endif >> + >> + get_shrinkers_max_nr(); >> for_each_node(node) >> - if (alloc_mem_cgroup_per_node_info(memcg, node)) >> + if (alloc_mem_cgroup_per_node_info(memcg, node)) { >> + put_shrinkers_max_nr(); >> goto fail; >> + } >> >> memcg->id.id = idr_alloc(&mem_cgroup_idr, memcg, >> 1, MEM_CGROUP_ID_MAX, >> GFP_KERNEL); >> + put_shrinkers_max_nr(); >> if (memcg->id.id < 0) >> goto fail; >> >> diff --git a/mm/vmscan.c b/mm/vmscan.c >> index 4f02fe83537e..f63eb5596c35 100644 >> --- a/mm/vmscan.c >> +++ b/mm/vmscan.c >> @@ -172,6 +172,22 @@ static DECLARE_RWSEM(shrinker_rwsem); >> #if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) >> static DEFINE_IDR(shrinkers_id_idr); >> >> +static int expand_shrinker_id(int id) >> +{ >> + if (likely(id < shrinkers_max_nr)) >> + return 0; >> + >> + id = shrinkers_max_nr * 2; >> + if (id == 0) >> + id = BITS_PER_BYTE; >> + >> + if (expand_shrinker_maps(shrinkers_max_nr, id)) >> + return -ENOMEM; >> + >> + shrinkers_max_nr = id; >> + return 0; >> +} >> + > > I think this function should live in memcontrol.c and shrinkers_max_nr > should be private to memcontrol.c. > >> static int add_memcg_shrinker(struct shrinker *shrinker) >> { >> int id, ret; >> @@ -180,6 +196,11 @@ static int add_memcg_shrinker(struct shrinker *shrinker) >> ret = id = idr_alloc(&shrinkers_id_idr, shrinker, 0, 0, GFP_KERNEL); >> if (ret < 0) >> goto unlock; >> + ret = expand_shrinker_id(id); >> + if (ret < 0) { >> + idr_remove(&shrinkers_id_idr, id); >> + goto unlock; >> + } >> shrinker->id = id; >> ret = 0; >> unlock: >>