The patch titled Subject: memcg: reduce memcg tree traversals for stats collection has been added to the -mm tree. Its filename is memcg-reduce-memcg-tree-traversals-for-stats-collection.patch This patch should soon appear at http://ozlabs.org/~akpm/mmots/broken-out/memcg-reduce-memcg-tree-traversals-for-stats-collection.patch and later at http://ozlabs.org/~akpm/mmotm/broken-out/memcg-reduce-memcg-tree-traversals-for-stats-collection.patch Before you just go and hit "reply", please: a) Consider who else should be cc'ed b) Prefer to cc a suitable mailing list as well c) Ideally: find the original patch on the mailing list and do a reply-to-all to that, adding suitable additional cc's *** Remember to use Documentation/process/submit-checklist.rst when testing your code *** The -mm tree is included into linux-next and is updated there every 3-4 working days ------------------------------------------------------ From: Shakeel Butt <shakeelb@xxxxxxxxxx> Subject: memcg: reduce memcg tree traversals for stats collection Currently cgroup-v1's memcg_stat_show traverses the memcg tree ~17 times to collect the stats while cgroup-v2's memory_stat_show traverses the memcg tree thrice. On a large machine, a couple thousand memcgs is very normal and if the churn is high and memcgs stick around during to several reasons, tens of thousands of nodes in memcg tree can exist. This patch has refactored and shared the stat collection code between cgroup-v1 and cgroup-v2 and has reduced the tree traversal to just one. I ran a simple benchmark which reads the root_mem_cgroup's stat file 1000 times in the presense of 2500 memcgs on cgroup-v1. The results are: Without the patch: $ time ./read-root-stat-1000-times real 0m1.663s user 0m0.000s sys 0m1.660s With the patch: $ time ./read-root-stat-1000-times real 0m0.468s user 0m0.000s sys 0m0.467s Link: http://lkml.kernel.org/r/20180724224635.143944-1-shakeelb@xxxxxxxxxx Signed-off-by: Shakeel Butt <shakeelb@xxxxxxxxxx> Acked-by: Michal Hocko <mhocko@xxxxxxxx> Cc: Johannes Weiner <hannes@xxxxxxxxxxx> Cc: Vladimir Davydov <vdavydov.dev@xxxxxxxxx> Cc: Greg Thelen <gthelen@xxxxxxxxxx> Cc: Bruce Merry <bmerry@xxxxxxxxx> Signed-off-by: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx> --- mm/memcontrol.c | 152 ++++++++++++++++++++++------------------------ 1 file changed, 74 insertions(+), 78 deletions(-) --- a/mm/memcontrol.c~memcg-reduce-memcg-tree-traversals-for-stats-collection +++ a/mm/memcontrol.c @@ -2905,29 +2905,34 @@ static int mem_cgroup_hierarchy_write(st return retval; } -static void tree_stat(struct mem_cgroup *memcg, unsigned long *stat) -{ - struct mem_cgroup *iter; - int i; - - memset(stat, 0, sizeof(*stat) * MEMCG_NR_STAT); - - for_each_mem_cgroup_tree(iter, memcg) { - for (i = 0; i < MEMCG_NR_STAT; i++) - stat[i] += memcg_page_state(iter, i); - } -} +struct accumulated_stats { + unsigned long stat[MEMCG_NR_STAT]; + unsigned long events[NR_VM_EVENT_ITEMS]; + unsigned long lru_pages[NR_LRU_LISTS]; + const unsigned int *stats_array; + const unsigned int *events_array; + int stats_size; + int events_size; +}; -static void tree_events(struct mem_cgroup *memcg, unsigned long *events) +static void accumulate_memcg_tree(struct mem_cgroup *memcg, + struct accumulated_stats *acc) { - struct mem_cgroup *iter; + struct mem_cgroup *mi; int i; - memset(events, 0, sizeof(*events) * NR_VM_EVENT_ITEMS); - - for_each_mem_cgroup_tree(iter, memcg) { - for (i = 0; i < NR_VM_EVENT_ITEMS; i++) - events[i] += memcg_sum_events(iter, i); + for_each_mem_cgroup_tree(mi, memcg) { + for (i = 0; i < acc->stats_size; i++) + acc->stat[i] += memcg_page_state(mi, + acc->stats_array ? acc->stats_array[i] : i); + + for (i = 0; i < acc->events_size; i++) + acc->events[i] += memcg_sum_events(mi, + acc->events_array ? acc->events_array[i] : i); + + for (i = 0; i < NR_LRU_LISTS; i++) + acc->lru_pages[i] += + mem_cgroup_nr_lru_pages(mi, BIT(i)); } } @@ -3338,6 +3343,7 @@ static int memcg_stat_show(struct seq_fi unsigned long memory, memsw; struct mem_cgroup *mi; unsigned int i; + struct accumulated_stats acc; BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats)); BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS); @@ -3370,32 +3376,27 @@ static int memcg_stat_show(struct seq_fi seq_printf(m, "hierarchical_memsw_limit %llu\n", (u64)memsw * PAGE_SIZE); - for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { - unsigned long long val = 0; + memset(&acc, 0, sizeof(acc)); + acc.stats_size = ARRAY_SIZE(memcg1_stats); + acc.stats_array = memcg1_stats; + acc.events_size = ARRAY_SIZE(memcg1_events); + acc.events_array = memcg1_events; + accumulate_memcg_tree(memcg, &acc); + for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account()) continue; - for_each_mem_cgroup_tree(mi, memcg) - val += memcg_page_state(mi, memcg1_stats[i]) * - PAGE_SIZE; - seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i], val); - } - - for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) { - unsigned long long val = 0; - - for_each_mem_cgroup_tree(mi, memcg) - val += memcg_sum_events(mi, memcg1_events[i]); - seq_printf(m, "total_%s %llu\n", memcg1_event_names[i], val); + seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i], + (u64)acc.stat[i] * PAGE_SIZE); } - for (i = 0; i < NR_LRU_LISTS; i++) { - unsigned long long val = 0; + for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) + seq_printf(m, "total_%s %llu\n", memcg1_event_names[i], + (u64)acc.events[i]); - for_each_mem_cgroup_tree(mi, memcg) - val += mem_cgroup_nr_lru_pages(mi, BIT(i)) * PAGE_SIZE; - seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i], val); - } + for (i = 0; i < NR_LRU_LISTS; i++) + seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i], + (u64)acc.lru_pages[i] * PAGE_SIZE); #ifdef CONFIG_DEBUG_VM { @@ -5487,8 +5488,7 @@ static int memory_events_show(struct seq static int memory_stat_show(struct seq_file *m, void *v) { struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); - unsigned long stat[MEMCG_NR_STAT]; - unsigned long events[NR_VM_EVENT_ITEMS]; + struct accumulated_stats acc; int i; /* @@ -5502,66 +5502,62 @@ static int memory_stat_show(struct seq_f * Current memory state: */ - tree_stat(memcg, stat); - tree_events(memcg, events); + memset(&acc, 0, sizeof(acc)); + acc.stats_size = MEMCG_NR_STAT; + acc.events_size = NR_VM_EVENT_ITEMS; + accumulate_memcg_tree(memcg, &acc); seq_printf(m, "anon %llu\n", - (u64)stat[MEMCG_RSS] * PAGE_SIZE); + (u64)acc.stat[MEMCG_RSS] * PAGE_SIZE); seq_printf(m, "file %llu\n", - (u64)stat[MEMCG_CACHE] * PAGE_SIZE); + (u64)acc.stat[MEMCG_CACHE] * PAGE_SIZE); seq_printf(m, "kernel_stack %llu\n", - (u64)stat[MEMCG_KERNEL_STACK_KB] * 1024); + (u64)acc.stat[MEMCG_KERNEL_STACK_KB] * 1024); seq_printf(m, "slab %llu\n", - (u64)(stat[NR_SLAB_RECLAIMABLE] + - stat[NR_SLAB_UNRECLAIMABLE]) * PAGE_SIZE); + (u64)(acc.stat[NR_SLAB_RECLAIMABLE] + + acc.stat[NR_SLAB_UNRECLAIMABLE]) * PAGE_SIZE); seq_printf(m, "sock %llu\n", - (u64)stat[MEMCG_SOCK] * PAGE_SIZE); + (u64)acc.stat[MEMCG_SOCK] * PAGE_SIZE); seq_printf(m, "shmem %llu\n", - (u64)stat[NR_SHMEM] * PAGE_SIZE); + (u64)acc.stat[NR_SHMEM] * PAGE_SIZE); seq_printf(m, "file_mapped %llu\n", - (u64)stat[NR_FILE_MAPPED] * PAGE_SIZE); + (u64)acc.stat[NR_FILE_MAPPED] * PAGE_SIZE); seq_printf(m, "file_dirty %llu\n", - (u64)stat[NR_FILE_DIRTY] * PAGE_SIZE); + (u64)acc.stat[NR_FILE_DIRTY] * PAGE_SIZE); seq_printf(m, "file_writeback %llu\n", - (u64)stat[NR_WRITEBACK] * PAGE_SIZE); + (u64)acc.stat[NR_WRITEBACK] * PAGE_SIZE); - for (i = 0; i < NR_LRU_LISTS; i++) { - struct mem_cgroup *mi; - unsigned long val = 0; - - for_each_mem_cgroup_tree(mi, memcg) - val += mem_cgroup_nr_lru_pages(mi, BIT(i)); - seq_printf(m, "%s %llu\n", - mem_cgroup_lru_names[i], (u64)val * PAGE_SIZE); - } + for (i = 0; i < NR_LRU_LISTS; i++) + seq_printf(m, "%s %llu\n", mem_cgroup_lru_names[i], + (u64)acc.lru_pages[i] * PAGE_SIZE); seq_printf(m, "slab_reclaimable %llu\n", - (u64)stat[NR_SLAB_RECLAIMABLE] * PAGE_SIZE); + (u64)acc.stat[NR_SLAB_RECLAIMABLE] * PAGE_SIZE); seq_printf(m, "slab_unreclaimable %llu\n", - (u64)stat[NR_SLAB_UNRECLAIMABLE] * PAGE_SIZE); + (u64)acc.stat[NR_SLAB_UNRECLAIMABLE] * PAGE_SIZE); /* Accumulated memory events */ - seq_printf(m, "pgfault %lu\n", events[PGFAULT]); - seq_printf(m, "pgmajfault %lu\n", events[PGMAJFAULT]); + seq_printf(m, "pgfault %lu\n", acc.events[PGFAULT]); + seq_printf(m, "pgmajfault %lu\n", acc.events[PGMAJFAULT]); - seq_printf(m, "pgrefill %lu\n", events[PGREFILL]); - seq_printf(m, "pgscan %lu\n", events[PGSCAN_KSWAPD] + - events[PGSCAN_DIRECT]); - seq_printf(m, "pgsteal %lu\n", events[PGSTEAL_KSWAPD] + - events[PGSTEAL_DIRECT]); - seq_printf(m, "pgactivate %lu\n", events[PGACTIVATE]); - seq_printf(m, "pgdeactivate %lu\n", events[PGDEACTIVATE]); - seq_printf(m, "pglazyfree %lu\n", events[PGLAZYFREE]); - seq_printf(m, "pglazyfreed %lu\n", events[PGLAZYFREED]); + seq_printf(m, "pgrefill %lu\n", acc.events[PGREFILL]); + seq_printf(m, "pgscan %lu\n", acc.events[PGSCAN_KSWAPD] + + acc.events[PGSCAN_DIRECT]); + seq_printf(m, "pgsteal %lu\n", acc.events[PGSTEAL_KSWAPD] + + acc.events[PGSTEAL_DIRECT]); + seq_printf(m, "pgactivate %lu\n", acc.events[PGACTIVATE]); + seq_printf(m, "pgdeactivate %lu\n", acc.events[PGDEACTIVATE]); + seq_printf(m, "pglazyfree %lu\n", acc.events[PGLAZYFREE]); + seq_printf(m, "pglazyfreed %lu\n", acc.events[PGLAZYFREED]); seq_printf(m, "workingset_refault %lu\n", - stat[WORKINGSET_REFAULT]); + acc.stat[WORKINGSET_REFAULT]); seq_printf(m, "workingset_activate %lu\n", - stat[WORKINGSET_ACTIVATE]); + acc.stat[WORKINGSET_ACTIVATE]); seq_printf(m, "workingset_nodereclaim %lu\n", - stat[WORKINGSET_NODERECLAIM]); + acc.stat[WORKINGSET_NODERECLAIM]); return 0; } _ Patches currently in -mm which might be from shakeelb@xxxxxxxxxx are kvm-mm-account-shadow-page-tables-to-kmemcg.patch slub-track-number-of-slabs-irrespective-of-config_slub_debug.patch fs-fsnotify-account-fsnotify-metadata-to-kmemcg.patch fs-fsnotify-account-fsnotify-metadata-to-kmemcg-fix.patch fs-mm-account-buffer_head-to-kmemcg.patch fs-mm-account-buffer_head-to-kmemcgpatchfix.patch memcg-reduce-memcg-tree-traversals-for-stats-collection.patch -- To unsubscribe from this list: send the line "unsubscribe mm-commits" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html