In the next patch I need a quick way to get a value of MEM_CGROUP_STAT_RSS. The current procedure (mem_cgroup_read_stat) is slow (iterates over all cpus) and may sleep (uses get/put_online_cpus), so it's a no-go. This patch converts memory cgroup statistics to use percpu_counter so that percpu_counter_read will do the trick. Signed-off-by: Vladimir Davydov <vdavydov@xxxxxxxxxxxxx> --- mm/memcontrol.c | 217 ++++++++++++++++++------------------------------------- 1 file changed, 69 insertions(+), 148 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 085dc6d2f876..7e8d65e0608a 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -136,9 +136,7 @@ enum mem_cgroup_events_target { #define SOFTLIMIT_EVENTS_TARGET 1024 #define NUMAINFO_EVENTS_TARGET 1024 -struct mem_cgroup_stat_cpu { - long count[MEM_CGROUP_STAT_NSTATS]; - unsigned long events[MEM_CGROUP_EVENTS_NSTATS]; +struct mem_cgroup_ratelimit_state { unsigned long nr_page_events; unsigned long targets[MEM_CGROUP_NTARGETS]; }; @@ -341,16 +339,10 @@ struct mem_cgroup { atomic_t moving_account; /* taken only while moving_account > 0 */ spinlock_t move_lock; - /* - * percpu counter. - */ - struct mem_cgroup_stat_cpu __percpu *stat; - /* - * used when a cpu is offlined or other synchronizations - * See mem_cgroup_read_stat(). - */ - struct mem_cgroup_stat_cpu nocpu_base; - spinlock_t pcp_counter_lock; + + struct percpu_counter stat[MEM_CGROUP_STAT_NSTATS]; + struct percpu_counter events[MEM_CGROUP_EVENTS_NSTATS]; + struct mem_cgroup_ratelimit_state __percpu *ratelimit; atomic_t dead_count; #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET) @@ -849,59 +841,16 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) return mz; } -/* - * Implementation Note: reading percpu statistics for memcg. - * - * Both of vmstat[] and percpu_counter has threshold and do periodic - * synchronization to implement "quick" read. There are trade-off between - * reading cost and precision of value. Then, we may have a chance to implement - * a periodic synchronizion of counter in memcg's counter. - * - * But this _read() function is used for user interface now. The user accounts - * memory usage by memory cgroup and he _always_ requires exact value because - * he accounts memory. Even if we provide quick-and-fuzzy read, we always - * have to visit all online cpus and make sum. So, for now, unnecessary - * synchronization is not implemented. (just implemented for cpu hotplug) - * - * If there are kernel internal actions which can make use of some not-exact - * value, and reading all cpu value can be performance bottleneck in some - * common workload, threashold and synchonization as vmstat[] should be - * implemented. - */ static long mem_cgroup_read_stat(struct mem_cgroup *memcg, enum mem_cgroup_stat_index idx) { - long val = 0; - int cpu; - - get_online_cpus(); - for_each_online_cpu(cpu) - val += per_cpu(memcg->stat->count[idx], cpu); -#ifdef CONFIG_HOTPLUG_CPU - spin_lock(&memcg->pcp_counter_lock); - val += memcg->nocpu_base.count[idx]; - spin_unlock(&memcg->pcp_counter_lock); -#endif - put_online_cpus(); - return val; + return percpu_counter_read(&memcg->stat[idx]); } static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, enum mem_cgroup_events_index idx) { - unsigned long val = 0; - int cpu; - - get_online_cpus(); - for_each_online_cpu(cpu) - val += per_cpu(memcg->stat->events[idx], cpu); -#ifdef CONFIG_HOTPLUG_CPU - spin_lock(&memcg->pcp_counter_lock); - val += memcg->nocpu_base.events[idx]; - spin_unlock(&memcg->pcp_counter_lock); -#endif - put_online_cpus(); - return val; + return percpu_counter_read(&memcg->events[idx]); } static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, @@ -913,25 +862,21 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, * counted as CACHE even if it's on ANON LRU. */ if (PageAnon(page)) - __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS], + percpu_counter_add(&memcg->stat[MEM_CGROUP_STAT_RSS], nr_pages); else - __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE], + percpu_counter_add(&memcg->stat[MEM_CGROUP_STAT_CACHE], nr_pages); if (PageTransHuge(page)) - __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], + percpu_counter_add(&memcg->stat[MEM_CGROUP_STAT_RSS_HUGE], nr_pages); /* pagein of a big page is an event. So, ignore page size */ if (nr_pages > 0) - __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]); - else { - __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]); - nr_pages = -nr_pages; /* for event */ - } - - __this_cpu_add(memcg->stat->nr_page_events, nr_pages); + percpu_counter_inc(&memcg->events[MEM_CGROUP_EVENTS_PGPGIN]); + else + percpu_counter_inc(&memcg->events[MEM_CGROUP_EVENTS_PGPGOUT]); } unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru) @@ -981,8 +926,8 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, { unsigned long val, next; - val = __this_cpu_read(memcg->stat->nr_page_events); - next = __this_cpu_read(memcg->stat->targets[target]); + val = __this_cpu_read(memcg->ratelimit->nr_page_events); + next = __this_cpu_read(memcg->ratelimit->targets[target]); /* from time_after() in jiffies.h */ if ((long)next - (long)val < 0) { switch (target) { @@ -998,7 +943,7 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, default: break; } - __this_cpu_write(memcg->stat->targets[target], next); + __this_cpu_write(memcg->ratelimit->targets[target], next); return true; } return false; @@ -1006,10 +951,15 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, /* * Check events in order. - * */ -static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) +static void memcg_check_events(struct mem_cgroup *memcg, struct page *page, + unsigned long nr_pages) { + unsigned long flags; + + local_irq_save(flags); + __this_cpu_add(memcg->ratelimit->nr_page_events, nr_pages); + /* threshold event is triggered in finer grain than soft limit */ if (unlikely(mem_cgroup_event_ratelimit(memcg, MEM_CGROUP_TARGET_THRESH))) { @@ -1030,6 +980,7 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) atomic_inc(&memcg->numainfo_events); #endif } + local_irq_restore(flags); } struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) @@ -1294,10 +1245,10 @@ void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx) switch (idx) { case PGFAULT: - this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]); + percpu_counter_inc(&memcg->events[MEM_CGROUP_EVENTS_PGFAULT]); break; case PGMAJFAULT: - this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]); + percpu_counter_inc(&memcg->events[MEM_CGROUP_EVENTS_PGMAJFAULT]); break; default: BUG(); @@ -2306,7 +2257,7 @@ void mem_cgroup_update_page_stat(struct page *page, if (unlikely(!memcg || !PageCgroupUsed(pc))) return; - this_cpu_add(memcg->stat->count[idx], val); + percpu_counter_add(&memcg->stat[idx], val); } /* @@ -2476,37 +2427,12 @@ static void drain_all_stock_sync(struct mem_cgroup *root_memcg) mutex_unlock(&percpu_charge_mutex); } -/* - * This function drains percpu counter value from DEAD cpu and - * move it to local cpu. Note that this function can be preempted. - */ -static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu) -{ - int i; - - spin_lock(&memcg->pcp_counter_lock); - for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { - long x = per_cpu(memcg->stat->count[i], cpu); - - per_cpu(memcg->stat->count[i], cpu) = 0; - memcg->nocpu_base.count[i] += x; - } - for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) { - unsigned long x = per_cpu(memcg->stat->events[i], cpu); - - per_cpu(memcg->stat->events[i], cpu) = 0; - memcg->nocpu_base.events[i] += x; - } - spin_unlock(&memcg->pcp_counter_lock); -} - static int memcg_cpu_hotplug_callback(struct notifier_block *nb, unsigned long action, void *hcpu) { int cpu = (unsigned long)hcpu; struct memcg_stock_pcp *stock; - struct mem_cgroup *iter; if (action == CPU_ONLINE) return NOTIFY_OK; @@ -2514,9 +2440,6 @@ static int memcg_cpu_hotplug_callback(struct notifier_block *nb, if (action != CPU_DEAD && action != CPU_DEAD_FROZEN) return NOTIFY_OK; - for_each_mem_cgroup(iter) - mem_cgroup_drain_pcp_counter(iter, cpu); - stock = &per_cpu(memcg_stock, cpu); drain_stock(stock); return NOTIFY_OK; @@ -3419,8 +3342,8 @@ void mem_cgroup_split_huge_fixup(struct page *head) pc->mem_cgroup = memcg; pc->flags = head_pc->flags; } - __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], - HPAGE_PMD_NR); + percpu_counter_sub(&memcg->stat[MEM_CGROUP_STAT_RSS_HUGE], + HPAGE_PMD_NR); } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ @@ -3475,17 +3398,17 @@ static int mem_cgroup_move_account(struct page *page, move_lock_mem_cgroup(from, &flags); if (!PageAnon(page) && page_mapped(page)) { - __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], - nr_pages); - __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], - nr_pages); + percpu_counter_sub(&from->stat[MEM_CGROUP_STAT_FILE_MAPPED], + nr_pages); + percpu_counter_add(&to->stat[MEM_CGROUP_STAT_FILE_MAPPED], + nr_pages); } if (PageWriteback(page)) { - __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_WRITEBACK], - nr_pages); - __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_WRITEBACK], - nr_pages); + percpu_counter_sub(&from->stat[MEM_CGROUP_STAT_WRITEBACK], + nr_pages); + percpu_counter_add(&to->stat[MEM_CGROUP_STAT_WRITEBACK], + nr_pages); } /* @@ -3499,12 +3422,10 @@ static int mem_cgroup_move_account(struct page *page, move_unlock_mem_cgroup(from, &flags); ret = 0; - local_irq_disable(); mem_cgroup_charge_statistics(to, page, nr_pages); - memcg_check_events(to, page); + memcg_check_events(to, page, nr_pages); mem_cgroup_charge_statistics(from, page, -nr_pages); - memcg_check_events(from, page); - local_irq_enable(); + memcg_check_events(from, page, nr_pages); out_unlock: unlock_page(page); out: @@ -3582,7 +3503,7 @@ static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg, bool charge) { int val = (charge) ? 1 : -1; - this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val); + percpu_counter_add(&memcg->stat[MEM_CGROUP_STAT_SWAP], val); } /** @@ -5413,25 +5334,11 @@ static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) static struct mem_cgroup *mem_cgroup_alloc(void) { - struct mem_cgroup *memcg; size_t size; size = sizeof(struct mem_cgroup); size += nr_node_ids * sizeof(struct mem_cgroup_per_node *); - - memcg = kzalloc(size, GFP_KERNEL); - if (!memcg) - return NULL; - - memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu); - if (!memcg->stat) - goto out_free; - spin_lock_init(&memcg->pcp_counter_lock); - return memcg; - -out_free: - kfree(memcg); - return NULL; + return kzalloc(size, GFP_KERNEL); } /* @@ -5448,13 +5355,20 @@ out_free: static void __mem_cgroup_free(struct mem_cgroup *memcg) { int node; + int i; mem_cgroup_remove_from_trees(memcg); for_each_node(node) free_mem_cgroup_per_zone_info(memcg, node); - free_percpu(memcg->stat); + for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) + percpu_counter_destroy(&memcg->stat[i]); + + for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) + percpu_counter_destroy(&memcg->events[i]); + + free_percpu(memcg->ratelimit); /* * We need to make sure that (at least for now), the jump label @@ -5511,11 +5425,24 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) struct mem_cgroup *memcg; long error = -ENOMEM; int node; + int i; memcg = mem_cgroup_alloc(); if (!memcg) return ERR_PTR(error); + for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) + if (percpu_counter_init(&memcg->stat[i], 0, GFP_KERNEL)) + goto free_out; + + for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) + if (percpu_counter_init(&memcg->events[i], 0, GFP_KERNEL)) + goto free_out; + + memcg->ratelimit = alloc_percpu(struct mem_cgroup_ratelimit_state); + if (!memcg->ratelimit) + goto free_out; + for_each_node(node) if (alloc_mem_cgroup_per_zone_info(memcg, node)) goto free_out; @@ -6507,10 +6434,8 @@ void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, VM_BUG_ON_PAGE(!PageTransHuge(page), page); } - local_irq_disable(); mem_cgroup_charge_statistics(memcg, page, nr_pages); - memcg_check_events(memcg, page); - local_irq_enable(); + memcg_check_events(memcg, page, nr_pages); if (do_swap_account && PageSwapCache(page)) { swp_entry_t entry = { .val = page_private(page) }; @@ -6557,8 +6482,6 @@ static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout, unsigned long nr_anon, unsigned long nr_file, unsigned long nr_huge, struct page *dummy_page) { - unsigned long flags; - if (!mem_cgroup_is_root(memcg)) { if (nr_mem) res_counter_uncharge(&memcg->res, @@ -6569,14 +6492,12 @@ static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout, memcg_oom_recover(memcg); } - local_irq_save(flags); - __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_anon); - __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_file); - __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], nr_huge); - __this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT], pgpgout); - __this_cpu_add(memcg->stat->nr_page_events, nr_anon + nr_file); - memcg_check_events(memcg, dummy_page); - local_irq_restore(flags); + percpu_counter_sub(&memcg->stat[MEM_CGROUP_STAT_RSS], nr_anon); + percpu_counter_sub(&memcg->stat[MEM_CGROUP_STAT_CACHE], nr_file); + percpu_counter_sub(&memcg->stat[MEM_CGROUP_STAT_RSS_HUGE], nr_huge); + percpu_counter_add(&memcg->events[MEM_CGROUP_EVENTS_PGPGOUT], pgpgout); + + memcg_check_events(memcg, dummy_page, nr_anon + nr_file); } static void uncharge_list(struct list_head *page_list) -- 1.7.10.4 -- To unsubscribe from this list: send the line "unsubscribe cgroups" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html