(2014/09/12 0:41), Vladimir Davydov wrote: > In the next patch I need a quick way to get a value of > MEM_CGROUP_STAT_RSS. The current procedure (mem_cgroup_read_stat) is > slow (iterates over all cpus) and may sleep (uses get/put_online_cpus), > so it's a no-go. > > This patch converts memory cgroup statistics to use percpu_counter so > that percpu_counter_read will do the trick. > > Signed-off-by: Vladimir Davydov <vdavydov@xxxxxxxxxxxxx> I have no strong objections but you need performance comparison to go with this. I thought percpu counter was messy to be used for "array". I can't understand why you started from fixing future performance problem before merging new feature. Thanks, -Kame > --- > mm/memcontrol.c | 217 ++++++++++++++++++------------------------------------- > 1 file changed, 69 insertions(+), 148 deletions(-) > > diff --git a/mm/memcontrol.c b/mm/memcontrol.c > index 085dc6d2f876..7e8d65e0608a 100644 > --- a/mm/memcontrol.c > +++ b/mm/memcontrol.c > @@ -136,9 +136,7 @@ enum mem_cgroup_events_target { > #define SOFTLIMIT_EVENTS_TARGET 1024 > #define NUMAINFO_EVENTS_TARGET 1024 > > -struct mem_cgroup_stat_cpu { > - long count[MEM_CGROUP_STAT_NSTATS]; > - unsigned long events[MEM_CGROUP_EVENTS_NSTATS]; > +struct mem_cgroup_ratelimit_state { > unsigned long nr_page_events; > unsigned long targets[MEM_CGROUP_NTARGETS]; > }; > @@ -341,16 +339,10 @@ struct mem_cgroup { > atomic_t moving_account; > /* taken only while moving_account > 0 */ > spinlock_t move_lock; > - /* > - * percpu counter. > - */ > - struct mem_cgroup_stat_cpu __percpu *stat; > - /* > - * used when a cpu is offlined or other synchronizations > - * See mem_cgroup_read_stat(). > - */ > - struct mem_cgroup_stat_cpu nocpu_base; > - spinlock_t pcp_counter_lock; > + > + struct percpu_counter stat[MEM_CGROUP_STAT_NSTATS]; > + struct percpu_counter events[MEM_CGROUP_EVENTS_NSTATS]; > + struct mem_cgroup_ratelimit_state __percpu *ratelimit; > > atomic_t dead_count; > #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET) > @@ -849,59 +841,16 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) > return mz; > } > > -/* > - * Implementation Note: reading percpu statistics for memcg. > - * > - * Both of vmstat[] and percpu_counter has threshold and do periodic > - * synchronization to implement "quick" read. There are trade-off between > - * reading cost and precision of value. Then, we may have a chance to implement > - * a periodic synchronizion of counter in memcg's counter. > - * > - * But this _read() function is used for user interface now. The user accounts > - * memory usage by memory cgroup and he _always_ requires exact value because > - * he accounts memory. Even if we provide quick-and-fuzzy read, we always > - * have to visit all online cpus and make sum. So, for now, unnecessary > - * synchronization is not implemented. (just implemented for cpu hotplug) > - * > - * If there are kernel internal actions which can make use of some not-exact > - * value, and reading all cpu value can be performance bottleneck in some > - * common workload, threashold and synchonization as vmstat[] should be > - * implemented. > - */ > static long mem_cgroup_read_stat(struct mem_cgroup *memcg, > enum mem_cgroup_stat_index idx) > { > - long val = 0; > - int cpu; > - > - get_online_cpus(); > - for_each_online_cpu(cpu) > - val += per_cpu(memcg->stat->count[idx], cpu); > -#ifdef CONFIG_HOTPLUG_CPU > - spin_lock(&memcg->pcp_counter_lock); > - val += memcg->nocpu_base.count[idx]; > - spin_unlock(&memcg->pcp_counter_lock); > -#endif > - put_online_cpus(); > - return val; > + return percpu_counter_read(&memcg->stat[idx]); > } > > static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, > enum mem_cgroup_events_index idx) > { > - unsigned long val = 0; > - int cpu; > - > - get_online_cpus(); > - for_each_online_cpu(cpu) > - val += per_cpu(memcg->stat->events[idx], cpu); > -#ifdef CONFIG_HOTPLUG_CPU > - spin_lock(&memcg->pcp_counter_lock); > - val += memcg->nocpu_base.events[idx]; > - spin_unlock(&memcg->pcp_counter_lock); > -#endif > - put_online_cpus(); > - return val; > + return percpu_counter_read(&memcg->events[idx]); > } > > static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, > @@ -913,25 +862,21 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, > * counted as CACHE even if it's on ANON LRU. > */ > if (PageAnon(page)) > - __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS], > + percpu_counter_add(&memcg->stat[MEM_CGROUP_STAT_RSS], > nr_pages); > else > - __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE], > + percpu_counter_add(&memcg->stat[MEM_CGROUP_STAT_CACHE], > nr_pages); > > if (PageTransHuge(page)) > - __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], > + percpu_counter_add(&memcg->stat[MEM_CGROUP_STAT_RSS_HUGE], > nr_pages); > > /* pagein of a big page is an event. So, ignore page size */ > if (nr_pages > 0) > - __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]); > - else { > - __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]); > - nr_pages = -nr_pages; /* for event */ > - } > - > - __this_cpu_add(memcg->stat->nr_page_events, nr_pages); > + percpu_counter_inc(&memcg->events[MEM_CGROUP_EVENTS_PGPGIN]); > + else > + percpu_counter_inc(&memcg->events[MEM_CGROUP_EVENTS_PGPGOUT]); > } > > unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru) > @@ -981,8 +926,8 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, > { > unsigned long val, next; > > - val = __this_cpu_read(memcg->stat->nr_page_events); > - next = __this_cpu_read(memcg->stat->targets[target]); > + val = __this_cpu_read(memcg->ratelimit->nr_page_events); > + next = __this_cpu_read(memcg->ratelimit->targets[target]); > /* from time_after() in jiffies.h */ > if ((long)next - (long)val < 0) { > switch (target) { > @@ -998,7 +943,7 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, > default: > break; > } > - __this_cpu_write(memcg->stat->targets[target], next); > + __this_cpu_write(memcg->ratelimit->targets[target], next); > return true; > } > return false; > @@ -1006,10 +951,15 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, > > /* > * Check events in order. > - * > */ > -static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) > +static void memcg_check_events(struct mem_cgroup *memcg, struct page *page, > + unsigned long nr_pages) > { > + unsigned long flags; > + > + local_irq_save(flags); > + __this_cpu_add(memcg->ratelimit->nr_page_events, nr_pages); > + > /* threshold event is triggered in finer grain than soft limit */ > if (unlikely(mem_cgroup_event_ratelimit(memcg, > MEM_CGROUP_TARGET_THRESH))) { > @@ -1030,6 +980,7 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) > atomic_inc(&memcg->numainfo_events); > #endif > } > + local_irq_restore(flags); > } > > struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) > @@ -1294,10 +1245,10 @@ void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx) > > switch (idx) { > case PGFAULT: > - this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]); > + percpu_counter_inc(&memcg->events[MEM_CGROUP_EVENTS_PGFAULT]); > break; > case PGMAJFAULT: > - this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]); > + percpu_counter_inc(&memcg->events[MEM_CGROUP_EVENTS_PGMAJFAULT]); > break; > default: > BUG(); > @@ -2306,7 +2257,7 @@ void mem_cgroup_update_page_stat(struct page *page, > if (unlikely(!memcg || !PageCgroupUsed(pc))) > return; > > - this_cpu_add(memcg->stat->count[idx], val); > + percpu_counter_add(&memcg->stat[idx], val); > } > > /* > @@ -2476,37 +2427,12 @@ static void drain_all_stock_sync(struct mem_cgroup *root_memcg) > mutex_unlock(&percpu_charge_mutex); > } > > -/* > - * This function drains percpu counter value from DEAD cpu and > - * move it to local cpu. Note that this function can be preempted. > - */ > -static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu) > -{ > - int i; > - > - spin_lock(&memcg->pcp_counter_lock); > - for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { > - long x = per_cpu(memcg->stat->count[i], cpu); > - > - per_cpu(memcg->stat->count[i], cpu) = 0; > - memcg->nocpu_base.count[i] += x; > - } > - for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) { > - unsigned long x = per_cpu(memcg->stat->events[i], cpu); > - > - per_cpu(memcg->stat->events[i], cpu) = 0; > - memcg->nocpu_base.events[i] += x; > - } > - spin_unlock(&memcg->pcp_counter_lock); > -} > - > static int memcg_cpu_hotplug_callback(struct notifier_block *nb, > unsigned long action, > void *hcpu) > { > int cpu = (unsigned long)hcpu; > struct memcg_stock_pcp *stock; > - struct mem_cgroup *iter; > > if (action == CPU_ONLINE) > return NOTIFY_OK; > @@ -2514,9 +2440,6 @@ static int memcg_cpu_hotplug_callback(struct notifier_block *nb, > if (action != CPU_DEAD && action != CPU_DEAD_FROZEN) > return NOTIFY_OK; > > - for_each_mem_cgroup(iter) > - mem_cgroup_drain_pcp_counter(iter, cpu); > - > stock = &per_cpu(memcg_stock, cpu); > drain_stock(stock); > return NOTIFY_OK; > @@ -3419,8 +3342,8 @@ void mem_cgroup_split_huge_fixup(struct page *head) > pc->mem_cgroup = memcg; > pc->flags = head_pc->flags; > } > - __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], > - HPAGE_PMD_NR); > + percpu_counter_sub(&memcg->stat[MEM_CGROUP_STAT_RSS_HUGE], > + HPAGE_PMD_NR); > } > #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ > > @@ -3475,17 +3398,17 @@ static int mem_cgroup_move_account(struct page *page, > move_lock_mem_cgroup(from, &flags); > > if (!PageAnon(page) && page_mapped(page)) { > - __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], > - nr_pages); > - __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], > - nr_pages); > + percpu_counter_sub(&from->stat[MEM_CGROUP_STAT_FILE_MAPPED], > + nr_pages); > + percpu_counter_add(&to->stat[MEM_CGROUP_STAT_FILE_MAPPED], > + nr_pages); > } > > if (PageWriteback(page)) { > - __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_WRITEBACK], > - nr_pages); > - __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_WRITEBACK], > - nr_pages); > + percpu_counter_sub(&from->stat[MEM_CGROUP_STAT_WRITEBACK], > + nr_pages); > + percpu_counter_add(&to->stat[MEM_CGROUP_STAT_WRITEBACK], > + nr_pages); > } > > /* > @@ -3499,12 +3422,10 @@ static int mem_cgroup_move_account(struct page *page, > move_unlock_mem_cgroup(from, &flags); > ret = 0; > > - local_irq_disable(); > mem_cgroup_charge_statistics(to, page, nr_pages); > - memcg_check_events(to, page); > + memcg_check_events(to, page, nr_pages); > mem_cgroup_charge_statistics(from, page, -nr_pages); > - memcg_check_events(from, page); > - local_irq_enable(); > + memcg_check_events(from, page, nr_pages); > out_unlock: > unlock_page(page); > out: > @@ -3582,7 +3503,7 @@ static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg, > bool charge) > { > int val = (charge) ? 1 : -1; > - this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val); > + percpu_counter_add(&memcg->stat[MEM_CGROUP_STAT_SWAP], val); > } > > /** > @@ -5413,25 +5334,11 @@ static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) > > static struct mem_cgroup *mem_cgroup_alloc(void) > { > - struct mem_cgroup *memcg; > size_t size; > > size = sizeof(struct mem_cgroup); > size += nr_node_ids * sizeof(struct mem_cgroup_per_node *); > - > - memcg = kzalloc(size, GFP_KERNEL); > - if (!memcg) > - return NULL; > - > - memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu); > - if (!memcg->stat) > - goto out_free; > - spin_lock_init(&memcg->pcp_counter_lock); > - return memcg; > - > -out_free: > - kfree(memcg); > - return NULL; > + return kzalloc(size, GFP_KERNEL); > } > > /* > @@ -5448,13 +5355,20 @@ out_free: > static void __mem_cgroup_free(struct mem_cgroup *memcg) > { > int node; > + int i; > > mem_cgroup_remove_from_trees(memcg); > > for_each_node(node) > free_mem_cgroup_per_zone_info(memcg, node); > > - free_percpu(memcg->stat); > + for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) > + percpu_counter_destroy(&memcg->stat[i]); > + > + for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) > + percpu_counter_destroy(&memcg->events[i]); > + > + free_percpu(memcg->ratelimit); > > /* > * We need to make sure that (at least for now), the jump label > @@ -5511,11 +5425,24 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) > struct mem_cgroup *memcg; > long error = -ENOMEM; > int node; > + int i; > > memcg = mem_cgroup_alloc(); > if (!memcg) > return ERR_PTR(error); > > + for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) > + if (percpu_counter_init(&memcg->stat[i], 0, GFP_KERNEL)) > + goto free_out; > + > + for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) > + if (percpu_counter_init(&memcg->events[i], 0, GFP_KERNEL)) > + goto free_out; > + > + memcg->ratelimit = alloc_percpu(struct mem_cgroup_ratelimit_state); > + if (!memcg->ratelimit) > + goto free_out; > + > for_each_node(node) > if (alloc_mem_cgroup_per_zone_info(memcg, node)) > goto free_out; > @@ -6507,10 +6434,8 @@ void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, > VM_BUG_ON_PAGE(!PageTransHuge(page), page); > } > > - local_irq_disable(); > mem_cgroup_charge_statistics(memcg, page, nr_pages); > - memcg_check_events(memcg, page); > - local_irq_enable(); > + memcg_check_events(memcg, page, nr_pages); > > if (do_swap_account && PageSwapCache(page)) { > swp_entry_t entry = { .val = page_private(page) }; > @@ -6557,8 +6482,6 @@ static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout, > unsigned long nr_anon, unsigned long nr_file, > unsigned long nr_huge, struct page *dummy_page) > { > - unsigned long flags; > - > if (!mem_cgroup_is_root(memcg)) { > if (nr_mem) > res_counter_uncharge(&memcg->res, > @@ -6569,14 +6492,12 @@ static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout, > memcg_oom_recover(memcg); > } > > - local_irq_save(flags); > - __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_anon); > - __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_file); > - __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], nr_huge); > - __this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT], pgpgout); > - __this_cpu_add(memcg->stat->nr_page_events, nr_anon + nr_file); > - memcg_check_events(memcg, dummy_page); > - local_irq_restore(flags); > + percpu_counter_sub(&memcg->stat[MEM_CGROUP_STAT_RSS], nr_anon); > + percpu_counter_sub(&memcg->stat[MEM_CGROUP_STAT_CACHE], nr_file); > + percpu_counter_sub(&memcg->stat[MEM_CGROUP_STAT_RSS_HUGE], nr_huge); > + percpu_counter_add(&memcg->events[MEM_CGROUP_EVENTS_PGPGOUT], pgpgout); > + > + memcg_check_events(memcg, dummy_page, nr_anon + nr_file); > } > > static void uncharge_list(struct list_head *page_list) > -- To unsubscribe from this list: send the line "unsubscribe cgroups" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html