This patch is a trial for making stat for memory_cgroup NUMA-aware. * dividing per-cpu stat to handle nid information. * add nid information to page_cgroup * Because init routine has to call kmalloc, init_early is set to be 0. This is just a trial at this stage. Any comments are welcome. Works well on my fake NUMA system. I think we can add "numastat" based on this. Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@xxxxxxxxxxxxxx> mm/memcontrol.c | 97 ++++++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 71 insertions(+), 26 deletions(-) Index: devel-2.6.23-mm1/mm/memcontrol.c =================================================================== --- devel-2.6.23-mm1.orig/mm/memcontrol.c +++ devel-2.6.23-mm1/mm/memcontrol.c @@ -29,6 +29,7 @@ #include <linux/spinlock.h> #include <linux/fs.h> #include <linux/seq_file.h> +#include <linux/vmalloc.h> #include <asm/uaccess.h> @@ -59,12 +60,18 @@ enum mem_cgroup_stat_index { MEM_CGROUP_STAT_NSTATS, }; +#ifndef CONFIG_NUMA struct mem_cgroup_stat_cpu { - s64 count[MEM_CGROUP_STAT_NSTATS]; + s64 count[1][MEM_CGROUP_STAT_NSTATS]; } ____cacheline_aligned_in_smp; +#else +struct mem_cgroup_stat_cpu { + s64 count[MAX_NUMNODES][MEM_CGROUP_STAT_NSTATS]; +} ____cacheline_aligned_in_smp; +#endif struct mem_cgroup_stat { - struct mem_cgroup_stat_cpu cpustat[NR_CPUS]; + struct mem_cgroup_stat_cpu *cpustat[NR_CPUS]; }; /* @@ -72,28 +79,28 @@ struct mem_cgroup_stat { * MUST be called under preempt_disable(). */ static inline void __mem_cgroup_stat_add(struct mem_cgroup_stat *stat, - enum mem_cgroup_stat_index idx, int val) + enum mem_cgroup_stat_index idx, int nid, int val) { int cpu = smp_processor_id(); #ifdef CONFIG_PREEMPT VM_BUG_ON(preempt_count() == 0); #endif - stat->cpustat[cpu].count[idx] += val; + stat->cpustat[cpu]->count[nid][idx] += val; } static inline void mem_cgroup_stat_inc(struct mem_cgroup_stat *stat, - enum mem_cgroup_stat_index idx) + enum mem_cgroup_stat_index idx, int nid) { preempt_disable(); - __mem_cgroup_stat_add(stat, idx, 1); + __mem_cgroup_stat_add(stat, idx, nid, 1); preempt_enable(); } static inline void mem_cgroup_stat_dec(struct mem_cgroup_stat *stat, - enum mem_cgroup_stat_index idx) + enum mem_cgroup_stat_index idx, int nid) { preempt_disable(); - __mem_cgroup_stat_add(stat, idx, -1); + __mem_cgroup_stat_add(stat, idx, nid, -1); preempt_enable(); } @@ -149,6 +156,7 @@ struct page_cgroup { struct list_head lru; /* per cgroup LRU list */ struct page *page; struct mem_cgroup *mem_cgroup; + int nid; atomic_t ref_cnt; /* Helpful when pages move b/w */ /* mapped and cached states */ int flags; @@ -169,21 +177,23 @@ enum { * We have to modify several values at charge/uncharge.. */ static inline void -mem_cgroup_charge_statistics(struct mem_cgroup *mem, int flags, int charge) +mem_cgroup_charge_statistics(struct mem_cgroup *mem, int nid, + int flags, int charge) { int val = (charge)? 1 : -1; struct mem_cgroup_stat *stat = &mem->stat; preempt_disable(); if (flags & PCGF_PAGECACHE) - __mem_cgroup_stat_add(stat, MEM_CGROUP_STAT_PAGECACHE, val); + __mem_cgroup_stat_add(stat, + MEM_CGROUP_STAT_PAGECACHE, nid, val); else - __mem_cgroup_stat_add(stat, MEM_CGROUP_STAT_RSS, val); + __mem_cgroup_stat_add(stat, MEM_CGROUP_STAT_RSS, nid, val); if (flags & PCGF_ACTIVE) - __mem_cgroup_stat_add(stat, MEM_CGROUP_STAT_ACTIVE, val); + __mem_cgroup_stat_add(stat, MEM_CGROUP_STAT_ACTIVE, nid, val); else - __mem_cgroup_stat_add(stat, MEM_CGROUP_STAT_INACTIVE, val); + __mem_cgroup_stat_add(stat, MEM_CGROUP_STAT_INACTIVE, nid, val); preempt_enable(); } @@ -315,8 +325,10 @@ static void __mem_cgroup_move_lists(stru if (moved) { struct mem_cgroup_stat *stat = &mem->stat; preempt_disable(); - __mem_cgroup_stat_add(stat, MEM_CGROUP_STAT_ACTIVE, moved); - __mem_cgroup_stat_add(stat, MEM_CGROUP_STAT_INACTIVE, -moved); + __mem_cgroup_stat_add(stat, MEM_CGROUP_STAT_ACTIVE, + pc->nid, moved); + __mem_cgroup_stat_add(stat, MEM_CGROUP_STAT_INACTIVE, + pc->nid, -moved); preempt_enable(); } if (active) { @@ -493,10 +505,10 @@ retry: bool is_atomic = gfp_mask & GFP_ATOMIC; if (is_cache) mem_cgroup_stat_inc(&mem->stat, - MEM_CGROUP_STAT_FAIL_CACHE); + MEM_CGROUP_STAT_FAIL_CACHE, page_to_nid(page)); else mem_cgroup_stat_inc(&mem->stat, - MEM_CGROUP_STAT_FAIL_RSS); + MEM_CGROUP_STAT_FAIL_RSS, page_to_nid(page)); /* * We cannot reclaim under GFP_ATOMIC, fail the charge */ @@ -537,6 +549,7 @@ noreclaim: atomic_set(&pc->ref_cnt, 1); pc->mem_cgroup = mem; pc->page = page; + pc->nid = page_to_nid(page); if (is_cache) pc->flags = PCGF_PAGECACHE | PCGF_ACTIVE; else @@ -554,7 +567,7 @@ noreclaim: } /* Update statistics vector */ - mem_cgroup_charge_statistics(mem, pc->flags, true); + mem_cgroup_charge_statistics(mem, pc->nid, pc->flags, true); spin_lock_irqsave(&mem->lru_lock, flags); list_add(&pc->lru, &mem->active_list); @@ -621,7 +634,8 @@ void mem_cgroup_uncharge(struct page_cgr spin_lock_irqsave(&mem->lru_lock, flags); list_del_init(&pc->lru); spin_unlock_irqrestore(&mem->lru_lock, flags); - mem_cgroup_charge_statistics(mem, pc->flags, false); + mem_cgroup_charge_statistics(mem, pc->nid, pc->flags, + false); kfree(pc); } } @@ -657,6 +671,7 @@ void mem_cgroup_end_migration(struct pag void mem_cgroup_page_migration(struct page *page, struct page *newpage) { struct page_cgroup *pc; + struct mem_cgroup *mem; retry: pc = page_get_page_cgroup(page); if (!pc) @@ -664,6 +679,11 @@ retry: if (clear_page_cgroup(page, pc) != pc) goto retry; pc->page = newpage; + pc->nid = page_to_nid(newpage); + mem = pc->mem_cgroup; + mem_cgroup_charge_statistics(mem, page_to_nid(page), pc->flags, false); + mem_cgroup_charge_statistics(mem, + page_to_nid(newpage), pc->flags, true); lock_page_cgroup(newpage); page_assign_page_cgroup(newpage, pc); unlock_page_cgroup(newpage); @@ -697,7 +717,8 @@ retry: css_put(&mem->css); res_counter_uncharge(&mem->res, PAGE_SIZE); list_del_init(&pc->lru); - mem_cgroup_charge_statistics(mem, pc->flags, false); + mem_cgroup_charge_statistics(mem, pc->flags, pc->nid, + false); kfree(pc); } else /* being uncharged ? ...do relax */ break; @@ -872,13 +893,16 @@ static int mem_control_stat_show(struct struct mem_cgroup_stat *stat = &mem_cont->stat; int i; - for (i = 0; i < ARRAY_SIZE(stat->cpustat[0].count); i++) { + for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { unsigned int cpu; + int node; s64 val; val = 0; - for (cpu = 0; cpu < NR_CPUS; cpu++) - val += stat->cpustat[cpu].count[i]; + for_each_possible_cpu(cpu) + for_each_node_state(node, N_POSSIBLE) + val += stat->cpustat[cpu]->count[node][i]; + val *= mem_cgroup_stat_desc[i].unit; seq_printf(m, "%s %lld\n", mem_cgroup_stat_desc[i].msg, val); } @@ -941,12 +965,14 @@ static struct cgroup_subsys_state * mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) { struct mem_cgroup *mem; + int cpu; if (unlikely((cont->parent) == NULL)) { mem = &init_mem_cgroup; init_mm.mem_cgroup = mem; - } else + } else { mem = kzalloc(sizeof(struct mem_cgroup), GFP_KERNEL); + } if (mem == NULL) return NULL; @@ -956,6 +982,17 @@ mem_cgroup_create(struct cgroup_subsys * INIT_LIST_HEAD(&mem->inactive_list); spin_lock_init(&mem->lru_lock); mem->control_type = MEM_CGROUP_TYPE_ALL; + + for_each_possible_cpu(cpu) { + int nid = cpu_to_node(cpu); + struct mem_cgroup_stat_cpu *mcsc; + if (sizeof(*mcsc) < PAGE_SIZE) + mcsc = kmalloc_node(sizeof(*mcsc), GFP_KERNEL, nid); + else + mcsc = vmalloc_node(sizeof(*mcsc), nid); + memset(mcsc, 0, sizeof(*mcsc)); + mem->stat.cpustat[cpu] = mcsc; + } return &mem->css; } @@ -969,7 +1006,15 @@ static void mem_cgroup_pre_destroy(struc static void mem_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cont) { - kfree(mem_cgroup_from_cont(cont)); + struct mem_cgroup *mem = mem_cgroup_from_cont(cont); + int cpu; + for_each_possible_cpu(cpu) { + if (sizeof(struct mem_cgroup_stat_cpu) < PAGE_SIZE) + kfree(mem->stat.cpustat[cpu]); + else + vfree(mem->stat.cpustat[cpu]); + } + kfree(mem); } static int mem_cgroup_populate(struct cgroup_subsys *ss, @@ -1021,5 +1066,5 @@ struct cgroup_subsys mem_cgroup_subsys = .destroy = mem_cgroup_destroy, .populate = mem_cgroup_populate, .attach = mem_cgroup_move_task, - .early_init = 1, + .early_init = 0, }; _______________________________________________ Containers mailing list Containers@xxxxxxxxxxxxxxxxxxxxxxxxxx https://lists.linux-foundation.org/mailman/listinfo/containers