Re: [RFC][PATCH] memory cgroup enhancements updated [10/10] NUMA aware account

Balbir Singh <balbir@xxxxxxxxxxxxxxxxxx> · Wed, 24 Oct 2007 20:29:08 +0530

KAMEZAWA Hiroyuki wrote:
> This patch is a trial for making stat for memory_cgroup NUMA-aware.
> 
>  * dividing per-cpu stat to handle nid information.
>  * add nid information to page_cgroup
>  * Because init routine has to call kmalloc, init_early is set to be 0.
> 
> This is just a trial at this stage. Any comments are welcome.
> Works well on my fake NUMA system.
> I think we can add "numastat" based on this.
> 
> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@xxxxxxxxxxxxxx>
> 
> 
>  mm/memcontrol.c |   97 ++++++++++++++++++++++++++++++++++++++++----------------
>  1 file changed, 71 insertions(+), 26 deletions(-)
> 
> Index: devel-2.6.23-mm1/mm/memcontrol.c
> ===================================================================
> --- devel-2.6.23-mm1.orig/mm/memcontrol.c
> +++ devel-2.6.23-mm1/mm/memcontrol.c
> @@ -29,6 +29,7 @@
>  #include <linux/spinlock.h>
>  #include <linux/fs.h>
>  #include <linux/seq_file.h>
> +#include <linux/vmalloc.h>
> 
>  #include <asm/uaccess.h>
> 
> @@ -59,12 +60,18 @@ enum mem_cgroup_stat_index {
>  	MEM_CGROUP_STAT_NSTATS,
>  };
> 
> +#ifndef CONFIG_NUMA
>  struct mem_cgroup_stat_cpu {
> -	s64 count[MEM_CGROUP_STAT_NSTATS];
> +	s64 count[1][MEM_CGROUP_STAT_NSTATS];
>  } ____cacheline_aligned_in_smp;
> +#else
> +struct mem_cgroup_stat_cpu {
> +	s64 count[MAX_NUMNODES][MEM_CGROUP_STAT_NSTATS];
> +} ____cacheline_aligned_in_smp;
> +#endif
> 
>  struct mem_cgroup_stat {
> -	struct mem_cgroup_stat_cpu cpustat[NR_CPUS];
> +	struct mem_cgroup_stat_cpu *cpustat[NR_CPUS];
>  };
> 
>  /*
> @@ -72,28 +79,28 @@ struct mem_cgroup_stat {
>   * MUST be called under preempt_disable().
>   */
>  static inline void __mem_cgroup_stat_add(struct mem_cgroup_stat *stat,
> -                enum mem_cgroup_stat_index idx, int val)
> +		enum mem_cgroup_stat_index idx, int nid, int val)
>  {
>  	int cpu = smp_processor_id();
>  #ifdef CONFIG_PREEMPT
>  	VM_BUG_ON(preempt_count() == 0);
>  #endif
> -	stat->cpustat[cpu].count[idx] += val;
> +	stat->cpustat[cpu]->count[nid][idx] += val;
>  }
> 
>  static inline void mem_cgroup_stat_inc(struct mem_cgroup_stat *stat,
> -		enum mem_cgroup_stat_index idx)
> +		enum mem_cgroup_stat_index idx, int nid)
>  {
>  	preempt_disable();
> -	__mem_cgroup_stat_add(stat, idx, 1);
> +	__mem_cgroup_stat_add(stat, idx, nid, 1);
>  	preempt_enable();
>  }
> 
>  static inline void mem_cgroup_stat_dec(struct mem_cgroup_stat *stat,
> -		enum mem_cgroup_stat_index idx)
> +		enum mem_cgroup_stat_index idx, int nid)
>  {
>  	preempt_disable();
> -	__mem_cgroup_stat_add(stat, idx, -1);
> +	__mem_cgroup_stat_add(stat, idx, nid, -1);
>  	preempt_enable();
>  }
> 
> @@ -149,6 +156,7 @@ struct page_cgroup {
>  	struct list_head lru;		/* per cgroup LRU list */
>  	struct page *page;
>  	struct mem_cgroup *mem_cgroup;
> +	int	nid;
>  	atomic_t ref_cnt;		/* Helpful when pages move b/w  */
>  					/* mapped and cached states     */
>  	int	 flags;
> @@ -169,21 +177,23 @@ enum {
>   * We have to modify several values at charge/uncharge..
>   */
>  static inline void
> -mem_cgroup_charge_statistics(struct mem_cgroup *mem, int flags, int charge)
> +mem_cgroup_charge_statistics(struct mem_cgroup *mem, int nid,
> +				int flags, int charge)
>  {
>  	int val = (charge)? 1 : -1;
>  	struct mem_cgroup_stat *stat = &mem->stat;
>  	preempt_disable();
> 
>  	if (flags & PCGF_PAGECACHE)
> -		__mem_cgroup_stat_add(stat, MEM_CGROUP_STAT_PAGECACHE, val);
> +		__mem_cgroup_stat_add(stat,
> +				MEM_CGROUP_STAT_PAGECACHE, nid, val);
>  	else
> -		__mem_cgroup_stat_add(stat, MEM_CGROUP_STAT_RSS, val);
> +		__mem_cgroup_stat_add(stat, MEM_CGROUP_STAT_RSS, nid, val);
> 
>  	if (flags & PCGF_ACTIVE)
> -		__mem_cgroup_stat_add(stat, MEM_CGROUP_STAT_ACTIVE, val);
> +		__mem_cgroup_stat_add(stat, MEM_CGROUP_STAT_ACTIVE, nid, val);
>  	else
> -		__mem_cgroup_stat_add(stat, MEM_CGROUP_STAT_INACTIVE, val);
> +		__mem_cgroup_stat_add(stat, MEM_CGROUP_STAT_INACTIVE, nid, val);
> 
>  	preempt_enable();
>  }
> @@ -315,8 +325,10 @@ static void __mem_cgroup_move_lists(stru
>  	if (moved) {
>  		struct mem_cgroup_stat *stat = &mem->stat;
>  		preempt_disable();
> -		__mem_cgroup_stat_add(stat, MEM_CGROUP_STAT_ACTIVE, moved);
> -		__mem_cgroup_stat_add(stat, MEM_CGROUP_STAT_INACTIVE, -moved);
> +		__mem_cgroup_stat_add(stat, MEM_CGROUP_STAT_ACTIVE,
> +							pc->nid, moved);
> +		__mem_cgroup_stat_add(stat, MEM_CGROUP_STAT_INACTIVE,
> +					pc->nid, -moved);
>  		preempt_enable();
>  	}
>  	if (active) {
> @@ -493,10 +505,10 @@ retry:
>  		bool is_atomic = gfp_mask & GFP_ATOMIC;
>  		if (is_cache)
>  			mem_cgroup_stat_inc(&mem->stat,
> -				MEM_CGROUP_STAT_FAIL_CACHE);
> +				MEM_CGROUP_STAT_FAIL_CACHE, page_to_nid(page));
>  		else
>  			mem_cgroup_stat_inc(&mem->stat,
> -				MEM_CGROUP_STAT_FAIL_RSS);
> +				MEM_CGROUP_STAT_FAIL_RSS, page_to_nid(page));
>  		/*
>  		 * We cannot reclaim under GFP_ATOMIC, fail the charge
>  		 */
> @@ -537,6 +549,7 @@ noreclaim:
>  	atomic_set(&pc->ref_cnt, 1);
>  	pc->mem_cgroup = mem;
>  	pc->page = page;
> +	pc->nid = page_to_nid(page);
>  	if (is_cache)
>  		pc->flags = PCGF_PAGECACHE | PCGF_ACTIVE;
>  	else
> @@ -554,7 +567,7 @@ noreclaim:
>  	}
> 
>  	/* Update statistics vector */
> -	mem_cgroup_charge_statistics(mem, pc->flags, true);
> +	mem_cgroup_charge_statistics(mem, pc->nid, pc->flags, true);
> 
>  	spin_lock_irqsave(&mem->lru_lock, flags);
>  	list_add(&pc->lru, &mem->active_list);
> @@ -621,7 +634,8 @@ void mem_cgroup_uncharge(struct page_cgr
>  			spin_lock_irqsave(&mem->lru_lock, flags);
>  			list_del_init(&pc->lru);
>  			spin_unlock_irqrestore(&mem->lru_lock, flags);
> -			mem_cgroup_charge_statistics(mem, pc->flags, false);
> +			mem_cgroup_charge_statistics(mem, pc->nid, pc->flags,
> +						false);
>  			kfree(pc);
>  		}
>  	}
> @@ -657,6 +671,7 @@ void mem_cgroup_end_migration(struct pag
>  void mem_cgroup_page_migration(struct page *page, struct page *newpage)
>  {
>  	struct page_cgroup *pc;
> +	struct mem_cgroup *mem;
>  retry:
>  	pc = page_get_page_cgroup(page);
>  	if (!pc)
> @@ -664,6 +679,11 @@ retry:
>  	if (clear_page_cgroup(page, pc) != pc)
>  		goto retry;
>  	pc->page = newpage;
> +	pc->nid = page_to_nid(newpage);
> +	mem = pc->mem_cgroup;
> +	mem_cgroup_charge_statistics(mem, page_to_nid(page), pc->flags, false);
> +	mem_cgroup_charge_statistics(mem,
> +				page_to_nid(newpage), pc->flags, true);
>  	lock_page_cgroup(newpage);
>  	page_assign_page_cgroup(newpage, pc);
>  	unlock_page_cgroup(newpage);
> @@ -697,7 +717,8 @@ retry:
>  			css_put(&mem->css);
>  			res_counter_uncharge(&mem->res, PAGE_SIZE);
>  			list_del_init(&pc->lru);
> -			mem_cgroup_charge_statistics(mem, pc->flags, false);
> +			mem_cgroup_charge_statistics(mem, pc->flags, pc->nid,
> +					false);
>  			kfree(pc);
>  		} else 	/* being uncharged ? ...do relax */
>  			break;
> @@ -872,13 +893,16 @@ static int mem_control_stat_show(struct 
>  	struct mem_cgroup_stat *stat = &mem_cont->stat;
>  	int i;
> 
> -	for (i = 0; i < ARRAY_SIZE(stat->cpustat[0].count); i++) {
> +	for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
>  		unsigned int cpu;
> +		int node;
>  		s64 val;
> 
>  		val = 0;
> -		for (cpu = 0; cpu < NR_CPUS; cpu++)
> -			val += stat->cpustat[cpu].count[i];
> +		for_each_possible_cpu(cpu)
> +			for_each_node_state(node, N_POSSIBLE)
> +				val += stat->cpustat[cpu]->count[node][i];
> +
>  		val *= mem_cgroup_stat_desc[i].unit;
>  		seq_printf(m, "%s %lld\n", mem_cgroup_stat_desc[i].msg, val);
>  	}
> @@ -941,12 +965,14 @@ static struct cgroup_subsys_state *
>  mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
>  {
>  	struct mem_cgroup *mem;
> +	int cpu;
> 
>  	if (unlikely((cont->parent) == NULL)) {
>  		mem = &init_mem_cgroup;
>  		init_mm.mem_cgroup = mem;
> -	} else
> +	} else {
>  		mem = kzalloc(sizeof(struct mem_cgroup), GFP_KERNEL);
> +	}
> 
>  	if (mem == NULL)
>  		return NULL;
> @@ -956,6 +982,17 @@ mem_cgroup_create(struct cgroup_subsys *
>  	INIT_LIST_HEAD(&mem->inactive_list);
>  	spin_lock_init(&mem->lru_lock);
>  	mem->control_type = MEM_CGROUP_TYPE_ALL;
> +
> +	for_each_possible_cpu(cpu) {
> +		int nid = cpu_to_node(cpu);
> +		struct mem_cgroup_stat_cpu *mcsc;
> +		if (sizeof(*mcsc) < PAGE_SIZE)
> +			mcsc = kmalloc_node(sizeof(*mcsc), GFP_KERNEL, nid);
> +		else
> +			mcsc = vmalloc_node(sizeof(*mcsc), nid);

Do we need to use the vmalloc() pool? I think we might be better off
using a dedicated slab for ourselves

> +		memset(mcsc, 0, sizeof(*mcsc));
> +		mem->stat.cpustat[cpu] = mcsc;
> +	}
>  	return &mem->css;
>  }
> 
> @@ -969,7 +1006,15 @@ static void mem_cgroup_pre_destroy(struc
>  static void mem_cgroup_destroy(struct cgroup_subsys *ss,
>  				struct cgroup *cont)
>  {
> -	kfree(mem_cgroup_from_cont(cont));
> +	struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
> +	int cpu;
> +	for_each_possible_cpu(cpu) {
> +		if (sizeof(struct mem_cgroup_stat_cpu) < PAGE_SIZE)
> +			kfree(mem->stat.cpustat[cpu]);
> +		else
> +			vfree(mem->stat.cpustat[cpu]);
> +	}
> +	kfree(mem);
>  }
> 
>  static int mem_cgroup_populate(struct cgroup_subsys *ss,
> @@ -1021,5 +1066,5 @@ struct cgroup_subsys mem_cgroup_subsys =
>  	.destroy = mem_cgroup_destroy,
>  	.populate = mem_cgroup_populate,
>  	.attach = mem_cgroup_move_task,
> -	.early_init = 1,
> +	.early_init = 0,

I don't understand why this change is required here?

>  };
> 

-- 
	Warm Regards,
	Balbir Singh
	Linux Technology Center
	IBM, ISTL
_______________________________________________
Containers mailing list
Containers@xxxxxxxxxxxxxxxxxxxxxxxxxx
https://lists.linux-foundation.org/mailman/listinfo/containers