On Thu, 9 Jun 2011 19:10:31 +0900 KAMEZAWA Hiroyuki <kamezawa.hiroyu@xxxxxxxxxxxxxx> wrote: > > A scan node selection logic for memcg rather than round-robin. > > This patch is what I'm testing now but I don't have big NUMA. > please review if you have time. This is against linux-3.0-rc2. > I'm sorry that I noticed I can remove all "noswap" code then... the size of patch will be half. I'll post more tested version in the next week. Thanks, -Kame > == > From fedf64c8b7e42a8d03974456f01fa3774df94afe Mon Sep 17 00:00:00 2001 > From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@xxxxxxxxxxxxxx> > Date: Thu, 9 Jun 2011 18:53:14 +0900 > Subject: [PATCH] weighted node scanning for memory cgroup. > > When I pushed 889976dbcb1218119fdd950fb7819084e37d7d37 > " memcg: reclaim memory from nodes in round-robin order " > > I mentioned " a better algorithm is needed." > > This patch is a better algorithm. This implements.. > > 1. per node reclaim target possibilty (called as weight.) > 2. select a node from nodes by proportional fair way with the weight > > For 1, this patch simply check the number of pages on LRU with > some bias of active/inactive ratio and swappiness. A weight for > node is calculated as > > weight = (inactive_files + active_files * active_ratio) * (200-swappines) > (inactive_anon + active_files * active_ratio) * swappiness > > In general, if node contains more inactive files, the weight goes high. > This weight can be seen by numa_stat. > > [root@bluextal linux-2.6]# cat /cgroup/memory/A/memory.numa_stat > total=31863 N0=14235 N1=17628 > file=27912 N0=12814 N1=15098 > anon=3951 N0=1421 N1=2530 > unevictable=0 N0=0 N1=0 > lru_scan_weight=35798 N0=19886 N1=15912 > > For 2. this patch uses lottery scheduling using node's weight. > This is porportionally fair logic. > > For example, assume Node 0,1,2,3 and their weights are 1000,2000,1000,6000. > The node selection possibility is 1:2:1:6. So, even if a node contains > small amount of memory, it has possiblity to be reclaimed. In long run, > there will prevent starvation....where the system ignores small umount of > file caches in a node..... > > I used bsearch() for finding a victim node..so, you may think the code is > complicated. > > TO-BE-CHECKED: > - of course, calculation of "weight" is the heart of the logic. > - what part is hard to read ? > - cleverer implemenation ? > - need more tests. > - choose better variable names and add more comments ;) > > Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@xxxxxxxxxxxxxx> > --- > include/linux/memcontrol.h | 2 +- > mm/memcontrol.c | 226 +++++++++++++++++++++++++++++++++++++------ > mm/vmscan.c | 2 +- > 3 files changed, 196 insertions(+), 34 deletions(-) > > diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h > index 9724a38..95d18b6 100644 > --- a/include/linux/memcontrol.h > +++ b/include/linux/memcontrol.h > @@ -108,7 +108,7 @@ extern void mem_cgroup_end_migration(struct mem_cgroup *mem, > */ > int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg); > int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg); > -int mem_cgroup_select_victim_node(struct mem_cgroup *memcg); > +int mem_cgroup_select_victim_node(struct mem_cgroup *memcg, bool noswap); > unsigned long mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, > struct zone *zone, > enum lru_list lru); > diff --git a/mm/memcontrol.c b/mm/memcontrol.c > index 06825be..2ed3749 100644 > --- a/mm/memcontrol.c > +++ b/mm/memcontrol.c > @@ -48,6 +48,8 @@ > #include <linux/page_cgroup.h> > #include <linux/cpu.h> > #include <linux/oom.h> > +#include <linux/random.h> > +#include <linux/bsearch.h> > #include "internal.h" > > #include <asm/uaccess.h> > @@ -141,10 +143,22 @@ struct mem_cgroup_per_zone { > > struct mem_cgroup_per_node { > struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; > + unsigned long weight; > +}; > + > +struct node_schedinfo { > + unsigned long weight_start, weight_end; > + unsigned long weight_noswap_start, weight_noswap_end; > + int nid; > }; > > struct mem_cgroup_lru_info { > struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES]; > + struct node_schedinfo *schedinfo; > + struct rw_semaphore updating; > + int nr_nodes; > + unsigned long total_weight; > + unsigned long total_weight_noswap; > }; > > /* > @@ -1559,36 +1573,157 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem) > } > > #if MAX_NUMNODES > 1 > +/* > + * Weight for node scanning calculation. > + * Basically, node includes inactive pages are guilty. But if we never scan > + * nodes only with active pages, LRU rotation on the node never occurs. > + * So, we need to take into account active pages to some extent. > + */ > +static void mem_cgroup_node_sched_weight(struct mem_cgroup *mem, > + int nid, unsigned long usage, > + int file_active_ratio, int anon_active_ratio, > + unsigned long *weight, unsigned long *weight_noswap) > +{ > + unsigned long file, anon; > + > + > + file = mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_FILE); > + /* reduce weight of node active pages with regard to the total ratio */ > + file = file * file_active_ratio / 100; > + file += mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_FILE); > + > + > + anon = mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_ANON); > + /* reduce weight of node active pages with regard to the total ratio */ > + anon = anon * anon_active_ratio / 100; > + anon += mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_ANON); > + > + *weight_noswap = file; > + *weight = file * (200 - mem->swappiness) + anon * (mem->swappiness + 1); > + *weight /= 200; > + > + return; > +} > + > +/* > + * calculate a ratio for active/inactive ratio with regard to curret > + * total inactive/active ratio. > + */ > +static unsigned int > +mem_cgroup_active_ratio(struct mem_cgroup *mem, enum lru_list l) > +{ > + unsigned long active, inactive; > + > + inactive = mem_cgroup_get_local_zonestat(mem, l); > + active = mem_cgroup_get_local_zonestat(mem, l + LRU_ACTIVE); > + > + return (active * 100)/(active + inactive); > +} > > /* > * Always updating the nodemask is not very good - even if we have an empty > * list or the wrong list here, we can start from some node and traverse all > * nodes based on the zonelist. So update the list loosely once per 10 secs. > - * > */ > -static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem) > +static void mem_cgroup_may_update_nodeweight(struct mem_cgroup *mem) > { > int nid; > + int file_active_ratio, anon_active_ratio; > + unsigned long total_weight, total_weight_noswap; > + struct node_schedinfo *ns; > + unsigned long usage; > > if (time_after(mem->next_scan_node_update, jiffies)) > return; > - > + down_write(&mem->info.updating); > + /* double check for race. */ > + if (time_after(mem->next_scan_node_update, jiffies)) { > + up_write(&mem->info.updating); > + return; > + } > mem->next_scan_node_update = jiffies + 10*HZ; > + > /* make a nodemask where this memcg uses memory from */ > - mem->scan_nodes = node_states[N_HIGH_MEMORY]; > + total_weight = 0; > + total_weight_noswap = 0; > > - for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) { > > - if (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_FILE) || > - mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_FILE)) > - continue; > + file_active_ratio = mem_cgroup_active_ratio(mem, LRU_INACTIVE_FILE); > + anon_active_ratio = mem_cgroup_active_ratio(mem, LRU_INACTIVE_ANON); > + usage = res_counter_read_u64(&mem->res, RES_USAGE) >> PAGE_SHIFT; > + > + /* > + * Calculate each node's weight and put the score into array of > + * node_scheinfo. Eech node schedinfo contains nid and > + * [start_weight, end_weight). Later, a thread will get a ticket > + * with a number < total_weight and find a node_schedinfo which > + * has start_weight <= ticket number < end_weight. > + * > + * If ticket number is enough random, we can do proportional fair > + * victim selection with regard to "weight" of each nodes. > + * > + * Why do we need to use "schedinfo" rather than adding member to > + * per-node structure ? There are 2 reasons. 1) node array > + * can have holes and not good for bsearch() 2) We can skip nodes > + * which has no memory. Using other structure like prio-tree is a > + * possible idea, of course. > + */ > + ns = &mem->info.schedinfo[0]; > + > + for_each_node_mask (nid, node_states[N_HIGH_MEMORY]) { > + unsigned long weight, weight_noswap; > > - if (total_swap_pages && > - (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_ANON) || > - mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_ANON))) > + mem_cgroup_node_sched_weight(mem, nid, usage, > + file_active_ratio, > + anon_active_ratio, > + &weight, &weight_noswap); > + > + if (weight == 0) > continue; > - node_clear(nid, mem->scan_nodes); > + ns->nid = nid; > + /* For noswap scan, we take care of file caches */ > + ns->weight_noswap_start = total_weight_noswap; > + ns->weight_noswap_end = ns->weight_noswap_start + weight_noswap; > + total_weight_noswap += weight_noswap; > + > + ns->weight_start = total_weight; > + ns->weight_end = ns->weight_start + weight; > + total_weight += weight; > + ns++; > + /* for numa stat */ > + mem->info.nodeinfo[nid]->weight = weight; > } > + > + mem->info.total_weight = total_weight; > + mem->info.total_weight_noswap = total_weight_noswap; > + mem->info.nr_nodes = ns - mem->info.schedinfo; > + > + up_write(&mem->info.updating); > +} > + > +/* routine for finding lottery winner by bsearch. */ > +static int cmp_node_lottery_weight(const void *key, const void *elt) > +{ > + struct node_schedinfo *ns = (struct node_schedinfo *)elt; > + unsigned long lottery = (unsigned long)key; > + > + if (lottery < ns->weight_start) > + return -1; > + if (lottery >= ns->weight_end) > + return 1; > + return 0; > +} > + > +static int cmp_node_lottery_weight_noswap(const void *key, const void *elt) > +{ > + struct node_schedinfo *ns = (struct node_schedinfo *)elt; > + unsigned long lottery = (unsigned long)key; > + > + if (lottery < ns->weight_noswap_start) > + return -1; > + if (lottery >= ns->weight_noswap_end) > + return 1; > + return 0; > } > > /* > @@ -1601,29 +1736,42 @@ static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem) > * hit limits, it will see a contention on a node. But freeing from remote > * node means more costs for memory reclaim because of memory latency. > * > - * Now, we use round-robin. Better algorithm is welcomed. > + * This selects a node by an lottery scheduling algorithm with regard to > + * each node's weight under the whole memcg usage. By this we'll scan > + * nodes in a way proportional fair to the number of reclaim candidate > + * pages per node. > */ > -int mem_cgroup_select_victim_node(struct mem_cgroup *mem) > +int mem_cgroup_select_victim_node(struct mem_cgroup *mem, bool noswap) > { > - int node; > - > - mem_cgroup_may_update_nodemask(mem); > - node = mem->last_scanned_node; > - > - node = next_node(node, mem->scan_nodes); > - if (node == MAX_NUMNODES) > - node = first_node(mem->scan_nodes); > - /* > - * We call this when we hit limit, not when pages are added to LRU. > - * No LRU may hold pages because all pages are UNEVICTABLE or > - * memcg is too small and all pages are not on LRU. In that case, > - * we use curret node. > - */ > - if (unlikely(node == MAX_NUMNODES)) > - node = numa_node_id(); > + struct node_schedinfo *ns; > + unsigned long lottery; > + int nid; > > - mem->last_scanned_node = node; > - return node; > + mem_cgroup_may_update_nodeweight(mem); > + /* use pseudo nid while updating schedule information. */ > + if (!down_read_trylock(&mem->info.updating)) > + return numa_node_id(); > + > + if (!mem->info.nr_nodes) { > + /* if all nodes has very small memory for each... */ > + ns = NULL; > + } else if (noswap) { > + lottery = random32() % mem->info.total_weight_noswap; > + ns = bsearch((void*)lottery, mem->info.schedinfo, > + mem->info.nr_nodes, > + sizeof(*ns), cmp_node_lottery_weight_noswap); > + } else { > + lottery = random32() % mem->info.total_weight; > + ns = bsearch((void*)lottery, mem->info.schedinfo, > + mem->info.nr_nodes, > + sizeof(*ns), cmp_node_lottery_weight); > + } > + if (ns) > + nid = ns->nid; > + else > + nid = numa_node_id(); > + up_read(&mem->info.updating); > + return nid; > } > > #else > @@ -4135,6 +4283,11 @@ static int mem_control_numa_stat_show(struct seq_file *m, void *arg) > seq_printf(m, " N%d=%lu", nid, node_nr); > } > seq_putc(m, '\n'); > + seq_printf(m, "lru_scan_weight=%lu", mem_cont->info.total_weight); > + for_each_node_state(nid, N_HIGH_MEMORY) > + seq_printf(m, " N%d=%lu", nid, > + mem_cont->info.nodeinfo[nid]->weight); > + seq_putc(m, '\n'); > return 0; > } > #endif /* CONFIG_NUMA */ > @@ -4789,6 +4942,8 @@ static void __mem_cgroup_free(struct mem_cgroup *mem) > for_each_node_state(node, N_POSSIBLE) > free_mem_cgroup_per_zone_info(mem, node); > > + kfree(mem->info.schedinfo); > + > free_percpu(mem->stat); > if (sizeof(struct mem_cgroup) < PAGE_SIZE) > kfree(mem); > @@ -4878,6 +5033,13 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) > if (alloc_mem_cgroup_per_zone_info(mem, node)) > goto free_out; > > + node = nodes_weight(node_states[N_POSSIBLE]); > + mem->info.schedinfo = > + kzalloc(sizeof(struct node_schedinfo) * node, GFP_KERNEL); > + if (!mem->info.schedinfo) > + goto free_out; > + init_rwsem(&mem->info.updating); > + > /* root ? */ > if (cont->parent == NULL) { > int cpu; > diff --git a/mm/vmscan.c b/mm/vmscan.c > index faa0a08..dd1823b 100644 > --- a/mm/vmscan.c > +++ b/mm/vmscan.c > @@ -2254,7 +2254,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, > * take care of from where we get pages. So the node where we start the > * scan does not need to be the current node. > */ > - nid = mem_cgroup_select_victim_node(mem_cont); > + nid = mem_cgroup_select_victim_node(mem_cont, noswap); > > zonelist = NODE_DATA(nid)->node_zonelists; > > -- > 1.7.4.1 > > > -- > To unsubscribe from this list: send the line "unsubscribe linux-kernel" in > the body of a message to majordomo@xxxxxxxxxxxxxxx > More majordomo info at http://vger.kernel.org/majordomo-info.html > Please read the FAQ at http://www.tux.org/lkml/ > -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxxx For more info on Linux MM, see: http://www.linux-mm.org/ . Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/ Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>