This patch implements a node selection logic based on each node's weight. This patch adds a new array of nodescan_tickets[]. This array holds each node's scan weight in a tuple of 2 values. as for (i = 0, total_weight = 0; i < nodes; i++) { weight = node->weight; nodescan_tickets[i].start = total_weight; nodescan_tickets[i].length = weight; } After this, a lottery logic as 'ticket = random32()/total_weight' will make a ticket and bserach(ticket, nodescan_tickets[]) will find a node which holds [start, length] contains ticket. (This is a lottery scheduling.) By this, node will be selected in fair manner proportinal to its weight. This patch improve the scan time. Following is a test result ot kernel-make on 4-node fake-numa under 500M limit, with 8cpus. 2cpus per node. [Before patch] 772.52user 305.67system 4:11.48elapsed 428%CPU (0avgtext+0avgdata 1457264maxresident)k 4797592inputs+5483240outputs (12550major+35707629minor)pagefaults 0swaps [After patch] 773.73user 305.09system 3:51.28elapsed 466%CPU (0avgtext+0avgdata 1458464maxresident)k 4400264inputs+4797056outputs (5578major+35690202minor)pagefaults 0swaps elapsed time and major faults are reduced. Here, vmscan_stat shows [Before patch] scanned_pages_by_limit 3926782 scanned_anon_pages_by_limit 1511090 scanned_file_pages_by_limit 2415692 elapsed_ns_by_limit 69528714562 [After patch] scanned_pages_by_limit 4326462 scanned_anon_pages_by_limit 1310619 scanned_file_pages_by_limit 3015843 elapsed_ns_by_limit 42495200307 This patch helps to scan file caches rather than scanning anon. and elapsed time is much reduced. Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@xxxxxxxxxxxxxx> --- include/linux/memcontrol.h | 3 mm/memcontrol.c | 150 ++++++++++++++++++++++++++++++++++++++------- mm/vmscan.c | 4 - 3 files changed, 131 insertions(+), 26 deletions(-) Index: mmotm-Aug3/mm/memcontrol.c =================================================================== --- mmotm-Aug3.orig/mm/memcontrol.c +++ mmotm-Aug3/mm/memcontrol.c @@ -49,6 +49,9 @@ #include <linux/page_cgroup.h> #include <linux/cpu.h> #include <linux/oom.h> +#include <linux/random.h> +#include <linux/bsearch.h> +#include <linux/cpuset.h> #include "internal.h" #include <asm/uaccess.h> @@ -151,6 +154,11 @@ struct mem_cgroup_lru_info { struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES]; }; +struct numascan_ticket { + int nid; + unsigned int start, tickets; +}; + /* * Cgroups above their limits are maintained in a RB-Tree, independent of * their hierarchy representation @@ -287,7 +295,10 @@ struct mem_cgroup { atomic_t numainfo_events; atomic_t numainfo_updating; struct work_struct numainfo_update_work; - unsigned long total_weight; + unsigned long total_weight; + int numascan_generation; + int numascan_tickets_num[2]; + struct numascan_ticket *numascan_tickets[2]; #endif /* * Should the accounting and control be hierarchical, per subtree? @@ -1660,6 +1671,46 @@ mem_cgroup_calc_numascan_weight(struct m } /* + * For lottery scheduling, this routine disributes "ticket" for + * scanning to each node. ticket will be recored into numascan_ticket + * array and this array will be used for scheduling, lator. + * For make lottery wair, we limit the sum of tickets almost 0xffff. + * Later, random() & 0xffff will do proportional fair lottery. + */ +#define NUMA_TICKET_SHIFT (16) +#define NUMA_TICKET_FACTOR ((1 << NUMA_TICKET_SHIFT) - 1) +static void mem_cgroup_update_numascan_tickets(struct mem_cgroup *memcg) +{ + struct numascan_ticket *nt; + unsigned int node_ticket, assigned_tickets; + u64 weight; + int nid, assigned_num, generation; + + /* update ticket information by double buffering */ + generation = memcg->numascan_generation ^ 0x1; + + nt = memcg->numascan_tickets[generation]; + assigned_tickets = 0; + assigned_num = 0; + for_each_node_mask(nid, memcg->scan_nodes) { + weight = memcg->info.nodeinfo[nid]->weight; + node_ticket = div64_u64(weight << NUMA_TICKET_SHIFT, + memcg->total_weight + 1); + if (!node_ticket) + node_ticket = 1; + nt->nid = nid; + nt->start = assigned_tickets; + nt->tickets = node_ticket; + assigned_tickets += node_ticket; + nt++; + assigned_num++; + } + memcg->numascan_tickets_num[generation] = assigned_num; + smp_wmb(); + memcg->numascan_generation = generation; +} + +/* * Update all node's scan weight in background. */ static void mem_cgroup_numainfo_update_work(struct work_struct *work) @@ -1672,6 +1723,8 @@ static void mem_cgroup_numainfo_update_w memcg->total_weight = mem_cgroup_calc_numascan_weight(memcg); + synchronize_rcu(); + mem_cgroup_update_numascan_tickets(memcg); atomic_set(&memcg->numainfo_updating, 0); css_put(&memcg->css); } @@ -1698,6 +1751,18 @@ static void mem_cgroup_may_update_nodema schedule_work(&mem->numainfo_update_work); } +static int node_weight_compare(const void *key, const void *elt) +{ + unsigned long lottery = (unsigned long)key; + struct numascan_ticket *nt = (struct numascan_ticket *)elt; + + if (lottery < nt->start) + return -1; + if (lottery > (nt->start + nt->tickets)) + return 1; + return 0; +} + /* * Selecting a node where we start reclaim from. Because what we need is just * reducing usage counter, start from anywhere is O,K. Considering @@ -1707,32 +1772,38 @@ static void mem_cgroup_may_update_nodema * we'll use or we've used. So, it may make LRU bad. And if several threads * hit limits, it will see a contention on a node. But freeing from remote * node means more costs for memory reclaim because of memory latency. - * - * Now, we use round-robin. Better algorithm is welcomed. */ -int mem_cgroup_select_victim_node(struct mem_cgroup *mem, nodemask_t **mask) +int mem_cgroup_select_victim_node(struct mem_cgroup *memcg, nodemask_t **mask, + struct memcg_scanrecord *rec) { - int node; + int node = MAX_NUMNODES; + struct numascan_ticket *nt; + unsigned long lottery; + int generation; + if (rec->context == SCAN_BY_SHRINK) + goto out; + + mem_cgroup_may_update_nodemask(memcg); *mask = NULL; - mem_cgroup_may_update_nodemask(mem); - node = mem->last_scanned_node; + lottery = random32() & NUMA_TICKET_FACTOR; - node = next_node(node, mem->scan_nodes); - if (node == MAX_NUMNODES) - node = first_node(mem->scan_nodes); - /* - * We call this when we hit limit, not when pages are added to LRU. - * No LRU may hold pages because all pages are UNEVICTABLE or - * memcg is too small and all pages are not on LRU. In that case, - * we use curret node. - */ - if (unlikely(node == MAX_NUMNODES)) + rcu_read_lock(); + generation = memcg->numascan_generation; + nt = bsearch((void *)lottery, + memcg->numascan_tickets[generation], + memcg->numascan_tickets_num[generation], + sizeof(struct numascan_ticket), node_weight_compare); + rcu_read_unlock(); + if (nt) + node = nt->nid; +out: + if (unlikely(node == MAX_NUMNODES)) { node = numa_node_id(); - else - *mask = &mem->scan_nodes; + *mask = NULL; + } else + *mask = &memcg->scan_nodes; - mem->last_scanned_node = node; return node; } @@ -1771,14 +1842,42 @@ bool mem_cgroup_reclaimable(struct mem_c return false; } -static void mem_cgroup_numascan_init(struct mem_cgroup *memcg) +static bool mem_cgroup_numascan_init(struct mem_cgroup *memcg) { + struct numascan_ticket *nt; + int nr_nodes; + INIT_WORK(&memcg->numainfo_update_work, mem_cgroup_numainfo_update_work); + + nr_nodes = num_possible_nodes(); + nt = kmalloc(sizeof(struct numascan_ticket) * nr_nodes, + GFP_KERNEL); + if (!nt) + return false; + memcg->numascan_tickets[0] = nt; + nt = kmalloc(sizeof(struct numascan_ticket) * nr_nodes, + GFP_KERNEL); + if (!nt) { + kfree(memcg->numascan_tickets[0]); + memcg->numascan_tickets[0] = NULL; + return false; + } + memcg->numascan_tickets[1] = nt; + memcg->numascan_tickets_num[0] = 0; + memcg->numascan_tickets_num[1] = 0; + return true; +} + +static void mem_cgroup_numascan_free(struct mem_cgroup *memcg) +{ + kfree(memcg->numascan_tickets[0]); + kfree(memcg->numascan_tickets[1]); } #else -int mem_cgroup_select_victim_node(struct mem_cgroup *mem, nodemask_t **mask) +int mem_cgroup_select_victim_node(struct mem_cgroup *mem, nodemask_t **mask, + struct memcg_scanrecord *rec) { *mask = NULL; return 0; @@ -1791,6 +1890,9 @@ bool mem_cgroup_reclaimable(struct mem_c static void mem_cgroup_numascan_init(struct mem_cgroup *memcg) { } +static bool mem_cgroup_numascan_free(struct mem_cgroup *memcg) +{ +} #endif static void __mem_cgroup_record_scanstat(unsigned long *stats, @@ -5080,6 +5182,7 @@ static void __mem_cgroup_free(struct mem int node; mem_cgroup_remove_from_trees(mem); + mem_cgroup_numascan_free(mem); free_css_id(&mem_cgroup_subsys, &mem->css); for_each_node_state(node, N_POSSIBLE) @@ -5218,7 +5321,8 @@ mem_cgroup_create(struct cgroup_subsys * mem->move_charge_at_immigrate = 0; mutex_init(&mem->thresholds_lock); spin_lock_init(&mem->scanstat.lock); - mem_cgroup_numascan_init(mem); + if (!mem_cgroup_numascan_init(mem)) + goto free_out; return &mem->css; free_out: __mem_cgroup_free(mem); Index: mmotm-Aug3/mm/vmscan.c =================================================================== --- mmotm-Aug3.orig/mm/vmscan.c +++ mmotm-Aug3/mm/vmscan.c @@ -2378,9 +2378,9 @@ unsigned long try_to_free_mem_cgroup_pag * take care of from where we get pages. So the node where we start the * scan does not need to be the current node. */ - nid = mem_cgroup_select_victim_node(mem_cont, &sc.nodemask); + nid = mem_cgroup_select_victim_node(mem_cont, &sc.nodemask, rec); - zonelist = NODE_DATA(nid)->node_zonelists; + zonelist = &NODE_DATA(nid)->node_zonelists[0]; trace_mm_vmscan_memcg_reclaim_begin(0, sc.may_writepage, Index: mmotm-Aug3/include/linux/memcontrol.h =================================================================== --- mmotm-Aug3.orig/include/linux/memcontrol.h +++ mmotm-Aug3/include/linux/memcontrol.h @@ -118,7 +118,8 @@ extern void mem_cgroup_end_migration(str */ int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg); int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg); -int mem_cgroup_select_victim_node(struct mem_cgroup *memcg, nodemask_t **mask); +int mem_cgroup_select_victim_node(struct mem_cgroup *memcg, nodemask_t **mask, + struct memcg_scanrecord *rec); unsigned long mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, int nid, int zid, unsigned int lrumask); struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg, -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/ Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>