I may post this patch as stand alone, later. == Check memcg has reclaimable pages at select_victim(). Now, with help of bitmap as memcg->scan_node, we can check whether memcg has reclaimable pages with easy test of node_empty(&mem->scan_nodes). mem->scan_nodes is a bitmap to show whether memcg contains reclaimable memory or not, which is updated periodically. This patch makes use of scan_nodes and modify hierarchy walk at memory shrinking in following way. - check scan_nodes in mem_cgroup_select_victim() - mem_cgroup_select_victim() returns NULL if no memcg is reclaimable. - force update of scan_nodes. - rename mem_cgroup_select_victim() to be mem_cgroup_select_get_victim() to show refcnt is +1. This will make hierarchy walk better. And this allows to remove mem_cgroup_local_pages() check which was used for the same purpose. But this function was wrong because it cannot handle information of unevictable pages and tmpfs v.s. swapless information. Changelog: - added since v3. Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@xxxxxxxxxxxxxx> --- mm/memcontrol.c | 165 +++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 110 insertions(+), 55 deletions(-) Index: memcg_async/mm/memcontrol.c =================================================================== --- memcg_async.orig/mm/memcontrol.c +++ memcg_async/mm/memcontrol.c @@ -584,15 +584,6 @@ static long mem_cgroup_read_stat(struct return val; } -static long mem_cgroup_local_usage(struct mem_cgroup *mem) -{ - long ret; - - ret = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS); - ret += mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE); - return ret; -} - static void mem_cgroup_swap_statistics(struct mem_cgroup *mem, bool charge) { @@ -1555,43 +1546,6 @@ u64 mem_cgroup_get_limit(struct mem_cgro return min(limit, memsw); } -/* - * Visit the first child (need not be the first child as per the ordering - * of the cgroup list, since we track last_scanned_child) of @mem and use - * that to reclaim free pages from. - */ -static struct mem_cgroup * -mem_cgroup_select_victim(struct mem_cgroup *root_mem) -{ - struct mem_cgroup *ret = NULL; - struct cgroup_subsys_state *css; - int nextid, found; - - if (!root_mem->use_hierarchy) { - css_get(&root_mem->css); - ret = root_mem; - } - - while (!ret) { - rcu_read_lock(); - nextid = root_mem->last_scanned_child + 1; - css = css_get_next(&mem_cgroup_subsys, nextid, &root_mem->css, - &found); - if (css && css_tryget(css)) - ret = container_of(css, struct mem_cgroup, css); - - rcu_read_unlock(); - /* Updates scanning parameter */ - if (!css) { - /* this means start scan from ID:1 */ - root_mem->last_scanned_child = 0; - } else - root_mem->last_scanned_child = found; - } - - return ret; -} - #if MAX_NUMNODES > 1 /* @@ -1600,11 +1554,11 @@ mem_cgroup_select_victim(struct mem_cgro * nodes based on the zonelist. So update the list loosely once per 10 secs. * */ -static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem) +static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem, bool force) { int nid; - if (time_after(mem->next_scan_node_update, jiffies)) + if (!force && time_after(mem->next_scan_node_update, jiffies)) return; mem->next_scan_node_update = jiffies + 10*HZ; @@ -1641,7 +1595,7 @@ int mem_cgroup_select_victim_node(struct { int node; - mem_cgroup_may_update_nodemask(mem); + mem_cgroup_may_update_nodemask(mem, false); node = mem->last_scanned_node; node = next_node(node, mem->scan_nodes); @@ -1660,13 +1614,117 @@ int mem_cgroup_select_victim_node(struct return node; } +/** + * mem_cgroup_has_reclaimable + * @mem_cgroup : the mem_cgroup + * + * The caller can test whether the memcg has reclaimable pages. + * + * This function checks memcg has reclaimable pages or not with bitmap of + * memcg->scan_nodes. This bitmap is updated periodically and indicates + * which node has reclaimable memcg memory or not. + * Although this is a rough test and result is not very precise but we don't + * have to scan all nodes and don't have to use locks. + * + * For non-NUMA, this cheks reclaimable pages on zones because we don't + * update scan_nodes.(see below) + */ +static bool mem_cgroup_has_reclaimable(struct mem_cgroup *memcg) +{ + return !nodes_empty(memcg->scan_nodes); +} + #else + +static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem, bool force) +{ +} + int mem_cgroup_select_victim_node(struct mem_cgroup *mem) { return 0; } + +static bool mem_cgroup_has_reclaimable(struct mem_cgroup *memcg) +{ + unsigned long nr; + int zid; + + for (zid = NODE_DATA(0)->nr_zones - 1; zid >= 0; zid--) + if (mem_cgroup_zone_reclaimable_pages(memcg, 0, zid)) + break; + if (zid < 0) + return false; + return true; +} #endif +/** + * mem_cgroup_select_get_victim + * @root_mem: the root memcg of hierarchy which should be shrinked. + * + * Visit children of root_mem ony by one. If the routine finds a memcg + * which contains reclaimable pages, returns it with refcnt +1. The + * scan is done in round-robin and 'the next start point' is saved into + * mem->last_scanned_child. If no reclaimable memcg are found, returns NULL. + */ +static struct mem_cgroup * +mem_cgroup_select_get_victim(struct mem_cgroup *root_mem) +{ + struct mem_cgroup *ret = NULL; + struct cgroup_subsys_state *css; + int nextid, found; + bool second_visit = false; + + if (!root_mem->use_hierarchy) + goto return_root; + + while (!ret) { + rcu_read_lock(); + nextid = root_mem->last_scanned_child + 1; + css = css_get_next(&mem_cgroup_subsys, nextid, &root_mem->css, + &found); + if (css && css_tryget(css)) + ret = container_of(css, struct mem_cgroup, css); + + rcu_read_unlock(); + /* Updates scanning parameter */ + if (!css) { /* Indicates we scanned the last node of tree */ + /* + * If all memcg has no reclaimable pages, we may enter + * an infinite loop. Exit here if we reached the end + * of hierarchy tree twice. + */ + if (second_visit) + return NULL; + /* this means start scan from ID:1 */ + root_mem->last_scanned_child = 0; + second_visit = true; + } else + root_mem->last_scanned_child = found; + if (css && ret) { + /* + * check memcg has reclaimable memory or not. Update + * information carefully if we might fail with cached + * bitmask information. + */ + if (second_visit) + mem_cgroup_may_update_nodemask(ret, true); + + if (!mem_cgroup_has_reclaimable(ret)) { + css_put(css); + ret = NULL; + } + } + } + + return ret; +return_root: + css_get(&root_mem->css); + return root_mem; +} + + /* * Scan the hierarchy if needed to reclaim memory. We remember the last child * we reclaimed from, so that we don't end up penalizing one child extensively @@ -1705,7 +1763,9 @@ static int mem_cgroup_hierarchical_recla is_kswapd = true; while (1) { - victim = mem_cgroup_select_victim(root_mem); + victim = mem_cgroup_select_get_victim(root_mem); + if (!victim) + return total; if (victim == root_mem) { loop++; if (loop >= 1) @@ -1733,11 +1793,6 @@ static int mem_cgroup_hierarchical_recla } } } - if (!mem_cgroup_local_usage(victim)) { - /* this cgroup's local usage == 0 */ - css_put(&victim->css); - continue; - } /* we use swappiness of local cgroup */ if (check_soft) { ret = mem_cgroup_shrink_node_zone(victim, gfp_mask, -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxxx For more info on Linux MM, see: http://www.linux-mm.org/ . Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/ Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>