On Wed, May 25, 2011 at 10:15 PM, KAMEZAWA Hiroyuki <kamezawa.hiroyu@xxxxxxxxxxxxxx> wrote: > > I may post this patch as stand alone, later. > == > Check memcg has reclaimable pages at select_victim(). > > Now, with help of bitmap as memcg->scan_node, we can check whether memcg has > reclaimable pages with easy test of node_empty(&mem->scan_nodes). > > mem->scan_nodes is a bitmap to show whether memcg contains reclaimable > memory or not, which is updated periodically. > > This patch makes use of scan_nodes and modify hierarchy walk at memory > shrinking in following way. > > - check scan_nodes in mem_cgroup_select_victim() > - mem_cgroup_select_victim() returns NULL if no memcg is reclaimable. > - force update of scan_nodes. > - rename mem_cgroup_select_victim() to be mem_cgroup_select_get_victim() > to show refcnt is +1. > > This will make hierarchy walk better. > > And this allows to remove mem_cgroup_local_pages() check which was used for > the same purpose. But this function was wrong because it cannot handle > information of unevictable pages and tmpfs v.s. swapless information. > > Changelog: > - added since v3. > > Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@xxxxxxxxxxxxxx> > --- > mm/memcontrol.c | 165 +++++++++++++++++++++++++++++++++++++------------------- > 1 file changed, 110 insertions(+), 55 deletions(-) > > Index: memcg_async/mm/memcontrol.c > =================================================================== > --- memcg_async.orig/mm/memcontrol.c > +++ memcg_async/mm/memcontrol.c > @@ -584,15 +584,6 @@ static long mem_cgroup_read_stat(struct > return val; > } > > -static long mem_cgroup_local_usage(struct mem_cgroup *mem) > -{ > - long ret; > - > - ret = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS); > - ret += mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE); > - return ret; > -} > - > static void mem_cgroup_swap_statistics(struct mem_cgroup *mem, > bool charge) > { > @@ -1555,43 +1546,6 @@ u64 mem_cgroup_get_limit(struct mem_cgro > return min(limit, memsw); > } > > -/* > - * Visit the first child (need not be the first child as per the ordering > - * of the cgroup list, since we track last_scanned_child) of @mem and use > - * that to reclaim free pages from. > - */ > -static struct mem_cgroup * > -mem_cgroup_select_victim(struct mem_cgroup *root_mem) > -{ > - struct mem_cgroup *ret = NULL; > - struct cgroup_subsys_state *css; > - int nextid, found; > - > - if (!root_mem->use_hierarchy) { > - css_get(&root_mem->css); > - ret = root_mem; > - } > - > - while (!ret) { > - rcu_read_lock(); > - nextid = root_mem->last_scanned_child + 1; > - css = css_get_next(&mem_cgroup_subsys, nextid, &root_mem->css, > - &found); > - if (css && css_tryget(css)) > - ret = container_of(css, struct mem_cgroup, css); > - > - rcu_read_unlock(); > - /* Updates scanning parameter */ > - if (!css) { > - /* this means start scan from ID:1 */ > - root_mem->last_scanned_child = 0; > - } else > - root_mem->last_scanned_child = found; > - } > - > - return ret; > -} > - > #if MAX_NUMNODES > 1 > > /* > @@ -1600,11 +1554,11 @@ mem_cgroup_select_victim(struct mem_cgro > * nodes based on the zonelist. So update the list loosely once per 10 secs. > * > */ > -static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem) > +static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem, bool force) > { > int nid; > > - if (time_after(mem->next_scan_node_update, jiffies)) > + if (!force && time_after(mem->next_scan_node_update, jiffies)) > return; > > mem->next_scan_node_update = jiffies + 10*HZ; > @@ -1641,7 +1595,7 @@ int mem_cgroup_select_victim_node(struct > { > int node; > > - mem_cgroup_may_update_nodemask(mem); > + mem_cgroup_may_update_nodemask(mem, false); > node = mem->last_scanned_node; > > node = next_node(node, mem->scan_nodes); > @@ -1660,13 +1614,117 @@ int mem_cgroup_select_victim_node(struct > return node; > } > > +/** > + * mem_cgroup_has_reclaimable > + * @mem_cgroup : the mem_cgroup > + * > + * The caller can test whether the memcg has reclaimable pages. > + * > + * This function checks memcg has reclaimable pages or not with bitmap of > + * memcg->scan_nodes. This bitmap is updated periodically and indicates > + * which node has reclaimable memcg memory or not. > + * Although this is a rough test and result is not very precise but we don't > + * have to scan all nodes and don't have to use locks. > + * > + * For non-NUMA, this cheks reclaimable pages on zones because we don't > + * update scan_nodes.(see below) > + */ > +static bool mem_cgroup_has_reclaimable(struct mem_cgroup *memcg) > +{ > + return !nodes_empty(memcg->scan_nodes); > +} > + > #else > + > +static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem, bool force) > +{ > +} > + > int mem_cgroup_select_victim_node(struct mem_cgroup *mem) > { > return 0; > } > + > +static bool mem_cgroup_has_reclaimable(struct mem_cgroup *memcg) > +{ > + unsigned long nr; > + int zid; > + > + for (zid = NODE_DATA(0)->nr_zones - 1; zid >= 0; zid--) > + if (mem_cgroup_zone_reclaimable_pages(memcg, 0, zid)) > + break; > + if (zid < 0) > + return false; > + return true; > +} > #endif unused variable "nr". --Ying > > +/** > + * mem_cgroup_select_get_victim > + * @root_mem: the root memcg of hierarchy which should be shrinked. > + * > + * Visit children of root_mem ony by one. If the routine finds a memcg > + * which contains reclaimable pages, returns it with refcnt +1. The > + * scan is done in round-robin and 'the next start point' is saved into > + * mem->last_scanned_child. If no reclaimable memcg are found, returns NULL. > + */ > +static struct mem_cgroup * > +mem_cgroup_select_get_victim(struct mem_cgroup *root_mem) > +{ > + struct mem_cgroup *ret = NULL; > + struct cgroup_subsys_state *css; > + int nextid, found; > + bool second_visit = false; > + > + if (!root_mem->use_hierarchy) > + goto return_root; > + > + while (!ret) { > + rcu_read_lock(); > + nextid = root_mem->last_scanned_child + 1; > + css = css_get_next(&mem_cgroup_subsys, nextid, &root_mem->css, > + &found); > + if (css && css_tryget(css)) > + ret = container_of(css, struct mem_cgroup, css); > + > + rcu_read_unlock(); > + /* Updates scanning parameter */ > + if (!css) { /* Indicates we scanned the last node of tree */ > + /* > + * If all memcg has no reclaimable pages, we may enter > + * an infinite loop. Exit here if we reached the end > + * of hierarchy tree twice. > + */ > + if (second_visit) > + return NULL; > + /* this means start scan from ID:1 */ > + root_mem->last_scanned_child = 0; > + second_visit = true; > + } else > + root_mem->last_scanned_child = found; > + if (css && ret) { > + /* > + * check memcg has reclaimable memory or not. Update > + * information carefully if we might fail with cached > + * bitmask information. > + */ > + if (second_visit) > + mem_cgroup_may_update_nodemask(ret, true); > + > + if (!mem_cgroup_has_reclaimable(ret)) { > + css_put(css); > + ret = NULL; > + } > + } > + } > + > + return ret; > +return_root: > + css_get(&root_mem->css); > + return root_mem; > +} > + > + > /* > * Scan the hierarchy if needed to reclaim memory. We remember the last child > * we reclaimed from, so that we don't end up penalizing one child extensively > @@ -1705,7 +1763,9 @@ static int mem_cgroup_hierarchical_recla > is_kswapd = true; > > while (1) { > - victim = mem_cgroup_select_victim(root_mem); > + victim = mem_cgroup_select_get_victim(root_mem); > + if (!victim) > + return total; > if (victim == root_mem) { > loop++; > if (loop >= 1) > @@ -1733,11 +1793,6 @@ static int mem_cgroup_hierarchical_recla > } > } > } > - if (!mem_cgroup_local_usage(victim)) { > - /* this cgroup's local usage == 0 */ > - css_put(&victim->css); > - continue; > - } > /* we use swappiness of local cgroup */ > if (check_soft) { > ret = mem_cgroup_shrink_node_zone(victim, gfp_mask, > > -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxxx For more info on Linux MM, see: http://www.linux-mm.org/ . Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/ Don't email: <a href