Define a per node soft_limit_top_tier red black tree that sort and track the cgroups by each group's excess over its toptier soft limit. A cgroup is added to the tree if it has exceeded its top tier soft limit and it has used pages on the node. Signed-off-by: Tim Chen <tim.c.chen@xxxxxxxxxxxxxxx> --- mm/memcontrol.c | 68 +++++++++++++++++++++++++++++++++++++------------ 1 file changed, 52 insertions(+), 16 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 68590f46fa76..90a78ff3fca8 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -122,6 +122,7 @@ struct mem_cgroup_tree { }; static struct mem_cgroup_tree soft_limit_tree __read_mostly; +static struct mem_cgroup_tree soft_limit_toptier_tree __read_mostly; /* for OOM */ struct mem_cgroup_eventfd_list { @@ -590,17 +591,27 @@ mem_cgroup_page_nodeinfo(struct mem_cgroup *memcg, struct page *page) } static struct mem_cgroup_tree_per_node * -soft_limit_tree_node(int nid) -{ - return soft_limit_tree.rb_tree_per_node[nid]; +soft_limit_tree_node(int nid, enum node_states type) +{ + switch (type) { + case N_MEMORY: + return soft_limit_tree.rb_tree_per_node[nid]; + case N_TOPTIER: + if (node_state(nid, N_TOPTIER)) + return soft_limit_toptier_tree.rb_tree_per_node[nid]; + else + return NULL; + default: + return NULL; + } } static struct mem_cgroup_tree_per_node * -soft_limit_tree_from_page(struct page *page) +soft_limit_tree_from_page(struct page *page, enum node_states type) { int nid = page_to_nid(page); - return soft_limit_tree.rb_tree_per_node[nid]; + return soft_limit_tree_node(nid, type); } static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz, @@ -661,12 +672,24 @@ static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz, spin_unlock_irqrestore(&mctz->lock, flags); } -static unsigned long soft_limit_excess(struct mem_cgroup *memcg) +static unsigned long soft_limit_excess(struct mem_cgroup *memcg, enum node_states type) { - unsigned long nr_pages = page_counter_read(&memcg->memory); - unsigned long soft_limit = READ_ONCE(memcg->soft_limit); + unsigned long nr_pages; + unsigned long soft_limit; unsigned long excess = 0; + switch (type) { + case N_MEMORY: + nr_pages = page_counter_read(&memcg->memory); + soft_limit = READ_ONCE(memcg->soft_limit); + break; + case N_TOPTIER: + nr_pages = page_counter_read(&memcg->toptier); + soft_limit = READ_ONCE(memcg->toptier_soft_limit); + break; + default: + return 0; + } if (nr_pages > soft_limit) excess = nr_pages - soft_limit; @@ -679,7 +702,7 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) struct mem_cgroup_per_node *mz; struct mem_cgroup_tree_per_node *mctz; - mctz = soft_limit_tree_from_page(page); + mctz = soft_limit_tree_from_page(page, N_MEMORY); if (!mctz) return; /* @@ -688,7 +711,7 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) */ for (; memcg; memcg = parent_mem_cgroup(memcg)) { mz = mem_cgroup_page_nodeinfo(memcg, page); - excess = soft_limit_excess(memcg); + excess = soft_limit_excess(memcg, N_MEMORY); /* * We have to update the tree if mz is on RB-tree or * mem is over its softlimit. @@ -718,7 +741,7 @@ static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg) for_each_node(nid) { mz = mem_cgroup_nodeinfo(memcg, nid); - mctz = soft_limit_tree_node(nid); + mctz = soft_limit_tree_node(nid, N_MEMORY); if (mctz) mem_cgroup_remove_exceeded(mz, mctz); } @@ -742,7 +765,7 @@ __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) * position in the tree. */ __mem_cgroup_remove_exceeded(mz, mctz); - if (!soft_limit_excess(mz->memcg) || + if (!soft_limit_excess(mz->memcg, N_MEMORY) || !css_tryget(&mz->memcg->css)) goto retry; done: @@ -1805,7 +1828,7 @@ static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, .pgdat = pgdat, }; - excess = soft_limit_excess(root_memcg); + excess = soft_limit_excess(root_memcg, N_MEMORY); while (1) { victim = mem_cgroup_iter(root_memcg, victim, &reclaim); @@ -1834,7 +1857,7 @@ static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, total += mem_cgroup_shrink_node(victim, gfp_mask, false, pgdat, &nr_scanned); *total_scanned += nr_scanned; - if (!soft_limit_excess(root_memcg)) + if (!soft_limit_excess(root_memcg, N_MEMORY)) break; } mem_cgroup_iter_break(root_memcg, victim); @@ -3457,7 +3480,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, if (order > 0) return 0; - mctz = soft_limit_tree_node(pgdat->node_id); + mctz = soft_limit_tree_node(pgdat->node_id, N_MEMORY); /* * Do not even bother to check the largest node if the root @@ -3513,7 +3536,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, if (!reclaimed) next_mz = __mem_cgroup_largest_soft_limit_node(mctz); - excess = soft_limit_excess(mz->memcg); + excess = soft_limit_excess(mz->memcg, N_MEMORY); /* * One school of thought says that we should not add * back the node to the tree if reclaim returns 0. @@ -7189,6 +7212,19 @@ static int __init mem_cgroup_init(void) rtpn->rb_rightmost = NULL; spin_lock_init(&rtpn->lock); soft_limit_tree.rb_tree_per_node[node] = rtpn; + + if (!node_state(node, N_TOPTIER)) { + soft_limit_toptier_tree.rb_tree_per_node[node] = NULL; + continue; + } + + rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, + node_online(node) ? node : NUMA_NO_NODE); + + rtpn->rb_root = RB_ROOT; + rtpn->rb_rightmost = NULL; + spin_lock_init(&rtpn->lock); + soft_limit_toptier_tree.rb_tree_per_node[node] = rtpn; } return 0; -- 2.20.1