On Wed, Jun 6, 2012 at 11:23 AM, Ying Han <yinghan@xxxxxxxxxx> wrote: > This patch reverts all the existing softlimit reclaim implementations and > instead integrates the softlimit reclaim into existing global reclaim logic. > > The new softlimit reclaim includes the following changes: > > 1. add function should_reclaim_mem_cgroup() > > Add the filter function should_reclaim_mem_cgroup() under the common function > shrink_zone(). The later one is being called both from per-memcg reclaim as > well as global reclaim. > > Today the softlimit takes effect only under global memory pressure. The memcgs > get free run above their softlimit until there is a global memory contention. > This patch doesn't change the semantics. > > Under the global reclaim, we try to skip reclaiming from a memcg under its > softlimit. To prevent reclaim from trying too hard on hitting memcgs > (above softlimit) w/ only hard-to-reclaim pages, the reclaim priority is used > to skip the softlimit check. This is a trade-off of system performance and > resource isolation. > > 2. "hierarchical" softlimit reclaim > > This is consistant to how softlimit was previously implemented, where the > pressure is put for the whole hiearchy as long as the "root" of the hierarchy > over its softlimit. > > This part is not in my previous posts, and is quite different from my > understanding of softlimit reclaim. After quite a lot of discussions with > Johannes and Michal, i decided to go with it for now. And this is designed > to work with both trusted setups and untrusted setups. > > What's the trusted and untrusted setups ? > > case 1 : Administrator is the only one setting up the limits and also he > expects gurantees of memory under each cgroup's softlimit: > > Considering the following: > > root (soft: unlimited, use_hierarchy = 1) > -- A (soft: unlimited, usage 22G) > -- A1 (soft: 10G, usage 17G) > -- A2 (soft: 6G, usage 5G) > -- B (soft: 16G, usage 10G) > > So we have A1 above its softlimit and none of its ancestor does, then > global reclaim will only pick A1 to reclaim first. > > case 2: Untrusted enviroment where cgroups changes its softlimit or > adminstrator could make mistakes. In that case, we still want to attack the > mis-configured child if its parent is above softlimit. > > Considering the following: > > root (soft: unlimited, use_hierarchy = 1) > -- A (soft: 16G, usage 22G) > -- A1 (soft: 10G, usage 17G) > -- A2 (soft: 1000G, usage 5G) > -- B (soft: 16G, usage 10G) > > Here A2 would set its softlimit way higher than its parent, but the current > logic makes sure to still attack it when A exceeds its softlimit. > > Signed-off-by: Johannes Weiner <hannes@xxxxxxxxxxx> > Signed-off-by: Ying Han <yinghan@xxxxxxxxxx> > --- > include/linux/memcontrol.h | 19 +- > include/linux/swap.h | 4 - > mm/memcontrol.c | 454 +++----------------------------------------- > mm/vmscan.c | 84 ++------- > 4 files changed, 50 insertions(+), 511 deletions(-) > > diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h > index f94efd2..7d47c7c 100644 > --- a/include/linux/memcontrol.h > +++ b/include/linux/memcontrol.h > @@ -130,6 +130,8 @@ extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, > extern void mem_cgroup_replace_page_cache(struct page *oldpage, > struct page *newpage); > > +extern bool should_reclaim_mem_cgroup(struct mem_cgroup *memcg); > + > #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP > extern int do_swap_account; > #endif > @@ -185,9 +187,6 @@ static inline void mem_cgroup_dec_page_stat(struct page *page, > mem_cgroup_update_page_stat(page, idx, -1); > } > > -unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, > - gfp_t gfp_mask, > - unsigned long *total_scanned); > u64 mem_cgroup_get_limit(struct mem_cgroup *memcg); > > void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx); > @@ -390,14 +389,6 @@ static inline void mem_cgroup_dec_page_stat(struct page *page, > } > > static inline > -unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, > - gfp_t gfp_mask, > - unsigned long *total_scanned) > -{ > - return 0; > -} > - > -static inline > u64 mem_cgroup_get_limit(struct mem_cgroup *memcg) > { > return 0; > @@ -415,6 +406,12 @@ static inline void mem_cgroup_replace_page_cache(struct page *oldpage, > struct page *newpage) > { > } > + > +bool should_reclaim_mem_cgroup(struct mem_cgroup *memcg) > +{ > + return true; > +} > + > #endif /* CONFIG_CGROUP_MEM_RES_CTLR */ > > #if !defined(CONFIG_CGROUP_MEM_RES_CTLR) || !defined(CONFIG_DEBUG_VM) > diff --git a/include/linux/swap.h b/include/linux/swap.h > index b1fd5c7..c9e9279 100644 > --- a/include/linux/swap.h > +++ b/include/linux/swap.h > @@ -254,10 +254,6 @@ extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order, > extern int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file); > extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem, > gfp_t gfp_mask, bool noswap); > -extern unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, > - gfp_t gfp_mask, bool noswap, > - struct zone *zone, > - unsigned long *nr_scanned); > extern unsigned long shrink_all_memory(unsigned long nr_pages); > extern int vm_swappiness; > extern int remove_mapping(struct address_space *mapping, struct page *page); > diff --git a/mm/memcontrol.c b/mm/memcontrol.c > index 7685d4a..2ee1532 100644 > --- a/mm/memcontrol.c > +++ b/mm/memcontrol.c > @@ -35,7 +35,6 @@ > #include <linux/limits.h> > #include <linux/export.h> > #include <linux/mutex.h> > -#include <linux/rbtree.h> > #include <linux/slab.h> > #include <linux/swap.h> > #include <linux/swapops.h> > @@ -108,7 +107,6 @@ enum mem_cgroup_events_index { > */ > enum mem_cgroup_events_target { > MEM_CGROUP_TARGET_THRESH, > - MEM_CGROUP_TARGET_SOFTLIMIT, > MEM_CGROUP_TARGET_NUMAINFO, > MEM_CGROUP_NTARGETS, > }; > @@ -139,12 +137,6 @@ struct mem_cgroup_per_zone { > struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1]; > > struct zone_reclaim_stat reclaim_stat; > - struct rb_node tree_node; /* RB tree node */ > - unsigned long long usage_in_excess;/* Set to the value by which */ > - /* the soft limit is exceeded*/ > - bool on_tree; > - struct mem_cgroup *memcg; /* Back pointer, we cannot */ > - /* use container_of */ > }; > > struct mem_cgroup_per_node { > @@ -155,26 +147,6 @@ struct mem_cgroup_lru_info { > struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES]; > }; > > -/* > - * Cgroups above their limits are maintained in a RB-Tree, independent of > - * their hierarchy representation > - */ > - > -struct mem_cgroup_tree_per_zone { > - struct rb_root rb_root; > - spinlock_t lock; > -}; > - > -struct mem_cgroup_tree_per_node { > - struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES]; > -}; > - > -struct mem_cgroup_tree { > - struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES]; > -}; > - > -static struct mem_cgroup_tree soft_limit_tree __read_mostly; > - > struct mem_cgroup_threshold { > struct eventfd_ctx *eventfd; > u64 threshold; > @@ -356,12 +328,7 @@ static bool move_file(void) > &mc.to->move_charge_at_immigrate); > } > > -/* > - * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft > - * limit reclaim to prevent infinite loops, if they ever occur. > - */ > #define MEM_CGROUP_MAX_RECLAIM_LOOPS (100) > -#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS (2) > > enum charge_type { > MEM_CGROUP_CHARGE_TYPE_CACHE = 0, > @@ -394,12 +361,12 @@ enum charge_type { > static void mem_cgroup_get(struct mem_cgroup *memcg); > static void mem_cgroup_put(struct mem_cgroup *memcg); > > +static bool mem_cgroup_is_root(struct mem_cgroup *memcg); > /* Writing them here to avoid exposing memcg's inner layout */ > #ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM > #include <net/sock.h> > #include <net/ip.h> > > -static bool mem_cgroup_is_root(struct mem_cgroup *memcg); > void sock_update_memcg(struct sock *sk) > { > if (mem_cgroup_sockets_enabled) { > @@ -476,164 +443,6 @@ page_cgroup_zoneinfo(struct mem_cgroup *memcg, struct page *page) > return mem_cgroup_zoneinfo(memcg, nid, zid); > } > > -static struct mem_cgroup_tree_per_zone * > -soft_limit_tree_node_zone(int nid, int zid) > -{ > - return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; > -} > - > -static struct mem_cgroup_tree_per_zone * > -soft_limit_tree_from_page(struct page *page) > -{ > - int nid = page_to_nid(page); > - int zid = page_zonenum(page); > - > - return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; > -} > - > -static void > -__mem_cgroup_insert_exceeded(struct mem_cgroup *memcg, > - struct mem_cgroup_per_zone *mz, > - struct mem_cgroup_tree_per_zone *mctz, > - unsigned long long new_usage_in_excess) > -{ > - struct rb_node **p = &mctz->rb_root.rb_node; > - struct rb_node *parent = NULL; > - struct mem_cgroup_per_zone *mz_node; > - > - if (mz->on_tree) > - return; > - > - mz->usage_in_excess = new_usage_in_excess; > - if (!mz->usage_in_excess) > - return; > - while (*p) { > - parent = *p; > - mz_node = rb_entry(parent, struct mem_cgroup_per_zone, > - tree_node); > - if (mz->usage_in_excess < mz_node->usage_in_excess) > - p = &(*p)->rb_left; > - /* > - * We can't avoid mem cgroups that are over their soft > - * limit by the same amount > - */ > - else if (mz->usage_in_excess >= mz_node->usage_in_excess) > - p = &(*p)->rb_right; > - } > - rb_link_node(&mz->tree_node, parent, p); > - rb_insert_color(&mz->tree_node, &mctz->rb_root); > - mz->on_tree = true; > -} > - > -static void > -__mem_cgroup_remove_exceeded(struct mem_cgroup *memcg, > - struct mem_cgroup_per_zone *mz, > - struct mem_cgroup_tree_per_zone *mctz) > -{ > - if (!mz->on_tree) > - return; > - rb_erase(&mz->tree_node, &mctz->rb_root); > - mz->on_tree = false; > -} > - > -static void > -mem_cgroup_remove_exceeded(struct mem_cgroup *memcg, > - struct mem_cgroup_per_zone *mz, > - struct mem_cgroup_tree_per_zone *mctz) > -{ > - spin_lock(&mctz->lock); > - __mem_cgroup_remove_exceeded(memcg, mz, mctz); > - spin_unlock(&mctz->lock); > -} > - > - > -static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) > -{ > - unsigned long long excess; > - struct mem_cgroup_per_zone *mz; > - struct mem_cgroup_tree_per_zone *mctz; > - int nid = page_to_nid(page); > - int zid = page_zonenum(page); > - mctz = soft_limit_tree_from_page(page); > - > - /* > - * Necessary to update all ancestors when hierarchy is used. > - * because their event counter is not touched. > - */ > - for (; memcg; memcg = parent_mem_cgroup(memcg)) { > - mz = mem_cgroup_zoneinfo(memcg, nid, zid); > - excess = res_counter_soft_limit_excess(&memcg->res); > - /* > - * We have to update the tree if mz is on RB-tree or > - * mem is over its softlimit. > - */ > - if (excess || mz->on_tree) { > - spin_lock(&mctz->lock); > - /* if on-tree, remove it */ > - if (mz->on_tree) > - __mem_cgroup_remove_exceeded(memcg, mz, mctz); > - /* > - * Insert again. mz->usage_in_excess will be updated. > - * If excess is 0, no tree ops. > - */ > - __mem_cgroup_insert_exceeded(memcg, mz, mctz, excess); > - spin_unlock(&mctz->lock); > - } > - } > -} > - > -static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg) > -{ > - int node, zone; > - struct mem_cgroup_per_zone *mz; > - struct mem_cgroup_tree_per_zone *mctz; > - > - for_each_node(node) { > - for (zone = 0; zone < MAX_NR_ZONES; zone++) { > - mz = mem_cgroup_zoneinfo(memcg, node, zone); > - mctz = soft_limit_tree_node_zone(node, zone); > - mem_cgroup_remove_exceeded(memcg, mz, mctz); > - } > - } > -} > - > -static struct mem_cgroup_per_zone * > -__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) > -{ > - struct rb_node *rightmost = NULL; > - struct mem_cgroup_per_zone *mz; > - > -retry: > - mz = NULL; > - rightmost = rb_last(&mctz->rb_root); > - if (!rightmost) > - goto done; /* Nothing to reclaim from */ > - > - mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node); > - /* > - * Remove the node now but someone else can add it back, > - * we will to add it back at the end of reclaim to its correct > - * position in the tree. > - */ > - __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz); > - if (!res_counter_soft_limit_excess(&mz->memcg->res) || > - !css_tryget(&mz->memcg->css)) > - goto retry; > -done: > - return mz; > -} > - > -static struct mem_cgroup_per_zone * > -mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) > -{ > - struct mem_cgroup_per_zone *mz; > - > - spin_lock(&mctz->lock); > - mz = __mem_cgroup_largest_soft_limit_node(mctz); > - spin_unlock(&mctz->lock); > - return mz; > -} > - > /* > * Implementation Note: reading percpu statistics for memcg. > * > @@ -778,9 +587,6 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, > case MEM_CGROUP_TARGET_THRESH: > next = val + THRESHOLDS_EVENTS_TARGET; > break; > - case MEM_CGROUP_TARGET_SOFTLIMIT: > - next = val + SOFTLIMIT_EVENTS_TARGET; > - break; > case MEM_CGROUP_TARGET_NUMAINFO: > next = val + NUMAINFO_EVENTS_TARGET; > break; > @@ -803,11 +609,8 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) > /* threshold event is triggered in finer grain than soft limit */ > if (unlikely(mem_cgroup_event_ratelimit(memcg, > MEM_CGROUP_TARGET_THRESH))) { > - bool do_softlimit; > bool do_numainfo __maybe_unused; > > - do_softlimit = mem_cgroup_event_ratelimit(memcg, > - MEM_CGROUP_TARGET_SOFTLIMIT); > #if MAX_NUMNODES > 1 > do_numainfo = mem_cgroup_event_ratelimit(memcg, > MEM_CGROUP_TARGET_NUMAINFO); > @@ -815,8 +618,6 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) > preempt_enable(); > > mem_cgroup_threshold(memcg); > - if (unlikely(do_softlimit)) > - mem_cgroup_update_tree(memcg, page); > #if MAX_NUMNODES > 1 > if (unlikely(do_numainfo)) > atomic_inc(&memcg->numainfo_events); > @@ -867,6 +668,31 @@ struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) > return memcg; > } > > +bool should_reclaim_mem_cgroup(struct mem_cgroup *memcg) > +{ > + if (mem_cgroup_disabled()) > + return true; > + > + /* > + * We treat the root cgroup special here to always reclaim pages. > + * Now root cgroup has its own lru, and the only chance to reclaim > + * pages from it is through global reclaim. note, root cgroup does > + * not trigger targeted reclaim. > + */ > + if (mem_cgroup_is_root(memcg)) > + return true; > + > + for (; memcg; memcg = parent_mem_cgroup(memcg)) { > + /* This is global reclaim, stop at root cgroup */ > + if (mem_cgroup_is_root(memcg)) > + break; > + if (res_counter_soft_limit_excess(&memcg->res)) > + return true; > + } > + > + return false; > +} > + > /** > * mem_cgroup_iter - iterate over memory cgroup hierarchy > * @root: hierarchy root > @@ -1628,106 +1454,13 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) > return node; > } > > -/* > - * Check all nodes whether it contains reclaimable pages or not. > - * For quick scan, we make use of scan_nodes. This will allow us to skip > - * unused nodes. But scan_nodes is lazily updated and may not cotain > - * enough new information. We need to do double check. > - */ > -bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) > -{ > - int nid; > - > - /* > - * quick check...making use of scan_node. > - * We can skip unused nodes. > - */ > - if (!nodes_empty(memcg->scan_nodes)) { > - for (nid = first_node(memcg->scan_nodes); > - nid < MAX_NUMNODES; > - nid = next_node(nid, memcg->scan_nodes)) { > - > - if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap)) > - return true; > - } > - } > - /* > - * Check rest of nodes. > - */ > - for_each_node_state(nid, N_HIGH_MEMORY) { > - if (node_isset(nid, memcg->scan_nodes)) > - continue; > - if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap)) > - return true; > - } > - return false; > -} > - > #else > int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) > { > return 0; > } > - > -bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) > -{ > - return test_mem_cgroup_node_reclaimable(memcg, 0, noswap); > -} > #endif > > -static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, > - struct zone *zone, > - gfp_t gfp_mask, > - unsigned long *total_scanned) > -{ > - struct mem_cgroup *victim = NULL; > - int total = 0; > - int loop = 0; > - unsigned long excess; > - unsigned long nr_scanned; > - struct mem_cgroup_reclaim_cookie reclaim = { > - .zone = zone, > - .priority = 0, > - }; > - > - excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT; > - > - while (1) { > - victim = mem_cgroup_iter(root_memcg, victim, &reclaim); > - if (!victim) { > - loop++; > - if (loop >= 2) { > - /* > - * If we have not been able to reclaim > - * anything, it might because there are > - * no reclaimable pages under this hierarchy > - */ > - if (!total) > - break; > - /* > - * We want to do more targeted reclaim. > - * excess >> 2 is not to excessive so as to > - * reclaim too much, nor too less that we keep > - * coming back to reclaim from this cgroup > - */ > - if (total >= (excess >> 2) || > - (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) > - break; > - } > - continue; > - } > - if (!mem_cgroup_reclaimable(victim, false)) > - continue; > - total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false, > - zone, &nr_scanned); > - *total_scanned += nr_scanned; > - if (!res_counter_soft_limit_excess(&root_memcg->res)) > - break; > - } > - mem_cgroup_iter_break(root_memcg, victim); > - return total; > -} > - > /* > * Check OOM-Killer is already running under our hierarchy. > * If someone is running, return false. > @@ -2539,8 +2272,6 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, > > /* > * "charge_statistics" updated event counter. Then, check it. > - * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. > - * if they exceeds softlimit. > */ > memcg_check_events(memcg, page); > } > @@ -3555,98 +3286,6 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, > return ret; > } > > -unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, > - gfp_t gfp_mask, > - unsigned long *total_scanned) > -{ > - unsigned long nr_reclaimed = 0; > - struct mem_cgroup_per_zone *mz, *next_mz = NULL; > - unsigned long reclaimed; > - int loop = 0; > - struct mem_cgroup_tree_per_zone *mctz; > - unsigned long long excess; > - unsigned long nr_scanned; > - > - if (order > 0) > - return 0; > - > - mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone)); > - /* > - * This loop can run a while, specially if mem_cgroup's continuously > - * keep exceeding their soft limit and putting the system under > - * pressure > - */ > - do { > - if (next_mz) > - mz = next_mz; > - else > - mz = mem_cgroup_largest_soft_limit_node(mctz); > - if (!mz) > - break; > - > - nr_scanned = 0; > - reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone, > - gfp_mask, &nr_scanned); > - nr_reclaimed += reclaimed; > - *total_scanned += nr_scanned; > - spin_lock(&mctz->lock); > - > - /* > - * If we failed to reclaim anything from this memory cgroup > - * it is time to move on to the next cgroup > - */ > - next_mz = NULL; > - if (!reclaimed) { > - do { > - /* > - * Loop until we find yet another one. > - * > - * By the time we get the soft_limit lock > - * again, someone might have aded the > - * group back on the RB tree. Iterate to > - * make sure we get a different mem. > - * mem_cgroup_largest_soft_limit_node returns > - * NULL if no other cgroup is present on > - * the tree > - */ > - next_mz = > - __mem_cgroup_largest_soft_limit_node(mctz); > - if (next_mz == mz) > - css_put(&next_mz->memcg->css); > - else /* next_mz == NULL or other memcg */ > - break; > - } while (1); > - } > - __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz); > - excess = res_counter_soft_limit_excess(&mz->memcg->res); > - /* > - * One school of thought says that we should not add > - * back the node to the tree if reclaim returns 0. > - * But our reclaim could return 0, simply because due > - * to priority we are exposing a smaller subset of > - * memory to reclaim from. Consider this as a longer > - * term TODO. > - */ > - /* If excess == 0, no tree ops */ > - __mem_cgroup_insert_exceeded(mz->memcg, mz, mctz, excess); > - spin_unlock(&mctz->lock); > - css_put(&mz->memcg->css); > - loop++; > - /* > - * Could not reclaim anything and there are no more > - * mem cgroups to try or we seem to be looping without > - * reclaiming anything. > - */ > - if (!nr_reclaimed && > - (next_mz == NULL || > - loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS)) > - break; > - } while (!nr_reclaimed); > - if (next_mz) > - css_put(&next_mz->memcg->css); > - return nr_reclaimed; > -} > - > /* > * This routine traverse page_cgroup in given list and drop them all. > * *And* this routine doesn't reclaim page itself, just removes page_cgroup. > @@ -4790,9 +4429,6 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) > mz = &pn->zoneinfo[zone]; > for_each_lru(lru) > INIT_LIST_HEAD(&mz->lruvec.lists[lru]); > - mz->usage_in_excess = 0; > - mz->on_tree = false; > - mz->memcg = memcg; > } > memcg->info.nodeinfo[node] = pn; > return 0; > @@ -4867,7 +4503,6 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) > { > int node; > > - mem_cgroup_remove_from_trees(memcg); > free_css_id(&mem_cgroup_subsys, &memcg->css); > > for_each_node(node) > @@ -4923,41 +4558,6 @@ static void __init enable_swap_cgroup(void) > } > #endif > > -static int mem_cgroup_soft_limit_tree_init(void) > -{ > - struct mem_cgroup_tree_per_node *rtpn; > - struct mem_cgroup_tree_per_zone *rtpz; > - int tmp, node, zone; > - > - for_each_node(node) { > - tmp = node; > - if (!node_state(node, N_NORMAL_MEMORY)) > - tmp = -1; > - rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp); > - if (!rtpn) > - goto err_cleanup; > - > - soft_limit_tree.rb_tree_per_node[node] = rtpn; > - > - for (zone = 0; zone < MAX_NR_ZONES; zone++) { > - rtpz = &rtpn->rb_tree_per_zone[zone]; > - rtpz->rb_root = RB_ROOT; > - spin_lock_init(&rtpz->lock); > - } > - } > - return 0; > - > -err_cleanup: > - for_each_node(node) { > - if (!soft_limit_tree.rb_tree_per_node[node]) > - break; > - kfree(soft_limit_tree.rb_tree_per_node[node]); > - soft_limit_tree.rb_tree_per_node[node] = NULL; > - } > - return 1; > - > -} > - > static struct cgroup_subsys_state * __ref > mem_cgroup_create(struct cgroup *cont) > { > @@ -4978,8 +4578,6 @@ mem_cgroup_create(struct cgroup *cont) > int cpu; > enable_swap_cgroup(); > parent = NULL; > - if (mem_cgroup_soft_limit_tree_init()) > - goto free_out; > root_mem_cgroup = memcg; > for_each_possible_cpu(cpu) { > struct memcg_stock_pcp *stock = > diff --git a/mm/vmscan.c b/mm/vmscan.c > index 33dc256..0560783 100644 > --- a/mm/vmscan.c > +++ b/mm/vmscan.c > @@ -2150,7 +2150,22 @@ static void shrink_zone(int priority, struct zone *zone, > .zone = zone, > }; > > - shrink_mem_cgroup_zone(priority, &mz, sc); > + /* > + * Reclaim from mem_cgroup if any of these conditions are met: > + * - this is a targetted reclaim ( not global reclaim) > + * - reclaim priority is less than DEF_PRIORITY - 2 > + * - mem_cgroup or its ancestor ( not including root cgroup) > + * exceeds its soft limit > + * > + * Note: The priority check is a balance of how hard to > + * preserve the pages under softlimit. If the memcgs of the > + * zone having trouble to reclaim pages above their softlimit, > + * we have to reclaim under softlimit instead of burning more > + * cpu cycles. > + */ > + if (!global_reclaim(sc) || priority < DEF_PRIORITY - 2 || > + should_reclaim_mem_cgroup(memcg)) > + shrink_mem_cgroup_zone(priority, &mz, sc); > /* > * Limit reclaim has historically picked one memcg and > * scanned it with decreasing priority levels until > @@ -2231,8 +2246,6 @@ static bool shrink_zones(int priority, struct zonelist *zonelist, > { > struct zoneref *z; > struct zone *zone; > - unsigned long nr_soft_reclaimed; > - unsigned long nr_soft_scanned; > bool aborted_reclaim = false; > > /* > @@ -2271,18 +2284,6 @@ static bool shrink_zones(int priority, struct zonelist *zonelist, > continue; > } > } > - /* > - * This steals pages from memory cgroups over softlimit > - * and returns the number of reclaimed pages and > - * scanned pages. This works for global memory pressure > - * and balancing, not for a memcg's limit. > - */ > - nr_soft_scanned = 0; > - nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone, > - sc->order, sc->gfp_mask, > - &nr_soft_scanned); > - sc->nr_reclaimed += nr_soft_reclaimed; > - sc->nr_scanned += nr_soft_scanned; > /* need some check for avoid more shrink_zone() */ > } > > @@ -2462,47 +2463,6 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, > > #ifdef CONFIG_CGROUP_MEM_RES_CTLR > > -unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg, > - gfp_t gfp_mask, bool noswap, > - struct zone *zone, > - unsigned long *nr_scanned) > -{ > - struct scan_control sc = { > - .nr_scanned = 0, > - .nr_to_reclaim = SWAP_CLUSTER_MAX, > - .may_writepage = !laptop_mode, > - .may_unmap = 1, > - .may_swap = !noswap, > - .order = 0, > - .target_mem_cgroup = memcg, > - }; > - struct mem_cgroup_zone mz = { > - .mem_cgroup = memcg, > - .zone = zone, > - }; > - > - sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | > - (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); > - > - trace_mm_vmscan_memcg_softlimit_reclaim_begin(0, > - sc.may_writepage, > - sc.gfp_mask); > - > - /* > - * NOTE: Although we can get the priority field, using it > - * here is not a good idea, since it limits the pages we can scan. > - * if we don't reclaim here, the shrink_zone from balance_pgdat > - * will pick up pages from other mem cgroup's as well. We hack > - * the priority and make it zero. > - */ > - shrink_mem_cgroup_zone(0, &mz, &sc); > - > - trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); > - > - *nr_scanned = sc.nr_scanned; > - return sc.nr_reclaimed; > -} > - > unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, > gfp_t gfp_mask, > bool noswap) > @@ -2677,8 +2637,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, > int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ > unsigned long total_scanned; > struct reclaim_state *reclaim_state = current->reclaim_state; > - unsigned long nr_soft_reclaimed; > - unsigned long nr_soft_scanned; > struct scan_control sc = { > .gfp_mask = GFP_KERNEL, > .may_unmap = 1, > @@ -2781,16 +2739,6 @@ loop_again: > > sc.nr_scanned = 0; > > - nr_soft_scanned = 0; > - /* > - * Call soft limit reclaim before calling shrink_zone. > - */ > - nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone, > - order, sc.gfp_mask, > - &nr_soft_scanned); > - sc.nr_reclaimed += nr_soft_reclaimed; > - total_scanned += nr_soft_scanned; > - > /* > * We put equal pressure on every zone, unless > * one zone has way too many pages free > -- > 1.7.7.3 > The patch i have here is based on v3.4. Based on a offline conversation, i am going to rebase it on mmotm/linux-next and post it again. --Ying -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href