On Tue 21-06-16 15:15:52, Mel Gorman wrote: > Memcg was broken by the move of all LRUs to nodes because it is tracking > limits on a per-zone basis while receiving reclaim requests on a per-node > basis. This might be a bit misleading/confusing. First of all the limit is tracked per memcg not zones/nodes. What you are talking about is tracking soft limit excess and that was really tracked per zone. Then I do not think the zones->nodes LRUs should lead to any noticeable differences. All we care about is to reclaim some memory to get down to the soft limit. > This patch moves limit enforcement to the nodes. Technically, all > the variable names should also change but people are already familiar by > the meaning of "mz" even if "mn" would be a more appropriate name now. I wouldn't bother with those. > Signed-off-by: Mel Gorman <mgorman@xxxxxxxxxxxxxxxxxxx> This simplifies the code so it is definitely welcome! I would appreciate a more precise changelog. Acked-by: Michal Hocko <mhocko@xxxxxxxx> > --- > include/linux/memcontrol.h | 21 ++--- > include/linux/swap.h | 2 +- > mm/memcontrol.c | 210 ++++++++++++++++++++------------------------- > mm/vmscan.c | 22 ++--- > mm/workingset.c | 6 +- > 5 files changed, 114 insertions(+), 147 deletions(-) > > diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h > index a13328851fea..ea7e249cde6d 100644 > --- a/include/linux/memcontrol.h > +++ b/include/linux/memcontrol.h > @@ -60,7 +60,7 @@ enum mem_cgroup_stat_index { > }; > > struct mem_cgroup_reclaim_cookie { > - struct zone *zone; > + pg_data_t *pgdat; > int priority; > unsigned int generation; > }; > @@ -113,7 +113,7 @@ struct mem_cgroup_reclaim_iter { > /* > * per-zone information in memory controller. > */ > -struct mem_cgroup_per_zone { > +struct mem_cgroup_per_node { > struct lruvec lruvec; > unsigned long lru_size[NR_LRU_LISTS]; > > @@ -127,10 +127,6 @@ struct mem_cgroup_per_zone { > /* use container_of */ > }; > > -struct mem_cgroup_per_node { > - struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; > -}; > - > struct mem_cgroup_threshold { > struct eventfd_ctx *eventfd; > unsigned long threshold; > @@ -306,8 +302,7 @@ void mem_cgroup_uncharge_list(struct list_head *page_list); > > void mem_cgroup_migrate(struct page *oldpage, struct page *newpage); > > -struct lruvec *mem_cgroup_lruvec(struct pglist_data *, struct zone *zone, > - struct mem_cgroup *); > +struct lruvec *mem_cgroup_lruvec(struct pglist_data *, struct mem_cgroup *); > struct lruvec *mem_cgroup_page_lruvec(struct page *, struct pglist_data *); > > bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg); > @@ -410,9 +405,9 @@ unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, > static inline > unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru) > { > - struct mem_cgroup_per_zone *mz; > + struct mem_cgroup_per_node *mz; > > - mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec); > + mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec); > return mz->lru_size[lru]; > } > > @@ -483,7 +478,7 @@ static inline void mem_cgroup_dec_page_stat(struct page *page, > mem_cgroup_update_page_stat(page, idx, -1); > } > > -unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, > +unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, > gfp_t gfp_mask, > unsigned long *total_scanned); > > @@ -575,7 +570,7 @@ static inline void mem_cgroup_migrate(struct page *old, struct page *new) > } > > static inline struct lruvec *mem_cgroup_lruvec(struct pglist_data *pgdat, > - struct zone *zone, struct mem_cgroup *memcg) > + struct mem_cgroup *memcg) > { > return node_lruvec(pgdat); > } > @@ -687,7 +682,7 @@ static inline void mem_cgroup_dec_page_stat(struct page *page, > } > > static inline > -unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, > +unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, > gfp_t gfp_mask, > unsigned long *total_scanned) > { > diff --git a/include/linux/swap.h b/include/linux/swap.h > index 0ad616d7c381..2a23ddc96edd 100644 > --- a/include/linux/swap.h > +++ b/include/linux/swap.h > @@ -318,7 +318,7 @@ extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, > bool may_swap); > extern unsigned long mem_cgroup_shrink_node(struct mem_cgroup *mem, > gfp_t gfp_mask, bool noswap, > - struct zone *zone, > + pg_data_t *pgdat, > unsigned long *nr_scanned); > extern unsigned long shrink_all_memory(unsigned long nr_pages); > extern int vm_swappiness; > diff --git a/mm/memcontrol.c b/mm/memcontrol.c > index aac5fae56ea4..b09a17e4f2ff 100644 > --- a/mm/memcontrol.c > +++ b/mm/memcontrol.c > @@ -132,15 +132,11 @@ static const char * const mem_cgroup_lru_names[] = { > * their hierarchy representation > */ > > -struct mem_cgroup_tree_per_zone { > +struct mem_cgroup_tree_per_node { > struct rb_root rb_root; > spinlock_t lock; > }; > > -struct mem_cgroup_tree_per_node { > - struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES]; > -}; > - > struct mem_cgroup_tree { > struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES]; > }; > @@ -323,13 +319,10 @@ EXPORT_SYMBOL(memcg_kmem_enabled_key); > > #endif /* !CONFIG_SLOB */ > > -static struct mem_cgroup_per_zone * > -mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone) > +static struct mem_cgroup_per_node * > +mem_cgroup_nodeinfo(struct mem_cgroup *memcg, int nid) > { > - int nid = zone_to_nid(zone); > - int zid = zone_idx(zone); > - > - return &memcg->nodeinfo[nid]->zoneinfo[zid]; > + return memcg->nodeinfo[nid]; > } > > /** > @@ -383,37 +376,35 @@ ino_t page_cgroup_ino(struct page *page) > return ino; > } > > -static struct mem_cgroup_per_zone * > -mem_cgroup_page_zoneinfo(struct mem_cgroup *memcg, struct page *page) > +static struct mem_cgroup_per_node * > +mem_cgroup_page_nodeinfo(struct mem_cgroup *memcg, struct page *page) > { > int nid = page_to_nid(page); > - int zid = page_zonenum(page); > > - return &memcg->nodeinfo[nid]->zoneinfo[zid]; > + return memcg->nodeinfo[nid]; > } > > -static struct mem_cgroup_tree_per_zone * > -soft_limit_tree_node_zone(int nid, int zid) > +static struct mem_cgroup_tree_per_node * > +soft_limit_tree_node(int nid) > { > - return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; > + return soft_limit_tree.rb_tree_per_node[nid]; > } > > -static struct mem_cgroup_tree_per_zone * > +static struct mem_cgroup_tree_per_node * > soft_limit_tree_from_page(struct page *page) > { > int nid = page_to_nid(page); > - int zid = page_zonenum(page); > > - return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; > + return soft_limit_tree.rb_tree_per_node[nid]; > } > > -static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_zone *mz, > - struct mem_cgroup_tree_per_zone *mctz, > +static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz, > + struct mem_cgroup_tree_per_node *mctz, > unsigned long new_usage_in_excess) > { > struct rb_node **p = &mctz->rb_root.rb_node; > struct rb_node *parent = NULL; > - struct mem_cgroup_per_zone *mz_node; > + struct mem_cgroup_per_node *mz_node; > > if (mz->on_tree) > return; > @@ -423,7 +414,7 @@ static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_zone *mz, > return; > while (*p) { > parent = *p; > - mz_node = rb_entry(parent, struct mem_cgroup_per_zone, > + mz_node = rb_entry(parent, struct mem_cgroup_per_node, > tree_node); > if (mz->usage_in_excess < mz_node->usage_in_excess) > p = &(*p)->rb_left; > @@ -439,8 +430,8 @@ static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_zone *mz, > mz->on_tree = true; > } > > -static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz, > - struct mem_cgroup_tree_per_zone *mctz) > +static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz, > + struct mem_cgroup_tree_per_node *mctz) > { > if (!mz->on_tree) > return; > @@ -448,8 +439,8 @@ static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz, > mz->on_tree = false; > } > > -static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz, > - struct mem_cgroup_tree_per_zone *mctz) > +static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz, > + struct mem_cgroup_tree_per_node *mctz) > { > unsigned long flags; > > @@ -473,8 +464,8 @@ static unsigned long soft_limit_excess(struct mem_cgroup *memcg) > static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) > { > unsigned long excess; > - struct mem_cgroup_per_zone *mz; > - struct mem_cgroup_tree_per_zone *mctz; > + struct mem_cgroup_per_node *mz; > + struct mem_cgroup_tree_per_node *mctz; > > mctz = soft_limit_tree_from_page(page); > /* > @@ -482,7 +473,7 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) > * because their event counter is not touched. > */ > for (; memcg; memcg = parent_mem_cgroup(memcg)) { > - mz = mem_cgroup_page_zoneinfo(memcg, page); > + mz = mem_cgroup_page_nodeinfo(memcg, page); > excess = soft_limit_excess(memcg); > /* > * We have to update the tree if mz is on RB-tree or > @@ -507,24 +498,22 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) > > static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg) > { > - struct mem_cgroup_tree_per_zone *mctz; > - struct mem_cgroup_per_zone *mz; > - int nid, zid; > + struct mem_cgroup_tree_per_node *mctz; > + struct mem_cgroup_per_node *mz; > + int nid; > > for_each_node(nid) { > - for (zid = 0; zid < MAX_NR_ZONES; zid++) { > - mz = &memcg->nodeinfo[nid]->zoneinfo[zid]; > - mctz = soft_limit_tree_node_zone(nid, zid); > - mem_cgroup_remove_exceeded(mz, mctz); > - } > + mz = mem_cgroup_nodeinfo(memcg, nid); > + mctz = soft_limit_tree_node(nid); > + mem_cgroup_remove_exceeded(mz, mctz); > } > } > > -static struct mem_cgroup_per_zone * > -__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) > +static struct mem_cgroup_per_node * > +__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) > { > struct rb_node *rightmost = NULL; > - struct mem_cgroup_per_zone *mz; > + struct mem_cgroup_per_node *mz; > > retry: > mz = NULL; > @@ -532,7 +521,7 @@ __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) > if (!rightmost) > goto done; /* Nothing to reclaim from */ > > - mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node); > + mz = rb_entry(rightmost, struct mem_cgroup_per_node, tree_node); > /* > * Remove the node now but someone else can add it back, > * we will to add it back at the end of reclaim to its correct > @@ -546,10 +535,10 @@ __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) > return mz; > } > > -static struct mem_cgroup_per_zone * > -mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) > +static struct mem_cgroup_per_node * > +mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) > { > - struct mem_cgroup_per_zone *mz; > + struct mem_cgroup_per_node *mz; > > spin_lock_irq(&mctz->lock); > mz = __mem_cgroup_largest_soft_limit_node(mctz); > @@ -643,20 +632,16 @@ unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, > int nid, unsigned int lru_mask) > { > unsigned long nr = 0; > - int zid; > + struct mem_cgroup_per_node *mz; > + enum lru_list lru; > > VM_BUG_ON((unsigned)nid >= nr_node_ids); > > - for (zid = 0; zid < MAX_NR_ZONES; zid++) { > - struct mem_cgroup_per_zone *mz; > - enum lru_list lru; > - > - for_each_lru(lru) { > - if (!(BIT(lru) & lru_mask)) > - continue; > - mz = &memcg->nodeinfo[nid]->zoneinfo[zid]; > - nr += mz->lru_size[lru]; > - } > + for_each_lru(lru) { > + if (!(BIT(lru) & lru_mask)) > + continue; > + mz = mem_cgroup_nodeinfo(memcg, nid); > + nr += mz->lru_size[lru]; > } > return nr; > } > @@ -809,9 +794,9 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, > rcu_read_lock(); > > if (reclaim) { > - struct mem_cgroup_per_zone *mz; > + struct mem_cgroup_per_node *mz; > > - mz = mem_cgroup_zone_zoneinfo(root, reclaim->zone); > + mz = mem_cgroup_nodeinfo(root, reclaim->pgdat->node_id); > iter = &mz->iter[reclaim->priority]; > > if (prev && reclaim->generation != iter->generation) > @@ -910,19 +895,17 @@ static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg) > { > struct mem_cgroup *memcg = dead_memcg; > struct mem_cgroup_reclaim_iter *iter; > - struct mem_cgroup_per_zone *mz; > - int nid, zid; > + struct mem_cgroup_per_node *mz; > + int nid; > int i; > > while ((memcg = parent_mem_cgroup(memcg))) { > for_each_node(nid) { > - for (zid = 0; zid < MAX_NR_ZONES; zid++) { > - mz = &memcg->nodeinfo[nid]->zoneinfo[zid]; > - for (i = 0; i <= DEF_PRIORITY; i++) { > - iter = &mz->iter[i]; > - cmpxchg(&iter->position, > - dead_memcg, NULL); > - } > + mz = mem_cgroup_nodeinfo(memcg, nid); > + for (i = 0; i <= DEF_PRIORITY; i++) { > + iter = &mz->iter[i]; > + cmpxchg(&iter->position, > + dead_memcg, NULL); > } > } > } > @@ -946,7 +929,6 @@ static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg) > /** > * mem_cgroup_lruvec - get the lru list vector for a node or a memcg zone > * @node: node of the wanted lruvec > - * @zone: zone of the wanted lruvec > * @memcg: memcg of the wanted lruvec > * > * Returns the lru list vector holding pages for a given @node or a given > @@ -954,9 +936,9 @@ static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg) > * is disabled. > */ > struct lruvec *mem_cgroup_lruvec(struct pglist_data *pgdat, > - struct zone *zone, struct mem_cgroup *memcg) > + struct mem_cgroup *memcg) > { > - struct mem_cgroup_per_zone *mz; > + struct mem_cgroup_per_node *mz; > struct lruvec *lruvec; > > if (mem_cgroup_disabled()) { > @@ -964,7 +946,7 @@ struct lruvec *mem_cgroup_lruvec(struct pglist_data *pgdat, > goto out; > } > > - mz = mem_cgroup_zone_zoneinfo(memcg, zone); > + mz = mem_cgroup_nodeinfo(memcg, pgdat->node_id); > lruvec = &mz->lruvec; > out: > /* > @@ -972,8 +954,8 @@ struct lruvec *mem_cgroup_lruvec(struct pglist_data *pgdat, > * we have to be prepared to initialize lruvec->zone here; > * and if offlined then reonlined, we need to reinitialize it. > */ > - if (unlikely(lruvec->pgdat != zone->zone_pgdat)) > - lruvec->pgdat = zone->zone_pgdat; > + if (unlikely(lruvec->pgdat != pgdat)) > + lruvec->pgdat = pgdat; > return lruvec; > } > > @@ -988,7 +970,7 @@ struct lruvec *mem_cgroup_lruvec(struct pglist_data *pgdat, > */ > struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgdat) > { > - struct mem_cgroup_per_zone *mz; > + struct mem_cgroup_per_node *mz; > struct mem_cgroup *memcg; > struct lruvec *lruvec; > > @@ -1005,7 +987,7 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgd > if (!memcg) > memcg = root_mem_cgroup; > > - mz = mem_cgroup_page_zoneinfo(memcg, page); > + mz = mem_cgroup_page_nodeinfo(memcg, page); > lruvec = &mz->lruvec; > out: > /* > @@ -1032,7 +1014,7 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgd > void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, > enum zone_type zid, int nr_pages) > { > - struct mem_cgroup_per_zone *mz; > + struct mem_cgroup_per_node *mz; > unsigned long *lru_size; > long size; > bool empty; > @@ -1042,7 +1024,7 @@ void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, > if (mem_cgroup_disabled()) > return; > > - mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec); > + mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec); > lru_size = mz->lru_size + lru; > empty = list_empty(lruvec->lists + lru); > > @@ -1435,7 +1417,7 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) > #endif > > static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, > - struct zone *zone, > + pg_data_t *pgdat, > gfp_t gfp_mask, > unsigned long *total_scanned) > { > @@ -1445,7 +1427,7 @@ static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, > unsigned long excess; > unsigned long nr_scanned; > struct mem_cgroup_reclaim_cookie reclaim = { > - .zone = zone, > + .pgdat = pgdat, > .priority = 0, > }; > > @@ -1476,7 +1458,7 @@ static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, > continue; > } > total += mem_cgroup_shrink_node(victim, gfp_mask, false, > - zone, &nr_scanned); > + pgdat, &nr_scanned); > *total_scanned += nr_scanned; > if (!soft_limit_excess(root_memcg)) > break; > @@ -2603,22 +2585,22 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, > return ret; > } > > -unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, > +unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, > gfp_t gfp_mask, > unsigned long *total_scanned) > { > unsigned long nr_reclaimed = 0; > - struct mem_cgroup_per_zone *mz, *next_mz = NULL; > + struct mem_cgroup_per_node *mz, *next_mz = NULL; > unsigned long reclaimed; > int loop = 0; > - struct mem_cgroup_tree_per_zone *mctz; > + struct mem_cgroup_tree_per_node *mctz; > unsigned long excess; > unsigned long nr_scanned; > > if (order > 0) > return 0; > > - mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone)); > + mctz = soft_limit_tree_node(pgdat->node_id); > /* > * This loop can run a while, specially if mem_cgroup's continuously > * keep exceeding their soft limit and putting the system under > @@ -2633,7 +2615,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, > break; > > nr_scanned = 0; > - reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone, > + reclaimed = mem_cgroup_soft_reclaim(mz->memcg, pgdat, > gfp_mask, &nr_scanned); > nr_reclaimed += reclaimed; > *total_scanned += nr_scanned; > @@ -3254,22 +3236,21 @@ static int memcg_stat_show(struct seq_file *m, void *v) > > #ifdef CONFIG_DEBUG_VM > { > - int nid, zid; > - struct mem_cgroup_per_zone *mz; > + pg_data_t *pgdat; > + struct mem_cgroup_per_node *mz; > struct zone_reclaim_stat *rstat; > unsigned long recent_rotated[2] = {0, 0}; > unsigned long recent_scanned[2] = {0, 0}; > > - for_each_online_node(nid) > - for (zid = 0; zid < MAX_NR_ZONES; zid++) { > - mz = &memcg->nodeinfo[nid]->zoneinfo[zid]; > - rstat = &mz->lruvec.reclaim_stat; > + for_each_online_pgdat(pgdat) { > + mz = mem_cgroup_nodeinfo(memcg, pgdat->node_id); > + rstat = &mz->lruvec.reclaim_stat; > > - recent_rotated[0] += rstat->recent_rotated[0]; > - recent_rotated[1] += rstat->recent_rotated[1]; > - recent_scanned[0] += rstat->recent_scanned[0]; > - recent_scanned[1] += rstat->recent_scanned[1]; > - } > + recent_rotated[0] += rstat->recent_rotated[0]; > + recent_rotated[1] += rstat->recent_rotated[1]; > + recent_scanned[0] += rstat->recent_scanned[0]; > + recent_scanned[1] += rstat->recent_scanned[1]; > + } > seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]); > seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]); > seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]); > @@ -4095,11 +4076,10 @@ static struct cftype mem_cgroup_legacy_files[] = { > { }, /* terminate */ > }; > > -static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) > +static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) > { > struct mem_cgroup_per_node *pn; > - struct mem_cgroup_per_zone *mz; > - int zone, tmp = node; > + int tmp = node; > /* > * This routine is called against possible nodes. > * But it's BUG to call kmalloc() against offline node. > @@ -4114,18 +4094,16 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) > if (!pn) > return 1; > > - for (zone = 0; zone < MAX_NR_ZONES; zone++) { > - mz = &pn->zoneinfo[zone]; > - lruvec_init(&mz->lruvec); > - mz->usage_in_excess = 0; > - mz->on_tree = false; > - mz->memcg = memcg; > - } > + lruvec_init(&pn->lruvec); > + pn->usage_in_excess = 0; > + pn->on_tree = false; > + pn->memcg = memcg; > + > memcg->nodeinfo[node] = pn; > return 0; > } > > -static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) > +static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) > { > kfree(memcg->nodeinfo[node]); > } > @@ -4136,7 +4114,7 @@ static void mem_cgroup_free(struct mem_cgroup *memcg) > > memcg_wb_domain_exit(memcg); > for_each_node(node) > - free_mem_cgroup_per_zone_info(memcg, node); > + free_mem_cgroup_per_node_info(memcg, node); > free_percpu(memcg->stat); > kfree(memcg); > } > @@ -4159,7 +4137,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void) > goto fail; > > for_each_node(node) > - if (alloc_mem_cgroup_per_zone_info(memcg, node)) > + if (alloc_mem_cgroup_per_node_info(memcg, node)) > goto fail; > > if (memcg_wb_domain_init(memcg, GFP_KERNEL)) > @@ -5757,18 +5735,12 @@ static int __init mem_cgroup_init(void) > > for_each_node(node) { > struct mem_cgroup_tree_per_node *rtpn; > - int zone; > > rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, > node_online(node) ? node : NUMA_NO_NODE); > > - for (zone = 0; zone < MAX_NR_ZONES; zone++) { > - struct mem_cgroup_tree_per_zone *rtpz; > - > - rtpz = &rtpn->rb_tree_per_zone[zone]; > - rtpz->rb_root = RB_ROOT; > - spin_lock_init(&rtpz->lock); > - } > + rtpn->rb_root = RB_ROOT; > + spin_lock_init(&rtpn->lock); > soft_limit_tree.rb_tree_per_node[node] = rtpn; > } > > diff --git a/mm/vmscan.c b/mm/vmscan.c > index 3774ebf19f63..cf73bf4ebd06 100644 > --- a/mm/vmscan.c > +++ b/mm/vmscan.c > @@ -2223,8 +2223,7 @@ static inline void init_tlb_ubc(void) > static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memcg, > struct scan_control *sc, unsigned long *lru_pages) > { > - struct zone *zone = &pgdat->node_zones[sc->reclaim_idx]; > - struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, zone, memcg); > + struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg); > unsigned long nr[NR_LRU_LISTS]; > unsigned long targets[NR_LRU_LISTS]; > unsigned long nr_to_scan; > @@ -2431,7 +2430,7 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc, > do { > struct mem_cgroup *root = sc->target_mem_cgroup; > struct mem_cgroup_reclaim_cookie reclaim = { > - .zone = &pgdat->node_zones[classzone_idx], > + .pgdat = pgdat, > .priority = sc->priority, > }; > unsigned long node_lru_pages = 0; > @@ -2638,7 +2637,7 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc, > * and balancing, not for a memcg's limit. > */ > nr_soft_scanned = 0; > - nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone, > + nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone->zone_pgdat, > sc->order, sc->gfp_mask, > &nr_soft_scanned); > sc->nr_reclaimed += nr_soft_reclaimed; > @@ -2905,7 +2904,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, > > unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg, > gfp_t gfp_mask, bool noswap, > - struct zone *zone, > + pg_data_t *pgdat, > unsigned long *nr_scanned) > { > struct scan_control sc = { > @@ -2913,7 +2912,7 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg, > .target_mem_cgroup = memcg, > .may_writepage = !laptop_mode, > .may_unmap = 1, > - .reclaim_idx = zone_idx(zone), > + .reclaim_idx = MAX_NR_ZONES - 1, > .may_swap = !noswap, > }; > unsigned long lru_pages; > @@ -2932,7 +2931,7 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg, > * will pick up pages from other mem cgroup's as well. We hack > * the priority and make it zero. > */ > - shrink_node_memcg(zone->zone_pgdat, memcg, &sc, &lru_pages); > + shrink_node_memcg(pgdat, memcg, &sc, &lru_pages); > > trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); > > @@ -2952,6 +2951,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, > .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), > .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | > (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK), > + .reclaim_idx = MAX_NR_ZONES - 1, > .target_mem_cgroup = memcg, > .priority = DEF_PRIORITY, > .may_writepage = !laptop_mode, > @@ -2981,7 +2981,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, > #endif > > static void age_active_anon(struct pglist_data *pgdat, > - struct zone *zone, struct scan_control *sc) > + struct scan_control *sc) > { > struct mem_cgroup *memcg; > > @@ -2990,7 +2990,7 @@ static void age_active_anon(struct pglist_data *pgdat, > > memcg = mem_cgroup_iter(NULL, NULL, NULL); > do { > - struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, zone, memcg); > + struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg); > > if (inactive_list_is_low(lruvec, false)) > shrink_active_list(SWAP_CLUSTER_MAX, lruvec, > @@ -3178,7 +3178,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) > * pages are rotated regardless of classzone as this is > * about consistent aging. > */ > - age_active_anon(pgdat, &pgdat->node_zones[MAX_NR_ZONES - 1], &sc); > + age_active_anon(pgdat, &sc); > > /* > * If we're getting trouble reclaiming, start doing writepage > @@ -3190,7 +3190,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) > /* Call soft limit reclaim before calling shrink_node. */ > sc.nr_scanned = 0; > nr_soft_scanned = 0; > - nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone, sc.order, > + nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(pgdat, sc.order, > sc.gfp_mask, &nr_soft_scanned); > sc.nr_reclaimed += nr_soft_reclaimed; > > diff --git a/mm/workingset.c b/mm/workingset.c > index 2d81ca11317d..5b479890cc18 100644 > --- a/mm/workingset.c > +++ b/mm/workingset.c > @@ -218,7 +218,7 @@ void *workingset_eviction(struct address_space *mapping, struct page *page) > VM_BUG_ON_PAGE(page_count(page), page); > VM_BUG_ON_PAGE(!PageLocked(page), page); > > - lruvec = mem_cgroup_lruvec(zone->zone_pgdat, zone, memcg); > + lruvec = mem_cgroup_lruvec(zone->zone_pgdat, memcg); > eviction = atomic_long_inc_return(&lruvec->inactive_age); > return pack_shadow(memcgid, zone, eviction); > } > @@ -267,7 +267,7 @@ bool workingset_refault(void *shadow) > rcu_read_unlock(); > return false; > } > - lruvec = mem_cgroup_lruvec(zone->zone_pgdat, zone, memcg); > + lruvec = mem_cgroup_lruvec(zone->zone_pgdat, memcg); > refault = atomic_long_read(&lruvec->inactive_age); > active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE); > rcu_read_unlock(); > @@ -317,7 +317,7 @@ void workingset_activation(struct page *page) > */ > if (!mem_cgroup_disabled() && !page_memcg(page)) > goto out; > - lruvec = mem_cgroup_lruvec(page_pgdat(page), page_zone(page), page_memcg(page)); > + lruvec = mem_cgroup_lruvec(page_pgdat(page), page_memcg(page)); > atomic_long_inc(&lruvec->inactive_age); > out: > unlock_page_memcg(page); > -- > 2.6.4 > > -- > To unsubscribe, send a message with 'unsubscribe linux-mm' in > the body to majordomo@xxxxxxxxx. For more info on Linux MM, > see: http://www.linux-mm.org/ . > Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a> -- Michal Hocko SUSE Labs -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>