On Thu, 14 Apr 2011 15:54:26 -0700 Ying Han <yinghan@xxxxxxxxxx> wrote: > After reclaiming each node per memcg, it checks mem_cgroup_watermark_ok() > and breaks the priority loop if it returns true. The per-memcg zone will > be marked as "unreclaimable" if the scanning rate is much greater than the > reclaiming rate on the per-memcg LRU. The bit is cleared when there is a > page charged to the memcg being freed. Kswapd breaks the priority loop if > all the zones are marked as "unreclaimable". > > changelog v4..v3: > 1. split off from the per-memcg background reclaim patch in V3. > > Signed-off-by: Ying Han <yinghan@xxxxxxxxxx> > --- > include/linux/memcontrol.h | 30 ++++++++++++++ > include/linux/swap.h | 2 + > mm/memcontrol.c | 96 ++++++++++++++++++++++++++++++++++++++++++++ > mm/vmscan.c | 19 +++++++++ > 4 files changed, 147 insertions(+), 0 deletions(-) > > diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h > index d4ff7f2..a8159f5 100644 > --- a/include/linux/memcontrol.h > +++ b/include/linux/memcontrol.h > @@ -155,6 +155,12 @@ static inline void mem_cgroup_dec_page_stat(struct page *page, > unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, > gfp_t gfp_mask); > u64 mem_cgroup_get_limit(struct mem_cgroup *mem); > +void mem_cgroup_clear_unreclaimable(struct mem_cgroup *mem, struct page *page); > +bool mem_cgroup_zone_reclaimable(struct mem_cgroup *mem, int nid, int zid); > +bool mem_cgroup_mz_unreclaimable(struct mem_cgroup *mem, struct zone *zone); > +void mem_cgroup_mz_set_unreclaimable(struct mem_cgroup *mem, struct zone *zone); > +void mem_cgroup_mz_pages_scanned(struct mem_cgroup *mem, struct zone* zone, > + unsigned long nr_scanned); > > #ifdef CONFIG_TRANSPARENT_HUGEPAGE > void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail); > @@ -345,6 +351,25 @@ static inline void mem_cgroup_dec_page_stat(struct page *page, > { > } > > +static inline void mem_cgroup_mz_pages_scanned(struct mem_cgroup *mem, > + struct zone *zone, > + unsigned long nr_scanned) > +{ > +} > + > +static inline void mem_cgroup_clear_unreclaimable(struct page *page, > + struct zone *zone) > +{ > +} > +static inline void mem_cgroup_mz_set_unreclaimable(struct mem_cgroup *mem, > + struct zone *zone) > +{ > +} > +static inline bool mem_cgroup_mz_unreclaimable(struct mem_cgroup *mem, > + struct zone *zone) > +{ > +} > + > static inline > unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, > gfp_t gfp_mask) > @@ -363,6 +388,11 @@ static inline void mem_cgroup_split_huge_fixup(struct page *head, > { > } > > +static inline bool mem_cgroup_zone_reclaimable(struct mem_cgroup *mem, int nid, > + int zid) > +{ > + return false; > +} > #endif /* CONFIG_CGROUP_MEM_CONT */ > > #if !defined(CONFIG_CGROUP_MEM_RES_CTLR) || !defined(CONFIG_DEBUG_VM) > diff --git a/include/linux/swap.h b/include/linux/swap.h > index 17e0511..319b800 100644 > --- a/include/linux/swap.h > +++ b/include/linux/swap.h > @@ -160,6 +160,8 @@ enum { > SWP_SCANNING = (1 << 8), /* refcount in scan_swap_map */ > }; > > +#define ZONE_RECLAIMABLE_RATE 6 > + > #define SWAP_CLUSTER_MAX 32 > #define COMPACT_CLUSTER_MAX SWAP_CLUSTER_MAX > > diff --git a/mm/memcontrol.c b/mm/memcontrol.c > index e22351a..da6a130 100644 > --- a/mm/memcontrol.c > +++ b/mm/memcontrol.c > @@ -133,7 +133,10 @@ struct mem_cgroup_per_zone { > bool on_tree; > struct mem_cgroup *mem; /* Back pointer, we cannot */ > /* use container_of */ > + unsigned long pages_scanned; /* since last reclaim */ > + bool all_unreclaimable; /* All pages pinned */ > }; > + > /* Macro for accessing counter */ > #define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)]) > > @@ -1135,6 +1138,96 @@ mem_cgroup_get_reclaim_stat_from_page(struct page *page) > return &mz->reclaim_stat; > } > > +static unsigned long mem_cgroup_zone_reclaimable_pages( > + struct mem_cgroup_per_zone *mz) > +{ > + int nr; > + nr = MEM_CGROUP_ZSTAT(mz, LRU_ACTIVE_FILE) + > + MEM_CGROUP_ZSTAT(mz, LRU_INACTIVE_FILE); > + > + if (nr_swap_pages > 0) > + nr += MEM_CGROUP_ZSTAT(mz, LRU_ACTIVE_ANON) + > + MEM_CGROUP_ZSTAT(mz, LRU_INACTIVE_ANON); > + > + return nr; > +} > + > +void mem_cgroup_mz_pages_scanned(struct mem_cgroup *mem, struct zone* zone, > + unsigned long nr_scanned) > +{ > + struct mem_cgroup_per_zone *mz = NULL; > + int nid = zone_to_nid(zone); > + int zid = zone_idx(zone); > + > + if (!mem) > + return; > + > + mz = mem_cgroup_zoneinfo(mem, nid, zid); > + if (mz) > + mz->pages_scanned += nr_scanned; > +} > + > +bool mem_cgroup_zone_reclaimable(struct mem_cgroup *mem, int nid, int zid) > +{ > + struct mem_cgroup_per_zone *mz = NULL; > + > + if (!mem) > + return 0; > + > + mz = mem_cgroup_zoneinfo(mem, nid, zid); > + if (mz) > + return mz->pages_scanned < > + mem_cgroup_zone_reclaimable_pages(mz) * > + ZONE_RECLAIMABLE_RATE; > + return 0; > +} > + > +bool mem_cgroup_mz_unreclaimable(struct mem_cgroup *mem, struct zone *zone) > +{ > + struct mem_cgroup_per_zone *mz = NULL; > + int nid = zone_to_nid(zone); > + int zid = zone_idx(zone); > + > + if (!mem) > + return false; > + > + mz = mem_cgroup_zoneinfo(mem, nid, zid); > + if (mz) > + return mz->all_unreclaimable; > + > + return false; > +} > + > +void mem_cgroup_mz_set_unreclaimable(struct mem_cgroup *mem, struct zone *zone) > +{ > + struct mem_cgroup_per_zone *mz = NULL; > + int nid = zone_to_nid(zone); > + int zid = zone_idx(zone); > + > + if (!mem) > + return; > + > + mz = mem_cgroup_zoneinfo(mem, nid, zid); > + if (mz) > + mz->all_unreclaimable = true; > +} > + > +void mem_cgroup_clear_unreclaimable(struct mem_cgroup *mem, struct page *page) > +{ > + struct mem_cgroup_per_zone *mz = NULL; > + > + if (!mem) > + return; > + > + mz = page_cgroup_zoneinfo(mem, page); > + if (mz) { > + mz->pages_scanned = 0; > + mz->all_unreclaimable = false; > + } > + > + return; > +} > + > unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, > struct list_head *dst, > unsigned long *scanned, int order, > @@ -2801,6 +2894,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) > * special functions. > */ > > + mem_cgroup_clear_unreclaimable(mem, page); Hmm, this will easily cause cache ping-pong. (free_page() clears it after taking zone->lock....in batched manner.) Could you consider a way to make this low cost ? One way is using memcg_check_event() with some low event trigger. Second way is usign memcg_batch. In many case, we can expect a chunk of free pages are from the same zone. Then, add a new member to batch_memcg as struct memcg_batch_info { ..... struct zone *zone; # a zone page is last uncharged. ... } Then, == static void mem_cgroup_do_uncharge(struct mem_cgroup *mem, unsigned int nr_pages, + struct page *page, const enum charge_type ctype) { struct memcg_batch_info *batch = NULL; ..... if (batch->zone != page_zone(page)) { mem_cgroup_clear_unreclaimable(mem, page); } direct_uncharge: mem_cgroup_clear_unreclaimable(mem, page); .... } == This will reduce overhead dramatically. > unlock_page_cgroup(pc); > /* > * even after unlock, we have mem->res.usage here and this memcg > @@ -4569,6 +4663,8 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) > mz->usage_in_excess = 0; > mz->on_tree = false; > mz->mem = mem; > + mz->pages_scanned = 0; > + mz->all_unreclaimable = false; > } > return 0; > } > diff --git a/mm/vmscan.c b/mm/vmscan.c > index b8345d2..c081112 100644 > --- a/mm/vmscan.c > +++ b/mm/vmscan.c > @@ -1414,6 +1414,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, > ISOLATE_BOTH : ISOLATE_INACTIVE, > zone, sc->mem_cgroup, > 0, file); > + > + mem_cgroup_mz_pages_scanned(sc->mem_cgroup, zone, nr_scanned); > + > /* > * mem_cgroup_isolate_pages() keeps track of > * scanned pages on its own. > @@ -1533,6 +1536,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, > * mem_cgroup_isolate_pages() keeps track of > * scanned pages on its own. > */ > + mem_cgroup_mz_pages_scanned(sc->mem_cgroup, zone, pgscanned); > } > > reclaim_stat->recent_scanned[file] += nr_taken; > @@ -2648,6 +2652,7 @@ static void balance_pgdat_node(pg_data_t *pgdat, int order, > unsigned long total_scanned = 0; > struct mem_cgroup *mem_cont = sc->mem_cgroup; > int priority = sc->priority; > + int nid = pgdat->node_id; > > /* > * Now scan the zone in the dma->highmem direction, and we scan > @@ -2664,10 +2669,20 @@ static void balance_pgdat_node(pg_data_t *pgdat, int order, > if (!populated_zone(zone)) > continue; > > + if (mem_cgroup_mz_unreclaimable(mem_cont, zone) && > + priority != DEF_PRIORITY) > + continue; > + > sc->nr_scanned = 0; > shrink_zone(priority, zone, sc); > total_scanned += sc->nr_scanned; > > + if (mem_cgroup_mz_unreclaimable(mem_cont, zone)) > + continue; > + > + if (!mem_cgroup_zone_reclaimable(mem_cont, nid, i)) > + mem_cgroup_mz_set_unreclaimable(mem_cont, zone); > + > /* > * If we've done a decent amount of scanning and > * the reclaim ratio is low, start doing writepage > @@ -2752,6 +2767,10 @@ loop_again: > > if (!populated_zone(zone)) > continue; > + > + if (!mem_cgroup_mz_unreclaimable(mem_cont, > + zone)) > + Ah, okay. this will work. Thanks, -Kame -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxxx For more info on Linux MM, see: http://www.linux-mm.org/ . Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/ Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>