2011/4/15 KAMEZAWA Hiroyuki <kamezawa.hiroyu@xxxxxxxxxxxxxx>: > On Thu, 14 Apr 2011 15:54:26 -0700 > Ying Han <yinghan@xxxxxxxxxx> wrote: > >> After reclaiming each node per memcg, it checks mem_cgroup_watermark_ok() >> and breaks the priority loop if it returns true. The per-memcg zone will >> be marked as "unreclaimable" if the scanning rate is much greater than the >> reclaiming rate on the per-memcg LRU. The bit is cleared when there is a >> page charged to the memcg being freed. Kswapd breaks the priority loop if >> all the zones are marked as "unreclaimable". >> >> changelog v4..v3: >> 1. split off from the per-memcg background reclaim patch in V3. >> >> Signed-off-by: Ying Han <yinghan@xxxxxxxxxx> >> --- >> include/linux/memcontrol.h | 30 ++++++++++++++ >> include/linux/swap.h | 2 + >> mm/memcontrol.c | 96 ++++++++++++++++++++++++++++++++++++++++++++ >> mm/vmscan.c | 19 +++++++++ >> 4 files changed, 147 insertions(+), 0 deletions(-) >> >> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h >> index d4ff7f2..a8159f5 100644 >> --- a/include/linux/memcontrol.h >> +++ b/include/linux/memcontrol.h >> @@ -155,6 +155,12 @@ static inline void mem_cgroup_dec_page_stat(struct page *page, >> unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, >> gfp_t gfp_mask); >> u64 mem_cgroup_get_limit(struct mem_cgroup *mem); >> +void mem_cgroup_clear_unreclaimable(struct mem_cgroup *mem, struct page *page); >> +bool mem_cgroup_zone_reclaimable(struct mem_cgroup *mem, int nid, int zid); >> +bool mem_cgroup_mz_unreclaimable(struct mem_cgroup *mem, struct zone *zone); >> +void mem_cgroup_mz_set_unreclaimable(struct mem_cgroup *mem, struct zone *zone); >> +void mem_cgroup_mz_pages_scanned(struct mem_cgroup *mem, struct zone* zone, >> + unsigned long nr_scanned); >> >> #ifdef CONFIG_TRANSPARENT_HUGEPAGE >> void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail); >> @@ -345,6 +351,25 @@ static inline void mem_cgroup_dec_page_stat(struct page *page, >> { >> } >> >> +static inline void mem_cgroup_mz_pages_scanned(struct mem_cgroup *mem, >> + struct zone *zone, >> + unsigned long nr_scanned) >> +{ >> +} >> + >> +static inline void mem_cgroup_clear_unreclaimable(struct page *page, >> + struct zone *zone) >> +{ >> +} >> +static inline void mem_cgroup_mz_set_unreclaimable(struct mem_cgroup *mem, >> + struct zone *zone) >> +{ >> +} >> +static inline bool mem_cgroup_mz_unreclaimable(struct mem_cgroup *mem, >> + struct zone *zone) >> +{ >> +} >> + >> static inline >> unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, >> gfp_t gfp_mask) >> @@ -363,6 +388,11 @@ static inline void mem_cgroup_split_huge_fixup(struct page *head, >> { >> } >> >> +static inline bool mem_cgroup_zone_reclaimable(struct mem_cgroup *mem, int nid, >> + int zid) >> +{ >> + return false; >> +} >> #endif /* CONFIG_CGROUP_MEM_CONT */ >> >> #if !defined(CONFIG_CGROUP_MEM_RES_CTLR) || !defined(CONFIG_DEBUG_VM) >> diff --git a/include/linux/swap.h b/include/linux/swap.h >> index 17e0511..319b800 100644 >> --- a/include/linux/swap.h >> +++ b/include/linux/swap.h >> @@ -160,6 +160,8 @@ enum { >> SWP_SCANNING = (1 << 8), /* refcount in scan_swap_map */ >> }; >> >> +#define ZONE_RECLAIMABLE_RATE 6 >> + >> #define SWAP_CLUSTER_MAX 32 >> #define COMPACT_CLUSTER_MAX SWAP_CLUSTER_MAX >> >> diff --git a/mm/memcontrol.c b/mm/memcontrol.c >> index e22351a..da6a130 100644 >> --- a/mm/memcontrol.c >> +++ b/mm/memcontrol.c >> @@ -133,7 +133,10 @@ struct mem_cgroup_per_zone { >> bool on_tree; >> struct mem_cgroup *mem; /* Back pointer, we cannot */ >> /* use container_of */ >> + unsigned long pages_scanned; /* since last reclaim */ >> + bool all_unreclaimable; /* All pages pinned */ >> }; >> + >> /* Macro for accessing counter */ >> #define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)]) >> >> @@ -1135,6 +1138,96 @@ mem_cgroup_get_reclaim_stat_from_page(struct page *page) >> return &mz->reclaim_stat; >> } >> >> +static unsigned long mem_cgroup_zone_reclaimable_pages( >> + struct mem_cgroup_per_zone *mz) >> +{ >> + int nr; >> + nr = MEM_CGROUP_ZSTAT(mz, LRU_ACTIVE_FILE) + >> + MEM_CGROUP_ZSTAT(mz, LRU_INACTIVE_FILE); >> + >> + if (nr_swap_pages > 0) >> + nr += MEM_CGROUP_ZSTAT(mz, LRU_ACTIVE_ANON) + >> + MEM_CGROUP_ZSTAT(mz, LRU_INACTIVE_ANON); >> + >> + return nr; >> +} >> + >> +void mem_cgroup_mz_pages_scanned(struct mem_cgroup *mem, struct zone* zone, >> + unsigned long nr_scanned) >> +{ >> + struct mem_cgroup_per_zone *mz = NULL; >> + int nid = zone_to_nid(zone); >> + int zid = zone_idx(zone); >> + >> + if (!mem) >> + return; >> + >> + mz = mem_cgroup_zoneinfo(mem, nid, zid); >> + if (mz) >> + mz->pages_scanned += nr_scanned; >> +} >> + >> +bool mem_cgroup_zone_reclaimable(struct mem_cgroup *mem, int nid, int zid) >> +{ >> + struct mem_cgroup_per_zone *mz = NULL; >> + >> + if (!mem) >> + return 0; >> + >> + mz = mem_cgroup_zoneinfo(mem, nid, zid); >> + if (mz) >> + return mz->pages_scanned < >> + mem_cgroup_zone_reclaimable_pages(mz) * >> + ZONE_RECLAIMABLE_RATE; >> + return 0; >> +} >> + >> +bool mem_cgroup_mz_unreclaimable(struct mem_cgroup *mem, struct zone *zone) >> +{ >> + struct mem_cgroup_per_zone *mz = NULL; >> + int nid = zone_to_nid(zone); >> + int zid = zone_idx(zone); >> + >> + if (!mem) >> + return false; >> + >> + mz = mem_cgroup_zoneinfo(mem, nid, zid); >> + if (mz) >> + return mz->all_unreclaimable; >> + >> + return false; >> +} >> + >> +void mem_cgroup_mz_set_unreclaimable(struct mem_cgroup *mem, struct zone *zone) >> +{ >> + struct mem_cgroup_per_zone *mz = NULL; >> + int nid = zone_to_nid(zone); >> + int zid = zone_idx(zone); >> + >> + if (!mem) >> + return; >> + >> + mz = mem_cgroup_zoneinfo(mem, nid, zid); >> + if (mz) >> + mz->all_unreclaimable = true; >> +} >> + >> +void mem_cgroup_clear_unreclaimable(struct mem_cgroup *mem, struct page *page) >> +{ >> + struct mem_cgroup_per_zone *mz = NULL; >> + >> + if (!mem) >> + return; >> + >> + mz = page_cgroup_zoneinfo(mem, page); >> + if (mz) { >> + mz->pages_scanned = 0; >> + mz->all_unreclaimable = false; >> + } >> + >> + return; >> +} >> + >> unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, >> struct list_head *dst, >> unsigned long *scanned, int order, >> @@ -2801,6 +2894,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) >> * special functions. >> */ >> >> + mem_cgroup_clear_unreclaimable(mem, page); > > Hmm, this will easily cause cache ping-pong. (free_page() clears it after taking > zone->lock....in batched manner.) > > Could you consider a way to make this low cost ? > > One way is using memcg_check_event() with some low event trigger. > Second way is usign memcg_batch. > In many case, we can expect a chunk of free pages are from the same zone. > Then, add a new member to batch_memcg as > > struct memcg_batch_info { > ..... > struct zone *zone; # a zone page is last uncharged. > ... > } > > Then, > == > static void mem_cgroup_do_uncharge(struct mem_cgroup *mem, > unsigned int nr_pages, > + struct page *page, > const enum charge_type ctype) > { > struct memcg_batch_info *batch = NULL; > ..... > > if (batch->zone != page_zone(page)) { > mem_cgroup_clear_unreclaimable(mem, page); > } > direct_uncharge: > mem_cgroup_clear_unreclaimable(mem, page); > .... > } > == > > This will reduce overhead dramatically. > Excuse me but I don't quite understand this part, IMHO this is to avoid call mem_cgroup_clear_unreclaimable() against each single page during a munmap()/free_pages() including many pages to free, which is unnecessary because the zone will turn into 'reclaimable' at the first page uncharged. Then why can't we just say, if (mem_cgroup_zoneinfo(mem, page_to_nid(page), page_zonenum(page))->all_unreclaimable) { mem_cgroup_clear_unreclaimable(mem, page); } -- Thanks, Zhu Yanhai > > >> unlock_page_cgroup(pc); >> /* >> * even after unlock, we have mem->res.usage here and this memcg >> @@ -4569,6 +4663,8 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) >> mz->usage_in_excess = 0; >> mz->on_tree = false; >> mz->mem = mem; >> + mz->pages_scanned = 0; >> + mz->all_unreclaimable = false; >> } >> return 0; >> } >> diff --git a/mm/vmscan.c b/mm/vmscan.c >> index b8345d2..c081112 100644 >> --- a/mm/vmscan.c >> +++ b/mm/vmscan.c >> @@ -1414,6 +1414,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, >> ISOLATE_BOTH : ISOLATE_INACTIVE, >> zone, sc->mem_cgroup, >> 0, file); >> + >> + mem_cgroup_mz_pages_scanned(sc->mem_cgroup, zone, nr_scanned); >> + >> /* >> * mem_cgroup_isolate_pages() keeps track of >> * scanned pages on its own. >> @@ -1533,6 +1536,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, >> * mem_cgroup_isolate_pages() keeps track of >> * scanned pages on its own. >> */ >> + mem_cgroup_mz_pages_scanned(sc->mem_cgroup, zone, pgscanned); >> } >> >> reclaim_stat->recent_scanned[file] += nr_taken; >> @@ -2648,6 +2652,7 @@ static void balance_pgdat_node(pg_data_t *pgdat, int order, >> unsigned long total_scanned = 0; >> struct mem_cgroup *mem_cont = sc->mem_cgroup; >> int priority = sc->priority; >> + int nid = pgdat->node_id; >> >> /* >> * Now scan the zone in the dma->highmem direction, and we scan >> @@ -2664,10 +2669,20 @@ static void balance_pgdat_node(pg_data_t *pgdat, int order, >> if (!populated_zone(zone)) >> continue; >> >> + if (mem_cgroup_mz_unreclaimable(mem_cont, zone) && >> + priority != DEF_PRIORITY) >> + continue; >> + >> sc->nr_scanned = 0; >> shrink_zone(priority, zone, sc); >> total_scanned += sc->nr_scanned; >> >> + if (mem_cgroup_mz_unreclaimable(mem_cont, zone)) >> + continue; >> + >> + if (!mem_cgroup_zone_reclaimable(mem_cont, nid, i)) >> + mem_cgroup_mz_set_unreclaimable(mem_cont, zone); >> + >> /* >> * If we've done a decent amount of scanning and >> * the reclaim ratio is low, start doing writepage >> @@ -2752,6 +2767,10 @@ loop_again: >> >> if (!populated_zone(zone)) >> continue; >> + >> + if (!mem_cgroup_mz_unreclaimable(mem_cont, >> + zone)) >> + > > Ah, okay. this will work. > > Thanks, > -Kame > -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/ Don't email: <a href