On Mon, Mar 19, 2012 at 1:27 AM, Zhu Yanhai <zhu.yanhai@xxxxxxxxx> wrote: > 2011/4/15 KAMEZAWA Hiroyuki <kamezawa.hiroyu@xxxxxxxxxxxxxx>: >> On Thu, 14 Apr 2011 15:54:26 -0700 >> Ying Han <yinghan@xxxxxxxxxx> wrote: >> >>> After reclaiming each node per memcg, it checks mem_cgroup_watermark_ok() >>> and breaks the priority loop if it returns true. The per-memcg zone will >>> be marked as "unreclaimable" if the scanning rate is much greater than the >>> reclaiming rate on the per-memcg LRU. The bit is cleared when there is a >>> page charged to the memcg being freed. Kswapd breaks the priority loop if >>> all the zones are marked as "unreclaimable". >>> >>> changelog v4..v3: >>> 1. split off from the per-memcg background reclaim patch in V3. >>> >>> Signed-off-by: Ying Han <yinghan@xxxxxxxxxx> >>> --- >>> include/linux/memcontrol.h | 30 ++++++++++++++ >>> include/linux/swap.h | 2 + >>> mm/memcontrol.c | 96 ++++++++++++++++++++++++++++++++++++++++++++ >>> mm/vmscan.c | 19 +++++++++ >>> 4 files changed, 147 insertions(+), 0 deletions(-) >>> >>> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h >>> index d4ff7f2..a8159f5 100644 >>> --- a/include/linux/memcontrol.h >>> +++ b/include/linux/memcontrol.h >>> @@ -155,6 +155,12 @@ static inline void mem_cgroup_dec_page_stat(struct page *page, >>> unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, >>> gfp_t gfp_mask); >>> u64 mem_cgroup_get_limit(struct mem_cgroup *mem); >>> +void mem_cgroup_clear_unreclaimable(struct mem_cgroup *mem, struct page *page); >>> +bool mem_cgroup_zone_reclaimable(struct mem_cgroup *mem, int nid, int zid); >>> +bool mem_cgroup_mz_unreclaimable(struct mem_cgroup *mem, struct zone *zone); >>> +void mem_cgroup_mz_set_unreclaimable(struct mem_cgroup *mem, struct zone *zone); >>> +void mem_cgroup_mz_pages_scanned(struct mem_cgroup *mem, struct zone* zone, >>> + unsigned long nr_scanned); >>> >>> #ifdef CONFIG_TRANSPARENT_HUGEPAGE >>> void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail); >>> @@ -345,6 +351,25 @@ static inline void mem_cgroup_dec_page_stat(struct page *page, >>> { >>> } >>> >>> +static inline void mem_cgroup_mz_pages_scanned(struct mem_cgroup *mem, >>> + struct zone *zone, >>> + unsigned long nr_scanned) >>> +{ >>> +} >>> + >>> +static inline void mem_cgroup_clear_unreclaimable(struct page *page, >>> + struct zone *zone) >>> +{ >>> +} >>> +static inline void mem_cgroup_mz_set_unreclaimable(struct mem_cgroup *mem, >>> + struct zone *zone) >>> +{ >>> +} >>> +static inline bool mem_cgroup_mz_unreclaimable(struct mem_cgroup *mem, >>> + struct zone *zone) >>> +{ >>> +} >>> + >>> static inline >>> unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, >>> gfp_t gfp_mask) >>> @@ -363,6 +388,11 @@ static inline void mem_cgroup_split_huge_fixup(struct page *head, >>> { >>> } >>> >>> +static inline bool mem_cgroup_zone_reclaimable(struct mem_cgroup *mem, int nid, >>> + int zid) >>> +{ >>> + return false; >>> +} >>> #endif /* CONFIG_CGROUP_MEM_CONT */ >>> >>> #if !defined(CONFIG_CGROUP_MEM_RES_CTLR) || !defined(CONFIG_DEBUG_VM) >>> diff --git a/include/linux/swap.h b/include/linux/swap.h >>> index 17e0511..319b800 100644 >>> --- a/include/linux/swap.h >>> +++ b/include/linux/swap.h >>> @@ -160,6 +160,8 @@ enum { >>> SWP_SCANNING = (1 << 8), /* refcount in scan_swap_map */ >>> }; >>> >>> +#define ZONE_RECLAIMABLE_RATE 6 >>> + >>> #define SWAP_CLUSTER_MAX 32 >>> #define COMPACT_CLUSTER_MAX SWAP_CLUSTER_MAX >>> >>> diff --git a/mm/memcontrol.c b/mm/memcontrol.c >>> index e22351a..da6a130 100644 >>> --- a/mm/memcontrol.c >>> +++ b/mm/memcontrol.c >>> @@ -133,7 +133,10 @@ struct mem_cgroup_per_zone { >>> bool on_tree; >>> struct mem_cgroup *mem; /* Back pointer, we cannot */ >>> /* use container_of */ >>> + unsigned long pages_scanned; /* since last reclaim */ >>> + bool all_unreclaimable; /* All pages pinned */ >>> }; >>> + >>> /* Macro for accessing counter */ >>> #define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)]) >>> >>> @@ -1135,6 +1138,96 @@ mem_cgroup_get_reclaim_stat_from_page(struct page *page) >>> return &mz->reclaim_stat; >>> } >>> >>> +static unsigned long mem_cgroup_zone_reclaimable_pages( >>> + struct mem_cgroup_per_zone *mz) >>> +{ >>> + int nr; >>> + nr = MEM_CGROUP_ZSTAT(mz, LRU_ACTIVE_FILE) + >>> + MEM_CGROUP_ZSTAT(mz, LRU_INACTIVE_FILE); >>> + >>> + if (nr_swap_pages > 0) >>> + nr += MEM_CGROUP_ZSTAT(mz, LRU_ACTIVE_ANON) + >>> + MEM_CGROUP_ZSTAT(mz, LRU_INACTIVE_ANON); >>> + >>> + return nr; >>> +} >>> + >>> +void mem_cgroup_mz_pages_scanned(struct mem_cgroup *mem, struct zone* zone, >>> + unsigned long nr_scanned) >>> +{ >>> + struct mem_cgroup_per_zone *mz = NULL; >>> + int nid = zone_to_nid(zone); >>> + int zid = zone_idx(zone); >>> + >>> + if (!mem) >>> + return; >>> + >>> + mz = mem_cgroup_zoneinfo(mem, nid, zid); >>> + if (mz) >>> + mz->pages_scanned += nr_scanned; >>> +} >>> + >>> +bool mem_cgroup_zone_reclaimable(struct mem_cgroup *mem, int nid, int zid) >>> +{ >>> + struct mem_cgroup_per_zone *mz = NULL; >>> + >>> + if (!mem) >>> + return 0; >>> + >>> + mz = mem_cgroup_zoneinfo(mem, nid, zid); >>> + if (mz) >>> + return mz->pages_scanned < >>> + mem_cgroup_zone_reclaimable_pages(mz) * >>> + ZONE_RECLAIMABLE_RATE; >>> + return 0; >>> +} >>> + >>> +bool mem_cgroup_mz_unreclaimable(struct mem_cgroup *mem, struct zone *zone) >>> +{ >>> + struct mem_cgroup_per_zone *mz = NULL; >>> + int nid = zone_to_nid(zone); >>> + int zid = zone_idx(zone); >>> + >>> + if (!mem) >>> + return false; >>> + >>> + mz = mem_cgroup_zoneinfo(mem, nid, zid); >>> + if (mz) >>> + return mz->all_unreclaimable; >>> + >>> + return false; >>> +} >>> + >>> +void mem_cgroup_mz_set_unreclaimable(struct mem_cgroup *mem, struct zone *zone) >>> +{ >>> + struct mem_cgroup_per_zone *mz = NULL; >>> + int nid = zone_to_nid(zone); >>> + int zid = zone_idx(zone); >>> + >>> + if (!mem) >>> + return; >>> + >>> + mz = mem_cgroup_zoneinfo(mem, nid, zid); >>> + if (mz) >>> + mz->all_unreclaimable = true; >>> +} >>> + >>> +void mem_cgroup_clear_unreclaimable(struct mem_cgroup *mem, struct page *page) >>> +{ >>> + struct mem_cgroup_per_zone *mz = NULL; >>> + >>> + if (!mem) >>> + return; >>> + >>> + mz = page_cgroup_zoneinfo(mem, page); >>> + if (mz) { >>> + mz->pages_scanned = 0; >>> + mz->all_unreclaimable = false; >>> + } >>> + >>> + return; >>> +} >>> + >>> unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, >>> struct list_head *dst, >>> unsigned long *scanned, int order, >>> @@ -2801,6 +2894,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) >>> * special functions. >>> */ >>> >>> + mem_cgroup_clear_unreclaimable(mem, page); >> >> Hmm, this will easily cause cache ping-pong. (free_page() clears it after taking >> zone->lock....in batched manner.) >> >> Could you consider a way to make this low cost ? >> >> One way is using memcg_check_event() with some low event trigger. >> Second way is usign memcg_batch. >> In many case, we can expect a chunk of free pages are from the same zone. >> Then, add a new member to batch_memcg as >> >> struct memcg_batch_info { >> ..... >> struct zone *zone; # a zone page is last uncharged. >> ... >> } >> >> Then, >> == >> static void mem_cgroup_do_uncharge(struct mem_cgroup *mem, >> unsigned int nr_pages, >> + struct page *page, >> const enum charge_type ctype) >> { >> struct memcg_batch_info *batch = NULL; >> ..... >> >> if (batch->zone != page_zone(page)) { >> mem_cgroup_clear_unreclaimable(mem, page); >> } >> direct_uncharge: >> mem_cgroup_clear_unreclaimable(mem, page); >> .... >> } >> == >> >> This will reduce overhead dramatically. >> > > Excuse me but I don't quite understand this part, IMHO this is to > avoid call mem_cgroup_clear_unreclaimable() against each single page > during a munmap()/free_pages() including many pages to free, which is > unnecessary because the zone will turn into 'reclaimable' at the first > page uncharged. > Then why can't we just say, > if (mem_cgroup_zoneinfo(mem, page_to_nid(page), page_zonenum(page))->all_unreclaimable) { > mem_cgroup_clear_unreclaimable(mem, page); > } Are you suggesting to replace the batching w/ the code above? --Ying > -- > Thanks, > Zhu Yanhai > > >> >> >>> unlock_page_cgroup(pc); >>> /* >>> * even after unlock, we have mem->res.usage here and this memcg >>> @@ -4569,6 +4663,8 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) >>> mz->usage_in_excess = 0; >>> mz->on_tree = false; >>> mz->mem = mem; >>> + mz->pages_scanned = 0; >>> + mz->all_unreclaimable = false; >>> } >>> return 0; >>> } >>> diff --git a/mm/vmscan.c b/mm/vmscan.c >>> index b8345d2..c081112 100644 >>> --- a/mm/vmscan.c >>> +++ b/mm/vmscan.c >>> @@ -1414,6 +1414,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, >>> ISOLATE_BOTH : ISOLATE_INACTIVE, >>> zone, sc->mem_cgroup, >>> 0, file); >>> + >>> + mem_cgroup_mz_pages_scanned(sc->mem_cgroup, zone, nr_scanned); >>> + >>> /* >>> * mem_cgroup_isolate_pages() keeps track of >>> * scanned pages on its own. >>> @@ -1533,6 +1536,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, >>> * mem_cgroup_isolate_pages() keeps track of >>> * scanned pages on its own. >>> */ >>> + mem_cgroup_mz_pages_scanned(sc->mem_cgroup, zone, pgscanned); >>> } >>> >>> reclaim_stat->recent_scanned[file] += nr_taken; >>> @@ -2648,6 +2652,7 @@ static void balance_pgdat_node(pg_data_t *pgdat, int order, >>> unsigned long total_scanned = 0; >>> struct mem_cgroup *mem_cont = sc->mem_cgroup; >>> int priority = sc->priority; >>> + int nid = pgdat->node_id; >>> >>> /* >>> * Now scan the zone in the dma->highmem direction, and we scan >>> @@ -2664,10 +2669,20 @@ static void balance_pgdat_node(pg_data_t *pgdat, int order, >>> if (!populated_zone(zone)) >>> continue; >>> >>> + if (mem_cgroup_mz_unreclaimable(mem_cont, zone) && >>> + priority != DEF_PRIORITY) >>> + continue; >>> + >>> sc->nr_scanned = 0; >>> shrink_zone(priority, zone, sc); >>> total_scanned += sc->nr_scanned; >>> >>> + if (mem_cgroup_mz_unreclaimable(mem_cont, zone)) >>> + continue; >>> + >>> + if (!mem_cgroup_zone_reclaimable(mem_cont, nid, i)) >>> + mem_cgroup_mz_set_unreclaimable(mem_cont, zone); >>> + >>> /* >>> * If we've done a decent amount of scanning and >>> * the reclaim ratio is low, start doing writepage >>> @@ -2752,6 +2767,10 @@ loop_again: >>> >>> if (!populated_zone(zone)) >>> continue; >>> + >>> + if (!mem_cgroup_mz_unreclaimable(mem_cont, >>> + zone)) >>> + >> >> Ah, okay. this will work. >> >> Thanks, >> -Kame >> -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/ Don't email: <a href