After reclaiming each node per memcg, it checks mem_cgroup_watermark_ok() and breaks the priority loop if it returns true. The per-memcg zone will be marked as "unreclaimable" if the scanning rate is much greater than the reclaiming rate on the per-memcg LRU. The bit is cleared when there is a page charged to the memcg being freed. Kswapd breaks the priority loop if all the zones are marked as "unreclaimable". changelog v8a..v7 remove using priority. Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@xxxxxxxxxxxxxx> Signed-off-by: Ying Han <yinghan@xxxxxxxxxx> --- include/linux/memcontrol.h | 40 ++++++++++++++ include/linux/sched.h | 1 include/linux/swap.h | 2 mm/memcontrol.c | 126 +++++++++++++++++++++++++++++++++++++++++++-- mm/vmscan.c | 13 ++++ 5 files changed, 177 insertions(+), 5 deletions(-) Index: memcg/include/linux/memcontrol.h =================================================================== --- memcg.orig/include/linux/memcontrol.h +++ memcg/include/linux/memcontrol.h @@ -158,6 +158,14 @@ unsigned long mem_cgroup_soft_limit_recl unsigned long *total_scanned); u64 mem_cgroup_get_limit(struct mem_cgroup *mem); u64 mem_cgroup_get_usage(struct mem_cgroup *mem); +bool mem_cgroup_zone_reclaimable(struct mem_cgroup *mem, struct zone *zone); +bool mem_cgroup_mz_unreclaimable(struct mem_cgroup *mem, struct zone *zone); +void mem_cgroup_mz_set_unreclaimable(struct mem_cgroup *mem, struct zone *zone); +void mem_cgroup_clear_unreclaimable(struct mem_cgroup *mem, struct page *page); +void mem_cgroup_mz_clear_unreclaimable(struct mem_cgroup *mem, + struct zone *zone); +void mem_cgroup_mz_pages_scanned(struct mem_cgroup *mem, struct zone* zone, + unsigned long nr_scanned); void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx); #ifdef CONFIG_TRANSPARENT_HUGEPAGE @@ -355,6 +363,38 @@ static inline void mem_cgroup_dec_page_s { } +static inline bool mem_cgroup_zone_reclaimable(struct mem_cgroup *mem, + struct zone *zone) +{ + return false; +} + +static inline bool mem_cgroup_mz_unreclaimable(struct mem_cgroup *mem, + struct zone *zone) +{ + return false; +} + +static inline void mem_cgroup_mz_set_unreclaimable(struct mem_cgroup *mem, + struct zone *zone) +{ +} + +static inline void mem_cgroup_clear_unreclaimable(struct mem_cgroup *mem, + struct page *page) +{ +} + +static inline void mem_cgroup_mz_clear_unreclaimable(struct mem_cgroup *mem, + struct zone *zone) +{ +} +static inline void mem_cgroup_mz_pages_scanned(struct mem_cgroup *mem, + struct zone *zone, + unsigned long nr_scanned) +{ +} + static inline unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, gfp_t gfp_mask, Index: memcg/include/linux/sched.h =================================================================== --- memcg.orig/include/linux/sched.h +++ memcg/include/linux/sched.h @@ -1540,6 +1540,7 @@ struct task_struct { struct mem_cgroup *memcg; /* target memcg of uncharge */ unsigned long nr_pages; /* uncharged usage */ unsigned long memsw_nr_pages; /* uncharged mem+swap usage */ + struct zone *zone; /* a zone page is last uncharged */ } memcg_batch; #endif }; Index: memcg/include/linux/swap.h =================================================================== --- memcg.orig/include/linux/swap.h +++ memcg/include/linux/swap.h @@ -152,6 +152,8 @@ enum { SWP_SCANNING = (1 << 8), /* refcount in scan_swap_map */ }; +#define ZONE_RECLAIMABLE_RATE 6 + #define SWAP_CLUSTER_MAX 32 #define COMPACT_CLUSTER_MAX SWAP_CLUSTER_MAX Index: memcg/mm/memcontrol.c =================================================================== --- memcg.orig/mm/memcontrol.c +++ memcg/mm/memcontrol.c @@ -139,7 +139,10 @@ struct mem_cgroup_per_zone { bool on_tree; struct mem_cgroup *mem; /* Back pointer, we cannot */ /* use container_of */ + unsigned long pages_scanned; /* since last reclaim */ + bool all_unreclaimable; /* All pages pinned */ }; + /* Macro for accessing counter */ #define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)]) @@ -1166,12 +1169,15 @@ int mem_cgroup_inactive_file_is_low(stru return (active > inactive); } -unsigned long mem_cgroup_zone_reclaimable_pages(struct mem_cgroup *memcg, +unsigned long mem_cgroup_zone_reclaimable_pages(struct mem_cgroup *mem, int nid, int zone_idx) { int nr; - struct mem_cgroup_per_zone *mz = - mem_cgroup_zoneinfo(memcg, nid, zone_idx); + struct mem_cgroup_per_zone *mz; + + if (!mem) + return 0; + mz = mem_cgroup_zoneinfo(mem, nid, zone_idx); nr = MEM_CGROUP_ZSTAT(mz, NR_ACTIVE_FILE) + MEM_CGROUP_ZSTAT(mz, NR_INACTIVE_FILE); @@ -1222,6 +1228,102 @@ mem_cgroup_get_reclaim_stat_from_page(st return &mz->reclaim_stat; } +void mem_cgroup_mz_pages_scanned(struct mem_cgroup *mem, struct zone *zone, + unsigned long nr_scanned) +{ + struct mem_cgroup_per_zone *mz = NULL; + int nid = zone_to_nid(zone); + int zid = zone_idx(zone); + + if (!mem) + return; + + mz = mem_cgroup_zoneinfo(mem, nid, zid); + if (mz) + mz->pages_scanned += nr_scanned; +} + +bool mem_cgroup_zone_reclaimable(struct mem_cgroup *mem, struct zone *zone) +{ + struct mem_cgroup_per_zone *mz = NULL; + int nid = zone_to_nid(zone); + int zid = zone_idx(zone); + + if (!mem) + return 0; + + mz = mem_cgroup_zoneinfo(mem, nid, zid); + + return mz->pages_scanned < + mem_cgroup_zone_reclaimable_pages(mem, nid, zid) * + ZONE_RECLAIMABLE_RATE; +} + +bool mem_cgroup_mz_unreclaimable(struct mem_cgroup *mem, struct zone *zone) +{ + struct mem_cgroup_per_zone *mz = NULL; + int nid = zone_to_nid(zone); + int zid = zone_idx(zone); + + if (!mem) + return false; + + mz = mem_cgroup_zoneinfo(mem, nid, zid); + if (mz) + return mz->all_unreclaimable; + + return false; +} + +void mem_cgroup_mz_set_unreclaimable(struct mem_cgroup *mem, struct zone *zone) +{ + struct mem_cgroup_per_zone *mz = NULL; + int nid = zone_to_nid(zone); + int zid = zone_idx(zone); + + if (!mem) + return; + + mz = mem_cgroup_zoneinfo(mem, nid, zid); + if (mz) + mz->all_unreclaimable = true; +} + +void mem_cgroup_mz_clear_unreclaimable(struct mem_cgroup *mem, + struct zone *zone) +{ + struct mem_cgroup_per_zone *mz = NULL; + int nid = zone_to_nid(zone); + int zid = zone_idx(zone); + + if (!mem) + return; + + mz = mem_cgroup_zoneinfo(mem, nid, zid); + if (mz) { + mz->pages_scanned = 0; + mz->all_unreclaimable = false; + } + + return; +} + +void mem_cgroup_clear_unreclaimable(struct mem_cgroup *mem, struct page *page) +{ + struct mem_cgroup_per_zone *mz = NULL; + + if (!mem) + return; + + mz = page_cgroup_zoneinfo(mem, page); + if (mz) { + mz->pages_scanned = 0; + mz->all_unreclaimable = false; + } + + return; +} + unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, struct list_head *dst, unsigned long *scanned, int order, @@ -2791,6 +2893,7 @@ void mem_cgroup_cancel_charge_swapin(str static void mem_cgroup_do_uncharge(struct mem_cgroup *mem, unsigned int nr_pages, + struct page *page, const enum charge_type ctype) { struct memcg_batch_info *batch = NULL; @@ -2808,6 +2911,10 @@ static void mem_cgroup_do_uncharge(struc */ if (!batch->memcg) batch->memcg = mem; + + if (!batch->zone) + batch->zone = page_zone(page); + /* * do_batch > 0 when unmapping pages or inode invalidate/truncate. * In those cases, all pages freed continuously can be expected to be in @@ -2829,12 +2936,17 @@ static void mem_cgroup_do_uncharge(struc */ if (batch->memcg != mem) goto direct_uncharge; + + if (batch->zone != page_zone(page)) + mem_cgroup_mz_clear_unreclaimable(mem, page_zone(page)); + /* remember freed charge and uncharge it later */ batch->nr_pages++; if (uncharge_memsw) batch->memsw_nr_pages++; return; direct_uncharge: + mem_cgroup_mz_clear_unreclaimable(mem, page_zone(page)); res_counter_uncharge(&mem->res, nr_pages * PAGE_SIZE); if (uncharge_memsw) res_counter_uncharge(&mem->memsw, nr_pages * PAGE_SIZE); @@ -2916,7 +3028,7 @@ __mem_cgroup_uncharge_common(struct page mem_cgroup_get(mem); } if (!mem_cgroup_is_root(mem)) - mem_cgroup_do_uncharge(mem, nr_pages, ctype); + mem_cgroup_do_uncharge(mem, nr_pages, page, ctype); return mem; @@ -2984,6 +3096,10 @@ void mem_cgroup_uncharge_end(void) if (batch->memsw_nr_pages) res_counter_uncharge(&batch->memcg->memsw, batch->memsw_nr_pages * PAGE_SIZE); + if (batch->zone) + mem_cgroup_mz_clear_unreclaimable(batch->memcg, batch->zone); + batch->zone = NULL; + memcg_oom_recover(batch->memcg); /* forget this pointer (for sanity check) */ batch->memcg = NULL; @@ -4659,6 +4775,8 @@ static int alloc_mem_cgroup_per_zone_inf mz->usage_in_excess = 0; mz->on_tree = false; mz->mem = mem; + mz->pages_scanned = 0; + mz->all_unreclaimable = false; } return 0; } Index: memcg/mm/vmscan.c =================================================================== --- memcg.orig/mm/vmscan.c +++ memcg/mm/vmscan.c @@ -1412,6 +1412,9 @@ shrink_inactive_list(unsigned long nr_to ISOLATE_BOTH : ISOLATE_INACTIVE, zone, sc->mem_cgroup, 0, file); + + mem_cgroup_mz_pages_scanned(sc->mem_cgroup, zone, nr_scanned); + /* * mem_cgroup_isolate_pages() keeps track of * scanned pages on its own. @@ -1531,6 +1534,7 @@ static void shrink_active_list(unsigned * mem_cgroup_isolate_pages() keeps track of * scanned pages on its own. */ + mem_cgroup_mz_pages_scanned(sc->mem_cgroup, zone, pgscanned); } reclaim_stat->recent_scanned[file] += nr_taken; @@ -1998,7 +2002,8 @@ static void shrink_zones(int priority, s static bool zone_reclaimable(struct zone *zone) { - return zone->pages_scanned < zone_reclaimable_pages(zone) * 6; + return zone->pages_scanned < zone_reclaimable_pages(zone) * + ZONE_RECLAIMABLE_RATE; } /* All zones in zonelist are unreclaimable? */ @@ -2343,6 +2348,10 @@ shrink_memcg_node(int nid, int priority, scan = mem_cgroup_zone_reclaimable_pages(mem_cont, nid, i); if (!scan) continue; + /* we would like to remove memory from where we can do easy */ + if ((sc->nr_reclaimed >= total_scanned/4) && + mem_cgroup_mz_unreclaimable(mem_cont, zone)) + continue; /* If recent memory reclaim on this zone doesn't get good */ zrs = get_reclaim_stat(zone, sc); scan = zrs->recent_scanned[0] + zrs->recent_scanned[1]; @@ -2355,6 +2364,8 @@ shrink_memcg_node(int nid, int priority, shrink_zone(priority, zone, sc); total_scanned += sc->nr_scanned; sc->may_writepage = 0; + if (!mem_cgroup_zone_reclaimable(mem_cont, zone)) + mem_cgroup_mz_set_unreclaimable(mem_cont, zone); } sc->nr_scanned = total_scanned; } -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxxx For more info on Linux MM, see: http://www.linux-mm.org/ . Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/ Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>