This is the main loop of per-memcg background reclaim which is implemented in function balance_mem_cgroup_pgdat(). The function performs a priority loop similar to global reclaim. During each iteration it invokes balance_pgdat_node() for all nodes on the system, which is another new function performs background reclaim per node. A fairness mechanism is implemented to remember the last node it was reclaiming from and always start at the next one. After reclaiming each node, it checks mem_cgroup_watermark_ok() and breaks the priority loop if it returns true. The per-memcg zone will be marked as "unreclaimable" if the scanning rate is much greater than the reclaiming rate on the per-memcg LRU. The bit is cleared when there is a page charged to the memcg being freed. Kswapd breaks the priority loop if all the zones are marked as "unreclaimable". changelog v3..v2: 1. change mz->all_unreclaimable to be boolean. 2. define ZONE_RECLAIMABLE_RATE macro shared by zone and per-memcg reclaim. 3. some more clean-up. changelog v2..v1: 1. move the per-memcg per-zone clear_unreclaimable into uncharge stage. 2. shared the kswapd_run/kswapd_stop for per-memcg and global background reclaim. 3. name the per-memcg memcg as "memcg-id" (css->id). And the global kswapd keeps the same name. 4. fix a race on kswapd_stop while the per-memcg-per-zone info could be accessed after freeing. 5. add the fairness in zonelist where memcg remember the last zone reclaimed from. Signed-off-by: Ying Han <yinghan@xxxxxxxxxx> --- include/linux/memcontrol.h | 33 +++++++ include/linux/swap.h | 2 + mm/memcontrol.c | 136 +++++++++++++++++++++++++++++ mm/vmscan.c | 208 ++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 379 insertions(+), 0 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index f7ffd1f..a8159f5 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -88,6 +88,9 @@ extern int mem_cgroup_init_kswapd(struct mem_cgroup *mem, struct kswapd *kswapd_p); extern void mem_cgroup_clear_kswapd(struct mem_cgroup *mem); extern wait_queue_head_t *mem_cgroup_kswapd_wait(struct mem_cgroup *mem); +extern int mem_cgroup_last_scanned_node(struct mem_cgroup *mem); +extern int mem_cgroup_select_victim_node(struct mem_cgroup *mem, + const nodemask_t *nodes); static inline int mm_match_cgroup(const struct mm_struct *mm, const struct mem_cgroup *cgroup) @@ -152,6 +155,12 @@ static inline void mem_cgroup_dec_page_stat(struct page *page, unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, gfp_t gfp_mask); u64 mem_cgroup_get_limit(struct mem_cgroup *mem); +void mem_cgroup_clear_unreclaimable(struct mem_cgroup *mem, struct page *page); +bool mem_cgroup_zone_reclaimable(struct mem_cgroup *mem, int nid, int zid); +bool mem_cgroup_mz_unreclaimable(struct mem_cgroup *mem, struct zone *zone); +void mem_cgroup_mz_set_unreclaimable(struct mem_cgroup *mem, struct zone *zone); +void mem_cgroup_mz_pages_scanned(struct mem_cgroup *mem, struct zone* zone, + unsigned long nr_scanned); #ifdef CONFIG_TRANSPARENT_HUGEPAGE void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail); @@ -342,6 +351,25 @@ static inline void mem_cgroup_dec_page_stat(struct page *page, { } +static inline void mem_cgroup_mz_pages_scanned(struct mem_cgroup *mem, + struct zone *zone, + unsigned long nr_scanned) +{ +} + +static inline void mem_cgroup_clear_unreclaimable(struct page *page, + struct zone *zone) +{ +} +static inline void mem_cgroup_mz_set_unreclaimable(struct mem_cgroup *mem, + struct zone *zone) +{ +} +static inline bool mem_cgroup_mz_unreclaimable(struct mem_cgroup *mem, + struct zone *zone) +{ +} + static inline unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, gfp_t gfp_mask) @@ -360,6 +388,11 @@ static inline void mem_cgroup_split_huge_fixup(struct page *head, { } +static inline bool mem_cgroup_zone_reclaimable(struct mem_cgroup *mem, int nid, + int zid) +{ + return false; +} #endif /* CONFIG_CGROUP_MEM_CONT */ #if !defined(CONFIG_CGROUP_MEM_RES_CTLR) || !defined(CONFIG_DEBUG_VM) diff --git a/include/linux/swap.h b/include/linux/swap.h index 17e0511..319b800 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -160,6 +160,8 @@ enum { SWP_SCANNING = (1 << 8), /* refcount in scan_swap_map */ }; +#define ZONE_RECLAIMABLE_RATE 6 + #define SWAP_CLUSTER_MAX 32 #define COMPACT_CLUSTER_MAX SWAP_CLUSTER_MAX diff --git a/mm/memcontrol.c b/mm/memcontrol.c index acd84a8..efeade3 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -133,7 +133,10 @@ struct mem_cgroup_per_zone { bool on_tree; struct mem_cgroup *mem; /* Back pointer, we cannot */ /* use container_of */ + unsigned long pages_scanned; /* since last reclaim */ + bool all_unreclaimable; /* All pages pinned */ }; + /* Macro for accessing counter */ #define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)]) @@ -275,6 +278,11 @@ struct mem_cgroup { int wmark_ratio; + /* While doing per cgroup background reclaim, we cache the + * last node we reclaimed from + */ + int last_scanned_node; + wait_queue_head_t *kswapd_wait; }; @@ -1129,6 +1137,96 @@ mem_cgroup_get_reclaim_stat_from_page(struct page *page) return &mz->reclaim_stat; } +static unsigned long mem_cgroup_zone_reclaimable_pages( + struct mem_cgroup_per_zone *mz) +{ + int nr; + nr = MEM_CGROUP_ZSTAT(mz, LRU_ACTIVE_FILE) + + MEM_CGROUP_ZSTAT(mz, LRU_INACTIVE_FILE); + + if (nr_swap_pages > 0) + nr += MEM_CGROUP_ZSTAT(mz, LRU_ACTIVE_ANON) + + MEM_CGROUP_ZSTAT(mz, LRU_INACTIVE_ANON); + + return nr; +} + +void mem_cgroup_mz_pages_scanned(struct mem_cgroup *mem, struct zone* zone, + unsigned long nr_scanned) +{ + struct mem_cgroup_per_zone *mz = NULL; + int nid = zone_to_nid(zone); + int zid = zone_idx(zone); + + if (!mem) + return; + + mz = mem_cgroup_zoneinfo(mem, nid, zid); + if (mz) + mz->pages_scanned += nr_scanned; +} + +bool mem_cgroup_zone_reclaimable(struct mem_cgroup *mem, int nid, int zid) +{ + struct mem_cgroup_per_zone *mz = NULL; + + if (!mem) + return 0; + + mz = mem_cgroup_zoneinfo(mem, nid, zid); + if (mz) + return mz->pages_scanned < + mem_cgroup_zone_reclaimable_pages(mz) * + ZONE_RECLAIMABLE_RATE; + return 0; +} + +bool mem_cgroup_mz_unreclaimable(struct mem_cgroup *mem, struct zone *zone) +{ + struct mem_cgroup_per_zone *mz = NULL; + int nid = zone_to_nid(zone); + int zid = zone_idx(zone); + + if (!mem) + return false; + + mz = mem_cgroup_zoneinfo(mem, nid, zid); + if (mz) + return mz->all_unreclaimable; + + return false; +} + +void mem_cgroup_mz_set_unreclaimable(struct mem_cgroup *mem, struct zone *zone) +{ + struct mem_cgroup_per_zone *mz = NULL; + int nid = zone_to_nid(zone); + int zid = zone_idx(zone); + + if (!mem) + return; + + mz = mem_cgroup_zoneinfo(mem, nid, zid); + if (mz) + mz->all_unreclaimable = true; +} + +void mem_cgroup_clear_unreclaimable(struct mem_cgroup *mem, struct page *page) +{ + struct mem_cgroup_per_zone *mz = NULL; + + if (!mem) + return; + + mz = page_cgroup_zoneinfo(mem, page); + if (mz) { + mz->pages_scanned = 0; + mz->all_unreclaimable = false; + } + + return; +} + unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, struct list_head *dst, unsigned long *scanned, int order, @@ -1545,6 +1643,32 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, } /* + * Visit the first node after the last_scanned_node of @mem and use that to + * reclaim free pages from. + */ +int +mem_cgroup_select_victim_node(struct mem_cgroup *mem, const nodemask_t *nodes) +{ + int next_nid; + int last_scanned; + + last_scanned = mem->last_scanned_node; + + /* Initial stage and start from node0 */ + if (last_scanned == -1) + next_nid = 0; + else + next_nid = next_node(last_scanned, *nodes); + + if (next_nid == MAX_NUMNODES) + next_nid = first_node(*nodes); + + mem->last_scanned_node = next_nid; + + return next_nid; +} + +/* * Check OOM-Killer is already running under our hierarchy. * If someone is running, return false. */ @@ -2779,6 +2903,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) * special functions. */ + mem_cgroup_clear_unreclaimable(mem, page); unlock_page_cgroup(pc); /* * even after unlock, we have mem->res.usage here and this memcg @@ -4501,6 +4626,8 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) mz->usage_in_excess = 0; mz->on_tree = false; mz->mem = mem; + mz->pages_scanned = 0; + mz->all_unreclaimable = false; } return 0; } @@ -4651,6 +4778,14 @@ wait_queue_head_t *mem_cgroup_kswapd_wait(struct mem_cgroup *mem) return mem->kswapd_wait; } +int mem_cgroup_last_scanned_node(struct mem_cgroup *mem) +{ + if (!mem) + return -1; + + return mem->last_scanned_node; +} + static int mem_cgroup_soft_limit_tree_init(void) { struct mem_cgroup_tree_per_node *rtpn; @@ -4726,6 +4861,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) res_counter_init(&mem->memsw, NULL); } mem->last_scanned_child = 0; + mem->last_scanned_node = -1; INIT_LIST_HEAD(&mem->oom_notify); if (parent) diff --git a/mm/vmscan.c b/mm/vmscan.c index a1a1211..6571eb8 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -47,6 +47,8 @@ #include <linux/swapops.h> +#include <linux/res_counter.h> + #include "internal.h" #define CREATE_TRACE_POINTS @@ -111,6 +113,8 @@ struct scan_control { * are scanned. */ nodemask_t *nodemask; + + int priority; }; #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) @@ -1410,6 +1414,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, ISOLATE_BOTH : ISOLATE_INACTIVE, zone, sc->mem_cgroup, 0, file); + + mem_cgroup_mz_pages_scanned(sc->mem_cgroup, zone, nr_scanned); + /* * mem_cgroup_isolate_pages() keeps track of * scanned pages on its own. @@ -1529,6 +1536,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, * mem_cgroup_isolate_pages() keeps track of * scanned pages on its own. */ + mem_cgroup_mz_pages_scanned(sc->mem_cgroup, zone, pgscanned); } reclaim_stat->recent_scanned[file] += nr_taken; @@ -2632,11 +2640,211 @@ static void kswapd_try_to_sleep(struct kswapd *kswapd_p, int order, finish_wait(wait_h, &wait); } +#ifdef CONFIG_CGROUP_MEM_RES_CTLR +/* + * The function is used for per-memcg LRU. It scanns all the zones of the + * node and returns the nr_scanned and nr_reclaimed. + */ +static void balance_pgdat_node(pg_data_t *pgdat, int order, + struct scan_control *sc) +{ + int i, end_zone; + unsigned long total_scanned; + struct mem_cgroup *mem_cont = sc->mem_cgroup; + int priority = sc->priority; + int nid = pgdat->node_id; + + /* + * Scan in the highmem->dma direction for the highest + * zone which needs scanning + */ + for (i = pgdat->nr_zones - 1; i >= 0; i--) { + struct zone *zone = pgdat->node_zones + i; + + if (!populated_zone(zone)) + continue; + + if (mem_cgroup_mz_unreclaimable(mem_cont, zone) && + priority != DEF_PRIORITY) + continue; + /* + * Do some background aging of the anon list, to give + * pages a chance to be referenced before reclaiming. + */ + if (inactive_anon_is_low(zone, sc)) + shrink_active_list(SWAP_CLUSTER_MAX, zone, + sc, priority, 0); + + end_zone = i; + goto scan; + } + return; + +scan: + total_scanned = 0; + /* + * Now scan the zone in the dma->highmem direction, stopping + * at the last zone which needs scanning. + * + * We do this because the page allocator works in the opposite + * direction. This prevents the page allocator from allocating + * pages behind kswapd's direction of progress, which would + * cause too much scanning of the lower zones. + */ + for (i = 0; i <= end_zone; i++) { + struct zone *zone = pgdat->node_zones + i; + + if (!populated_zone(zone)) + continue; + + if (mem_cgroup_mz_unreclaimable(mem_cont, zone) && + priority != DEF_PRIORITY) + continue; + + sc->nr_scanned = 0; + shrink_zone(priority, zone, sc); + total_scanned += sc->nr_scanned; + + if (mem_cgroup_mz_unreclaimable(mem_cont, zone)) + continue; + + if (!mem_cgroup_zone_reclaimable(mem_cont, nid, i)) + mem_cgroup_mz_set_unreclaimable(mem_cont, zone); + + /* + * If we've done a decent amount of scanning and + * the reclaim ratio is low, start doing writepage + * even in laptop mode + */ + if (total_scanned > SWAP_CLUSTER_MAX * 2 && + total_scanned > sc->nr_reclaimed + sc->nr_reclaimed / 2) { + sc->may_writepage = 1; + } + } + + sc->nr_scanned = total_scanned; + return; +} + +/* + * Per cgroup background reclaim. + * TODO: Take off the order since memcg always do order 0 + */ +static unsigned long balance_mem_cgroup_pgdat(struct mem_cgroup *mem_cont, + int order) +{ + int i, nid; + int start_node; + int priority; + bool wmark_ok; + int loop; + pg_data_t *pgdat; + nodemask_t do_nodes; + unsigned long total_scanned; + struct scan_control sc = { + .gfp_mask = GFP_KERNEL, + .may_unmap = 1, + .may_swap = 1, + .nr_to_reclaim = ULONG_MAX, + .swappiness = vm_swappiness, + .order = order, + .mem_cgroup = mem_cont, + }; + +loop_again: + do_nodes = NODE_MASK_NONE; + sc.may_writepage = !laptop_mode; + sc.nr_reclaimed = 0; + total_scanned = 0; + + for (priority = DEF_PRIORITY; priority >= 0; priority--) { + sc.priority = priority; + wmark_ok = false; + loop = 0; + + /* The swap token gets in the way of swapout... */ + if (!priority) + disable_swap_token(); + + if (priority == DEF_PRIORITY) + do_nodes = node_states[N_ONLINE]; + + while (1) { + nid = mem_cgroup_select_victim_node(mem_cont, + &do_nodes); + + /* Indicate we have cycled the nodelist once + * TODO: we might add MAX_RECLAIM_LOOP for preventing + * kswapd burning cpu cycles. + */ + if (loop == 0) { + start_node = nid; + loop++; + } else if (nid == start_node) + break; + + pgdat = NODE_DATA(nid); + balance_pgdat_node(pgdat, order, &sc); + total_scanned += sc.nr_scanned; + + /* Set the node which has at least + * one reclaimable zone + */ + for (i = pgdat->nr_zones - 1; i >= 0; i--) { + struct zone *zone = pgdat->node_zones + i; + + if (!populated_zone(zone)) + continue; + + if (!mem_cgroup_mz_unreclaimable(mem_cont, + zone)) + break; + } + if (i < 0) + node_clear(nid, do_nodes); + + if (mem_cgroup_watermark_ok(mem_cont, + CHARGE_WMARK_HIGH)) { + wmark_ok = true; + goto out; + } + + if (nodes_empty(do_nodes)) { + wmark_ok = true; + goto out; + } + } + + /* All the nodes are unreclaimable, kswapd is done */ + if (nodes_empty(do_nodes)) { + wmark_ok = true; + goto out; + } + + if (total_scanned && priority < DEF_PRIORITY - 2) + congestion_wait(WRITE, HZ/10); + + if (sc.nr_reclaimed >= SWAP_CLUSTER_MAX) + break; + } +out: + if (!wmark_ok) { + cond_resched(); + + try_to_freeze(); + + goto loop_again; + } + + return sc.nr_reclaimed; +} +#else static unsigned long balance_mem_cgroup_pgdat(struct mem_cgroup *mem_cont, int order) { return 0; } +#endif /* * The background pageout daemon, started as a kernel thread -- 1.7.3.1 -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxxx For more info on Linux MM, see: http://www.linux-mm.org/ . Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/ Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>