This patch re-implement the soft_limit reclaim function which it picks up next memcg to reclaim from in a round-robin fashion. For each memcg, we do hierarchical reclaim and checks the zone_wmark_ok() after each iteration. There is a rate limit per each memcg on how many pages to scan based on how much it exceeds the soft_limit. This patch is a first step approach to switch from RB-tree based reclaim to link-list based reclaim, and improvement on per-memcg soft_limit reclaim algorithm is needed next. Some test result: Test 1: Here I have three memcgs each doing a read on 20g file on a 32g system(no swap). Meantime I have a program pinned a 18g anon pages under root. The hard_limit and soft_limit is listed as container(hard_limit, soft_limit) root: 18g anon pages w/o swap A (20g, 2g): soft_kswapd_steal 4265600 soft_kswapd_scan 4265600 B (20g, 2g): soft_kswapd_steal 4265600 soft_kswapd_scan 4265600 C: (20g, 2g) soft_kswapd_steal 4083904 soft_kswapd_scan 4083904 vmstat: kswapd_steal 12617255 99.9% steal This two stats shows the zone_wmark_ok is fullfilled after soft_limit reclaim vs per-zone reclaim. kwapd_zone_wmark_ok 1974 kswapd_soft_limit_zone_wmark_ok 1969 Test2: Here the same memcgs but each is doing a 20g file write. root: 18g anon pages w/o swap A (20g, 2g): soft_kswapd_steal 4718336 soft_kswapd_scan 4718336 B (20g, 2g): soft_kswapd_steal 4710304 soft_kswapd_scan 4710304 C (20g, 3g); soft_kswapd_steal 2933406 soft_kswapd_scan 5797460 kswapd_steal 15958486 77% kswapd_zone_wmark_ok 2517 kswapd_soft_limit_zone_wmark_ok 2405 TODO: 1. We would like to do better on targeting reclaim by calculating the target nr_to_scan per-memcg, especially combining the current heuristics with soft_limit exceeds. How much weight we would like to put for the soft_limit exceed, or do we want to make it configurable? 2. As decided in LSF, we need a second list of memcgs under their soft_limit per-zone as well. This is needed to do zone balancing w/o global LRU. We shouldn't scan the second list unless the first list exhausted. Signed-off-by: Ying Han <yinghan@xxxxxxxxxx> --- include/linux/memcontrol.h | 3 +- mm/memcontrol.c | 119 ++++++++++++++++++++++++++++++++++++++++++- mm/vmscan.c | 25 +++++----- 3 files changed, 131 insertions(+), 16 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 6a0cffd..c7fcb26 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -145,7 +145,8 @@ static inline void mem_cgroup_dec_page_stat(struct page *page, } unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, - gfp_t gfp_mask, + gfp_t gfp_mask, int end_zone, + unsigned long balance_gap, unsigned long *total_scanned); u64 mem_cgroup_get_limit(struct mem_cgroup *mem); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 1360de6..b87ccc8 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1093,6 +1093,19 @@ unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg, return MEM_CGROUP_ZSTAT(mz, lru); } +unsigned long mem_cgroup_zone_reclaimable_pages(struct mem_cgroup_per_zone *mz) +{ + unsigned long total = 0; + + if (nr_swap_pages) { + total += MEM_CGROUP_ZSTAT(mz, LRU_INACTIVE_ANON); + total += MEM_CGROUP_ZSTAT(mz, LRU_ACTIVE_ANON); + } + total += MEM_CGROUP_ZSTAT(mz, LRU_INACTIVE_FILE); + total += MEM_CGROUP_ZSTAT(mz, LRU_ACTIVE_FILE); + return total; +} + struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg, struct zone *zone) { @@ -1528,7 +1541,14 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, return ret; total += ret; if (check_soft) { - if (!res_counter_soft_limit_excess(&root_mem->res)) + /* + * We want to be fair for each memcg soft_limit reclaim + * based on the excess.excess >> 2 is not to excessive + * so as to reclaim too much, nor too less that we keep + * coming back to reclaim from tis cgroup. + */ + if (!res_counter_soft_limit_excess(&root_mem->res) || + total >= (excess >> 2)) return total; } else if (mem_cgroup_margin(root_mem)) return 1 + total; @@ -3314,11 +3334,104 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, return ret; } +static struct mem_cgroup_per_zone * +__mem_cgroup_next_soft_limit_node(struct mem_cgroup_list_per_zone *mclz) +{ + struct mem_cgroup_per_zone *mz; + +retry: + mz = NULL; + if (list_empty(&mclz->list)) + goto done; + + mz = list_entry(mclz->list.prev, struct mem_cgroup_per_zone, + soft_limit_list); + + __mem_cgroup_remove_exceeded(mz->mem, mz, mclz); + if (!res_counter_soft_limit_excess(&mz->mem->res) || + !mem_cgroup_zone_reclaimable_pages(mz) || + !css_tryget(&mz->mem->css)) + goto retry; +done: + return mz; +} + +static struct mem_cgroup_per_zone * +mem_cgroup_next_soft_limit_node(struct mem_cgroup_list_per_zone *mclz) +{ + struct mem_cgroup_per_zone *mz; + + spin_lock(&mclz->lock); + mz = __mem_cgroup_next_soft_limit_node(mclz); + spin_unlock(&mclz->lock); + return mz; +} + unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, - gfp_t gfp_mask, + gfp_t gfp_mask, int end_zone, + unsigned long balance_gap, unsigned long *total_scanned) { - return 0; + unsigned long nr_reclaimed = 0; + unsigned long reclaimed; + struct mem_cgroup_per_zone *mz; + struct mem_cgroup_list_per_zone *mclz; + unsigned long long excess; + unsigned long nr_scanned; + int loop = 0; + + /* + * memcg reclaim doesn't support lumpy. + */ + if (order > 0) + return 0; + + mclz = soft_limit_list_node_zone(zone_to_nid(zone), zone_idx(zone)); + /* + * Start from the head of list. + */ + while (!list_empty(&mclz->list)) { + mz = mem_cgroup_next_soft_limit_node(mclz); + if (!mz) + break; + + nr_scanned = 0; + reclaimed = mem_cgroup_hierarchical_reclaim(mz->mem, zone, + gfp_mask, + MEM_CGROUP_RECLAIM_SOFT, + &nr_scanned); + nr_reclaimed += reclaimed; + *total_scanned += nr_scanned; + + spin_lock(&mclz->lock); + + __mem_cgroup_remove_exceeded(mz->mem, mz, mclz); + /* + * Add it back to the list even the reclaimed equals + * to zero as long as the memcg is still above its + * soft_limit. It could be possible lots of pages becomes + * reclaimable suddently. + */ + excess = res_counter_soft_limit_excess(&mz->mem->res); + __mem_cgroup_insert_exceeded(mz->mem, mz, mclz, excess); + + spin_unlock(&mclz->lock); + css_put(&mz->mem->css); + loop++; + + if (zone_watermark_ok_safe(zone, order, + high_wmark_pages(zone) + balance_gap, + end_zone, 0)) { + break; + } + + if (loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS || + *total_scanned > nr_reclaimed + nr_reclaimed / 2) + break; + + } + + return nr_reclaimed; } /* diff --git a/mm/vmscan.c b/mm/vmscan.c index 96789e0..9d79070 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2421,18 +2421,6 @@ loop_again: if (zone->all_unreclaimable && priority != DEF_PRIORITY) continue; - sc.nr_scanned = 0; - - nr_soft_scanned = 0; - /* - * Call soft limit reclaim before calling shrink_zone. - */ - nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone, - order, sc.gfp_mask, - &nr_soft_scanned); - sc.nr_reclaimed += nr_soft_reclaimed; - total_scanned += nr_soft_scanned; - /* * We put equal pressure on every zone, unless * one zone has way too many pages free @@ -2445,6 +2433,19 @@ loop_again: (zone->present_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / KSWAPD_ZONE_BALANCE_GAP_RATIO); + sc.nr_scanned = 0; + + nr_soft_scanned = 0; + /* + * Call soft limit reclaim before calling shrink_zone. + */ + nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone, + order, sc.gfp_mask, + end_zone, balance_gap, + &nr_soft_scanned); + sc.nr_reclaimed += nr_soft_reclaimed; + total_scanned += nr_soft_scanned; + if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone) + balance_gap, end_zone, 0)) -- 1.7.3.1 -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxxx For more info on Linux MM, see: http://www.linux-mm.org/ . Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/ Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>