By default the per-memcg background reclaim is disabled when the limit_in_bytes is set the maximum. The kswapd_run() is called when the memcg is being resized, and kswapd_stop() is called when the memcg is being deleted. The per-memcg kswapd is waked up based on the usage and low_wmark, which is checked once per 1024 increments per cpu. The memcg's kswapd is waked up if the usage is larger than the low_wmark. At each iteration of work, the work frees memory at most 2048 pages and switch to next work for round robin. And if the memcg seems congested, it adds delay for the next work. Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@xxxxxxxxxxxxxx> --- include/linux/memcontrol.h | 2 - mm/memcontrol.c | 86 +++++++++++++++++++++++++++++++++++++++++++++ mm/vmscan.c | 23 +++++++----- 3 files changed, 102 insertions(+), 9 deletions(-) Index: memcg/mm/memcontrol.c =================================================================== --- memcg.orig/mm/memcontrol.c +++ memcg/mm/memcontrol.c @@ -111,10 +111,12 @@ enum mem_cgroup_events_index { enum mem_cgroup_events_target { MEM_CGROUP_TARGET_THRESH, MEM_CGROUP_TARGET_SOFTLIMIT, + MEM_CGROUP_WMARK_EVENTS_THRESH, MEM_CGROUP_NTARGETS, }; #define THRESHOLDS_EVENTS_TARGET (128) #define SOFTLIMIT_EVENTS_TARGET (1024) +#define WMARK_EVENTS_TARGET (1024) struct mem_cgroup_stat_cpu { long count[MEM_CGROUP_STAT_NSTATS]; @@ -267,6 +269,11 @@ struct mem_cgroup { struct list_head oom_notify; /* + * For high/low watermark. + */ + bool bgreclaim_resched; + struct delayed_work bgreclaim_work; + /* * Should we move charges of a task when a task is moved into this * mem_cgroup ? And what type of charges should we move ? */ @@ -374,6 +381,8 @@ static void mem_cgroup_put(struct mem_cg static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem); static void drain_all_stock_async(void); +static void wake_memcg_kswapd(struct mem_cgroup *mem); + static struct mem_cgroup_per_zone * mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) { @@ -552,6 +561,12 @@ mem_cgroup_largest_soft_limit_node(struc return mz; } +static void mem_cgroup_check_wmark(struct mem_cgroup *mem) +{ + if (!mem_cgroup_watermark_ok(mem, CHARGE_WMARK_LOW)) + wake_memcg_kswapd(mem); +} + /* * Implementation Note: reading percpu statistics for memcg. * @@ -702,6 +717,9 @@ static void __mem_cgroup_target_update(s case MEM_CGROUP_TARGET_SOFTLIMIT: next = val + SOFTLIMIT_EVENTS_TARGET; break; + case MEM_CGROUP_WMARK_EVENTS_THRESH: + next = val + WMARK_EVENTS_TARGET; + break; default: return; } @@ -725,6 +743,10 @@ static void memcg_check_events(struct me __mem_cgroup_target_update(mem, MEM_CGROUP_TARGET_SOFTLIMIT); } + if (unlikely(__memcg_event_check(mem, + MEM_CGROUP_WMARK_EVENTS_THRESH))){ + mem_cgroup_check_wmark(mem); + } } } @@ -3661,6 +3683,67 @@ unsigned long mem_cgroup_soft_limit_recl return nr_reclaimed; } +struct workqueue_struct *memcg_bgreclaimq; + +static int memcg_bgreclaim_init(void) +{ + /* + * use UNBOUND workqueue because we traverse nodes (no locality) and + * the work is cpu-intensive. + */ + memcg_bgreclaimq = alloc_workqueue("memcg", + WQ_MEM_RECLAIM | WQ_UNBOUND | WQ_FREEZABLE, 0); + return 0; +} +module_init(memcg_bgreclaim_init); + +static void memcg_bgreclaim(struct work_struct *work) +{ + struct delayed_work *dw = to_delayed_work(work); + struct mem_cgroup *mem = + container_of(dw, struct mem_cgroup, bgreclaim_work); + int delay = 0; + unsigned long long required, usage, hiwat; + + hiwat = res_counter_read_u64(&mem->res, RES_HIGH_WMARK_LIMIT); + usage = res_counter_read_u64(&mem->res, RES_USAGE); + required = usage - hiwat; + if (required >= 0) { + required = ((usage - hiwat) >> PAGE_SHIFT) + 1; + delay = shrink_mem_cgroup(mem, (long)required); + } + if (!mem->bgreclaim_resched || + mem_cgroup_watermark_ok(mem, CHARGE_WMARK_HIGH)) { + cgroup_release_and_wakeup_rmdir(&mem->css); + return; + } + /* need reschedule */ + if (!queue_delayed_work(memcg_bgreclaimq, &mem->bgreclaim_work, delay)) + cgroup_release_and_wakeup_rmdir(&mem->css); +} + +static void wake_memcg_kswapd(struct mem_cgroup *mem) +{ + if (delayed_work_pending(&mem->bgreclaim_work)) + return; + cgroup_exclude_rmdir(&mem->css); + if (!queue_delayed_work(memcg_bgreclaimq, &mem->bgreclaim_work, 0)) + cgroup_release_and_wakeup_rmdir(&mem->css); + return; +} + +static void stop_memcg_kswapd(struct mem_cgroup *mem) +{ + /* + * at destroy(), there is no task and we don't need to take care of + * new bgreclaim work queued. But we need to prevent it from reschedule + * use bgreclaim_resched to tell no more reschedule. + */ + mem->bgreclaim_resched = false; + flush_delayed_work(&mem->bgreclaim_work); + mem->bgreclaim_resched = true; +} + /* * This routine traverse page_cgroup in given list and drop them all. * *And* this routine doesn't reclaim page itself, just removes page_cgroup. @@ -3742,6 +3825,7 @@ move_account: ret = -EBUSY; if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children)) goto out; + stop_memcg_kswapd(mem); ret = -EINTR; if (signal_pending(current)) goto out; @@ -4804,6 +4888,8 @@ static struct mem_cgroup *mem_cgroup_all if (!mem->stat) goto out_free; spin_lock_init(&mem->pcp_counter_lock); + INIT_DELAYED_WORK(&mem->bgreclaim_work, memcg_bgreclaim); + mem->bgreclaim_resched = true; return mem; out_free: Index: memcg/include/linux/memcontrol.h =================================================================== --- memcg.orig/include/linux/memcontrol.h +++ memcg/include/linux/memcontrol.h @@ -89,7 +89,7 @@ extern int mem_cgroup_last_scanned_node( extern int mem_cgroup_select_victim_node(struct mem_cgroup *mem, const nodemask_t *nodes); -unsigned long shrink_mem_cgroup(struct mem_cgroup *mem); +int shrink_mem_cgroup(struct mem_cgroup *mem, long required); static inline int mm_match_cgroup(const struct mm_struct *mm, const struct mem_cgroup *cgroup) Index: memcg/mm/vmscan.c =================================================================== --- memcg.orig/mm/vmscan.c +++ memcg/mm/vmscan.c @@ -2373,20 +2373,19 @@ shrink_memcg_node(int nid, int priority, /* * Per cgroup background reclaim. */ -unsigned long shrink_mem_cgroup(struct mem_cgroup *mem) +int shrink_mem_cgroup(struct mem_cgroup *mem, long required) { - int nid, priority, next_prio; + int nid, priority, next_prio, delay; nodemask_t nodes; unsigned long total_scanned; struct scan_control sc = { .gfp_mask = GFP_HIGHUSER_MOVABLE, .may_unmap = 1, .may_swap = 1, - .nr_to_reclaim = SWAP_CLUSTER_MAX, .order = 0, .mem_cgroup = mem, }; - + /* writepage will be set later per zone */ sc.may_writepage = 0; sc.nr_reclaimed = 0; total_scanned = 0; @@ -2400,9 +2399,12 @@ unsigned long shrink_mem_cgroup(struct m * Now, we scan MEMCG_BGRECLAIM_SCAN_LIMIT pages per scan. * We use static priority 0. */ + sc.nr_to_reclaim = min(required, (long)MEMCG_BGSCAN_LIMIT/2); next_prio = min(SWAP_CLUSTER_MAX * num_node_state(N_HIGH_MEMORY), MEMCG_BGSCAN_LIMIT/8); priority = DEF_PRIORITY; + /* delay for next work at congestion */ + delay = HZ/10; while ((total_scanned < MEMCG_BGSCAN_LIMIT) && !nodes_empty(nodes) && (sc.nr_to_reclaim > sc.nr_reclaimed)) { @@ -2423,12 +2425,17 @@ unsigned long shrink_mem_cgroup(struct m priority--; next_prio <<= 1; } - if (sc.nr_scanned && - total_scanned > sc.nr_reclaimed * 2) - congestion_wait(WRITE, HZ/10); + /* give up early ? */ + if (total_scanned > MEMCG_BGSCAN_LIMIT/8 && + total_scanned > sc.nr_reclaimed * 4) + goto out; } + /* We scanned enough...If we reclaimed half of requested, no delay */ + if (sc.nr_reclaimed > sc.nr_to_reclaim/2) + delay = 0; +out: current->flags &= ~PF_SWAPWRITE; - return sc.nr_reclaimed; + return delay; } #endif -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxxx For more info on Linux MM, see: http://www.linux-mm.org/ . Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/ Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>