== This patch adds a function to calculate reclam/scan ratio. By the recent scan. This wil be shown by memory.reclaim_stat interface in later patch. Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@xxxxxxxxxxxxxx> --- include/linux/swap.h | 8 +- mm/memcontrol.c | 137 +++++++++++++++++++++++++++++++++++++++++++++++---- mm/vmscan.c | 9 ++- 3 files changed, 138 insertions(+), 16 deletions(-) Index: memcg_async/mm/memcontrol.c =================================================================== --- memcg_async.orig/mm/memcontrol.c +++ memcg_async/mm/memcontrol.c @@ -73,7 +73,6 @@ static int really_do_swap_account __init #define do_swap_account (0) #endif - /* * Statistics for memory cgroup. */ @@ -215,6 +214,7 @@ static void mem_cgroup_oom_notify(struct static void mem_cgroup_reset_margin_to_limit(struct mem_cgroup *mem); static void mem_cgroup_update_margin_to_limit(struct mem_cgroup *mem); static void mem_cgroup_may_async_reclaim(struct mem_cgroup *mem); +static void mem_cgroup_reflesh_scan_ratio(struct mem_cgroup *mem); /* * The memory controller data structure. The memory controller controls both @@ -294,6 +294,12 @@ struct mem_cgroup { #define FAILED_TO_KEEP_MARGIN (1) /* someone hit limit */ #define ASYNC_WORKER_RUNNING (2) /* a worker runs */ #define ASYNC_WORKER_SHOULD_STOP (3) /* worker thread should stop */ + + /* For calculating scan success ratio */ + spinlock_t scan_stat_lock; + unsigned long scanned; + unsigned long reclaimed; + unsigned long next_scanratio_update; /* * percpu counter. */ @@ -758,6 +764,7 @@ static void memcg_check_events(struct me } /* update margin-to-limit and run async reclaim if necessary */ if (__memcg_event_check(mem, MEM_CGROUP_TARGET_KEEP_MARGIN)) { + mem_cgroup_reflesh_scan_ratio(mem); mem_cgroup_may_async_reclaim(mem); __mem_cgroup_target_update(mem, MEM_CGROUP_TARGET_KEEP_MARGIN); @@ -1417,6 +1424,96 @@ unsigned int mem_cgroup_swappiness(struc return memcg->swappiness; } +static void __mem_cgroup_update_scan_ratio(struct mem_cgroup *mem, + unsigned long scanned, + unsigned long reclaimed) +{ + unsigned long limit; + + limit = res_counter_read_u64(&mem->res, RES_LIMIT) >> PAGE_SHIFT; + spin_lock(&mem->scan_stat_lock); + mem->scanned += scanned; + mem->reclaimed += reclaimed; + /* avoid overflow */ + if (mem->scanned > limit) { + mem->scanned /= 2; + mem->reclaimed /= 2; + } + spin_unlock(&mem->scan_stat_lock); +} + +/** + * mem_cgroup_update_scan_ratio + * @memcg: the memcg + * @root : root memcg of hierarchy walk. + * @scanned : scanned pages + * @reclaimed: reclaimed pages. + * + * record scan/reclaim ratio to the memcg both to a child and it's root + * mem cgroup, which is a reclaim target. This value is used for + * detect congestion and for determining sleep time at memory reclaim. + */ + +static void mem_cgroup_update_scan_ratio(struct mem_cgroup *mem, + struct mem_cgroup *root, + unsigned long scanned, + unsigned long reclaimed) +{ + __mem_cgroup_update_scan_ratio(mem, scanned, reclaimed); + if (mem != root) + __mem_cgroup_update_scan_ratio(root, scanned, reclaimed); + +} + +/* + * Workload can be changed over time. This routine is for forgetting old + * information to some extent. This is triggered by event counter i.e. + * some amounts of pagein/pageout events and rate limited once per 1 min. + * + * By this, recent 1min information will be twice informative than old + * information. + */ +static void mem_cgroup_reflesh_scan_ratio(struct mem_cgroup *mem) +{ + struct cgroup *parent; + /* Update all parent's information if they are old */ + while (1) { + if (time_after(mem->next_scanratio_update, jiffies)) + break; + mem->next_scanratio_update = jiffies + HZ*60; + spin_lock(&mem->scan_stat_lock); + mem->scanned /= 2; + mem->reclaimed /= 2; + spin_unlock(&mem->scan_stat_lock); + if (!mem->use_hierarchy) + break; + parent = mem->css.cgroup->parent; + if (!parent) + break; + mem = mem_cgroup_from_cont(parent); + } +} + +/** + * mem_cgroup_scan_ratio: + * @mem: the mem cgroup + * + * Returns recent reclaim/scan ratio. If this is low, memory is filled by + * active pages(or dirty pages). If high, memory includes inactive, unneccesary + * files. This can be a hint for admins to show the limit is correct or not. + */ +static int mem_cgroup_scan_ratio(struct mem_cgroup *mem) +{ + int scan_success_ratio; + + spin_lock(&mem->scan_stat_lock); + scan_success_ratio = mem->reclaimed * 100 / (mem->scanned + 1); + spin_unlock(&mem->scan_stat_lock); + + return scan_success_ratio; +} + + static void mem_cgroup_start_move(struct mem_cgroup *mem) { int cpu; @@ -1855,9 +1952,14 @@ static int mem_cgroup_hierarchical_recla *total_scanned += nr_scanned; mem_cgroup_soft_steal(victim, is_kswapd, ret); mem_cgroup_soft_scan(victim, is_kswapd, nr_scanned); - } else + mem_cgroup_update_scan_ratio(victim, + root_mem, nr_scanned, ret); + } else { ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, - noswap); + noswap, &nr_scanned); + mem_cgroup_update_scan_ratio(victim, + root_mem, nr_scanned, ret); + } css_put(&victim->css); /* * At shrinking usage, we can't check we should stop here or @@ -3895,12 +3997,14 @@ static void mem_cgroup_stop_async_worker * someone tries to delete cgroup, stop reclaim. * If margin is big even after shrink memory, reschedule itself again. */ + static void mem_cgroup_async_shrink_worker(struct work_struct *work) { struct delayed_work *dw = to_delayed_work(work); - struct mem_cgroup *mem; - int delay = 0; + struct mem_cgroup *mem, *victim; long nr_to_reclaim; + unsigned long nr_scanned, nr_reclaimed; + int delay = 0; mem = container_of(dw, struct mem_cgroup, async_work); @@ -3910,12 +4014,22 @@ static void mem_cgroup_async_shrink_work nr_to_reclaim = mem->margin_to_limit_pages - mem_cgroup_margin(mem); - if (nr_to_reclaim > 0) - mem_cgroup_shrink_rate_limited(mem, nr_to_reclaim); - else + if (nr_to_reclaim <= 0) + goto finish_scan; + + /* select a memcg under hierarchy */ + victim = mem_cgroup_select_get_victim(mem); + if (!victim) goto finish_scan; + + nr_reclaimed = mem_cgroup_shrink_rate_limited(victim, nr_to_reclaim, + &nr_scanned); + mem_cgroup_update_scan_ratio(victim, mem, nr_scanned, nr_reclaimed); + css_put(&victim->css); + /* If margin is enough big, stop */ - if (mem_cgroup_margin(mem) >= mem->margin_to_limit_pages) + nr_to_reclaim = mem->margin_to_limit_pages - mem_cgroup_margin(mem); + if (nr_to_reclaim <= 0) goto finish_scan; /* If someone tries to rmdir(), we should stop */ if (test_bit(ASYNC_WORKER_SHOULD_STOP, &mem->async_flags)) @@ -4083,12 +4197,14 @@ try_to_free: shrink = 1; while (nr_retries && mem->res.usage > 0) { int progress; + unsigned long nr_scanned; if (signal_pending(current)) { ret = -EINTR; goto out; } - progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL, false); + progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL, + false, &nr_scanned); if (!progress) { nr_retries--; /* maybe some writeback is necessary */ @@ -5315,6 +5431,7 @@ mem_cgroup_create(struct cgroup_subsys * atomic_set(&mem->refcnt, 1); mem->move_charge_at_immigrate = 0; spin_lock_init(&mem->update_margin_lock); + spin_lock_init(&mem->scan_stat_lock); INIT_DELAYED_WORK(&mem->async_work, mem_cgroup_async_shrink_worker); mutex_init(&mem->thresholds_lock); return &mem->css; Index: memcg_async/include/linux/swap.h =================================================================== --- memcg_async.orig/include/linux/swap.h +++ memcg_async/include/linux/swap.h @@ -252,13 +252,15 @@ static inline void lru_cache_add_file(st extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *mask); extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem, - gfp_t gfp_mask, bool noswap); + gfp_t gfp_mask, bool noswap, + unsigned long *nr_scanned); extern unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, gfp_t gfp_mask, bool noswap, struct zone *zone, unsigned long *nr_scanned); -extern void mem_cgroup_shrink_rate_limited(struct mem_cgroup *mem, - unsigned long nr_to_reclaim); +extern unsigned long mem_cgroup_shrink_rate_limited(struct mem_cgroup *mem, + unsigned long nr_to_reclaim, + unsigned long *nr_scanned); extern int __isolate_lru_page(struct page *page, int mode, int file); extern unsigned long shrink_all_memory(unsigned long nr_pages); Index: memcg_async/mm/vmscan.c =================================================================== --- memcg_async.orig/mm/vmscan.c +++ memcg_async/mm/vmscan.c @@ -2221,7 +2221,8 @@ unsigned long mem_cgroup_shrink_node_zon unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, gfp_t gfp_mask, - bool noswap) + bool noswap, + unsigned long *nr_scanned) { struct zonelist *zonelist; unsigned long nr_reclaimed; @@ -2258,12 +2259,14 @@ unsigned long try_to_free_mem_cgroup_pag nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink); trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed); + *nr_scanned = sc.nr_scanned; return nr_reclaimed; } -void mem_cgroup_shrink_rate_limited(struct mem_cgroup *mem, - unsigned long nr_to_reclaim) +unsigned long mem_cgroup_shrink_rate_limited(struct mem_cgroup *mem, + unsigned long nr_to_reclaim, + unsigned long *nr_scanned) { } -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxxx For more info on Linux MM, see: http://www.linux-mm.org/ . Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/ Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>