On Mon, Nov 29, 2010 at 11:51 PM, KAMEZAWA Hiroyuki <kamezawa.hiroyu@xxxxxxxxxxxxxx> wrote: > On Mon, 29 Nov 2010 22:49:44 -0800 > Ying Han <yinghan@xxxxxxxxxx> wrote: > >> The current implementation of memcg only supports direct reclaim and this >> patch adds the support for background reclaim. Per cgroup background reclaim >> is needed which spreads out the memory pressure over longer period of time >> and smoothes out the system performance. >> >> There is a kswapd kernel thread for each memory node. We add a different kswapd >> for each cgroup. The kswapd is sleeping in the wait queue headed at kswapd_wait >> field of a kswapd descriptor. >> >> The kswapd() function now is shared between global and per cgroup kswapd thread. >> It is passed in with the kswapd descriptor which contains the information of >> either node or cgroup. Then the new function balance_mem_cgroup_pgdat is invoked >> if it is per cgroup kswapd thread. The balance_mem_cgroup_pgdat performs a >> priority loop similar to global reclaim. In each iteration it invokes >> balance_pgdat_node for all nodes on the system, which is a new function performs >> background reclaim per node. After reclaiming each node, it checks >> mem_cgroup_watermark_ok() and breaks the priority loop if returns true. A per >> memcg zone will be marked as "unreclaimable" if the scanning rate is much >> greater than the reclaiming rate on the per cgroup LRU. The bit is cleared when >> there is a page charged to the cgroup being freed. Kswapd breaks the priority >> loop if all the zones are marked as "unreclaimable". >> >> Signed-off-by: Ying Han <yinghan@xxxxxxxxxx> >> --- >> include/linux/memcontrol.h | 30 +++++++ >> mm/memcontrol.c | 182 ++++++++++++++++++++++++++++++++++++++- >> mm/page_alloc.c | 2 + >> mm/vmscan.c | 205 +++++++++++++++++++++++++++++++++++++++++++- >> 4 files changed, 416 insertions(+), 3 deletions(-) >> >> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h >> index 90fe7fe..dbed45d 100644 >> --- a/include/linux/memcontrol.h >> +++ b/include/linux/memcontrol.h >> @@ -127,6 +127,12 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, >> gfp_t gfp_mask); >> u64 mem_cgroup_get_limit(struct mem_cgroup *mem); >> >> +void mem_cgroup_clear_unreclaimable(struct page *page, struct zone *zone); >> +bool mem_cgroup_zone_reclaimable(struct mem_cgroup *mem, int nid, int zid); >> +bool mem_cgroup_mz_unreclaimable(struct mem_cgroup *mem, struct zone *zone); >> +void mem_cgroup_mz_set_unreclaimable(struct mem_cgroup *mem, struct zone *zone); >> +void mem_cgroup_mz_pages_scanned(struct mem_cgroup *mem, struct zone* zone, >> + unsigned long nr_scanned); >> #else /* CONFIG_CGROUP_MEM_RES_CTLR */ >> struct mem_cgroup; >> >> @@ -299,6 +305,25 @@ static inline void mem_cgroup_update_file_mapped(struct page *page, >> { >> } >> >> +static inline void mem_cgroup_mz_pages_scanned(struct mem_cgroup *mem, >> + struct zone *zone, >> + unsigned long nr_scanned) >> +{ >> +} >> + >> +static inline void mem_cgroup_clear_unreclaimable(struct page *page, >> + struct zone *zone) >> +{ >> +} >> +static inline void mem_cgroup_mz_set_unreclaimable(struct mem_cgroup *mem, >> + struct zone *zone) >> +{ >> +} >> +static inline bool mem_cgroup_mz_unreclaimable(struct mem_cgroup *mem, >> + struct zone *zone) >> +{ >> +} >> + >> static inline >> unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, >> gfp_t gfp_mask) >> @@ -312,6 +337,11 @@ u64 mem_cgroup_get_limit(struct mem_cgroup *mem) >> return 0; >> } >> >> +static inline bool mem_cgroup_zone_reclaimable(struct mem_cgroup *mem, int nid, >> + int zid) >> +{ >> + return false; >> +} >> #endif /* CONFIG_CGROUP_MEM_CONT */ >> >> #endif /* _LINUX_MEMCONTROL_H */ >> diff --git a/mm/memcontrol.c b/mm/memcontrol.c >> index a0c6ed9..1d39b65 100644 >> --- a/mm/memcontrol.c >> +++ b/mm/memcontrol.c >> @@ -48,6 +48,8 @@ >> #include <linux/page_cgroup.h> >> #include <linux/cpu.h> >> #include <linux/oom.h> >> +#include <linux/kthread.h> >> + >> #include "internal.h" >> >> #include <asm/uaccess.h> >> @@ -118,7 +120,10 @@ struct mem_cgroup_per_zone { >> bool on_tree; >> struct mem_cgroup *mem; /* Back pointer, we cannot */ >> /* use container_of */ >> + unsigned long pages_scanned; /* since last reclaim */ >> + int all_unreclaimable; /* All pages pinned */ >> }; >> + >> /* Macro for accessing counter */ >> #define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)]) >> >> @@ -372,6 +377,7 @@ static void mem_cgroup_put(struct mem_cgroup *mem); >> static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem); >> static void drain_all_stock_async(void); >> static unsigned long get_min_free_kbytes(struct mem_cgroup *mem); >> +static inline void wake_memcg_kswapd(struct mem_cgroup *mem); >> >> static struct mem_cgroup_per_zone * >> mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) >> @@ -1086,6 +1092,106 @@ mem_cgroup_get_reclaim_stat_from_page(struct page *page) >> return &mz->reclaim_stat; >> } >> >> +unsigned long mem_cgroup_zone_reclaimable_pages( >> + struct mem_cgroup_per_zone *mz) >> +{ >> + int nr; >> + nr = MEM_CGROUP_ZSTAT(mz, LRU_ACTIVE_FILE) + >> + MEM_CGROUP_ZSTAT(mz, LRU_INACTIVE_FILE); >> + >> + if (nr_swap_pages > 0) >> + nr += MEM_CGROUP_ZSTAT(mz, LRU_ACTIVE_ANON) + >> + MEM_CGROUP_ZSTAT(mz, LRU_INACTIVE_ANON); >> + >> + return nr; >> +} >> + >> +void mem_cgroup_mz_pages_scanned(struct mem_cgroup *mem, struct zone* zone, >> + unsigned long nr_scanned) >> +{ >> + struct mem_cgroup_per_zone *mz = NULL; >> + int nid = zone_to_nid(zone); >> + int zid = zone_idx(zone); >> + >> + if (!mem) >> + return; >> + >> + mz = mem_cgroup_zoneinfo(mem, nid, zid); >> + if (mz) >> + mz->pages_scanned += nr_scanned; >> +} >> + >> +bool mem_cgroup_zone_reclaimable(struct mem_cgroup *mem, int nid, int zid) >> +{ >> + struct mem_cgroup_per_zone *mz = NULL; >> + >> + if (!mem) >> + return 0; >> + >> + mz = mem_cgroup_zoneinfo(mem, nid, zid); >> + if (mz) >> + return mz->pages_scanned < >> + mem_cgroup_zone_reclaimable_pages(mz) * 6; >> + return 0; >> +} >> + >> +bool mem_cgroup_mz_unreclaimable(struct mem_cgroup *mem, struct zone *zone) >> +{ >> + struct mem_cgroup_per_zone *mz = NULL; >> + int nid = zone_to_nid(zone); >> + int zid = zone_idx(zone); >> + >> + if (!mem) >> + return 0; >> + >> + mz = mem_cgroup_zoneinfo(mem, nid, zid); >> + if (mz) >> + return mz->all_unreclaimable; >> + >> + return 0; >> +} >> + >> +void mem_cgroup_mz_set_unreclaimable(struct mem_cgroup *mem, struct zone *zone) >> +{ >> + struct mem_cgroup_per_zone *mz = NULL; >> + int nid = zone_to_nid(zone); >> + int zid = zone_idx(zone); >> + >> + if (!mem) >> + return; >> + >> + mz = mem_cgroup_zoneinfo(mem, nid, zid); >> + if (mz) >> + mz->all_unreclaimable = 1; >> +} >> + >> +void mem_cgroup_clear_unreclaimable(struct page *page, struct zone *zone) >> +{ >> + struct mem_cgroup_per_zone *mz = NULL; >> + struct mem_cgroup *mem = NULL; >> + int nid = zone_to_nid(zone); >> + int zid = zone_idx(zone); >> + struct page_cgroup *pc = lookup_page_cgroup(page); >> + >> + if (unlikely(!pc)) >> + return; >> + >> + rcu_read_lock(); >> + mem = pc->mem_cgroup; > > This is incorrect. you have to do css_tryget(&mem->css) before rcu_read_unlock. > >> + rcu_read_unlock(); >> + >> + if (!mem) >> + return; >> + >> + mz = mem_cgroup_zoneinfo(mem, nid, zid); >> + if (mz) { >> + mz->pages_scanned = 0; >> + mz->all_unreclaimable = 0; >> + } >> + >> + return; >> +} >> + >> unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, >> struct list_head *dst, >> unsigned long *scanned, int order, >> @@ -1887,6 +1993,20 @@ static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, >> struct res_counter *fail_res; >> unsigned long flags = 0; >> int ret; >> + unsigned long min_free_kbytes = 0; >> + >> + min_free_kbytes = get_min_free_kbytes(mem); >> + if (min_free_kbytes) { >> + ret = res_counter_charge(&mem->res, csize, CHARGE_WMARK_LOW, >> + &fail_res); >> + if (likely(!ret)) { >> + return CHARGE_OK; >> + } else { >> + mem_over_limit = mem_cgroup_from_res_counter(fail_res, >> + res); >> + wake_memcg_kswapd(mem_over_limit); >> + } >> + } > > I think this check can be moved out to periodic-check as threshould notifiers. Yes. This will be changed in V2. > > > >> >> ret = res_counter_charge(&mem->res, csize, CHARGE_WMARK_MIN, &fail_res); >> >> @@ -3037,6 +3157,7 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, >> else >> memcg->memsw_is_minimum = false; >> } >> + setup_per_memcg_wmarks(memcg); >> mutex_unlock(&set_limit_mutex); >> >> if (!ret) >> @@ -3046,7 +3167,7 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, >> MEM_CGROUP_RECLAIM_SHRINK); >> curusage = res_counter_read_u64(&memcg->res, RES_USAGE); >> /* Usage is reduced ? */ >> - if (curusage >= oldusage) >> + if (curusage >= oldusage) >> retry_count--; >> else >> oldusage = curusage; > > What's changed here ? > >> @@ -3096,6 +3217,7 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, >> else >> memcg->memsw_is_minimum = false; >> } >> + setup_per_memcg_wmarks(memcg); >> mutex_unlock(&set_limit_mutex); >> >> if (!ret) >> @@ -4352,6 +4474,8 @@ static struct mem_cgroup *mem_cgroup_alloc(void) >> static void __mem_cgroup_free(struct mem_cgroup *mem) >> { >> int node; >> + struct kswapd *kswapd_p; >> + wait_queue_head_t *wait; >> >> mem_cgroup_remove_from_trees(mem); >> free_css_id(&mem_cgroup_subsys, &mem->css); >> @@ -4360,6 +4484,15 @@ static void __mem_cgroup_free(struct mem_cgroup *mem) >> free_mem_cgroup_per_zone_info(mem, node); >> >> free_percpu(mem->stat); >> + >> + wait = mem->kswapd_wait; >> + kswapd_p = container_of(wait, struct kswapd, kswapd_wait); >> + if (kswapd_p) { >> + if (kswapd_p->kswapd_task) >> + kthread_stop(kswapd_p->kswapd_task); >> + kfree(kswapd_p); >> + } >> + >> if (sizeof(struct mem_cgroup) < PAGE_SIZE) >> kfree(mem); >> else >> @@ -4421,6 +4554,39 @@ int mem_cgroup_watermark_ok(struct mem_cgroup *mem, >> return ret; >> } >> >> +static inline >> +void wake_memcg_kswapd(struct mem_cgroup *mem) >> +{ >> + wait_queue_head_t *wait; >> + struct kswapd *kswapd_p; >> + struct task_struct *thr; >> + static char memcg_name[PATH_MAX]; >> + >> + if (!mem) >> + return; >> + >> + wait = mem->kswapd_wait; >> + kswapd_p = container_of(wait, struct kswapd, kswapd_wait); >> + if (!kswapd_p->kswapd_task) { >> + if (mem->css.cgroup) >> + cgroup_path(mem->css.cgroup, memcg_name, PATH_MAX); >> + else >> + sprintf(memcg_name, "no_name"); >> + >> + thr = kthread_run(kswapd, kswapd_p, "kswapd%s", memcg_name); > > I don't think reusing the name of "kswapd" isn't good. and this name cannot > be long as PATH_MAX...IIUC, this name is for comm[] field which is 16bytes long. > > So, how about naming this as > > "memcg%d", mem->css.id ? > > Exporing css.id will be okay if necessary. I am not if that is working since the mem->css hasn't been initialized during mem_cgroup_create(). So that is one of the reasons that I put the kswapd creation at triggering wmarks instead of creating cgroup, since I have all that information ready by the time. However, I agree that adding into the cgroup creation is better for performance perspective since we won't add the overhead for the page allocation. ( Although only the first wmark triggering ). Any suggestion? --Ying > > > >> + if (IS_ERR(thr)) >> + printk(KERN_INFO "Failed to start kswapd on memcg %d\n", >> + 0); >> + else >> + kswapd_p->kswapd_task = thr; >> + } > > Hmm, ok, then, kswapd-for-memcg is created when someone go over watermark