Re: [PATCH 3/4] Per cgroup background reclaim.

Ying Han <yinghan@xxxxxxxxxx> · Mon, 6 Dec 2010 18:25:55 -0800

On Mon, Nov 29, 2010 at 11:51 PM, KAMEZAWA Hiroyuki
<kamezawa.hiroyu@xxxxxxxxxxxxxx> wrote:
> On Mon, 29 Nov 2010 22:49:44 -0800
> Ying Han <yinghan@xxxxxxxxxx> wrote:
>
>> The current implementation of memcg only supports direct reclaim and this
>> patch adds the support for background reclaim. Per cgroup background reclaim
>> is needed which spreads out the memory pressure over longer period of time
>> and smoothes out the system performance.
>>
>> There is a kswapd kernel thread for each memory node. We add a different kswapd
>> for each cgroup. The kswapd is sleeping in the wait queue headed at kswapd_wait
>> field of a kswapd descriptor.
>>
>> The kswapd() function now is shared between global and per cgroup kswapd thread.
>> It is passed in with the kswapd descriptor which contains the information of
>> either node or cgroup. Then the new function balance_mem_cgroup_pgdat is invoked
>> if it is per cgroup kswapd thread. The balance_mem_cgroup_pgdat performs a
>> priority loop similar to global reclaim. In each iteration it invokes
>> balance_pgdat_node for all nodes on the system, which is a new function performs
>> background reclaim per node. After reclaiming each node, it checks
>> mem_cgroup_watermark_ok() and breaks the priority loop if returns true. A per
>> memcg zone will be marked as "unreclaimable" if the scanning rate is much
>> greater than the reclaiming rate on the per cgroup LRU. The bit is cleared when
>> there is a page charged to the cgroup being freed. Kswapd breaks the priority
>> loop if all the zones are marked as "unreclaimable".
>>
>> Signed-off-by: Ying Han <yinghan@xxxxxxxxxx>
>> ---
>>  include/linux/memcontrol.h |   30 +++++++
>>  mm/memcontrol.c            |  182 ++++++++++++++++++++++++++++++++++++++-
>>  mm/page_alloc.c            |    2 +
>>  mm/vmscan.c                |  205 +++++++++++++++++++++++++++++++++++++++++++-
>>  4 files changed, 416 insertions(+), 3 deletions(-)
>>
>> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
>> index 90fe7fe..dbed45d 100644
>> --- a/include/linux/memcontrol.h
>> +++ b/include/linux/memcontrol.h
>> @@ -127,6 +127,12 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
>>                                               gfp_t gfp_mask);
>>  u64 mem_cgroup_get_limit(struct mem_cgroup *mem);
>>
>> +void mem_cgroup_clear_unreclaimable(struct page *page, struct zone *zone);
>> +bool mem_cgroup_zone_reclaimable(struct mem_cgroup *mem, int nid, int zid);
>> +bool mem_cgroup_mz_unreclaimable(struct mem_cgroup *mem, struct zone *zone);
>> +void mem_cgroup_mz_set_unreclaimable(struct mem_cgroup *mem, struct zone *zone);
>> +void mem_cgroup_mz_pages_scanned(struct mem_cgroup *mem, struct zone* zone,
>> +                                     unsigned long nr_scanned);
>>  #else /* CONFIG_CGROUP_MEM_RES_CTLR */
>>  struct mem_cgroup;
>>
>> @@ -299,6 +305,25 @@ static inline void mem_cgroup_update_file_mapped(struct page *page,
>>  {
>>  }
>>
>> +static inline void mem_cgroup_mz_pages_scanned(struct mem_cgroup *mem,
>> +                                             struct zone *zone,
>> +                                             unsigned long nr_scanned)
>> +{
>> +}
>> +
>> +static inline void mem_cgroup_clear_unreclaimable(struct page *page,
>> +                                                     struct zone *zone)
>> +{
>> +}
>> +static inline void mem_cgroup_mz_set_unreclaimable(struct mem_cgroup *mem,
>> +             struct zone *zone)
>> +{
>> +}
>> +static inline bool mem_cgroup_mz_unreclaimable(struct mem_cgroup *mem,
>> +                                             struct zone *zone)
>> +{
>> +}
>> +
>>  static inline
>>  unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
>>                                           gfp_t gfp_mask)
>> @@ -312,6 +337,11 @@ u64 mem_cgroup_get_limit(struct mem_cgroup *mem)
>>       return 0;
>>  }
>>
>> +static inline bool mem_cgroup_zone_reclaimable(struct mem_cgroup *mem, int nid,
>> +                                                             int zid)
>> +{
>> +     return false;
>> +}
>>  #endif /* CONFIG_CGROUP_MEM_CONT */
>>
>>  #endif /* _LINUX_MEMCONTROL_H */
>> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
>> index a0c6ed9..1d39b65 100644
>> --- a/mm/memcontrol.c
>> +++ b/mm/memcontrol.c
>> @@ -48,6 +48,8 @@
>>  #include <linux/page_cgroup.h>
>>  #include <linux/cpu.h>
>>  #include <linux/oom.h>
>> +#include <linux/kthread.h>
>> +
>>  #include "internal.h"
>>
>>  #include <asm/uaccess.h>
>> @@ -118,7 +120,10 @@ struct mem_cgroup_per_zone {
>>       bool                    on_tree;
>>       struct mem_cgroup       *mem;           /* Back pointer, we cannot */
>>                                               /* use container_of        */
>> +     unsigned long           pages_scanned;  /* since last reclaim */
>> +     int                     all_unreclaimable;      /* All pages pinned */
>>  };
>> +
>>  /* Macro for accessing counter */
>>  #define MEM_CGROUP_ZSTAT(mz, idx)    ((mz)->count[(idx)])
>>
>> @@ -372,6 +377,7 @@ static void mem_cgroup_put(struct mem_cgroup *mem);
>>  static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem);
>>  static void drain_all_stock_async(void);
>>  static unsigned long get_min_free_kbytes(struct mem_cgroup *mem);
>> +static inline void wake_memcg_kswapd(struct mem_cgroup *mem);
>>
>>  static struct mem_cgroup_per_zone *
>>  mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
>> @@ -1086,6 +1092,106 @@ mem_cgroup_get_reclaim_stat_from_page(struct page *page)
>>       return &mz->reclaim_stat;
>>  }
>>
>> +unsigned long mem_cgroup_zone_reclaimable_pages(
>> +                                     struct mem_cgroup_per_zone *mz)
>> +{
>> +     int nr;
>> +     nr = MEM_CGROUP_ZSTAT(mz, LRU_ACTIVE_FILE) +
>> +             MEM_CGROUP_ZSTAT(mz, LRU_INACTIVE_FILE);
>> +
>> +     if (nr_swap_pages > 0)
>> +             nr += MEM_CGROUP_ZSTAT(mz, LRU_ACTIVE_ANON) +
>> +                     MEM_CGROUP_ZSTAT(mz, LRU_INACTIVE_ANON);
>> +
>> +     return nr;
>> +}
>> +
>> +void mem_cgroup_mz_pages_scanned(struct mem_cgroup *mem, struct zone* zone,
>> +                                             unsigned long nr_scanned)
>> +{
>> +     struct mem_cgroup_per_zone *mz = NULL;
>> +     int nid = zone_to_nid(zone);
>> +     int zid = zone_idx(zone);
>> +
>> +     if (!mem)
>> +             return;
>> +
>> +     mz = mem_cgroup_zoneinfo(mem, nid, zid);
>> +     if (mz)
>> +             mz->pages_scanned += nr_scanned;
>> +}
>> +
>> +bool mem_cgroup_zone_reclaimable(struct mem_cgroup *mem, int nid, int zid)
>> +{
>> +     struct mem_cgroup_per_zone *mz = NULL;
>> +
>> +     if (!mem)
>> +             return 0;
>> +
>> +     mz = mem_cgroup_zoneinfo(mem, nid, zid);
>> +     if (mz)
>> +             return mz->pages_scanned <
>> +                             mem_cgroup_zone_reclaimable_pages(mz) * 6;
>> +     return 0;
>> +}
>> +
>> +bool mem_cgroup_mz_unreclaimable(struct mem_cgroup *mem, struct zone *zone)
>> +{
>> +     struct mem_cgroup_per_zone *mz = NULL;
>> +     int nid = zone_to_nid(zone);
>> +     int zid = zone_idx(zone);
>> +
>> +     if (!mem)
>> +             return 0;
>> +
>> +     mz = mem_cgroup_zoneinfo(mem, nid, zid);
>> +     if (mz)
>> +             return mz->all_unreclaimable;
>> +
>> +     return 0;
>> +}
>> +
>> +void mem_cgroup_mz_set_unreclaimable(struct mem_cgroup *mem, struct zone *zone)
>> +{
>> +     struct mem_cgroup_per_zone *mz = NULL;
>> +     int nid = zone_to_nid(zone);
>> +     int zid = zone_idx(zone);
>> +
>> +     if (!mem)
>> +             return;
>> +
>> +     mz = mem_cgroup_zoneinfo(mem, nid, zid);
>> +     if (mz)
>> +             mz->all_unreclaimable = 1;
>> +}
>> +
>> +void mem_cgroup_clear_unreclaimable(struct page *page, struct zone *zone)
>> +{
>> +     struct mem_cgroup_per_zone *mz = NULL;
>> +     struct mem_cgroup *mem = NULL;
>> +     int nid = zone_to_nid(zone);
>> +     int zid = zone_idx(zone);
>> +     struct page_cgroup *pc = lookup_page_cgroup(page);
>> +
>> +     if (unlikely(!pc))
>> +             return;
>> +
>> +     rcu_read_lock();
>> +     mem = pc->mem_cgroup;
>
> This is incorrect. you have to do css_tryget(&mem->css) before rcu_read_unlock.
>
>> +     rcu_read_unlock();
>> +
>> +     if (!mem)
>> +             return;
>> +
>> +     mz = mem_cgroup_zoneinfo(mem, nid, zid);
>> +     if (mz) {
>> +             mz->pages_scanned = 0;
>> +             mz->all_unreclaimable = 0;
>> +     }
>> +
>> +     return;
>> +}
>> +
>>  unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
>>                                       struct list_head *dst,
>>                                       unsigned long *scanned, int order,
>> @@ -1887,6 +1993,20 @@ static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
>>       struct res_counter *fail_res;
>>       unsigned long flags = 0;
>>       int ret;
>> +     unsigned long min_free_kbytes = 0;
>> +
>> +     min_free_kbytes = get_min_free_kbytes(mem);
>> +     if (min_free_kbytes) {
>> +             ret = res_counter_charge(&mem->res, csize, CHARGE_WMARK_LOW,
>> +                                     &fail_res);
>> +             if (likely(!ret)) {
>> +                     return CHARGE_OK;
>> +             } else {
>> +                     mem_over_limit = mem_cgroup_from_res_counter(fail_res,
>> +                                                                     res);
>> +                     wake_memcg_kswapd(mem_over_limit);
>> +             }
>> +     }
>
> I think this check can be moved out to periodic-check as threshould notifiers.

Yes. This will be changed in V2.

>
>
>
>>
>>       ret = res_counter_charge(&mem->res, csize, CHARGE_WMARK_MIN, &fail_res);
>>
>> @@ -3037,6 +3157,7 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
>>                       else
>>                               memcg->memsw_is_minimum = false;
>>               }
>> +             setup_per_memcg_wmarks(memcg);
>>               mutex_unlock(&set_limit_mutex);
>>
>>               if (!ret)
>> @@ -3046,7 +3167,7 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
>>                                               MEM_CGROUP_RECLAIM_SHRINK);
>>               curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
>>               /* Usage is reduced ? */
>> -             if (curusage >= oldusage)
>> +             if (curusage >= oldusage)
>>                       retry_count--;
>>               else
>>                       oldusage = curusage;
>
> What's changed here ?
>
>> @@ -3096,6 +3217,7 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
>>                       else
>>                               memcg->memsw_is_minimum = false;
>>               }
>> +             setup_per_memcg_wmarks(memcg);
>>               mutex_unlock(&set_limit_mutex);
>>
>>               if (!ret)
>> @@ -4352,6 +4474,8 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
>>  static void __mem_cgroup_free(struct mem_cgroup *mem)
>>  {
>>       int node;
>> +     struct kswapd *kswapd_p;
>> +     wait_queue_head_t *wait;
>>
>>       mem_cgroup_remove_from_trees(mem);
>>       free_css_id(&mem_cgroup_subsys, &mem->css);
>> @@ -4360,6 +4484,15 @@ static void __mem_cgroup_free(struct mem_cgroup *mem)
>>               free_mem_cgroup_per_zone_info(mem, node);
>>
>>       free_percpu(mem->stat);
>> +
>> +     wait = mem->kswapd_wait;
>> +     kswapd_p = container_of(wait, struct kswapd, kswapd_wait);
>> +     if (kswapd_p) {
>> +             if (kswapd_p->kswapd_task)
>> +                     kthread_stop(kswapd_p->kswapd_task);
>> +             kfree(kswapd_p);
>> +     }
>> +
>>       if (sizeof(struct mem_cgroup) < PAGE_SIZE)
>>               kfree(mem);
>>       else
>> @@ -4421,6 +4554,39 @@ int mem_cgroup_watermark_ok(struct mem_cgroup *mem,
>>       return ret;
>>  }
>>
>> +static inline
>> +void wake_memcg_kswapd(struct mem_cgroup *mem)
>> +{
>> +     wait_queue_head_t *wait;
>> +     struct kswapd *kswapd_p;
>> +     struct task_struct *thr;
>> +     static char memcg_name[PATH_MAX];
>> +
>> +     if (!mem)
>> +             return;
>> +
>> +     wait = mem->kswapd_wait;
>> +     kswapd_p = container_of(wait, struct kswapd, kswapd_wait);
>> +     if (!kswapd_p->kswapd_task) {
>> +             if (mem->css.cgroup)
>> +                     cgroup_path(mem->css.cgroup, memcg_name, PATH_MAX);
>> +             else
>> +                     sprintf(memcg_name, "no_name");
>> +
>> +             thr = kthread_run(kswapd, kswapd_p, "kswapd%s", memcg_name);
>
> I don't think reusing the name of "kswapd" isn't good. and this name cannot
> be long as PATH_MAX...IIUC, this name is for comm[] field which is 16bytes long.
>
> So, how about naming this as
>
>  "memcg%d", mem->css.id ?
>
> Exporing css.id will be okay if necessary.

I am not if that is working since the mem->css hasn't been initialized
during mem_cgroup_create(). So that is one of the reasons that I put
the kswapd creation at triggering wmarks instead of creating cgroup,
since I have all that information ready by the time.

However, I agree that adding into the cgroup creation is better for
performance perspective since we won't add the overhead for the page
allocation. ( Although only the first wmark triggering ). Any
suggestion?

--Ying
>
>
>
>> +             if (IS_ERR(thr))
>> +                     printk(KERN_INFO "Failed to start kswapd on memcg %d\n",
>> +                             0);
>> +             else
>> +                     kswapd_p->kswapd_task = thr;
>> +     }
>
> Hmm, ok, then, kswapd-for-memcg is created when someone go over watermark