Re: [PATCH 4/5] Per cgroup background reclaim.

Ying Han <yinghan@xxxxxxxxxx> · Tue, 18 Jan 2011 18:12:42 -0800

On Thu, Jan 13, 2011 at 4:52 PM, KAMEZAWA Hiroyuki
<kamezawa.hiroyu@xxxxxxxxxxxxxx> wrote:
> On Thu, 13 Jan 2011 14:00:34 -0800
> Ying Han <yinghan@xxxxxxxxxx> wrote:
>
>> The current implementation of memcg only supports direct reclaim and this
>> patch adds the support for background reclaim. Per cgroup background reclaim
>> is needed which spreads out the memory pressure over longer period of time
>> and smoothes out the system performance.
>>
>> There is a kswapd kernel thread for each memory node. We add a different kswapd
>> for each cgroup. The kswapd is sleeping in the wait queue headed at kswapd_wait
>> field of a kswapd descriptor.
>>
>> The kswapd() function now is shared between global and per cgroup kswapd thread.
>> It is passed in with the kswapd descriptor which contains the information of
>> either node or cgroup. Then the new function balance_mem_cgroup_pgdat is invoked
>> if it is per cgroup kswapd thread. The balance_mem_cgroup_pgdat performs a
>> priority loop similar to global reclaim. In each iteration it invokes
>> balance_pgdat_node for all nodes on the system, which is a new function performs
>> background reclaim per node. A fairness mechanism is implemented to remember the
>> last node it was reclaiming from and always start at the next one. After reclaiming
>> each node, it checks mem_cgroup_watermark_ok() and breaks the priority loop if
>> returns true. A per memcg zone will be marked as "unreclaimable" if the scanning
>> rate is much greater than the reclaiming rate on the per cgroup LRU. The bit is
>> cleared when there is a page charged to the cgroup being freed. Kswapd breaks the
>> priority loop if all the zones are marked as "unreclaimable".
>>
>> Change log v2...v1:
>> 1. start/stop the per-cgroup kswapd at create/delete cgroup stage.
>> 2. remove checking the wmark from per-page charging. now it checks the wmark
>> periodically based on the event counter.
>> 3. move the per-cgroup per-zone clear_unreclaimable into uncharge stage.
>> 4. shared the kswapd_run/kswapd_stop for per-cgroup and global background
>> reclaim.
>> 5. name the per-cgroup memcg as "memcg-id" (css->id). And the global kswapd
>> keeps the same name.
>> 6. fix a race on kswapd_stop while the per-memcg-per-zone info could be accessed
>> after freeing.
>> 7. add the fairness in zonelist where memcg remember the last zone reclaimed
>> from.
>>
>> Signed-off-by: Ying Han <yinghan@xxxxxxxxxx>

Thank you for your comments ~

> Hmm...at first, I like using workqueue rather than using a thread per memcg.

I plan this as part of performance optimization effort which mainly
focus on reducing
the lock contention between the threads.

>
>
>
>
>> ---
>>  include/linux/memcontrol.h |   37 ++++++
>>  include/linux/swap.h       |    4 +-
>>  mm/memcontrol.c            |  192 ++++++++++++++++++++++++++++-
>>  mm/vmscan.c                |  298 ++++++++++++++++++++++++++++++++++++++++----
>>  4 files changed, 504 insertions(+), 27 deletions(-)
>>
>> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
>> index 80a605f..69c6e41 100644
>> --- a/include/linux/memcontrol.h
>> +++ b/include/linux/memcontrol.h
>> @@ -25,6 +25,7 @@ struct mem_cgroup;
>>  struct page_cgroup;
>>  struct page;
>>  struct mm_struct;
>> +struct kswapd;
>>
>>  /* Stats that can be updated by kernel. */
>>  enum mem_cgroup_page_stat_item {
>> @@ -94,6 +95,12 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem);
>>  extern struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page);
>>  extern struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
>>  extern int mem_cgroup_watermark_ok(struct mem_cgroup *mem, int charge_flags);
>> +extern int mem_cgroup_init_kswapd(struct mem_cgroup *mem,
>> +                               struct kswapd *kswapd_p);
>> +extern wait_queue_head_t *mem_cgroup_kswapd_wait(struct mem_cgroup *mem);
>> +extern int mem_cgroup_last_scanned_node(struct mem_cgroup *mem);
>> +extern int mem_cgroup_select_victim_node(struct mem_cgroup *mem,
>> +                                     nodemask_t *nodes);
>>
>>  static inline
>>  int mm_match_cgroup(const struct mm_struct *mm, const struct mem_cgroup *cgroup)
>> @@ -166,6 +173,12 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
>>                                               gfp_t gfp_mask);
>>  u64 mem_cgroup_get_limit(struct mem_cgroup *mem);
>>
>> +void mem_cgroup_clear_unreclaimable(struct page_cgroup *pc);
>> +bool mem_cgroup_zone_reclaimable(struct mem_cgroup *mem, int nid, int zid);
>> +bool mem_cgroup_mz_unreclaimable(struct mem_cgroup *mem, struct zone *zone);
>> +void mem_cgroup_mz_set_unreclaimable(struct mem_cgroup *mem, struct zone *zone);
>> +void mem_cgroup_mz_pages_scanned(struct mem_cgroup *mem, struct zone* zone,
>> +                                     unsigned long nr_scanned);
>>  #else /* CONFIG_CGROUP_MEM_RES_CTLR */
>>  struct mem_cgroup;
>>
>> @@ -361,6 +374,25 @@ static inline unsigned long mem_cgroup_page_stat(struct mem_cgroup *mem,
>>       return -ENOSYS;
>>  }
>>
>> +static inline void mem_cgroup_mz_pages_scanned(struct mem_cgroup *mem,
>> +                                             struct zone *zone,
>> +                                             unsigned long nr_scanned)
>> +{
>> +}
>> +
>> +static inline void mem_cgroup_clear_unreclaimable(struct page *page,
>> +                                                     struct zone *zone)
>> +{
>> +}
>> +static inline void mem_cgroup_mz_set_unreclaimable(struct mem_cgroup *mem,
>> +             struct zone *zone)
>> +{
>> +}
>> +static inline bool mem_cgroup_mz_unreclaimable(struct mem_cgroup *mem,
>> +                                             struct zone *zone)
>> +{
>> +}
>> +
>>  static inline
>>  unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
>>                                           gfp_t gfp_mask)
>> @@ -374,6 +406,11 @@ u64 mem_cgroup_get_limit(struct mem_cgroup *mem)
>>       return 0;
>>  }
>>
>> +static inline bool mem_cgroup_zone_reclaimable(struct mem_cgroup *mem, int nid,
>> +                                                             int zid)
>> +{
>> +     return false;
>> +}
>>  #endif /* CONFIG_CGROUP_MEM_CONT */
>>
>>  #endif /* _LINUX_MEMCONTROL_H */
>> diff --git a/include/linux/swap.h b/include/linux/swap.h
>> index 52122fa..b6b5cbb 100644
>> --- a/include/linux/swap.h
>> +++ b/include/linux/swap.h
>> @@ -292,8 +292,8 @@ static inline void scan_unevictable_unregister_node(struct node *node)
>>  }
>>  #endif
>>
>> -extern int kswapd_run(int nid);
>> -extern void kswapd_stop(int nid);
>> +extern int kswapd_run(int nid, struct mem_cgroup *mem);
>> +extern void kswapd_stop(int nid, struct mem_cgroup *mem);
>>
>>  #ifdef CONFIG_MMU
>>  /* linux/mm/shmem.c */
>> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
>> index 6ef26a7..e716ece 100644
>> --- a/mm/memcontrol.c
>> +++ b/mm/memcontrol.c
>> @@ -48,6 +48,8 @@
>>  #include <linux/page_cgroup.h>
>>  #include <linux/cpu.h>
>>  #include <linux/oom.h>
>> +#include <linux/kthread.h>
>> +
>>  #include "internal.h"
>>
>>  #include <asm/uaccess.h>
>> @@ -75,6 +77,7 @@ static int really_do_swap_account __initdata = 1; /* for remember boot option*/
>>   */
>>  #define THRESHOLDS_EVENTS_THRESH (7) /* once in 128 */
>>  #define SOFTLIMIT_EVENTS_THRESH (10) /* once in 1024 */
>> +#define WMARK_EVENTS_THRESH (10) /* once in 1024 */
>>
>>  /*
>>   * Statistics for memory cgroup.
>> @@ -131,7 +134,10 @@ struct mem_cgroup_per_zone {
>>       bool                    on_tree;
>>       struct mem_cgroup       *mem;           /* Back pointer, we cannot */
>>                                               /* use container_of        */
>> +     unsigned long           pages_scanned;  /* since last reclaim */
>> +     int                     all_unreclaimable;      /* All pages pinned */
>>  };
>> +
>>  /* Macro for accessing counter */
>>  #define MEM_CGROUP_ZSTAT(mz, idx)    ((mz)->count[(idx)])
>>
>> @@ -289,8 +295,16 @@ struct mem_cgroup {
>>       struct mem_cgroup_stat_cpu nocpu_base;
>>       spinlock_t pcp_counter_lock;
>>
>> +     /*
>> +      * per cgroup background reclaim.
>> +      */
>>       wait_queue_head_t *kswapd_wait;
>>       unsigned long min_free_kbytes;
>> +
>> +     /* While doing per cgroup background reclaim, we cache the
>> +      * last node we reclaimed from
>> +      */
>> +     int last_scanned_node;
>>  };
>>
>>  /* Stuffs for move charges at task migration. */
>> @@ -380,6 +394,7 @@ static void mem_cgroup_put(struct mem_cgroup *mem);
>>  static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem);
>>  static void drain_all_stock_async(void);
>>  static unsigned long get_min_free_kbytes(struct mem_cgroup *mem);
>> +static void wake_memcg_kswapd(struct mem_cgroup *mem);
>>
>>  static struct mem_cgroup_per_zone *
>>  mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
>> @@ -568,6 +583,12 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
>>       return mz;
>>  }
>>
>> +static void mem_cgroup_check_wmark(struct mem_cgroup *mem)
>> +{
>> +     if (!mem_cgroup_watermark_ok(mem, CHARGE_WMARK_LOW))
>> +             wake_memcg_kswapd(mem);
>> +}
>> +
>
> Low for trigger, High for stop ?
>
>
>>  /*
>>   * Implementation Note: reading percpu statistics for memcg.
>>   *
>> @@ -692,6 +713,8 @@ static void memcg_check_events(struct mem_cgroup *mem, struct page *page)
>>               mem_cgroup_threshold(mem);
>>               if (unlikely(__memcg_event_check(mem, SOFTLIMIT_EVENTS_THRESH)))
>>                       mem_cgroup_update_tree(mem, page);
>> +             if (unlikely(__memcg_event_check(mem, WMARK_EVENTS_THRESH)))
>> +                     mem_cgroup_check_wmark(mem);
>>       }
>
> This is nice.

>
>
>>  }
>>
>> @@ -1121,6 +1144,95 @@ mem_cgroup_get_reclaim_stat_from_page(struct page *page)
>>       return &mz->reclaim_stat;
>>  }
>>
>> +static unsigned long mem_cgroup_zone_reclaimable_pages(
>> +                                             struct mem_cgroup_per_zone *mz)
>> +{
>> +     int nr;
>> +     nr = MEM_CGROUP_ZSTAT(mz, LRU_ACTIVE_FILE) +
>> +             MEM_CGROUP_ZSTAT(mz, LRU_INACTIVE_FILE);
>> +
>> +     if (nr_swap_pages > 0)
>> +             nr += MEM_CGROUP_ZSTAT(mz, LRU_ACTIVE_ANON) +
>> +                     MEM_CGROUP_ZSTAT(mz, LRU_INACTIVE_ANON);
>> +
>> +     return nr;
>> +}
>> +
>> +void mem_cgroup_mz_pages_scanned(struct mem_cgroup *mem, struct zone* zone,
>> +                                             unsigned long nr_scanned)
>> +{
>> +     struct mem_cgroup_per_zone *mz = NULL;
>> +     int nid = zone_to_nid(zone);
>> +     int zid = zone_idx(zone);
>> +
>> +     if (!mem)
>> +             return;
>> +
>> +     mz = mem_cgroup_zoneinfo(mem, nid, zid);
>> +     if (mz)
>> +             mz->pages_scanned += nr_scanned;
>> +}
>> +
>> +bool mem_cgroup_zone_reclaimable(struct mem_cgroup *mem, int nid, int zid)
>> +{
>> +     struct mem_cgroup_per_zone *mz = NULL;
>> +
>> +     if (!mem)
>> +             return 0;
>> +
>> +     mz = mem_cgroup_zoneinfo(mem, nid, zid);
>> +     if (mz)
>> +             return mz->pages_scanned <
>> +                             mem_cgroup_zone_reclaimable_pages(mz) * 6;
>> +     return 0;
>> +}
>
> Where does this "*6" come from ? please add comment. Or add macro in header
> file and share the value with original.

This will be changed on next post, and I plan to define a macro of the
magic number
shared between per-zone & per-memcg reclaim.

>
>
>
>> +
>> +bool mem_cgroup_mz_unreclaimable(struct mem_cgroup *mem, struct zone *zone)
>> +{
>> +     struct mem_cgroup_per_zone *mz = NULL;
>> +     int nid = zone_to_nid(zone);
>> +     int zid = zone_idx(zone);
>> +
>> +     if (!mem)
>> +             return false;
>> +
>> +     mz = mem_cgroup_zoneinfo(mem, nid, zid);
>> +     if (mz)
>> +             return mz->all_unreclaimable;
>> +
>> +     return false;
>> +}
>
> I think you should check whether this zone has any page.
> If no pages in this zone, you can't reclaim any.

I think that has been covered in the mem_cgroup_zone_reclaimable():
>-------if (mz)
>------->-------return mz->pages_scanned <
>------->------->------->-------mem_cgroup_zone_reclaimable_pages(mz) *
>------->------->------->-------ZONE_RECLAIMABLE_RATE;

In this case, the mem_cgroup_zone_reclaimable_pages(mz) == 0 and the
function returns false.

>
>
>> +
>> +void mem_cgroup_mz_set_unreclaimable(struct mem_cgroup *mem, struct zone *zone)
>> +{
>> +     struct mem_cgroup_per_zone *mz = NULL;
>> +     int nid = zone_to_nid(zone);
>> +     int zid = zone_idx(zone);
>> +
>> +     if (!mem)
>> +             return;
>> +
>> +     mz = mem_cgroup_zoneinfo(mem, nid, zid);
>> +     if (mz)
>> +             mz->all_unreclaimable = 1;
>> +}
>
> I like boolean for this kind ot true/false value.

Changed in the next post.

>
>
>
>> +
>> +void mem_cgroup_clear_unreclaimable(struct page_cgroup *pc)
>> +{
>> +     struct mem_cgroup_per_zone *mz = NULL;
>> +
>> +     if (!pc)
>> +             return;
>> +
>> +     mz = page_cgroup_zoneinfo(pc);
>> +     if (mz) {
>> +             mz->pages_scanned = 0;
>> +             mz->all_unreclaimable = 0;
>> +     }
>> +
>> +     return;
>> +}
>> +
>>  unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
>>                                       struct list_head *dst,
>>                                       unsigned long *scanned, int order,
>> @@ -1773,6 +1885,34 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
>>  }
>>
>>  /*
>> + * Visit the first node after the last_scanned_node of @mem and use that to
>> + * reclaim free pages from.
>> + */
>> +int
>> +mem_cgroup_select_victim_node(struct mem_cgroup *mem, nodemask_t *nodes)
>> +{
>> +     int next_nid;
>> +     int last_scanned;
>> +
>> +     last_scanned = mem->last_scanned_node;
>> +
>> +     /* Initial stage and start from node0 */
>> +     if (last_scanned == -1)
>> +             next_nid = 0;
>> +     else
>> +             next_nid = next_node(last_scanned, *nodes);
>> +
>> +     if (next_nid == MAX_NUMNODES)
>> +             next_nid = first_node(*nodes);
>> +
>> +     spin_lock(&mem->reclaim_param_lock);
>> +     mem->last_scanned_node = next_nid;
>> +     spin_unlock(&mem->reclaim_param_lock);
>> +
>
> Is this 'lock' required ?

Changed in the next post.

>
>> +     return next_nid;
>> +}
>> +
>> +/*
>>   * Check OOM-Killer is already running under our hierarchy.
>>   * If someone is running, return false.
>>   */
>> @@ -2955,6 +3095,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
>>        * special functions.
>>        */
>>
>> +     mem_cgroup_clear_unreclaimable(pc);
>>       unlock_page_cgroup(pc);
>
> This kind of hook is not good....Can't you do this 'clear' by kswapd in
> lazy way ?

I can look into that.

>
>
>
>>       /*
>>        * even after unlock, we have mem->res.usage here and this memcg
>> @@ -3377,7 +3518,7 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
>>                                               MEM_CGROUP_RECLAIM_SHRINK);
>>               curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
>>               /* Usage is reduced ? */
>> -             if (curusage >= oldusage)
>> +             if (curusage >= oldusage)
>>                       retry_count--;
>>               else
>>                       oldusage = curusage;
>
> ?
>
>
>> @@ -3385,6 +3526,9 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
>>       if (!ret && enlarge)
>>               memcg_oom_recover(memcg);
>>
>> +     if (!mem_cgroup_is_root(memcg) && !memcg->kswapd_wait)
>> +             kswapd_run(0, memcg);
>> +
>>       return ret;
>>  }
>
> Hmm, this creates a thread when limit is set....So, tons of threads can be
> created. Can't we do this by work_queue ?
> Then, the number of threads will be scaled automatically.

Some effort I plan to do next for reducing the lock contention.

>
>
>>
>> @@ -4747,6 +4891,8 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
>>               mz->usage_in_excess = 0;
>>               mz->on_tree = false;
>>               mz->mem = mem;
>> +             mz->pages_scanned = 0;
>> +             mz->all_unreclaimable = 0;
>>       }
>>       return 0;
>>  }
>> @@ -4799,6 +4945,7 @@ static void __mem_cgroup_free(struct mem_cgroup *mem)
>>  {
>>       int node;
>>
>> +     kswapd_stop(0, mem);
>>       mem_cgroup_remove_from_trees(mem);
>>       free_css_id(&mem_cgroup_subsys, &mem->css);
>>
>> @@ -4867,6 +5014,48 @@ int mem_cgroup_watermark_ok(struct mem_cgroup *mem,
>>       return ret;
>>  }
>>
>> +int mem_cgroup_init_kswapd(struct mem_cgroup *mem, struct kswapd *kswapd_p)
>> +{
>> +     if (!mem || !kswapd_p)
>> +             return 0;
>> +
>> +     mem->kswapd_wait = &kswapd_p->kswapd_wait;
>> +     kswapd_p->kswapd_mem = mem;
>> +
>> +     return css_id(&mem->css);
>> +}
>> +
>> +wait_queue_head_t *mem_cgroup_kswapd_wait(struct mem_cgroup *mem)
>> +{
>> +     if (!mem)
>> +             return NULL;
>> +
>> +     return mem->kswapd_wait;
>> +}
>> +
>> +int mem_cgroup_last_scanned_node(struct mem_cgroup *mem)
>> +{
>> +     if (!mem)
>> +             return -1;
>> +
>> +     return mem->last_scanned_node;
>> +}
>> +
>> +static void wake_memcg_kswapd(struct mem_cgroup *mem)
>> +{
>> +     wait_queue_head_t *wait;
>> +
>> +     if (!mem)
>> +             return;
>> +
>> +     wait = mem->kswapd_wait;
>> +
>> +     if (!waitqueue_active(wait))
>> +             return;
>> +
>> +     wake_up_interruptible(wait);
>> +}
>> +
>>  static int mem_cgroup_soft_limit_tree_init(void)
>>  {
>>       struct mem_cgroup_tree_per_node *rtpn;
>> @@ -4942,6 +5131,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
>>               res_counter_init(&mem->memsw, NULL);
>>       }
>>       mem->last_scanned_child = 0;
>> +     mem->last_scanned_node = -1;
>
> If we always start from 0 at the first run, I think this can be 0 at default.
>
>>       spin_lock_init(&mem->reclaim_param_lock);
>>       INIT_LIST_HEAD(&mem->oom_notify);
>>
>> diff --git a/mm/vmscan.c b/mm/vmscan.c
>> index a53d91d..34f6165 100644
>> --- a/mm/vmscan.c
>> +++ b/mm/vmscan.c
>> @@ -46,6 +46,8 @@
>>
>>  #include <linux/swapops.h>
>>
>> +#include <linux/res_counter.h>
>> +
>>  #include "internal.h"
>>
>>  #define CREATE_TRACE_POINTS
>> @@ -98,6 +100,8 @@ struct scan_control {
>>        * are scanned.
>>        */
>>       nodemask_t      *nodemask;
>> +
>> +     int priority;
>>  };
>>
>>  #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
>> @@ -1385,6 +1389,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
>>                                       ISOLATE_INACTIVE : ISOLATE_BOTH,
>>                       zone, sc->mem_cgroup,
>>                       0, file);
>> +
>> +             mem_cgroup_mz_pages_scanned(sc->mem_cgroup, zone, nr_scanned);
>> +
>>               /*
>>                * mem_cgroup_isolate_pages() keeps track of
>>                * scanned pages on its own.
>> @@ -1504,6 +1511,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
>>                * mem_cgroup_isolate_pages() keeps track of
>>                * scanned pages on its own.
>>                */
>> +             mem_cgroup_mz_pages_scanned(sc->mem_cgroup, zone, pgscanned);
>>       }
>>
>>       reclaim_stat->recent_scanned[file] += nr_taken;
>> @@ -2127,11 +2135,19 @@ static int sleeping_prematurely(struct kswapd *kswapd, int order,
>>  {
>>       int i;
>>       pg_data_t *pgdat = kswapd->kswapd_pgdat;
>> +     struct mem_cgroup *mem = kswapd->kswapd_mem;
>>
>>       /* If a direct reclaimer woke kswapd within HZ/10, it's premature */
>>       if (remaining)
>>               return 1;
>>
>> +     /* If after HZ/10, the cgroup is below the high wmark, it's premature */
>> +     if (mem) {
>> +             if (!mem_cgroup_watermark_ok(mem, CHARGE_WMARK_HIGH))
>> +                     return 1;
>> +             return 0;
>> +     }
>> +
>>       /* If after HZ/10, a zone is below the high mark, it's premature */
>>       for (i = 0; i < pgdat->nr_zones; i++) {
>>               struct zone *zone = pgdat->node_zones + i;
>> @@ -2370,6 +2386,212 @@ out:
>>       return sc.nr_reclaimed;
>>  }
>>
>> +#ifdef CONFIG_CGROUP_MEM_RES_CTLR
>> +/*
>> + * The function is used for per-memcg LRU. It scanns all the zones of the
>> + * node and returns the nr_scanned and nr_reclaimed.
>> + */
>> +static void balance_pgdat_node(pg_data_t *pgdat, int order,
>> +                                     struct scan_control *sc)
>> +{
>> +     int i, end_zone;
>> +     unsigned long total_scanned;
>> +     struct mem_cgroup *mem_cont = sc->mem_cgroup;
>> +     int priority = sc->priority;
>> +     int nid = pgdat->node_id;
>> +
>> +     /*
>> +      * Scan in the highmem->dma direction for the highest
>> +      * zone which needs scanning
>> +      */
>> +     for (i = pgdat->nr_zones - 1; i >= 0; i--) {
>> +             struct zone *zone = pgdat->node_zones + i;
>> +
>> +             if (!populated_zone(zone))
>> +                     continue;
>> +
>> +             if (mem_cgroup_mz_unreclaimable(mem_cont, zone) &&
>> +                             priority != DEF_PRIORITY)
>> +                     continue;
>> +             /*
>> +              * Do some background aging of the anon list, to give
>> +              * pages a chance to be referenced before reclaiming.
>> +              */
>> +             if (inactive_anon_is_low(zone, sc))
>> +                     shrink_active_list(SWAP_CLUSTER_MAX, zone,
>> +                                                     sc, priority, 0);
>
> I think you can check per-zone memory usage here and compare it with
> the value in previous run which set mz->all_unreclaimable.
>
> If current_zone_usage < mz->usage_in_previous_run, you can clear
> all_unreclaimable without hooks.
>
> But please note that 'uncharge' doesn't mean pages turned to be reclaimable.
> I'm not sure there are better hint or not.
>
>
>
>> +
>> +             end_zone = i;
>> +             goto scan;
>> +     }
>> +     return;
>> +
>> +scan:
>> +     total_scanned = 0;
>> +     /*
>> +      * Now scan the zone in the dma->highmem direction, stopping
>> +      * at the last zone which needs scanning.
>> +      *
>> +      * We do this because the page allocator works in the opposite
>> +      * direction.  This prevents the page allocator from allocating
>> +      * pages behind kswapd's direction of progress, which would
>> +      * cause too much scanning of the lower zones.
>> +      */
>> +     for (i = 0; i <= end_zone; i++) {
>> +             struct zone *zone = pgdat->node_zones + i;
>> +
>> +             if (!populated_zone(zone))
>> +                     continue;
>> +
>> +             if (mem_cgroup_mz_unreclaimable(mem_cont, zone) &&
>> +                     priority != DEF_PRIORITY)
>> +                     continue;
>> +
>> +             sc->nr_scanned = 0;
>> +             shrink_zone(priority, zone, sc);
>> +             total_scanned += sc->nr_scanned;
>> +
>> +             if (mem_cgroup_mz_unreclaimable(mem_cont, zone))
>> +                     continue;
>> +
>> +             if (!mem_cgroup_zone_reclaimable(mem_cont, nid, i))
>> +                     mem_cgroup_mz_set_unreclaimable(mem_cont, zone);
>> +
>> +             /*
>> +              * If we've done a decent amount of scanning and
>> +              * the reclaim ratio is low, start doing writepage
>> +              * even in laptop mode
>> +              */
>> +             if (total_scanned > SWAP_CLUSTER_MAX * 2 &&
>> +                 total_scanned > sc->nr_reclaimed + sc->nr_reclaimed / 2) {
>> +                     sc->may_writepage = 1;
>> +             }
>> +     }
>> +
>> +     sc->nr_scanned = total_scanned;
>> +     return;
>> +}
>> +
>> +/*
>> + * Per cgroup background reclaim.
>> + * TODO: Take off the order since memcg always do order 0
>> + */
>> +static unsigned long balance_mem_cgroup_pgdat(struct mem_cgroup *mem_cont,
>> +                                           int order)
>> +{
>> +     int i, nid;
>> +     int start_node;
>> +     int priority;
>> +     int wmark_ok;
>> +     int loop = 0;
>> +     pg_data_t *pgdat;
>> +     nodemask_t do_nodes;
>> +     unsigned long total_scanned = 0;
>> +     struct scan_control sc = {
>> +             .gfp_mask = GFP_KERNEL,
>> +             .may_unmap = 1,
>> +             .may_swap = 1,
>> +             .nr_to_reclaim = ULONG_MAX,
>> +             .swappiness = vm_swappiness,
>> +             .order = order,
>> +             .mem_cgroup = mem_cont,
>> +     };
>> +
>> +loop_again:
>> +     do_nodes = NODE_MASK_NONE;
>> +     sc.may_writepage = !laptop_mode;
>> +     sc.nr_reclaimed = 0;
>> +     total_scanned = 0;
>> +
>> +     for (priority = DEF_PRIORITY; priority >= 0; priority--) {
>> +             sc.priority = priority;
>> +             wmark_ok = 0;
>> +             loop = 0;
>> +
>> +             /* The swap token gets in the way of swapout... */
>> +             if (!priority)
>> +                     disable_swap_token();
>> +
>> +             if (priority == DEF_PRIORITY)
>> +                     do_nodes = node_states[N_ONLINE];
>> +
>> +             while (1) {
>> +                     nid = mem_cgroup_select_victim_node(mem_cont,
>> +                                                     &do_nodes);
>> +
>> +                     /* Indicate we have cycled the nodelist once
>> +                      * TODO: we might add MAX_RECLAIM_LOOP for preventing
>> +                      * kswapd burning cpu cycles