Re: [PATCH V7 5/9] Infrastructure to support per-memcg reclaim.

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 





On Thu, Apr 21, 2011 at 10:11 PM, KOSAKI Motohiro <kosaki.motohiro@xxxxxxxxxxxxxx> wrote:
> Add the kswapd_mem field in kswapd descriptor which links the kswapd
> kernel thread to a memcg. The per-memcg kswapd is sleeping in the wait
> queue headed at kswapd_wait field of the kswapd descriptor.
>
> The kswapd() function is now shared between global and per-memcg kswapd. It
> is passed in with the kswapd descriptor which contains the information of
> either node or memcg. Then the new function balance_mem_cgroup_pgdat is
> invoked if it is per-mem kswapd thread, and the implementation of the function
> is on the following patch.
>
> change v7..v6:
> 1. change the threading model of memcg from per-memcg-per-thread to thread-pool.
> this is based on the patch from KAMAZAWA.
>
> change v6..v5:
> 1. rename is_node_kswapd to is_global_kswapd to match the scanning_global_lru.
> 2. revert the sleeping_prematurely change, but keep the kswapd_try_to_sleep()
> for memcg.
>
> changelog v4..v3:
> 1. fix up the kswapd_run and kswapd_stop for online_pages() and offline_pages.
> 2. drop the PF_MEMALLOC flag for memcg kswapd for now per KAMAZAWA's request.
>
> changelog v3..v2:
> 1. split off from the initial patch which includes all changes of the following
> three patches.
>
> Signed-off-by: Ying Han <yinghan@xxxxxxxxxx>
> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@xxxxxxxxxxxxxx>

Looks ok. but this one have some ugly coding style.

functioon()
{
       if (is_global_kswapd()) {
               looooooooong lines
               ...
               ..
       } else {
               another looooooong lines
               ...
               ..
       }
}

please pay attention more to keep simpler code.
However, I don't think this patch has major issue. I expect I can ack next version.


Thank you for reviewing. 

> ---
>  include/linux/swap.h |    2 +-
>  mm/memory_hotplug.c  |    2 +-
>  mm/vmscan.c          |  156 +++++++++++++++++++++++++++++++-------------------
>  3 files changed, 100 insertions(+), 60 deletions(-)
>
> diff --git a/include/linux/swap.h b/include/linux/swap.h
> index 9b91ca4..a062f0b 100644
> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -303,7 +303,7 @@ static inline void scan_unevictable_unregister_node(struct node *node)
>  }
>  #endif
>
> -extern int kswapd_run(int nid);
> +extern int kswapd_run(int nid, int id);

"id" is bad name. there is no information. please use memcg-id or so on.

will change . 


>  extern void kswapd_stop(int nid);
>
>  #ifdef CONFIG_MMU
> diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
> index 321fc74..36b4eed 100644
> --- a/mm/memory_hotplug.c
> +++ b/mm/memory_hotplug.c
> @@ -462,7 +462,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
>       setup_per_zone_wmarks();
>       calculate_zone_inactive_ratio(zone);
>       if (onlined_pages) {
> -             kswapd_run(zone_to_nid(zone));
> +             kswapd_run(zone_to_nid(zone), 0);
>               node_set_state(zone_to_nid(zone), N_HIGH_MEMORY);
>       }
>
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 7aba681..63c557e 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -2241,6 +2241,8 @@ static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages,
>       return balanced_pages > (present_pages >> 2);
>  }
>
> +#define is_global_kswapd(kswapd_p) ((kswapd_p)->kswapd_pgdat)

please use inline function.
 
Hmm.  see will change next/
 


> +
>  /* is kswapd sleeping prematurely? */
>  static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
>                                       int classzone_idx)
> @@ -2583,40 +2585,46 @@ static void kswapd_try_to_sleep(struct kswapd *kswapd_p, int order,
>
>       prepare_to_wait(wait_h, &wait, TASK_INTERRUPTIBLE);
>
> -     /* Try to sleep for a short interval */
> -     if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) {
> -             remaining = schedule_timeout(HZ/10);
> -             finish_wait(wait_h, &wait);
> -             prepare_to_wait(wait_h, &wait, TASK_INTERRUPTIBLE);
> -     }
> -
> -     /*
> -      * After a short sleep, check if it was a premature sleep. If not, then
> -      * go fully to sleep until explicitly woken up.
> -      */
> -     if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) {
> -             trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
> +     if (is_global_kswapd(kswapd_p)) {

bad indentation. :-/
please don't increase coding mess.

       if (!is_global_kswapd(kswapd_p)) {
               kswapd_try_to_sleep_memcg();
               return;
       }

is simpler.

Ok. I will check on next post. 

> +             /* Try to sleep for a short interval */
> +             if (!sleeping_prematurely(pgdat, order,
> +                             remaining, classzone_idx)) {
> +                     remaining = schedule_timeout(HZ/10);
> +                     finish_wait(wait_h, &wait);
> +                     prepare_to_wait(wait_h, &wait, TASK_INTERRUPTIBLE);
> +             }
>
>               /*
> -              * vmstat counters are not perfectly accurate and the estimated
> -              * value for counters such as NR_FREE_PAGES can deviate from the
> -              * true value by nr_online_cpus * threshold. To avoid the zone
> -              * watermarks being breached while under pressure, we reduce the
> -              * per-cpu vmstat threshold while kswapd is awake and restore
> -              * them before going back to sleep.
> +              * After a short sleep, check if it was a premature sleep.
> +              * If not, then go fully to sleep until explicitly woken up.
>                */
> -             set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
> -             schedule();
> -             set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold);
> +             if (!sleeping_prematurely(pgdat, order,
> +                                     remaining, classzone_idx)) {
> +                     trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
> +                     set_pgdat_percpu_threshold(pgdat,
> +                                     calculate_normal_threshold);
> +                     schedule();
> +                     set_pgdat_percpu_threshold(pgdat,
> +                                     calculate_pressure_threshold);
> +             } else {
> +                     if (remaining)
> +                             count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
> +                     else
> +                             count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY);
> +             }
>       } else {
> -             if (remaining)
> -                     count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
> -             else
> -                     count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY);
> +             /* For now, we just check the remaining works.*/
> +             if (mem_cgroup_kswapd_can_sleep())
> +                     schedule();
>       }
>       finish_wait(wait_h, &wait);


>  }
>
> +static unsigned long shrink_mem_cgroup(struct mem_cgroup *mem_cont, int order)
> +{
> +     return 0;
> +}
> +
>  /*
>   * The background pageout daemon, started as a kernel thread
>   * from the init process.
> @@ -2636,6 +2644,7 @@ int kswapd(void *p)
>       int classzone_idx;
>       struct kswapd *kswapd_p = (struct kswapd *)p;
>       pg_data_t *pgdat = kswapd_p->kswapd_pgdat;
> +     struct mem_cgroup *mem;
>       struct task_struct *tsk = current;
>
>       struct reclaim_state reclaim_state = {
> @@ -2645,9 +2654,11 @@ int kswapd(void *p)
>
>       lockdep_set_current_reclaim_state(GFP_KERNEL);
>
> -     cpumask = cpumask_of_node(pgdat->node_id);
> -     if (!cpumask_empty(cpumask))
> -             set_cpus_allowed_ptr(tsk, cpumask);
> +     if (is_global_kswapd(kswapd_p)) {
> +             cpumask = cpumask_of_node(pgdat->node_id);
> +             if (!cpumask_empty(cpumask))
> +                     set_cpus_allowed_ptr(tsk, cpumask);
> +     }
>       current->reclaim_state = &reclaim_state;
>
>       /*
> @@ -2662,7 +2673,10 @@ int kswapd(void *p)
>        * us from recursively trying to free more memory as we're
>        * trying to free the first piece of memory in the first place).
>        */
> -     tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
> +     if (is_global_kswapd(kswapd_p))
> +             tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
> +     else
> +             tsk->flags |= PF_SWAPWRITE | PF_KSWAPD;
>       set_freezable();
>
>       order = 0;
> @@ -2672,36 +2686,48 @@ int kswapd(void *p)
>               int new_classzone_idx;
>               int ret;
>
> -             new_order = pgdat->kswapd_max_order;
> -             new_classzone_idx = pgdat->classzone_idx;
> -             pgdat->kswapd_max_order = 0;
> -             pgdat->classzone_idx = MAX_NR_ZONES - 1;
> -             if (order < new_order || classzone_idx > new_classzone_idx) {
> -                     /*
> -                      * Don't sleep if someone wants a larger 'order'
> -                      * allocation or has tigher zone constraints
> -                      */
> -                     order = new_order;
> -                     classzone_idx = new_classzone_idx;
> -             } else {
> -                     kswapd_try_to_sleep(kswapd_p, order, classzone_idx);
> -                     order = pgdat->kswapd_max_order;
> -                     classzone_idx = pgdat->classzone_idx;
> +             if (is_global_kswapd(kswapd_p)) {
> +                     new_order = pgdat->kswapd_max_order;
> +                     new_classzone_idx = pgdat->classzone_idx;
>                       pgdat->kswapd_max_order = 0;
>                       pgdat->classzone_idx = MAX_NR_ZONES - 1;
> -             }
> +                     if (order < new_order ||
> +                                     classzone_idx > new_classzone_idx) {
> +                             /*
> +                              * Don't sleep if someone wants a larger 'order'
> +                              * allocation or has tigher zone constraints
> +                              */
> +                             order = new_order;
> +                             classzone_idx = new_classzone_idx;
> +                     } else {
> +                             kswapd_try_to_sleep(kswapd_p, order,
> +                                                 classzone_idx);
> +                             order = pgdat->kswapd_max_order;
> +                             classzone_idx = pgdat->classzone_idx;
> +                             pgdat->kswapd_max_order = 0;
> +                             pgdat->classzone_idx = MAX_NR_ZONES - 1;

-ETOODEEPNEST.


> +                     }
> +             } else
> +                     kswapd_try_to_sleep(kswapd_p, order, classzone_idx);
>
>               ret = try_to_freeze();
>               if (kthread_should_stop())
>                       break;
>
> +             if (ret)
> +                     continue;
>               /*
>                * We can speed up thawing tasks if we don't call balance_pgdat
>                * after returning from the refrigerator
>                */
> -             if (!ret) {
> +             if (is_global_kswapd(kswapd_p)) {
>                       trace_mm_vmscan_kswapd_wake(pgdat->node_id, order);
>                       order = balance_pgdat(pgdat, order, &classzone_idx);
> +             } else {
> +                     mem = mem_cgroup_get_shrink_target();
> +                     if (mem)
> +                             shrink_mem_cgroup(mem, order);
> +                     mem_cgroup_put_shrink_target(mem);
>               }



>       }
>       return 0;
> @@ -2845,30 +2871,44 @@ static int __devinit cpu_callback(struct notifier_block *nfb,
>   * This kswapd start function will be called by init and node-hot-add.
>   * On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added.
>   */
> -int kswapd_run(int nid)
> +int kswapd_run(int nid, int memcgid)
>  {
> -     pg_data_t *pgdat = NODE_DATA(nid);
>       struct task_struct *kswapd_tsk;
> +     pg_data_t *pgdat = NULL;
>       struct kswapd *kswapd_p;
> +     static char name[TASK_COMM_LEN];
>       int ret = 0;
>
> -     if (pgdat->kswapd)
> -             return 0;
> +     if (!memcgid) {
> +             pgdat = NODE_DATA(nid);
> +             if (pgdat->kswapd)
> +                     return ret;
> +     }
>
>       kswapd_p = kzalloc(sizeof(struct kswapd), GFP_KERNEL);
>       if (!kswapd_p)
>               return -ENOMEM;
>
> -     pgdat->kswapd = kswapd_p;
> -     kswapd_p->kswapd_wait = &pgdat->kswapd_wait;
> -     kswapd_p->kswapd_pgdat = pgdat;
> +     if (!memcgid) {
> +             pgdat->kswapd = kswapd_p;
> +             kswapd_p->kswapd_wait = &pgdat->kswapd_wait;
> +             kswapd_p->kswapd_pgdat = pgdat;
> +             snprintf(name, TASK_COMM_LEN, "kswapd_%d", nid);
> +     } else {
> +             kswapd_p->kswapd_wait = mem_cgroup_kswapd_waitq();
> +             snprintf(name, TASK_COMM_LEN, "memcg_%d", memcgid);
> +     }
>
> -     kswapd_tsk = kthread_run(kswapd, kswapd_p, "kswapd%d", nid);

You seems to change kswapd name slightly.



> +     kswapd_tsk = kthread_run(kswapd, kswapd_p, name);
>       if (IS_ERR(kswapd_tsk)) {
>               /* failure at boot is fatal */
>               BUG_ON(system_state == SYSTEM_BOOTING);
> -             printk("Failed to start kswapd on node %d\n",nid);
> -             pgdat->kswapd = NULL;
> +             if (!memcgid) {
> +                     printk(KERN_ERR "Failed to start kswapd on node %d\n",
> +                                                             nid);
> +                     pgdat->kswapd = NULL;
> +             } else
> +                     printk(KERN_ERR "Failed to start kswapd on memcg\n");

Why don't you show memcg-id here?

will change. 


>               kfree(kswapd_p);
>               ret = -1;
>       } else
> @@ -2899,7 +2939,7 @@ static int __init kswapd_init(void)
>
>       swap_setup();
>       for_each_node_state(nid, N_HIGH_MEMORY)
> -             kswapd_run(nid);
> +             kswapd_run(nid, 0);
>       hotcpu_notifier(cpu_callback, 0);
>       return 0;
>  }
> --
> 1.7.3.1
>





[Index of Archives]     [Linux ARM Kernel]     [Linux ARM]     [Linux Omap]     [Fedora ARM]     [IETF Annouce]     [Bugtraq]     [Linux]     [Linux OMAP]     [Linux MIPS]     [ECOS]     [Asterisk Internet PBX]     [Linux API]