Re: [PATCH 1/4] Add kswapd descriptor.

Ying Han <yinghan@xxxxxxxxxx> · Tue, 7 Dec 2010 10:50:59 -0800

On Tue, Dec 7, 2010 at 4:33 AM, Mel Gorman <mel@xxxxxxxxx> wrote:
> On Mon, Nov 29, 2010 at 10:49:42PM -0800, Ying Han wrote:
>> There is a kswapd kernel thread for each memory node. We add a different kswapd
>> for each cgroup.
>
> What is considered a normal number of cgroups in production? 10, 50, 10000? If
> it's a really large number and all the cgroups kswapds wake at the same time,
> the zone LRU lock will be very heavily contended.  Potentially there will
> also be a very large number of new IO sources. I confess I haven't read the
> thread yet so maybe this has already been thought of but it might make sense
> to have a 1:N relationship between kswapd and memcgroups and cycle between
> containers. The difficulty will be a latency between when kswapd wakes up
> and when a particular container is scanned. The closer the ratio is to 1:1,
> the less the latency will be but the higher the contenion on the LRU lock
> and IO will be.
>
>> The kswapd is sleeping in the wait queue headed at kswapd_wait
>> field of a kswapd descriptor. The kswapd descriptor stores information of node
>> or cgroup and it allows the global and per cgroup background reclaim to share
>> common reclaim algorithms.
>>
>> This patch addes the kswapd descriptor and changes per zone kswapd_wait to the
>> common data structure.
>>
>> Signed-off-by: Ying Han <yinghan@xxxxxxxxxx>
>> ---
>>  include/linux/mmzone.h |    3 +-
>>  include/linux/swap.h   |   10 +++++
>>  mm/memcontrol.c        |    2 +
>>  mm/mmzone.c            |    2 +-
>>  mm/page_alloc.c        |    9 +++-
>>  mm/vmscan.c            |   98 +++++++++++++++++++++++++++++++++--------------
>>  6 files changed, 90 insertions(+), 34 deletions(-)
>>
>> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
>> index 39c24eb..c77dfa2 100644
>> --- a/include/linux/mmzone.h
>> +++ b/include/linux/mmzone.h
>> @@ -642,8 +642,7 @@ typedef struct pglist_data {
>>       unsigned long node_spanned_pages; /* total size of physical page
>>                                            range, including holes */
>>       int node_id;
>> -     wait_queue_head_t kswapd_wait;
>> -     struct task_struct *kswapd;
>> +     wait_queue_head_t *kswapd_wait;
>>       int kswapd_max_order;
>>  } pg_data_t;
>>
>> diff --git a/include/linux/swap.h b/include/linux/swap.h
>> index eba53e7..2e6cb58 100644
>> --- a/include/linux/swap.h
>> +++ b/include/linux/swap.h
>> @@ -26,6 +26,16 @@ static inline int current_is_kswapd(void)
>>       return current->flags & PF_KSWAPD;
>>  }
>>
>> +struct kswapd {
>> +     struct task_struct *kswapd_task;
>> +     wait_queue_head_t kswapd_wait;
>> +     struct mem_cgroup *kswapd_mem;
>> +     pg_data_t *kswapd_pgdat;
>> +};
>> +
>> +#define MAX_KSWAPDS MAX_NUMNODES
>> +extern struct kswapd kswapds[MAX_KSWAPDS];
>
> This is potentially very large for a static structure. Can they not be
> dynamically allocated and kept on a list? Yes, there will be a list walk
> involved if yonu need a particular structure but that looks like it's a
> rare operation at this point.

This has been changed to dynamic allocation on the V2 I am working on. The
kswapd descriptor is dynamic allocated at kswapd_run for both per-node and
per-cgroup kswapd, and there is no list walking since it could be identified by
container_of(wait, struct kswapd, kswapd_wait);

--Ying
>
>> +int kswapd(void *p);
>>  /*
>>   * MAX_SWAPFILES defines the maximum number of swaptypes: things which can
>>   * be swapped to.  The swap type and the offset into that swap type are
>> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
>> index a4034b6..dca3590 100644
>> --- a/mm/memcontrol.c
>> +++ b/mm/memcontrol.c
>> @@ -263,6 +263,8 @@ struct mem_cgroup {
>>        */
>>       struct mem_cgroup_stat_cpu nocpu_base;
>>       spinlock_t pcp_counter_lock;
>> +
>> +     wait_queue_head_t *kswapd_wait;
>>  };
>>
>>  /* Stuffs for move charges at task migration. */
>> diff --git a/mm/mmzone.c b/mm/mmzone.c
>> index e35bfb8..c7cbed5 100644
>> --- a/mm/mmzone.c
>> +++ b/mm/mmzone.c
>> @@ -102,7 +102,7 @@ unsigned long zone_nr_free_pages(struct zone *zone)
>>        * free pages are low, get a better estimate for free pages
>>        */
>>       if (nr_free_pages < zone->percpu_drift_mark &&
>> -                     !waitqueue_active(&zone->zone_pgdat->kswapd_wait))
>> +                     !waitqueue_active(zone->zone_pgdat->kswapd_wait))
>>               return zone_page_state_snapshot(zone, NR_FREE_PAGES);
>>
>>       return nr_free_pages;
>> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>> index b48dea2..a15bc1c 100644
>> --- a/mm/page_alloc.c
>> +++ b/mm/page_alloc.c
>> @@ -4070,13 +4070,18 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
>>       int nid = pgdat->node_id;
>>       unsigned long zone_start_pfn = pgdat->node_start_pfn;
>>       int ret;
>> +     struct kswapd *kswapd_p;
>>
>>       pgdat_resize_init(pgdat);
>>       pgdat->nr_zones = 0;
>> -     init_waitqueue_head(&pgdat->kswapd_wait);
>>       pgdat->kswapd_max_order = 0;
>>       pgdat_page_cgroup_init(pgdat);
>> -
>> +
>> +     kswapd_p = &kswapds[nid];
>> +     init_waitqueue_head(&kswapd_p->kswapd_wait);
>> +     pgdat->kswapd_wait = &kswapd_p->kswapd_wait;
>> +     kswapd_p->kswapd_pgdat = pgdat;
>> +
>>       for (j = 0; j < MAX_NR_ZONES; j++) {
>>               struct zone *zone = pgdat->node_zones + j;
>>               unsigned long size, realsize, memmap_pages;
>> diff --git a/mm/vmscan.c b/mm/vmscan.c
>> index b8a6fdc..e08005e 100644
>> --- a/mm/vmscan.c
>> +++ b/mm/vmscan.c
>> @@ -2115,12 +2115,18 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
>>
>>       return nr_reclaimed;
>>  }
>> +
>>  #endif
>>
>
> Unnecessary whitespace there.
>
>> +DEFINE_SPINLOCK(kswapds_spinlock);
>> +struct kswapd kswapds[MAX_KSWAPDS];
>> +
>>  /* is kswapd sleeping prematurely? */
>> -static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining)
>> +static int sleeping_prematurely(struct kswapd *kswapd, int order,
>> +                             long remaining)
>>  {
>>       int i;
>> +     pg_data_t *pgdat = kswapd->kswapd_pgdat;
>>
>
> This will behave strangely. You are using information from a *node* to
> determine if the kswapd belonging to a cgroup should sleep or not. The
> risk is that a cgroup kswapd never goes to sleep because even when all
> of its pages are discarded, the node itself is still not balanced.
>
>>       /* If a direct reclaimer woke kswapd within HZ/10, it's premature */
>>       if (remaining)
>> @@ -2377,21 +2383,28 @@ out:
>>   * If there are applications that are active memory-allocators
>>   * (most normal use), this basically shouldn't matter.
>>   */
>> -static int kswapd(void *p)
>> +int kswapd(void *p)
>>  {
>>       unsigned long order;
>> -     pg_data_t *pgdat = (pg_data_t*)p;
>> +     struct kswapd *kswapd_p = (struct kswapd *)p;
>> +     pg_data_t *pgdat = kswapd_p->kswapd_pgdat;
>> +     struct mem_cgroup *mem = kswapd_p->kswapd_mem;
>> +     wait_queue_head_t *wait_h = &kswapd_p->kswapd_wait;
>>       struct task_struct *tsk = current;
>>       DEFINE_WAIT(wait);
>>       struct reclaim_state reclaim_state = {
>>               .reclaimed_slab = 0,
>>       };
>> -     const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
>> +     const struct cpumask *cpumask;
>>
>>       lockdep_set_current_reclaim_state(GFP_KERNEL);
>>
>> -     if (!cpumask_empty(cpumask))
>> -             set_cpus_allowed_ptr(tsk, cpumask);
>> +     if (pgdat) {
>> +             BUG_ON(pgdat->kswapd_wait != wait_h);
>> +             cpumask = cpumask_of_node(pgdat->node_id);
>> +             if (!cpumask_empty(cpumask))
>> +                     set_cpus_allowed_ptr(tsk, cpumask);
>> +     }
>>       current->reclaim_state = &reclaim_state;
>>
>>       /*
>> @@ -2414,9 +2427,13 @@ static int kswapd(void *p)
>>               unsigned long new_order;
>>               int ret;
>>
>> -             prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
>> -             new_order = pgdat->kswapd_max_order;
>> -             pgdat->kswapd_max_order = 0;
>> +             prepare_to_wait(wait_h, &wait, TASK_INTERRUPTIBLE);
>> +             if (pgdat) {
>> +                     new_order = pgdat->kswapd_max_order;
>> +                     pgdat->kswapd_max_order = 0;
>> +             } else
>> +                     new_order = 0;
>> +
>>               if (order < new_order) {
>>                       /*
>>                        * Don't sleep if someone wants a larger 'order'
>> @@ -2428,10 +2445,12 @@ static int kswapd(void *p)
>>                               long remaining = 0;
>>
>>                               /* Try to sleep for a short interval */
>> -                             if (!sleeping_prematurely(pgdat, order, remaining)) {
>> +                             if (!sleeping_prematurely(kswapd_p, order,
>> +                                                     remaining)) {
>>                                       remaining = schedule_timeout(HZ/10);
>> -                                     finish_wait(&pgdat->kswapd_wait, &wait);
>> -                                     prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
>> +                                     finish_wait(wait_h, &wait);
>> +                                     prepare_to_wait(wait_h, &wait,
>> +                                                     TASK_INTERRUPTIBLE);
>
> It would be nice if patch 1 did nothing but move the wait queue outside of
> the node structure without any other functional change. It'll be then be
> far easier to review a patch that introduces background reclaim for containers.
>
>>                               }
>>
>>                               /*
>> @@ -2439,20 +2458,25 @@ static int kswapd(void *p)
>>                                * premature sleep. If not, then go fully
>>                                * to sleep until explicitly woken up
>>                                */
>> -                             if (!sleeping_prematurely(pgdat, order, remaining)) {
>> -                                     trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
>> +                             if (!sleeping_prematurely(kswapd_p, order,
>> +                                                             remaining)) {
>> +                                     if (pgdat)
>> +                                             trace_mm_vmscan_kswapd_sleep(
>> +                                                             pgdat->node_id);
>>                                       schedule();
>>                               } else {
>>                                       if (remaining)
>> -                                             count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
>> +                                             count_vm_event(
>> +                                             KSWAPD_LOW_WMARK_HIT_QUICKLY);
>>                                       else
>> -                                             count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY);
>> +                                             count_vm_event(
>> +                                             KSWAPD_HIGH_WMARK_HIT_QUICKLY);
>>                               }
>>                       }
>> -
>> -                     order = pgdat->kswapd_max_order;
>> +                     if (pgdat)
>> +                             order = pgdat->kswapd_max_order;
>>               }
>> -             finish_wait(&pgdat->kswapd_wait, &wait);
>> +             finish_wait(wait_h, &wait);
>>
>>               ret = try_to_freeze();
>>               if (kthread_should_stop())
>> @@ -2476,6 +2500,7 @@ static int kswapd(void *p)
>>  void wakeup_kswapd(struct zone *zone, int order)
>>  {
>>       pg_data_t *pgdat;
>> +     wait_queue_head_t *wait;
>>
>>       if (!populated_zone(zone))
>>               return;
>> @@ -2488,9 +2513,10 @@ void wakeup_kswapd(struct zone *zone, int order)
>>       trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
>>       if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
>>               return;
>> -     if (!waitqueue_active(&pgdat->kswapd_wait))
>> +     wait = pgdat->kswapd_wait;
>> +     if (!waitqueue_active(wait))
>>               return;
>> -     wake_up_interruptible(&pgdat->kswapd_wait);
>> +     wake_up_interruptible(wait);
>>  }
>>
>>  /*
>> @@ -2587,7 +2613,10 @@ static int __devinit cpu_callback(struct notifier_block *nfb,
>>
>>                       if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids)
>>                               /* One of our CPUs online: restore mask */
>> -                             set_cpus_allowed_ptr(pgdat->kswapd, mask);
>> +                             if (kswapds[nid].kswapd_task)
>> +                                     set_cpus_allowed_ptr(
>> +                                             kswapds[nid].kswapd_task,
>> +                                             mask);
>>               }
>>       }
>>       return NOTIFY_OK;
>> @@ -2599,19 +2628,20 @@ static int __devinit cpu_callback(struct notifier_block *nfb,
>>   */
>>  int kswapd_run(int nid)
>>  {
>> -     pg_data_t *pgdat = NODE_DATA(nid);
>> +     struct task_struct *thr;
>>       int ret = 0;
>>
>> -     if (pgdat->kswapd)
>> +     if (kswapds[nid].kswapd_task)
>>               return 0;
>>
>> -     pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
>> -     if (IS_ERR(pgdat->kswapd)) {
>> +     thr = kthread_run(kswapd, &kswapds[nid], "kswapd%d", nid);
>> +     if (IS_ERR(thr)) {
>>               /* failure at boot is fatal */
>>               BUG_ON(system_state == SYSTEM_BOOTING);
>>               printk("Failed to start kswapd on node %d\n",nid);
>>               ret = -1;
>>       }
>> +     kswapds[nid].kswapd_task = thr;
>>       return ret;
>>  }
>>
>> @@ -2620,10 +2650,20 @@ int kswapd_run(int nid)
>>   */
>>  void kswapd_stop(int nid)
>>  {
>> -     struct task_struct *kswapd = NODE_DATA(nid)->kswapd;
>> +     struct task_struct *thr;
>> +     struct kswapd *kswapd_p;
>> +     wait_queue_head_t *wait;
>> +
>> +     pg_data_t *pgdat = NODE_DATA(nid);
>> +
>> +     spin_lock(&kswapds_spinlock);
>> +     wait = pgdat->kswapd_wait;
>> +     kswapd_p = container_of(wait, struct kswapd, kswapd_wait);
>> +     thr = kswapd_p->kswapd_task;
>> +     spin_unlock(&kswapds_spinlock);
>>
>> -     if (kswapd)
>> -             kthread_stop(kswapd);
>> +     if (thr)
>> +             kthread_stop(thr);
>>  }
>>
>>  static int __init kswapd_init(void)
>> --
>> 1.7.3.1
>>
>
> --
> Mel Gorman
> Part-time Phd Student                          Linux Technology Center
> University of Limerick                         IBM Dublin Software Lab
>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@xxxxxxxxxx  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/
Don't email: <a href