On Mon, Nov 29, 2010 at 10:49:42PM -0800, Ying Han wrote: > There is a kswapd kernel thread for each memory node. We add a different kswapd > for each cgroup. What is considered a normal number of cgroups in production? 10, 50, 10000? If it's a really large number and all the cgroups kswapds wake at the same time, the zone LRU lock will be very heavily contended. Potentially there will also be a very large number of new IO sources. I confess I haven't read the thread yet so maybe this has already been thought of but it might make sense to have a 1:N relationship between kswapd and memcgroups and cycle between containers. The difficulty will be a latency between when kswapd wakes up and when a particular container is scanned. The closer the ratio is to 1:1, the less the latency will be but the higher the contenion on the LRU lock and IO will be. > The kswapd is sleeping in the wait queue headed at kswapd_wait > field of a kswapd descriptor. The kswapd descriptor stores information of node > or cgroup and it allows the global and per cgroup background reclaim to share > common reclaim algorithms. > > This patch addes the kswapd descriptor and changes per zone kswapd_wait to the > common data structure. > > Signed-off-by: Ying Han <yinghan@xxxxxxxxxx> > --- > include/linux/mmzone.h | 3 +- > include/linux/swap.h | 10 +++++ > mm/memcontrol.c | 2 + > mm/mmzone.c | 2 +- > mm/page_alloc.c | 9 +++- > mm/vmscan.c | 98 +++++++++++++++++++++++++++++++++-------------- > 6 files changed, 90 insertions(+), 34 deletions(-) > > diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h > index 39c24eb..c77dfa2 100644 > --- a/include/linux/mmzone.h > +++ b/include/linux/mmzone.h > @@ -642,8 +642,7 @@ typedef struct pglist_data { > unsigned long node_spanned_pages; /* total size of physical page > range, including holes */ > int node_id; > - wait_queue_head_t kswapd_wait; > - struct task_struct *kswapd; > + wait_queue_head_t *kswapd_wait; > int kswapd_max_order; > } pg_data_t; > > diff --git a/include/linux/swap.h b/include/linux/swap.h > index eba53e7..2e6cb58 100644 > --- a/include/linux/swap.h > +++ b/include/linux/swap.h > @@ -26,6 +26,16 @@ static inline int current_is_kswapd(void) > return current->flags & PF_KSWAPD; > } > > +struct kswapd { > + struct task_struct *kswapd_task; > + wait_queue_head_t kswapd_wait; > + struct mem_cgroup *kswapd_mem; > + pg_data_t *kswapd_pgdat; > +}; > + > +#define MAX_KSWAPDS MAX_NUMNODES > +extern struct kswapd kswapds[MAX_KSWAPDS]; This is potentially very large for a static structure. Can they not be dynamically allocated and kept on a list? Yes, there will be a list walk involved if yonu need a particular structure but that looks like it's a rare operation at this point. > +int kswapd(void *p); > /* > * MAX_SWAPFILES defines the maximum number of swaptypes: things which can > * be swapped to. The swap type and the offset into that swap type are > diff --git a/mm/memcontrol.c b/mm/memcontrol.c > index a4034b6..dca3590 100644 > --- a/mm/memcontrol.c > +++ b/mm/memcontrol.c > @@ -263,6 +263,8 @@ struct mem_cgroup { > */ > struct mem_cgroup_stat_cpu nocpu_base; > spinlock_t pcp_counter_lock; > + > + wait_queue_head_t *kswapd_wait; > }; > > /* Stuffs for move charges at task migration. */ > diff --git a/mm/mmzone.c b/mm/mmzone.c > index e35bfb8..c7cbed5 100644 > --- a/mm/mmzone.c > +++ b/mm/mmzone.c > @@ -102,7 +102,7 @@ unsigned long zone_nr_free_pages(struct zone *zone) > * free pages are low, get a better estimate for free pages > */ > if (nr_free_pages < zone->percpu_drift_mark && > - !waitqueue_active(&zone->zone_pgdat->kswapd_wait)) > + !waitqueue_active(zone->zone_pgdat->kswapd_wait)) > return zone_page_state_snapshot(zone, NR_FREE_PAGES); > > return nr_free_pages; > diff --git a/mm/page_alloc.c b/mm/page_alloc.c > index b48dea2..a15bc1c 100644 > --- a/mm/page_alloc.c > +++ b/mm/page_alloc.c > @@ -4070,13 +4070,18 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, > int nid = pgdat->node_id; > unsigned long zone_start_pfn = pgdat->node_start_pfn; > int ret; > + struct kswapd *kswapd_p; > > pgdat_resize_init(pgdat); > pgdat->nr_zones = 0; > - init_waitqueue_head(&pgdat->kswapd_wait); > pgdat->kswapd_max_order = 0; > pgdat_page_cgroup_init(pgdat); > - > + > + kswapd_p = &kswapds[nid]; > + init_waitqueue_head(&kswapd_p->kswapd_wait); > + pgdat->kswapd_wait = &kswapd_p->kswapd_wait; > + kswapd_p->kswapd_pgdat = pgdat; > + > for (j = 0; j < MAX_NR_ZONES; j++) { > struct zone *zone = pgdat->node_zones + j; > unsigned long size, realsize, memmap_pages; > diff --git a/mm/vmscan.c b/mm/vmscan.c > index b8a6fdc..e08005e 100644 > --- a/mm/vmscan.c > +++ b/mm/vmscan.c > @@ -2115,12 +2115,18 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, > > return nr_reclaimed; > } > + > #endif > Unnecessary whitespace there. > +DEFINE_SPINLOCK(kswapds_spinlock); > +struct kswapd kswapds[MAX_KSWAPDS]; > + > /* is kswapd sleeping prematurely? */ > -static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining) > +static int sleeping_prematurely(struct kswapd *kswapd, int order, > + long remaining) > { > int i; > + pg_data_t *pgdat = kswapd->kswapd_pgdat; > This will behave strangely. You are using information from a *node* to determine if the kswapd belonging to a cgroup should sleep or not. The risk is that a cgroup kswapd never goes to sleep because even when all of its pages are discarded, the node itself is still not balanced. > /* If a direct reclaimer woke kswapd within HZ/10, it's premature */ > if (remaining) > @@ -2377,21 +2383,28 @@ out: > * If there are applications that are active memory-allocators > * (most normal use), this basically shouldn't matter. > */ > -static int kswapd(void *p) > +int kswapd(void *p) > { > unsigned long order; > - pg_data_t *pgdat = (pg_data_t*)p; > + struct kswapd *kswapd_p = (struct kswapd *)p; > + pg_data_t *pgdat = kswapd_p->kswapd_pgdat; > + struct mem_cgroup *mem = kswapd_p->kswapd_mem; > + wait_queue_head_t *wait_h = &kswapd_p->kswapd_wait; > struct task_struct *tsk = current; > DEFINE_WAIT(wait); > struct reclaim_state reclaim_state = { > .reclaimed_slab = 0, > }; > - const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); > + const struct cpumask *cpumask; > > lockdep_set_current_reclaim_state(GFP_KERNEL); > > - if (!cpumask_empty(cpumask)) > - set_cpus_allowed_ptr(tsk, cpumask); > + if (pgdat) { > + BUG_ON(pgdat->kswapd_wait != wait_h); > + cpumask = cpumask_of_node(pgdat->node_id); > + if (!cpumask_empty(cpumask)) > + set_cpus_allowed_ptr(tsk, cpumask); > + } > current->reclaim_state = &reclaim_state; > > /* > @@ -2414,9 +2427,13 @@ static int kswapd(void *p) > unsigned long new_order; > int ret; > > - prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); > - new_order = pgdat->kswapd_max_order; > - pgdat->kswapd_max_order = 0; > + prepare_to_wait(wait_h, &wait, TASK_INTERRUPTIBLE); > + if (pgdat) { > + new_order = pgdat->kswapd_max_order; > + pgdat->kswapd_max_order = 0; > + } else > + new_order = 0; > + > if (order < new_order) { > /* > * Don't sleep if someone wants a larger 'order' > @@ -2428,10 +2445,12 @@ static int kswapd(void *p) > long remaining = 0; > > /* Try to sleep for a short interval */ > - if (!sleeping_prematurely(pgdat, order, remaining)) { > + if (!sleeping_prematurely(kswapd_p, order, > + remaining)) { > remaining = schedule_timeout(HZ/10); > - finish_wait(&pgdat->kswapd_wait, &wait); > - prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); > + finish_wait(wait_h, &wait); > + prepare_to_wait(wait_h, &wait, > + TASK_INTERRUPTIBLE); It would be nice if patch 1 did nothing but move the wait queue outside of the node structure without any other functional change. It'll be then be far easier to review a patch that introduces background reclaim for containers. > } > > /* > @@ -2439,20 +2458,25 @@ static int kswapd(void *p) > * premature sleep. If not, then go fully > * to sleep until explicitly woken up > */ > - if (!sleeping_prematurely(pgdat, order, remaining)) { > - trace_mm_vmscan_kswapd_sleep(pgdat->node_id); > + if (!sleeping_prematurely(kswapd_p, order, > + remaining)) { > + if (pgdat) > + trace_mm_vmscan_kswapd_sleep( > + pgdat->node_id); > schedule(); > } else { > if (remaining) > - count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY); > + count_vm_event( > + KSWAPD_LOW_WMARK_HIT_QUICKLY); > else > - count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY); > + count_vm_event( > + KSWAPD_HIGH_WMARK_HIT_QUICKLY); > } > } > - > - order = pgdat->kswapd_max_order; > + if (pgdat) > + order = pgdat->kswapd_max_order; > } > - finish_wait(&pgdat->kswapd_wait, &wait); > + finish_wait(wait_h, &wait); > > ret = try_to_freeze(); > if (kthread_should_stop()) > @@ -2476,6 +2500,7 @@ static int kswapd(void *p) > void wakeup_kswapd(struct zone *zone, int order) > { > pg_data_t *pgdat; > + wait_queue_head_t *wait; > > if (!populated_zone(zone)) > return; > @@ -2488,9 +2513,10 @@ void wakeup_kswapd(struct zone *zone, int order) > trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order); > if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) > return; > - if (!waitqueue_active(&pgdat->kswapd_wait)) > + wait = pgdat->kswapd_wait; > + if (!waitqueue_active(wait)) > return; > - wake_up_interruptible(&pgdat->kswapd_wait); > + wake_up_interruptible(wait); > } > > /* > @@ -2587,7 +2613,10 @@ static int __devinit cpu_callback(struct notifier_block *nfb, > > if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids) > /* One of our CPUs online: restore mask */ > - set_cpus_allowed_ptr(pgdat->kswapd, mask); > + if (kswapds[nid].kswapd_task) > + set_cpus_allowed_ptr( > + kswapds[nid].kswapd_task, > + mask); > } > } > return NOTIFY_OK; > @@ -2599,19 +2628,20 @@ static int __devinit cpu_callback(struct notifier_block *nfb, > */ > int kswapd_run(int nid) > { > - pg_data_t *pgdat = NODE_DATA(nid); > + struct task_struct *thr; > int ret = 0; > > - if (pgdat->kswapd) > + if (kswapds[nid].kswapd_task) > return 0; > > - pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid); > - if (IS_ERR(pgdat->kswapd)) { > + thr = kthread_run(kswapd, &kswapds[nid], "kswapd%d", nid); > + if (IS_ERR(thr)) { > /* failure at boot is fatal */ > BUG_ON(system_state == SYSTEM_BOOTING); > printk("Failed to start kswapd on node %d\n",nid); > ret = -1; > } > + kswapds[nid].kswapd_task = thr; > return ret; > } > > @@ -2620,10 +2650,20 @@ int kswapd_run(int nid) > */ > void kswapd_stop(int nid) > { > - struct task_struct *kswapd = NODE_DATA(nid)->kswapd; > + struct task_struct *thr; > + struct kswapd *kswapd_p; > + wait_queue_head_t *wait; > + > + pg_data_t *pgdat = NODE_DATA(nid); > + > + spin_lock(&kswapds_spinlock); > + wait = pgdat->kswapd_wait; > + kswapd_p = container_of(wait, struct kswapd, kswapd_wait); > + thr = kswapd_p->kswapd_task; > + spin_unlock(&kswapds_spinlock); > > - if (kswapd) > - kthread_stop(kswapd); > + if (thr) > + kthread_stop(thr); > } > > static int __init kswapd_init(void) > -- > 1.7.3.1 > -- Mel Gorman Part-time Phd Student Linux Technology Center University of Limerick IBM Dublin Software Lab -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxxx For more info on Linux MM, see: http://www.linux-mm.org/ . Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/ Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>