On Tue, Dec 7, 2010 at 4:33 AM, Mel Gorman <mel@xxxxxxxxx> wrote: > On Mon, Nov 29, 2010 at 10:49:42PM -0800, Ying Han wrote: >> There is a kswapd kernel thread for each memory node. We add a different kswapd >> for each cgroup. > > What is considered a normal number of cgroups in production? 10, 50, 10000? If > it's a really large number and all the cgroups kswapds wake at the same time, > the zone LRU lock will be very heavily contended. Potentially there will > also be a very large number of new IO sources. I confess I haven't read the > thread yet so maybe this has already been thought of but it might make sense > to have a 1:N relationship between kswapd and memcgroups and cycle between > containers. The difficulty will be a latency between when kswapd wakes up > and when a particular container is scanned. The closer the ratio is to 1:1, > the less the latency will be but the higher the contenion on the LRU lock > and IO will be. > >> The kswapd is sleeping in the wait queue headed at kswapd_wait >> field of a kswapd descriptor. The kswapd descriptor stores information of node >> or cgroup and it allows the global and per cgroup background reclaim to share >> common reclaim algorithms. >> >> This patch addes the kswapd descriptor and changes per zone kswapd_wait to the >> common data structure. >> >> Signed-off-by: Ying Han <yinghan@xxxxxxxxxx> >> --- >> include/linux/mmzone.h | 3 +- >> include/linux/swap.h | 10 +++++ >> mm/memcontrol.c | 2 + >> mm/mmzone.c | 2 +- >> mm/page_alloc.c | 9 +++- >> mm/vmscan.c | 98 +++++++++++++++++++++++++++++++++-------------- >> 6 files changed, 90 insertions(+), 34 deletions(-) >> >> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h >> index 39c24eb..c77dfa2 100644 >> --- a/include/linux/mmzone.h >> +++ b/include/linux/mmzone.h >> @@ -642,8 +642,7 @@ typedef struct pglist_data { >> unsigned long node_spanned_pages; /* total size of physical page >> range, including holes */ >> int node_id; >> - wait_queue_head_t kswapd_wait; >> - struct task_struct *kswapd; >> + wait_queue_head_t *kswapd_wait; >> int kswapd_max_order; >> } pg_data_t; >> >> diff --git a/include/linux/swap.h b/include/linux/swap.h >> index eba53e7..2e6cb58 100644 >> --- a/include/linux/swap.h >> +++ b/include/linux/swap.h >> @@ -26,6 +26,16 @@ static inline int current_is_kswapd(void) >> return current->flags & PF_KSWAPD; >> } >> >> +struct kswapd { >> + struct task_struct *kswapd_task; >> + wait_queue_head_t kswapd_wait; >> + struct mem_cgroup *kswapd_mem; >> + pg_data_t *kswapd_pgdat; >> +}; >> + >> +#define MAX_KSWAPDS MAX_NUMNODES >> +extern struct kswapd kswapds[MAX_KSWAPDS]; > > This is potentially very large for a static structure. Can they not be > dynamically allocated and kept on a list? Yes, there will be a list walk > involved if yonu need a particular structure but that looks like it's a > rare operation at this point. This has been changed to dynamic allocation on the V2 I am working on. The kswapd descriptor is dynamic allocated at kswapd_run for both per-node and per-cgroup kswapd, and there is no list walking since it could be identified by container_of(wait, struct kswapd, kswapd_wait); --Ying > >> +int kswapd(void *p); >> /* >> * MAX_SWAPFILES defines the maximum number of swaptypes: things which can >> * be swapped to. The swap type and the offset into that swap type are >> diff --git a/mm/memcontrol.c b/mm/memcontrol.c >> index a4034b6..dca3590 100644 >> --- a/mm/memcontrol.c >> +++ b/mm/memcontrol.c >> @@ -263,6 +263,8 @@ struct mem_cgroup { >> */ >> struct mem_cgroup_stat_cpu nocpu_base; >> spinlock_t pcp_counter_lock; >> + >> + wait_queue_head_t *kswapd_wait; >> }; >> >> /* Stuffs for move charges at task migration. */ >> diff --git a/mm/mmzone.c b/mm/mmzone.c >> index e35bfb8..c7cbed5 100644 >> --- a/mm/mmzone.c >> +++ b/mm/mmzone.c >> @@ -102,7 +102,7 @@ unsigned long zone_nr_free_pages(struct zone *zone) >> * free pages are low, get a better estimate for free pages >> */ >> if (nr_free_pages < zone->percpu_drift_mark && >> - !waitqueue_active(&zone->zone_pgdat->kswapd_wait)) >> + !waitqueue_active(zone->zone_pgdat->kswapd_wait)) >> return zone_page_state_snapshot(zone, NR_FREE_PAGES); >> >> return nr_free_pages; >> diff --git a/mm/page_alloc.c b/mm/page_alloc.c >> index b48dea2..a15bc1c 100644 >> --- a/mm/page_alloc.c >> +++ b/mm/page_alloc.c >> @@ -4070,13 +4070,18 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, >> int nid = pgdat->node_id; >> unsigned long zone_start_pfn = pgdat->node_start_pfn; >> int ret; >> + struct kswapd *kswapd_p; >> >> pgdat_resize_init(pgdat); >> pgdat->nr_zones = 0; >> - init_waitqueue_head(&pgdat->kswapd_wait); >> pgdat->kswapd_max_order = 0; >> pgdat_page_cgroup_init(pgdat); >> - >> + >> + kswapd_p = &kswapds[nid]; >> + init_waitqueue_head(&kswapd_p->kswapd_wait); >> + pgdat->kswapd_wait = &kswapd_p->kswapd_wait; >> + kswapd_p->kswapd_pgdat = pgdat; >> + >> for (j = 0; j < MAX_NR_ZONES; j++) { >> struct zone *zone = pgdat->node_zones + j; >> unsigned long size, realsize, memmap_pages; >> diff --git a/mm/vmscan.c b/mm/vmscan.c >> index b8a6fdc..e08005e 100644 >> --- a/mm/vmscan.c >> +++ b/mm/vmscan.c >> @@ -2115,12 +2115,18 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, >> >> return nr_reclaimed; >> } >> + >> #endif >> > > Unnecessary whitespace there. > >> +DEFINE_SPINLOCK(kswapds_spinlock); >> +struct kswapd kswapds[MAX_KSWAPDS]; >> + >> /* is kswapd sleeping prematurely? */ >> -static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining) >> +static int sleeping_prematurely(struct kswapd *kswapd, int order, >> + long remaining) >> { >> int i; >> + pg_data_t *pgdat = kswapd->kswapd_pgdat; >> > > This will behave strangely. You are using information from a *node* to > determine if the kswapd belonging to a cgroup should sleep or not. The > risk is that a cgroup kswapd never goes to sleep because even when all > of its pages are discarded, the node itself is still not balanced. > >> /* If a direct reclaimer woke kswapd within HZ/10, it's premature */ >> if (remaining) >> @@ -2377,21 +2383,28 @@ out: >> * If there are applications that are active memory-allocators >> * (most normal use), this basically shouldn't matter. >> */ >> -static int kswapd(void *p) >> +int kswapd(void *p) >> { >> unsigned long order; >> - pg_data_t *pgdat = (pg_data_t*)p; >> + struct kswapd *kswapd_p = (struct kswapd *)p; >> + pg_data_t *pgdat = kswapd_p->kswapd_pgdat; >> + struct mem_cgroup *mem = kswapd_p->kswapd_mem; >> + wait_queue_head_t *wait_h = &kswapd_p->kswapd_wait; >> struct task_struct *tsk = current; >> DEFINE_WAIT(wait); >> struct reclaim_state reclaim_state = { >> .reclaimed_slab = 0, >> }; >> - const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); >> + const struct cpumask *cpumask; >> >> lockdep_set_current_reclaim_state(GFP_KERNEL); >> >> - if (!cpumask_empty(cpumask)) >> - set_cpus_allowed_ptr(tsk, cpumask); >> + if (pgdat) { >> + BUG_ON(pgdat->kswapd_wait != wait_h); >> + cpumask = cpumask_of_node(pgdat->node_id); >> + if (!cpumask_empty(cpumask)) >> + set_cpus_allowed_ptr(tsk, cpumask); >> + } >> current->reclaim_state = &reclaim_state; >> >> /* >> @@ -2414,9 +2427,13 @@ static int kswapd(void *p) >> unsigned long new_order; >> int ret; >> >> - prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); >> - new_order = pgdat->kswapd_max_order; >> - pgdat->kswapd_max_order = 0; >> + prepare_to_wait(wait_h, &wait, TASK_INTERRUPTIBLE); >> + if (pgdat) { >> + new_order = pgdat->kswapd_max_order; >> + pgdat->kswapd_max_order = 0; >> + } else >> + new_order = 0; >> + >> if (order < new_order) { >> /* >> * Don't sleep if someone wants a larger 'order' >> @@ -2428,10 +2445,12 @@ static int kswapd(void *p) >> long remaining = 0; >> >> /* Try to sleep for a short interval */ >> - if (!sleeping_prematurely(pgdat, order, remaining)) { >> + if (!sleeping_prematurely(kswapd_p, order, >> + remaining)) { >> remaining = schedule_timeout(HZ/10); >> - finish_wait(&pgdat->kswapd_wait, &wait); >> - prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); >> + finish_wait(wait_h, &wait); >> + prepare_to_wait(wait_h, &wait, >> + TASK_INTERRUPTIBLE); > > It would be nice if patch 1 did nothing but move the wait queue outside of > the node structure without any other functional change. It'll be then be > far easier to review a patch that introduces background reclaim for containers. > >> } >> >> /* >> @@ -2439,20 +2458,25 @@ static int kswapd(void *p) >> * premature sleep. If not, then go fully >> * to sleep until explicitly woken up >> */ >> - if (!sleeping_prematurely(pgdat, order, remaining)) { >> - trace_mm_vmscan_kswapd_sleep(pgdat->node_id); >> + if (!sleeping_prematurely(kswapd_p, order, >> + remaining)) { >> + if (pgdat) >> + trace_mm_vmscan_kswapd_sleep( >> + pgdat->node_id); >> schedule(); >> } else { >> if (remaining) >> - count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY); >> + count_vm_event( >> + KSWAPD_LOW_WMARK_HIT_QUICKLY); >> else >> - count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY); >> + count_vm_event( >> + KSWAPD_HIGH_WMARK_HIT_QUICKLY); >> } >> } >> - >> - order = pgdat->kswapd_max_order; >> + if (pgdat) >> + order = pgdat->kswapd_max_order; >> } >> - finish_wait(&pgdat->kswapd_wait, &wait); >> + finish_wait(wait_h, &wait); >> >> ret = try_to_freeze(); >> if (kthread_should_stop()) >> @@ -2476,6 +2500,7 @@ static int kswapd(void *p) >> void wakeup_kswapd(struct zone *zone, int order) >> { >> pg_data_t *pgdat; >> + wait_queue_head_t *wait; >> >> if (!populated_zone(zone)) >> return; >> @@ -2488,9 +2513,10 @@ void wakeup_kswapd(struct zone *zone, int order) >> trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order); >> if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) >> return; >> - if (!waitqueue_active(&pgdat->kswapd_wait)) >> + wait = pgdat->kswapd_wait; >> + if (!waitqueue_active(wait)) >> return; >> - wake_up_interruptible(&pgdat->kswapd_wait); >> + wake_up_interruptible(wait); >> } >> >> /* >> @@ -2587,7 +2613,10 @@ static int __devinit cpu_callback(struct notifier_block *nfb, >> >> if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids) >> /* One of our CPUs online: restore mask */ >> - set_cpus_allowed_ptr(pgdat->kswapd, mask); >> + if (kswapds[nid].kswapd_task) >> + set_cpus_allowed_ptr( >> + kswapds[nid].kswapd_task, >> + mask); >> } >> } >> return NOTIFY_OK; >> @@ -2599,19 +2628,20 @@ static int __devinit cpu_callback(struct notifier_block *nfb, >> */ >> int kswapd_run(int nid) >> { >> - pg_data_t *pgdat = NODE_DATA(nid); >> + struct task_struct *thr; >> int ret = 0; >> >> - if (pgdat->kswapd) >> + if (kswapds[nid].kswapd_task) >> return 0; >> >> - pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid); >> - if (IS_ERR(pgdat->kswapd)) { >> + thr = kthread_run(kswapd, &kswapds[nid], "kswapd%d", nid); >> + if (IS_ERR(thr)) { >> /* failure at boot is fatal */ >> BUG_ON(system_state == SYSTEM_BOOTING); >> printk("Failed to start kswapd on node %d\n",nid); >> ret = -1; >> } >> + kswapds[nid].kswapd_task = thr; >> return ret; >> } >> >> @@ -2620,10 +2650,20 @@ int kswapd_run(int nid) >> */ >> void kswapd_stop(int nid) >> { >> - struct task_struct *kswapd = NODE_DATA(nid)->kswapd; >> + struct task_struct *thr; >> + struct kswapd *kswapd_p; >> + wait_queue_head_t *wait; >> + >> + pg_data_t *pgdat = NODE_DATA(nid); >> + >> + spin_lock(&kswapds_spinlock); >> + wait = pgdat->kswapd_wait; >> + kswapd_p = container_of(wait, struct kswapd, kswapd_wait); >> + thr = kswapd_p->kswapd_task; >> + spin_unlock(&kswapds_spinlock); >> >> - if (kswapd) >> - kthread_stop(kswapd); >> + if (thr) >> + kthread_stop(thr); >> } >> >> static int __init kswapd_init(void) >> -- >> 1.7.3.1 >> > > -- > Mel Gorman > Part-time Phd Student Linux Technology Center > University of Limerick IBM Dublin Software Lab > -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxxx For more info on Linux MM, see: http://www.linux-mm.org/ . Fight unfair telecom policy in Canada: sign http://dissolvethecrtc.ca/ Don't email: <a href