[PATCH V4 01/10] Add kswapd descriptor

Ying Han <yinghan@xxxxxxxxxx> · Thu, 14 Apr 2011 15:54:20 -0700

There is a kswapd kernel thread for each numa node. We will add a different
kswapd for each memcg. The kswapd is sleeping in the wait queue headed at
kswapd_wait field of a kswapd descriptor. The kswapd descriptor stores
information of node or memcg and it allows the global and per-memcg background
reclaim to share common reclaim algorithms.

This patch adds the kswapd descriptor and moves the per-node kswapd to use the
new structure.

changelog v2..v1:
1. dynamic allocate kswapd descriptor and initialize the wait_queue_head of pgdat
at kswapd_run.
2. add helper macro is_node_kswapd to distinguish per-node/per-cgroup kswapd
descriptor.

changelog v3..v2:
1. move the struct mem_cgroup *kswapd_mem in kswapd sruct to later patch.
2. rename thr in kswapd_run to something else.

Signed-off-by: Ying Han <yinghan@xxxxxxxxxx>
---
 include/linux/mmzone.h |    3 +-
 include/linux/swap.h   |    7 ++++
 mm/page_alloc.c        |    1 -
 mm/vmscan.c            |   95 ++++++++++++++++++++++++++++++++++++------------
 4 files changed, 80 insertions(+), 26 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 628f07b..6cba7d2 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -640,8 +640,7 @@ typedef struct pglist_data {
 	unsigned long node_spanned_pages; /* total size of physical page
 					     range, including holes */
 	int node_id;
-	wait_queue_head_t kswapd_wait;
-	struct task_struct *kswapd;
+	wait_queue_head_t *kswapd_wait;
 	int kswapd_max_order;
 	enum zone_type classzone_idx;
 } pg_data_t;
diff --git a/include/linux/swap.h b/include/linux/swap.h
index ed6ebe6..f43d406 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -26,6 +26,13 @@ static inline int current_is_kswapd(void)
 	return current->flags & PF_KSWAPD;
 }
 
+struct kswapd {
+	struct task_struct *kswapd_task;
+	wait_queue_head_t kswapd_wait;
+	pg_data_t *kswapd_pgdat;
+};
+
+int kswapd(void *p);
 /*
  * MAX_SWAPFILES defines the maximum number of swaptypes: things which can
  * be swapped to.  The swap type and the offset into that swap type are
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6e1b52a..6340865 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4205,7 +4205,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
 
 	pgdat_resize_init(pgdat);
 	pgdat->nr_zones = 0;
-	init_waitqueue_head(&pgdat->kswapd_wait);
 	pgdat->kswapd_max_order = 0;
 	pgdat_page_cgroup_init(pgdat);
 	
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 060e4c1..77ac74f 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2241,13 +2241,16 @@ static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages,
 	return balanced_pages > (present_pages >> 2);
 }
 
+static DEFINE_SPINLOCK(kswapds_spinlock);
+
 /* is kswapd sleeping prematurely? */
-static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
-					int classzone_idx)
+static int sleeping_prematurely(struct kswapd *kswapd, int order,
+				long remaining, int classzone_idx)
 {
 	int i;
 	unsigned long balanced = 0;
 	bool all_zones_ok = true;
+	pg_data_t *pgdat = kswapd->kswapd_pgdat;
 
 	/* If a direct reclaimer woke kswapd within HZ/10, it's premature */
 	if (remaining)
@@ -2570,28 +2573,31 @@ out:
 	return order;
 }
 
-static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
+static void kswapd_try_to_sleep(struct kswapd *kswapd_p, int order,
+				int classzone_idx)
 {
 	long remaining = 0;
 	DEFINE_WAIT(wait);
+	pg_data_t *pgdat = kswapd_p->kswapd_pgdat;
+	wait_queue_head_t *wait_h = &kswapd_p->kswapd_wait;
 
 	if (freezing(current) || kthread_should_stop())
 		return;
 
-	prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
+	prepare_to_wait(wait_h, &wait, TASK_INTERRUPTIBLE);
 
 	/* Try to sleep for a short interval */
-	if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) {
+	if (!sleeping_prematurely(kswapd_p, order, remaining, classzone_idx)) {
 		remaining = schedule_timeout(HZ/10);
-		finish_wait(&pgdat->kswapd_wait, &wait);
-		prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
+		finish_wait(wait_h, &wait);
+		prepare_to_wait(wait_h, &wait, TASK_INTERRUPTIBLE);
 	}
 
 	/*
 	 * After a short sleep, check if it was a premature sleep. If not, then
 	 * go fully to sleep until explicitly woken up.
 	 */
-	if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) {
+	if (!sleeping_prematurely(kswapd_p, order, remaining, classzone_idx)) {
 		trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
 
 		/*
@@ -2611,7 +2617,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
 		else
 			count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY);
 	}
-	finish_wait(&pgdat->kswapd_wait, &wait);
+	finish_wait(wait_h, &wait);
 }
 
 /*
@@ -2627,20 +2633,24 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
  * If there are applications that are active memory-allocators
  * (most normal use), this basically shouldn't matter.
  */
-static int kswapd(void *p)
+int kswapd(void *p)
 {
 	unsigned long order;
 	int classzone_idx;
-	pg_data_t *pgdat = (pg_data_t*)p;
+	struct kswapd *kswapd_p = (struct kswapd *)p;
+	pg_data_t *pgdat = kswapd_p->kswapd_pgdat;
+	wait_queue_head_t *wait_h = &kswapd_p->kswapd_wait;
 	struct task_struct *tsk = current;
 
 	struct reclaim_state reclaim_state = {
 		.reclaimed_slab = 0,
 	};
-	const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
+	const struct cpumask *cpumask;
 
 	lockdep_set_current_reclaim_state(GFP_KERNEL);
 
+	BUG_ON(pgdat->kswapd_wait != wait_h);
+	cpumask = cpumask_of_node(pgdat->node_id);
 	if (!cpumask_empty(cpumask))
 		set_cpus_allowed_ptr(tsk, cpumask);
 	current->reclaim_state = &reclaim_state;
@@ -2679,7 +2689,7 @@ static int kswapd(void *p)
 			order = new_order;
 			classzone_idx = new_classzone_idx;
 		} else {
-			kswapd_try_to_sleep(pgdat, order, classzone_idx);
+			kswapd_try_to_sleep(kswapd_p, order, classzone_idx);
 			order = pgdat->kswapd_max_order;
 			classzone_idx = pgdat->classzone_idx;
 			pgdat->kswapd_max_order = 0;
@@ -2719,13 +2729,13 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
 		pgdat->kswapd_max_order = order;
 		pgdat->classzone_idx = min(pgdat->classzone_idx, classzone_idx);
 	}
-	if (!waitqueue_active(&pgdat->kswapd_wait))
+	if (!waitqueue_active(pgdat->kswapd_wait))
 		return;
 	if (zone_watermark_ok_safe(zone, order, low_wmark_pages(zone), 0, 0))
 		return;
 
 	trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
-	wake_up_interruptible(&pgdat->kswapd_wait);
+	wake_up_interruptible(pgdat->kswapd_wait);
 }
 
 /*
@@ -2817,12 +2827,23 @@ static int __devinit cpu_callback(struct notifier_block *nfb,
 		for_each_node_state(nid, N_HIGH_MEMORY) {
 			pg_data_t *pgdat = NODE_DATA(nid);
 			const struct cpumask *mask;
+			struct kswapd *kswapd_p;
+			struct task_struct *kswapd_thr;
+			wait_queue_head_t *wait;
 
 			mask = cpumask_of_node(pgdat->node_id);
 
+			spin_lock(&kswapds_spinlock);
+			wait = pgdat->kswapd_wait;
+			kswapd_p = container_of(wait, struct kswapd,
+						kswapd_wait);
+			kswapd_thr = kswapd_p->kswapd_task;
+			spin_unlock(&kswapds_spinlock);
+
 			if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids)
 				/* One of our CPUs online: restore mask */
-				set_cpus_allowed_ptr(pgdat->kswapd, mask);
+				if (kswapd_thr)
+					set_cpus_allowed_ptr(kswapd_thr, mask);
 		}
 	}
 	return NOTIFY_OK;
@@ -2835,18 +2856,31 @@ static int __devinit cpu_callback(struct notifier_block *nfb,
 int kswapd_run(int nid)
 {
 	pg_data_t *pgdat = NODE_DATA(nid);
+	struct task_struct *kswapd_thr;
+	struct kswapd *kswapd_p;
 	int ret = 0;
 
-	if (pgdat->kswapd)
+	if (pgdat->kswapd_wait)
 		return 0;
 
-	pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
-	if (IS_ERR(pgdat->kswapd)) {
+	kswapd_p = kzalloc(sizeof(struct kswapd), GFP_KERNEL);
+	if (!kswapd_p)
+		return -ENOMEM;
+
+	init_waitqueue_head(&kswapd_p->kswapd_wait);
+	pgdat->kswapd_wait = &kswapd_p->kswapd_wait;
+	kswapd_p->kswapd_pgdat = pgdat;
+
+	kswapd_thr = kthread_run(kswapd, kswapd_p, "kswapd%d", nid);
+	if (IS_ERR(kswapd_thr)) {
 		/* failure at boot is fatal */
 		BUG_ON(system_state == SYSTEM_BOOTING);
 		printk("Failed to start kswapd on node %d\n",nid);
+		pgdat->kswapd_wait = NULL;
+		kfree(kswapd_p);
 		ret = -1;
-	}
+	} else
+		kswapd_p->kswapd_task = kswapd_thr;
 	return ret;
 }
 
@@ -2855,10 +2889,25 @@ int kswapd_run(int nid)
  */
 void kswapd_stop(int nid)
 {
-	struct task_struct *kswapd = NODE_DATA(nid)->kswapd;
+	struct task_struct *kswapd_thr = NULL;
+	struct kswapd *kswapd_p = NULL;
+	wait_queue_head_t *wait;
+
+	pg_data_t *pgdat = NODE_DATA(nid);
+
+	spin_lock(&kswapds_spinlock);
+	wait = pgdat->kswapd_wait;
+	if (wait) {
+		kswapd_p = container_of(wait, struct kswapd, kswapd_wait);
+		kswapd_thr = kswapd_p->kswapd_task;
+		kswapd_p->kswapd_task = NULL;
+	}
+	spin_unlock(&kswapds_spinlock);
+
+	if (kswapd_thr)
+		kthread_stop(kswapd_thr);
 
-	if (kswapd)
-		kthread_stop(kswapd);
+	kfree(kswapd_p);
 }
 
 static int __init kswapd_init(void)
-- 
1.7.3.1

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@xxxxxxxxxx  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@xxxxxxxxx";> email@xxxxxxxxx </a>