[PATCH/RFC 7/11] numa - Automatic-migration - add internode migration delay

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



AutoPage Migration - add hysteresis to internode migration

This patch adds hysteresis to the internode migration to prevent
page migration trashing when automatic scheduler driven page migration
is enabled.

Add static in-line function "too_soon_for_internode_migration"
[macro => 0 if !CONFIG_AUTO_MIGRATION] to check for attempts to move
task to a new node sooner than auto_migrate_interval jiffies
after previous migration.  Note:  fetches interval from task struct
to avoid callout to cpuset func with rcu_lock/unlock round trip on
each migration check.  The task's auto_migrate_interval is updated
from cpuset_update_task_memory_state().

Modify try_to_wakeup() to leave task on its current cpu if too
soon to move it to a different node.

Modify can_migrate_task() to "just say no!" if the load balancer
proposes an internode migration too soon after previous internode
migration.

?	Fix comment block on can_migrate_task() to reflect
	order of tests in current code.

Added a control file--auto_migrate_interval--to cpusets to
query/set the per cpuset interval.  Provide some fairly arbitrary
min, max and default values.

Signed-off-by:  Lee Schermerhorn <lee.schermerhorn@xxxxxx>

 include/linux/auto-migrate.h |   30 ++++++++++++++++++++++++++
 include/linux/sched.h        |    2 +
 kernel/cpuset.c              |   49 +++++++++++++++++++++++++++++++++++++++++--
 kernel/sched_fair.c          |   18 +++++++++++++--
 4 files changed, 94 insertions(+), 5 deletions(-)

Index: linux-2.6.36-mmotm-101103-1217/include/linux/sched.h
===================================================================
--- linux-2.6.36-mmotm-101103-1217.orig/include/linux/sched.h
+++ linux-2.6.36-mmotm-101103-1217/include/linux/sched.h
@@ -1462,6 +1462,8 @@ struct task_struct {
 #ifdef CONFIG_AUTO_MIGRATION
 	short migrate_pending:1;	/* internode mem migration pending */
 #endif
+	unsigned long next_migrate;	/* internode migration hysteresis */
+	unsigned long auto_migrate_interval;	/* from cpuset */
 #endif
 	atomic_t fs_excl;	/* holding fs exclusive resources */
 	struct rcu_head rcu;
Index: linux-2.6.36-mmotm-101103-1217/include/linux/auto-migrate.h
===================================================================
--- linux-2.6.36-mmotm-101103-1217.orig/include/linux/auto-migrate.h
+++ linux-2.6.36-mmotm-101103-1217/include/linux/auto-migrate.h
@@ -12,6 +12,10 @@
 extern int is_auto_migration(int flags);
 extern void auto_migrate_task_memory(void);
 
+#define AUTO_MIGRATE_INTERVAL_DFLT (30*HZ)
+#define AUTO_MIGRATE_INTERVAL_MIN (5*HZ)
+#define AUTO_MIGRATE_INTERVAL_MAX (300*HZ)
+
 #ifdef SCHED_AUTO_MIGRATION
 /* these need sched.h definition.  They're only where sched.h is
  * already included.  Note we depend on sched.h being included
@@ -27,6 +31,24 @@ static inline void check_internode_migra
 }
 
 /*
+ * To avoids page migration thrashing when auto memory migration is enabled,
+ * check user task for too recent internode migration.
+ */
+static inline int too_soon_for_internode_migration(struct task_struct *task,
+								int this_cpu)
+{
+	if (auto_migrate_enabled(task) && task->mm &&
+		cpu_to_node(task_cpu(task)) != cpu_to_node(this_cpu)) {
+
+		if (task->migrate_pending ||
+			time_before(jiffies, task->next_migrate))
+			return 1;
+	}
+
+	return 0;
+}
+
+/*
  * called only by arch dependent code for architectures that
  * support "migration work"
  */
@@ -40,6 +62,8 @@ static inline void check_migrate_pending
 			local_irq_enable();
 		}
 
+		current->next_migrate = jiffies
+			 + current->auto_migrate_interval;
 		/*
 		 * can't be called in atomic context.
 		 */
@@ -71,6 +95,12 @@ static inline void check_migrate_pending
 	clear_thread_flag(TIF_NOTIFY_RESUME);
 }
 
+static inline int too_soon_for_internode_migration(struct task_struct *tsk,
+								int cpu)
+{
+	return 0;
+}
+
 #endif	/* CONFIG_AUTO_MIGRATION */
 
 #endif
Index: linux-2.6.36-mmotm-101103-1217/kernel/cpuset.c
===================================================================
--- linux-2.6.36-mmotm-101103-1217.orig/kernel/cpuset.c
+++ linux-2.6.36-mmotm-101103-1217/kernel/cpuset.c
@@ -53,6 +53,7 @@
 #include <linux/time.h>
 #include <linux/backing-dev.h>
 #include <linux/sort.h>
+#include <linux/auto-migrate.h>
 
 #include <asm/uaccess.h>
 #include <asm/atomic.h>
@@ -99,6 +100,8 @@ struct cpuset {
 
 	struct fmeter fmeter;		/* memory_pressure filter */
 
+	unsigned long auto_migrate_interval;
+
 	/* partition number for rebuild_sched_domains() */
 	int pn;
 
@@ -196,6 +199,7 @@ static inline int is_auto_migrate(const
 
 static struct cpuset top_cpuset = {
 	.flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)),
+	.auto_migrate_interval = AUTO_MIGRATE_INTERVAL_DFLT,
 };
 
 /*
@@ -358,9 +362,10 @@ static void cpuset_update_task_cpuset_fl
 		set_migrate_on_fault_enabled(tsk, 1);
 	else
 		set_migrate_on_fault_enabled(tsk, 0);
-	if (is_auto_migrate(cs))
+	if (is_auto_migrate(cs)) {
 		set_auto_migrate_enabled(tsk, 1);
-	else
+		tsk->auto_migrate_interval = cs->auto_migrate_interval;
+	} else
 		set_auto_migrate_enabled(tsk, 0);
 
 }
@@ -1526,6 +1531,28 @@ alloc_fail:
 	NODEMASK_FREE(to);
 }
 
+/*
+ * Call with manage_mutex held.
+ */
+static int update_auto_migrate_interval(struct cpuset *cs, u64 val)
+{
+	unsigned long n = val * HZ;	/* scale seconds to ticks */
+
+	if (n == cs->auto_migrate_interval)
+		return 0;
+
+	/*
+	 * silently clip to min/max
+	 */
+	if (n < AUTO_MIGRATE_INTERVAL_MIN)
+		cs->auto_migrate_interval = AUTO_MIGRATE_INTERVAL_MIN;
+	else if (n > AUTO_MIGRATE_INTERVAL_MAX)
+		cs->auto_migrate_interval = AUTO_MIGRATE_INTERVAL_MAX;
+	else
+		cs->auto_migrate_interval = n;
+	return 0;
+}
+
 /* The various types of files and directories in a cpuset file system */
 
 typedef enum {
@@ -1545,6 +1572,7 @@ typedef enum {
 	FILE_SHARED_FILE_POLICY,
 	FILE_MIGRATE_ON_FAULT,
 	FILE_AUTO_MIGRATE,
+	FILE_AUTO_MIGRATE_INTERVAL,
 } cpuset_filetype_t;
 
 static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
@@ -1596,6 +1624,9 @@ static int cpuset_write_u64(struct cgrou
 	case FILE_AUTO_MIGRATE:
 		retval = update_flag(CS_AUTO_MIGRATE, cs, val);
 		break;
+	case FILE_AUTO_MIGRATE_INTERVAL:
+		retval = update_auto_migrate_interval(cs, val);
+		break;
 	default:
 		retval = -EINVAL;
 		break;
@@ -1725,6 +1756,9 @@ static ssize_t cpuset_common_file_read(s
 	case FILE_MEMLIST:
 		s += cpuset_sprintf_memlist(s, cs);
 		break;
+	case FILE_AUTO_MIGRATE_INTERVAL:
+		s += sprintf(s, "%ld", cs->auto_migrate_interval / HZ);
+		break;
 	default:
 		retval = -EINVAL;
 		goto out;
@@ -1913,6 +1947,13 @@ static struct cftype cft_auto_migration
 	.private = FILE_AUTO_MIGRATE,
 };
 
+static struct cftype cft_auto_migrate_interval = {
+	.name = "auto_migrate_interval",
+	.read = cpuset_common_file_read,
+	.write_u64 = cpuset_write_u64,
+	.private = FILE_AUTO_MIGRATE_INTERVAL,
+};
+
 static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont)
 {
 	int err;
@@ -1936,6 +1977,9 @@ static int cpuset_populate(struct cgroup
 	err = add_auto_migration_file(cont, ss, &cft_auto_migration);
 	if (err < 0)
 		return err;
+	err = add_auto_migration_file(cont, ss, &cft_auto_migrate_interval);
+	if (err < 0)
+		return err;
 	/* memory_pressure_enabled is in root cpuset only */
 	if (!cont->parent)
 		err = cgroup_add_file(cont, ss,
@@ -2019,6 +2063,7 @@ static struct cgroup_subsys_state *cpuse
 	if (is_auto_migrate(parent))
 		set_bit(CS_AUTO_MIGRATE, &cs->flags);
 	set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
+	cs->auto_migrate_interval = parent->auto_migrate_interval;
 	cpumask_clear(cs->cpus_allowed);
 	nodes_clear(cs->mems_allowed);
 	fmeter_init(&cs->fmeter);
Index: linux-2.6.36-mmotm-101103-1217/kernel/sched_fair.c
===================================================================
--- linux-2.6.36-mmotm-101103-1217.orig/kernel/sched_fair.c
+++ linux-2.6.36-mmotm-101103-1217/kernel/sched_fair.c
@@ -1454,6 +1454,14 @@ select_task_rq_fair(struct rq *rq, struc
 	int want_sd = 1;
 	int sync = wake_flags & WF_SYNC;
 
+
+	/*
+	 * short circuit balancing if this task was recently
+	 * migrated to this cpu's node.
+	 */
+	if (too_soon_for_internode_migration(p, prev_cpu))
+		return prev_cpu;
+
 	if (sd_flag & SD_BALANCE_WAKE) {
 		if (cpumask_test_cpu(cpu, &p->cpus_allowed))
 			want_affine = 1;
@@ -1782,9 +1790,10 @@ int can_migrate_task(struct task_struct
 	int tsk_cache_hot = 0;
 	/*
 	 * We do not migrate tasks that are:
-	 * 1) running (obviously), or
-	 * 2) cannot be migrated to this CPU due to cpus_allowed, or
-	 * 3) are cache-hot on their current CPU.
+	 * 1) cannot be migrated to this CPU due to cpus_allowed, or
+	 * 2) running (obviously), or
+	 * 3) too soon since last internode migration
+	 * 4) are cache-hot on their current CPU.
 	 */
 	if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
 		schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
@@ -1797,6 +1806,9 @@ int can_migrate_task(struct task_struct
 		return 0;
 	}
 
+	if (too_soon_for_internode_migration(p, this_cpu))
+		return 0;
+
 	/*
 	 * Aggressive migration if:
 	 * 1) task is cache cold, or
--
To unsubscribe from this list: send the line "unsubscribe linux-numa" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[Index of Archives]     [Linux Kernel]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux SCSI]     [Devices]

  Powered by Linux