AutoPage Migration - add hysteresis to internode migration This patch adds hysteresis to the internode migration to prevent page migration trashing when automatic scheduler driven page migration is enabled. Add static in-line function "too_soon_for_internode_migration" [macro => 0 if !CONFIG_AUTO_MIGRATION] to check for attempts to move task to a new node sooner than auto_migrate_interval jiffies after previous migration. Note: fetches interval from task struct to avoid callout to cpuset func with rcu_lock/unlock round trip on each migration check. The task's auto_migrate_interval is updated from cpuset_update_task_memory_state(). Modify try_to_wakeup() to leave task on its current cpu if too soon to move it to a different node. Modify can_migrate_task() to "just say no!" if the load balancer proposes an internode migration too soon after previous internode migration. ? Fix comment block on can_migrate_task() to reflect order of tests in current code. Added a control file--auto_migrate_interval--to cpusets to query/set the per cpuset interval. Provide some fairly arbitrary min, max and default values. Signed-off-by: Lee Schermerhorn <lee.schermerhorn@xxxxxx> include/linux/auto-migrate.h | 30 ++++++++++++++++++++++++++ include/linux/sched.h | 2 + kernel/cpuset.c | 49 +++++++++++++++++++++++++++++++++++++++++-- kernel/sched_fair.c | 18 +++++++++++++-- 4 files changed, 94 insertions(+), 5 deletions(-) Index: linux-2.6.36-mmotm-101103-1217/include/linux/sched.h =================================================================== --- linux-2.6.36-mmotm-101103-1217.orig/include/linux/sched.h +++ linux-2.6.36-mmotm-101103-1217/include/linux/sched.h @@ -1462,6 +1462,8 @@ struct task_struct { #ifdef CONFIG_AUTO_MIGRATION short migrate_pending:1; /* internode mem migration pending */ #endif + unsigned long next_migrate; /* internode migration hysteresis */ + unsigned long auto_migrate_interval; /* from cpuset */ #endif atomic_t fs_excl; /* holding fs exclusive resources */ struct rcu_head rcu; Index: linux-2.6.36-mmotm-101103-1217/include/linux/auto-migrate.h =================================================================== --- linux-2.6.36-mmotm-101103-1217.orig/include/linux/auto-migrate.h +++ linux-2.6.36-mmotm-101103-1217/include/linux/auto-migrate.h @@ -12,6 +12,10 @@ extern int is_auto_migration(int flags); extern void auto_migrate_task_memory(void); +#define AUTO_MIGRATE_INTERVAL_DFLT (30*HZ) +#define AUTO_MIGRATE_INTERVAL_MIN (5*HZ) +#define AUTO_MIGRATE_INTERVAL_MAX (300*HZ) + #ifdef SCHED_AUTO_MIGRATION /* these need sched.h definition. They're only where sched.h is * already included. Note we depend on sched.h being included @@ -27,6 +31,24 @@ static inline void check_internode_migra } /* + * To avoids page migration thrashing when auto memory migration is enabled, + * check user task for too recent internode migration. + */ +static inline int too_soon_for_internode_migration(struct task_struct *task, + int this_cpu) +{ + if (auto_migrate_enabled(task) && task->mm && + cpu_to_node(task_cpu(task)) != cpu_to_node(this_cpu)) { + + if (task->migrate_pending || + time_before(jiffies, task->next_migrate)) + return 1; + } + + return 0; +} + +/* * called only by arch dependent code for architectures that * support "migration work" */ @@ -40,6 +62,8 @@ static inline void check_migrate_pending local_irq_enable(); } + current->next_migrate = jiffies + + current->auto_migrate_interval; /* * can't be called in atomic context. */ @@ -71,6 +95,12 @@ static inline void check_migrate_pending clear_thread_flag(TIF_NOTIFY_RESUME); } +static inline int too_soon_for_internode_migration(struct task_struct *tsk, + int cpu) +{ + return 0; +} + #endif /* CONFIG_AUTO_MIGRATION */ #endif Index: linux-2.6.36-mmotm-101103-1217/kernel/cpuset.c =================================================================== --- linux-2.6.36-mmotm-101103-1217.orig/kernel/cpuset.c +++ linux-2.6.36-mmotm-101103-1217/kernel/cpuset.c @@ -53,6 +53,7 @@ #include <linux/time.h> #include <linux/backing-dev.h> #include <linux/sort.h> +#include <linux/auto-migrate.h> #include <asm/uaccess.h> #include <asm/atomic.h> @@ -99,6 +100,8 @@ struct cpuset { struct fmeter fmeter; /* memory_pressure filter */ + unsigned long auto_migrate_interval; + /* partition number for rebuild_sched_domains() */ int pn; @@ -196,6 +199,7 @@ static inline int is_auto_migrate(const static struct cpuset top_cpuset = { .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)), + .auto_migrate_interval = AUTO_MIGRATE_INTERVAL_DFLT, }; /* @@ -358,9 +362,10 @@ static void cpuset_update_task_cpuset_fl set_migrate_on_fault_enabled(tsk, 1); else set_migrate_on_fault_enabled(tsk, 0); - if (is_auto_migrate(cs)) + if (is_auto_migrate(cs)) { set_auto_migrate_enabled(tsk, 1); - else + tsk->auto_migrate_interval = cs->auto_migrate_interval; + } else set_auto_migrate_enabled(tsk, 0); } @@ -1526,6 +1531,28 @@ alloc_fail: NODEMASK_FREE(to); } +/* + * Call with manage_mutex held. + */ +static int update_auto_migrate_interval(struct cpuset *cs, u64 val) +{ + unsigned long n = val * HZ; /* scale seconds to ticks */ + + if (n == cs->auto_migrate_interval) + return 0; + + /* + * silently clip to min/max + */ + if (n < AUTO_MIGRATE_INTERVAL_MIN) + cs->auto_migrate_interval = AUTO_MIGRATE_INTERVAL_MIN; + else if (n > AUTO_MIGRATE_INTERVAL_MAX) + cs->auto_migrate_interval = AUTO_MIGRATE_INTERVAL_MAX; + else + cs->auto_migrate_interval = n; + return 0; +} + /* The various types of files and directories in a cpuset file system */ typedef enum { @@ -1545,6 +1572,7 @@ typedef enum { FILE_SHARED_FILE_POLICY, FILE_MIGRATE_ON_FAULT, FILE_AUTO_MIGRATE, + FILE_AUTO_MIGRATE_INTERVAL, } cpuset_filetype_t; static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val) @@ -1596,6 +1624,9 @@ static int cpuset_write_u64(struct cgrou case FILE_AUTO_MIGRATE: retval = update_flag(CS_AUTO_MIGRATE, cs, val); break; + case FILE_AUTO_MIGRATE_INTERVAL: + retval = update_auto_migrate_interval(cs, val); + break; default: retval = -EINVAL; break; @@ -1725,6 +1756,9 @@ static ssize_t cpuset_common_file_read(s case FILE_MEMLIST: s += cpuset_sprintf_memlist(s, cs); break; + case FILE_AUTO_MIGRATE_INTERVAL: + s += sprintf(s, "%ld", cs->auto_migrate_interval / HZ); + break; default: retval = -EINVAL; goto out; @@ -1913,6 +1947,13 @@ static struct cftype cft_auto_migration .private = FILE_AUTO_MIGRATE, }; +static struct cftype cft_auto_migrate_interval = { + .name = "auto_migrate_interval", + .read = cpuset_common_file_read, + .write_u64 = cpuset_write_u64, + .private = FILE_AUTO_MIGRATE_INTERVAL, +}; + static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont) { int err; @@ -1936,6 +1977,9 @@ static int cpuset_populate(struct cgroup err = add_auto_migration_file(cont, ss, &cft_auto_migration); if (err < 0) return err; + err = add_auto_migration_file(cont, ss, &cft_auto_migrate_interval); + if (err < 0) + return err; /* memory_pressure_enabled is in root cpuset only */ if (!cont->parent) err = cgroup_add_file(cont, ss, @@ -2019,6 +2063,7 @@ static struct cgroup_subsys_state *cpuse if (is_auto_migrate(parent)) set_bit(CS_AUTO_MIGRATE, &cs->flags); set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); + cs->auto_migrate_interval = parent->auto_migrate_interval; cpumask_clear(cs->cpus_allowed); nodes_clear(cs->mems_allowed); fmeter_init(&cs->fmeter); Index: linux-2.6.36-mmotm-101103-1217/kernel/sched_fair.c =================================================================== --- linux-2.6.36-mmotm-101103-1217.orig/kernel/sched_fair.c +++ linux-2.6.36-mmotm-101103-1217/kernel/sched_fair.c @@ -1454,6 +1454,14 @@ select_task_rq_fair(struct rq *rq, struc int want_sd = 1; int sync = wake_flags & WF_SYNC; + + /* + * short circuit balancing if this task was recently + * migrated to this cpu's node. + */ + if (too_soon_for_internode_migration(p, prev_cpu)) + return prev_cpu; + if (sd_flag & SD_BALANCE_WAKE) { if (cpumask_test_cpu(cpu, &p->cpus_allowed)) want_affine = 1; @@ -1782,9 +1790,10 @@ int can_migrate_task(struct task_struct int tsk_cache_hot = 0; /* * We do not migrate tasks that are: - * 1) running (obviously), or - * 2) cannot be migrated to this CPU due to cpus_allowed, or - * 3) are cache-hot on their current CPU. + * 1) cannot be migrated to this CPU due to cpus_allowed, or + * 2) running (obviously), or + * 3) too soon since last internode migration + * 4) are cache-hot on their current CPU. */ if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) { schedstat_inc(p, se.statistics.nr_failed_migrations_affine); @@ -1797,6 +1806,9 @@ int can_migrate_task(struct task_struct return 0; } + if (too_soon_for_internode_migration(p, this_cpu)) + return 0; + /* * Aggressive migration if: * 1) task is cache cold, or -- To unsubscribe from this list: send the line "unsubscribe linux-numa" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html