AutoPage Migration - add max mapcount migration threshold This patch adds an additional per cpuset migration control that allows one to vary the page mapcount threshold above which pages will not be migrated by MPOL_MF_MOVE. The default value is 1, which yields the same behavior as before this patch. This is useful because anon pages can be shared between ancestors and descendants until sharing is broken by a write. We want to be able to unmap these pages for lazy, automigration so that the next touch will migrate the page local to the task that touches it. However, we still want a threshold above which we don't attempt to migrate the page because unmap is very expensive when a page has a large mapcount. We add the threshold to the task structure so that we can fetch it using a static inline function that is redefined as to return the default value of 1 when AUTO_MIGRATION is not configured. The max mapcount is accessed for each page proposed for migration and we don't want to call a cpuset function and take an rcu_lock/unlock round trip for each page. Note: This threshold could be configured under MIGRATE_ON_FAULT instead of AUTO_MIGRATION or independently of either, as it is useful for mbind() with MPOL_MF_MOVE as well. Signed-off-by: Lee Schermerhorn <lee.schermerhorn@xxxxxx> include/linux/auto-migrate.h | 4 ++++ include/linux/sched.h | 1 + kernel/cpuset.c | 42 +++++++++++++++++++++++++++++++++++++++++- mm/mempolicy.c | 8 +++++--- 4 files changed, 51 insertions(+), 4 deletions(-) Index: linux-2.6.36-mmotm-101103-1217/include/linux/auto-migrate.h =================================================================== --- linux-2.6.36-mmotm-101103-1217.orig/include/linux/auto-migrate.h +++ linux-2.6.36-mmotm-101103-1217/include/linux/auto-migrate.h @@ -77,6 +77,10 @@ static inline void check_migrate_pending } #endif /* SCHED_AUTO_MIGRATION */ +static inline unsigned int migrate_max_mapcount(struct task_struct *task) +{ + return task->migrate_max_mapcount; +} #else /* !CONFIG_AUTO_MIGRATION */ static inline int is_auto_migration(int flags) Index: linux-2.6.36-mmotm-101103-1217/kernel/cpuset.c =================================================================== --- linux-2.6.36-mmotm-101103-1217.orig/kernel/cpuset.c +++ linux-2.6.36-mmotm-101103-1217/kernel/cpuset.c @@ -101,6 +101,7 @@ struct cpuset { struct fmeter fmeter; /* memory_pressure filter */ unsigned long auto_migrate_interval; + unsigned int migrate_max_mapcount; /* partition number for rebuild_sched_domains() */ int pn; @@ -200,6 +201,7 @@ static inline int is_auto_migrate(const static struct cpuset top_cpuset = { .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)), .auto_migrate_interval = AUTO_MIGRATE_INTERVAL_DFLT, + .migrate_max_mapcount = 1, }; /* @@ -365,8 +367,11 @@ static void cpuset_update_task_cpuset_fl if (is_auto_migrate(cs)) { set_auto_migrate_enabled(tsk, 1); tsk->auto_migrate_interval = cs->auto_migrate_interval; - } else + tsk->migrate_max_mapcount = cs->migrate_max_mapcount; + } else { set_auto_migrate_enabled(tsk, 0); + tsk->migrate_max_mapcount = 1; + } } @@ -1553,6 +1558,23 @@ static int update_auto_migrate_interval( return 0; } +/* + * Call with manage_mutex held. + */ +static int update_migrate_max_mapcount(struct cpuset *cs, u64 val) +{ + unsigned int n = val; + + if (n == cs->migrate_max_mapcount) + return 0; + + if (n < 1) + cs->migrate_max_mapcount = 1; + else + cs->migrate_max_mapcount = n; + return 0; +} + /* The various types of files and directories in a cpuset file system */ typedef enum { @@ -1573,6 +1595,7 @@ typedef enum { FILE_MIGRATE_ON_FAULT, FILE_AUTO_MIGRATE, FILE_AUTO_MIGRATE_INTERVAL, + FILE_MIGRATE_MAX_MAPCOUNT, } cpuset_filetype_t; static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val) @@ -1627,6 +1650,9 @@ static int cpuset_write_u64(struct cgrou case FILE_AUTO_MIGRATE_INTERVAL: retval = update_auto_migrate_interval(cs, val); break; + case FILE_MIGRATE_MAX_MAPCOUNT: + retval = update_migrate_max_mapcount(cs, val); + break; default: retval = -EINVAL; break; @@ -1759,6 +1785,9 @@ static ssize_t cpuset_common_file_read(s case FILE_AUTO_MIGRATE_INTERVAL: s += sprintf(s, "%ld", cs->auto_migrate_interval / HZ); break; + case FILE_MIGRATE_MAX_MAPCOUNT: + s += sprintf(s, "%d", cs->migrate_max_mapcount); + break; default: retval = -EINVAL; goto out; @@ -1954,6 +1983,13 @@ static struct cftype cft_auto_migrate_in .private = FILE_AUTO_MIGRATE_INTERVAL, }; +static struct cftype cft_migrate_max_mapcount = { + .name = "migrate_max_mapcount", + .read = cpuset_common_file_read, + .write_u64 = cpuset_write_u64, + .private = FILE_MIGRATE_MAX_MAPCOUNT, +}; + static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont) { int err; @@ -1980,6 +2016,9 @@ static int cpuset_populate(struct cgroup err = add_auto_migration_file(cont, ss, &cft_auto_migrate_interval); if (err < 0) return err; + err = add_auto_migration_file(cont, ss, &cft_migrate_max_mapcount); + if (err < 0) + return err; /* memory_pressure_enabled is in root cpuset only */ if (!cont->parent) err = cgroup_add_file(cont, ss, @@ -2064,6 +2103,7 @@ static struct cgroup_subsys_state *cpuse set_bit(CS_AUTO_MIGRATE, &cs->flags); set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); cs->auto_migrate_interval = parent->auto_migrate_interval; + cs->migrate_max_mapcount = parent->migrate_max_mapcount; cpumask_clear(cs->cpus_allowed); nodes_clear(cs->mems_allowed); fmeter_init(&cs->fmeter); Index: linux-2.6.36-mmotm-101103-1217/include/linux/sched.h =================================================================== --- linux-2.6.36-mmotm-101103-1217.orig/include/linux/sched.h +++ linux-2.6.36-mmotm-101103-1217/include/linux/sched.h @@ -1464,6 +1464,7 @@ struct task_struct { #endif unsigned long next_migrate; /* internode migration hysteresis */ unsigned long auto_migrate_interval; /* from cpuset */ + unsigned int migrate_max_mapcount; /* for !MPOL_MF_MOVE_ALL */ #endif atomic_t fs_excl; /* holding fs exclusive resources */ struct rcu_head rcu; Index: linux-2.6.36-mmotm-101103-1217/mm/mempolicy.c =================================================================== --- linux-2.6.36-mmotm-101103-1217.orig/mm/mempolicy.c +++ linux-2.6.36-mmotm-101103-1217/mm/mempolicy.c @@ -1051,11 +1051,13 @@ static void migrate_page_add(struct page unsigned long flags) { /* - * Avoid migrating a file backed page in a private mapping or - * a page that is shared with others. + * Avoid migrating a file backed page in a private mapping, or + * a page that is shared with > 'migrate_max_mapcount' others + * unless MPOL_MF_MOVE_ALL specified. */ if ((!(flags & MPOL_MF_MOVE_ANON_ONLY) || PageAnon(page)) && - ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1)) { + ((flags & MPOL_MF_MOVE_ALL) || + page_mapcount(page) <= migrate_max_mapcount(current))) { if (!isolate_lru_page(page)) { list_add_tail(&page->lru, pagelist); inc_zone_page_state(page, NR_ISOLATED_ANON + -- To unsubscribe from this list: send the line "unsubscribe linux-numa" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html