Shared Policy Infrastructure - add cpuset control for huge page shared policy Add a per cpuset "shared_huge_policy" control file to enable shared hugetlbfs file policy for tasks in the cpuset. Default is disabled, resulting in the old behavior--i.e., we continue to use any task private vma policy, falling back to task or system default policy if none, on address ranges backed by shared hugetlbfs file mappings. The "shared_huge_policy" file depends on CONFIG_NUMA. This patch adapts and renames the cpuset_update_task_spread_flag() function and related mechanisms to update the cpusetcontrolled flags of the tasks in the cpuset when "shared_huge_policy" is changes. Why a "per cpuset" control? cpusets are numa-aware task groupings and memory policy is a numa concept. Applications that need/want shared hugetlbfs file policy can be grouped in a cpuset with this feature enabled, while other applications in other cpusets need not see this feature. Alternatively, the behavior may be enabled for the entire system by setting the control file in the top level cpuset. This use of cpusets to control NUMA-related behavior, vs. a separate controller, might be worth a side discussion? The default may be overridden--e.g., to enabled--on the kernel command line using the "shared_huge_policy_default" parameter. When cpusets are configured, this policy sets the default value of "shared_huge_policy" for the top cpuset, which is then inherited by all subsequently created descendant cpusets. When cpusets are not configured, this parameter sets the "shared_huge_policy_enabled" flag for the init process, which is then inherited by all descendant processes. A subsequent patch "hooks up" the shared file .{set|get}_policy vm_ops to install or lookup a shared policy on a memory mapped hugetlbfs file if the capability has been enabled for the caller's cpuset, or for the system in the case of no cpusets. Signed-off-by: Lee Schermerhorn <lee.schermerhorn@xxxxxx> include/linux/cpuset.h | 27 +++++++++++++++++ include/linux/sched.h | 20 ++++++++++++ include/linux/shared_policy.h | 4 ++ kernel/cpuset.c | 66 +++++++++++++++++++++++++++++++++++------- mm/mempolicy.c | 26 +++++++++++++++- 5 files changed, 131 insertions(+), 12 deletions(-) Index: linux-2.6.36-mmotm-101103-1217/include/linux/sched.h =================================================================== --- linux-2.6.36-mmotm-101103-1217.orig/include/linux/sched.h +++ linux-2.6.36-mmotm-101103-1217/include/linux/sched.h @@ -1455,6 +1455,7 @@ struct task_struct { #ifdef CONFIG_NUMA struct mempolicy *mempolicy; /* Protected by alloc_lock */ short il_next; + short shared_huge_policy_enabled; #endif atomic_t fs_excl; /* holding fs exclusive resources */ struct rcu_head rcu; @@ -1870,6 +1871,25 @@ extern void sched_exec(void); extern void sched_clock_idle_sleep_event(void); extern void sched_clock_idle_wakeup_event(u64 delta_ns); +#ifdef CONFIG_NUMA +static inline void set_shared_huge_policy_enabled(struct task_struct *tsk, + int val) +{ + tsk->shared_huge_policy_enabled = !!val; +} +static inline int shared_huge_policy_enabled(struct task_struct *tsk) +{ + return tsk->shared_huge_policy_enabled; +} + +#else +static void set_shared_huge_policy_enabled(struct task_struct *tsk, int val) { } +static int shared_huge_policy_enabled(struct task_struct *tsk) +{ + return 0; +} +#endif + #ifdef CONFIG_HOTPLUG_CPU extern void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p); extern void idle_task_exit(void); Index: linux-2.6.36-mmotm-101103-1217/kernel/cpuset.c =================================================================== --- linux-2.6.36-mmotm-101103-1217.orig/kernel/cpuset.c +++ linux-2.6.36-mmotm-101103-1217/kernel/cpuset.c @@ -132,6 +132,7 @@ typedef enum { CS_SCHED_LOAD_BALANCE, CS_SPREAD_PAGE, CS_SPREAD_SLAB, + CS_SHARED_HUGE_POLICY, } cpuset_flagbits_t; /* convenient tests for these bits */ @@ -170,6 +171,11 @@ static inline int is_spread_slab(const s return test_bit(CS_SPREAD_SLAB, &cs->flags); } +static inline int is_shared_huge_policy(const struct cpuset *cs) +{ + return test_bit(CS_SHARED_HUGE_POLICY, &cs->flags); +} + static struct cpuset top_cpuset = { .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)), }; @@ -306,11 +312,11 @@ static void guarantee_online_mems(const } /* - * update task's spread flag if cpuset's page/slab spread flag is set + * update task's cpuset-controlled flags to match it's cpuset. * * Called with callback_mutex/cgroup_mutex held */ -static void cpuset_update_task_spread_flag(struct cpuset *cs, +static void cpuset_update_task_cpuset_flags(struct cpuset *cs, struct task_struct *tsk) { if (is_spread_page(cs)) @@ -321,6 +327,10 @@ static void cpuset_update_task_spread_fl tsk->flags |= PF_SPREAD_SLAB; else tsk->flags &= ~PF_SPREAD_SLAB; + if (is_shared_huge_policy(cs)) + set_shared_huge_policy_enabled(tsk, 1); + else + set_shared_huge_policy_enabled(tsk, 0); } /* @@ -1180,7 +1190,8 @@ static int update_relax_domain_level(str } /* - * cpuset_change_flag - make a task's spread flags the same as its cpuset's + * cpuset_change_flag - make a task's cpuset controled flags the same as + * its cpuset's * @tsk: task to be updated * @scan: struct cgroup_scanner containing the cgroup of the task * @@ -1192,12 +1203,12 @@ static int update_relax_domain_level(str static void cpuset_change_flag(struct task_struct *tsk, struct cgroup_scanner *scan) { - cpuset_update_task_spread_flag(cgroup_cs(scan->cg), tsk); + cpuset_update_task_cpuset_flags(cgroup_cs(scan->cg), tsk); } /* - * update_tasks_flags - update the spread flags of tasks in the cpuset. - * @cs: the cpuset in which each task's spread flags needs to be changed + * update_tasks_flags - update the cpuset-controlled flags of tasks in a cpuset. + * @cs: the cpuset in which each task's flags needs to be changed * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() * * Called with cgroup_mutex held @@ -1233,7 +1244,7 @@ static int update_flag(cpuset_flagbits_t { struct cpuset *trialcs; int balance_flag_changed; - int spread_flag_changed; + int cpuset_flags_changed; struct ptr_heap heap; int err; @@ -1257,8 +1268,9 @@ static int update_flag(cpuset_flagbits_t balance_flag_changed = (is_sched_load_balance(cs) != is_sched_load_balance(trialcs)); - spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs)) - || (is_spread_page(cs) != is_spread_page(trialcs))); + cpuset_flags_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs)) + || (is_spread_page(cs) != is_spread_page(trialcs)) + || (is_shared_huge_policy(cs) != is_shared_huge_policy(trialcs))); mutex_lock(&callback_mutex); cs->flags = trialcs->flags; @@ -1267,7 +1279,7 @@ static int update_flag(cpuset_flagbits_t if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) async_rebuild_sched_domains(); - if (spread_flag_changed) + if (cpuset_flags_changed) update_tasks_flags(cs, &heap); heap_free(&heap); out: @@ -1428,7 +1440,7 @@ static void cpuset_attach_task(struct ta WARN_ON_ONCE(err); cpuset_change_task_nodemask(tsk, to); - cpuset_update_task_spread_flag(cs, tsk); + cpuset_update_task_cpuset_flags(cs, tsk); } @@ -1494,6 +1506,7 @@ typedef enum { FILE_MEMORY_PRESSURE, FILE_SPREAD_PAGE, FILE_SPREAD_SLAB, + FILE_SHARED_HUGE_POLICY, } cpuset_filetype_t; static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val) @@ -1533,6 +1546,9 @@ static int cpuset_write_u64(struct cgrou case FILE_SPREAD_SLAB: retval = update_flag(CS_SPREAD_SLAB, cs, val); break; + case FILE_SHARED_HUGE_POLICY: + retval = update_flag(CS_SHARED_HUGE_POLICY, cs, val); + break; default: retval = -EINVAL; break; @@ -1697,6 +1713,8 @@ static u64 cpuset_read_u64(struct cgroup return is_spread_page(cs); case FILE_SPREAD_SLAB: return is_spread_slab(cs); + case FILE_SHARED_HUGE_POLICY: + return is_shared_huge_policy(cs); default: BUG(); } @@ -1814,6 +1832,13 @@ static struct cftype cft_memory_pressure .private = FILE_MEMORY_PRESSURE_ENABLED, }; +static struct cftype cft_shared_huge_policy = { + .name = "shared_huge_policy", + .read_u64 = cpuset_read_u64, + .write_u64 = cpuset_write_u64, + .private = FILE_SHARED_HUGE_POLICY, +}; + static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont) { int err; @@ -1821,6 +1846,12 @@ static int cpuset_populate(struct cgroup err = cgroup_add_files(cont, ss, files, ARRAY_SIZE(files)); if (err) return err; + /* + * only if shared file policy configured + */ + err = add_shared_xxx_policy_file(cont, ss, &cft_shared_huge_policy); + if (err < 0) + return err; /* memory_pressure_enabled is in root cpuset only */ if (!cont->parent) err = cgroup_add_file(cont, ss, @@ -1895,6 +1926,8 @@ static struct cgroup_subsys_state *cpuse set_bit(CS_SPREAD_PAGE, &cs->flags); if (is_spread_slab(parent)) set_bit(CS_SPREAD_SLAB, &cs->flags); + if (is_shared_huge_policy(parent)) + set_bit(CS_SHARED_HUGE_POLICY, &cs->flags); set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); cpumask_clear(cs->cpus_allowed); nodes_clear(cs->mems_allowed); @@ -1968,6 +2001,17 @@ int __init cpuset_init(void) } /** + * cpuset_init_shared_file_policy - set default value for shared_file_policy + * enablement. + */ + +void __init cpuset_init_shared_huge_policy(int dflt) +{ + if (dflt) + set_bit(CS_SHARED_HUGE_POLICY, &top_cpuset.flags); +} + +/** * cpuset_do_move_task - move a given task to another cpuset * @tsk: pointer to task_struct the task to move * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner Index: linux-2.6.36-mmotm-101103-1217/include/linux/cpuset.h =================================================================== --- linux-2.6.36-mmotm-101103-1217.orig/include/linux/cpuset.h +++ linux-2.6.36-mmotm-101103-1217/include/linux/cpuset.h @@ -128,6 +128,28 @@ static inline void set_mems_allowed(node task_unlock(current); } +#ifdef CONFIG_NUMA +static inline int add_shared_xxx_policy_file(struct cgroup *cg, + struct cgroup_subsys *ss, + struct cftype *cft) +{ + return cgroup_add_file(cg, ss, cft); +} + +#else +/* + * don't expose "shared_huge_policy" file if !NUMA + */ +static inline int add_shared_xxx_policy_file(struct cgroup *cg, + struct cgroup_subsys *ss, + struct cftype *cft) +{ + return 0; +} +#endif + +extern void __init cpuset_init_shared_huge_policy(int dflt); + #else /* !CONFIG_CPUSETS */ static inline int cpuset_init(void) { return 0; } @@ -242,6 +264,11 @@ static inline void put_mems_allowed(void { } +static inline void cpuset_init_shared_huge_policy(int dflt) +{ + current->shared_file_policy_enabled = dflt; +} + #endif /* !CONFIG_CPUSETS */ #endif /* _LINUX_CPUSET_H */ Index: linux-2.6.36-mmotm-101103-1217/mm/mempolicy.c =================================================================== --- linux-2.6.36-mmotm-101103-1217.orig/mm/mempolicy.c +++ linux-2.6.36-mmotm-101103-1217/mm/mempolicy.c @@ -2079,7 +2079,25 @@ int __mpol_equal(struct mempolicy *a, st /* * Shared memory backing store policy support. - * + */ + +/* + * default state of per cpuset shared_huge_policy enablement + */ +int shared_huge_policy_default; /* default: disabled */ + +static int __init setup_shared_huge_policy_default(char *str) +{ + int ret, val; + ret = get_option(&str, &val); + if (!ret) + return 0; + shared_huge_policy_default = !!val; + return 1; +} +__setup("shared_huge_policy_default=", setup_shared_huge_policy_default); + +/* * Remember policies even when nobody has shared memory mapped. * The policies are kept in Red-Black tree linked from the inode. * They are protected by the sp->lock spinlock, which should be held @@ -2423,6 +2441,12 @@ void __init numa_policy_init(void) if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes)) printk("numa_policy_init: interleaving failed\n"); + + /* + * initialize per cpuset shared huge policy enablement + * from default. + */ + cpuset_init_shared_huge_policy(shared_huge_policy_default); } /* Reset policy of current process to default */ Index: linux-2.6.36-mmotm-101103-1217/include/linux/shared_policy.h =================================================================== --- linux-2.6.36-mmotm-101103-1217.orig/include/linux/shared_policy.h +++ linux-2.6.36-mmotm-101103-1217/include/linux/shared_policy.h @@ -30,6 +30,8 @@ struct shared_policy { int nr_sp_nodes; /* for numa_maps */ }; +extern int shared_file_policy_default; + extern struct shared_policy *mpol_shared_policy_new( struct address_space *mapping, struct mempolicy *mpol); @@ -43,6 +45,8 @@ extern struct mempolicy *mpol_shared_pol struct shared_policy {}; +#define shared_file_policy_default 0 + static inline int mpol_set_shared_policy(struct shared_policy *info, pgoff_t pgoff, unsigned long sz, struct mempolicy *new) -- To unsubscribe from this list: send the line "unsubscribe linux-numa" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html