[PATCH/RFC 9/14] Shared Policy: per cpuset huge file policy control

Lee Schermerhorn <lee.schermerhorn@xxxxxx> · Thu, 11 Nov 2010 14:13:07 -0500

Shared Policy Infrastructure - add cpuset control for huge page shared policy

Add a per cpuset "shared_huge_policy" control file to enable
shared hugetlbfs file policy for tasks in the cpuset.  Default
is disabled, resulting in the old behavior--i.e., we continue
to use any task private vma policy, falling back to task or
system default policy if none, on address ranges backed by
shared hugetlbfs file mappings.  The "shared_huge_policy" file
depends on CONFIG_NUMA.

This patch adapts and renames the cpuset_update_task_spread_flag()
function and related mechanisms to update the cpusetcontrolled flags
of the tasks in the cpuset when "shared_huge_policy" is changes.

Why a "per cpuset" control?  cpusets are numa-aware task groupings
and memory policy is a numa concept.  Applications that need/want
shared hugetlbfs file policy can be grouped in a cpuset with this
feature enabled, while other applications in other cpusets need not
see this feature.  Alternatively, the behavior may be enabled for
the entire system by setting the control file in the top level cpuset.

	This use of cpusets to control NUMA-related behavior,
	vs. a separate controller, might be worth a side discussion?

The default may be overridden--e.g., to enabled--on the kernel
command line using the "shared_huge_policy_default" parameter.
When cpusets are configured, this policy sets the default value
of "shared_huge_policy" for the top cpuset, which is then inherited
by all subsequently created descendant cpusets.  When cpusets are
not configured, this parameter sets the "shared_huge_policy_enabled"
flag for the init process, which is then inherited by all descendant
processes.

A subsequent patch "hooks up" the shared file .{set|get}_policy
vm_ops to install or lookup a shared policy on a memory mapped
hugetlbfs file if the capability has been enabled for the caller's cpuset,
or for the system in the case of no cpusets.

Signed-off-by: Lee Schermerhorn <lee.schermerhorn@xxxxxx>

 include/linux/cpuset.h        |   27 +++++++++++++++++
 include/linux/sched.h         |   20 ++++++++++++
 include/linux/shared_policy.h |    4 ++
 kernel/cpuset.c               |   66 +++++++++++++++++++++++++++++++++++-------
 mm/mempolicy.c                |   26 +++++++++++++++-
 5 files changed, 131 insertions(+), 12 deletions(-)

Index: linux-2.6.36-mmotm-101103-1217/include/linux/sched.h
===================================================================

--- linux-2.6.36-mmotm-101103-1217.orig/include/linux/sched.h
+++ linux-2.6.36-mmotm-101103-1217/include/linux/sched.h
@@ -1455,6 +1455,7 @@ struct task_struct {
 #ifdef CONFIG_NUMA
 	struct mempolicy *mempolicy;	/* Protected by alloc_lock */
 	short il_next;
+	short shared_huge_policy_enabled;
 #endif
 	atomic_t fs_excl;	/* holding fs exclusive resources */
 	struct rcu_head rcu;
@@ -1870,6 +1871,25 @@ extern void sched_exec(void);
 extern void sched_clock_idle_sleep_event(void);
 extern void sched_clock_idle_wakeup_event(u64 delta_ns);
 
+#ifdef CONFIG_NUMA
+static inline void set_shared_huge_policy_enabled(struct task_struct *tsk,
+							int val)
+{
+	tsk->shared_huge_policy_enabled = !!val;
+}
+static inline int shared_huge_policy_enabled(struct task_struct *tsk)
+{
+	return tsk->shared_huge_policy_enabled;
+}
+
+#else
+static void set_shared_huge_policy_enabled(struct task_struct *tsk, int val) { }
+static int shared_huge_policy_enabled(struct task_struct *tsk)
+{
+	return 0;
+}
+#endif
+
 #ifdef CONFIG_HOTPLUG_CPU
 extern void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p);
 extern void idle_task_exit(void);
Index: linux-2.6.36-mmotm-101103-1217/kernel/cpuset.c
===================================================================
--- linux-2.6.36-mmotm-101103-1217.orig/kernel/cpuset.c
+++ linux-2.6.36-mmotm-101103-1217/kernel/cpuset.c
@@ -132,6 +132,7 @@ typedef enum {
 	CS_SCHED_LOAD_BALANCE,
 	CS_SPREAD_PAGE,
 	CS_SPREAD_SLAB,
+	CS_SHARED_HUGE_POLICY,
 } cpuset_flagbits_t;
 
 /* convenient tests for these bits */
@@ -170,6 +171,11 @@ static inline int is_spread_slab(const s
 	return test_bit(CS_SPREAD_SLAB, &cs->flags);
 }
 
+static inline int is_shared_huge_policy(const struct cpuset *cs)
+{
+	return test_bit(CS_SHARED_HUGE_POLICY, &cs->flags);
+}
+
 static struct cpuset top_cpuset = {
 	.flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)),
 };
@@ -306,11 +312,11 @@ static void guarantee_online_mems(const
 }
 
 /*
- * update task's spread flag if cpuset's page/slab spread flag is set
+ * update task's cpuset-controlled flags to match it's cpuset.
  *
  * Called with callback_mutex/cgroup_mutex held
  */
-static void cpuset_update_task_spread_flag(struct cpuset *cs,
+static void cpuset_update_task_cpuset_flags(struct cpuset *cs,
 					struct task_struct *tsk)
 {
 	if (is_spread_page(cs))
@@ -321,6 +327,10 @@ static void cpuset_update_task_spread_fl
 		tsk->flags |= PF_SPREAD_SLAB;
 	else
 		tsk->flags &= ~PF_SPREAD_SLAB;
+	if (is_shared_huge_policy(cs))
+		set_shared_huge_policy_enabled(tsk, 1);
+	else
+		set_shared_huge_policy_enabled(tsk, 0);
 }
 
 /*
@@ -1180,7 +1190,8 @@ static int update_relax_domain_level(str
 }
 
 /*
- * cpuset_change_flag - make a task's spread flags the same as its cpuset's
+ * cpuset_change_flag - make a task's cpuset controled flags the same as
+ * its cpuset's
  * @tsk: task to be updated
  * @scan: struct cgroup_scanner containing the cgroup of the task
  *
@@ -1192,12 +1203,12 @@ static int update_relax_domain_level(str
 static void cpuset_change_flag(struct task_struct *tsk,
 				struct cgroup_scanner *scan)
 {
-	cpuset_update_task_spread_flag(cgroup_cs(scan->cg), tsk);
+	cpuset_update_task_cpuset_flags(cgroup_cs(scan->cg), tsk);
 }
 
 /*
- * update_tasks_flags - update the spread flags of tasks in the cpuset.
- * @cs: the cpuset in which each task's spread flags needs to be changed
+ * update_tasks_flags - update the cpuset-controlled flags of tasks in a cpuset.
+ * @cs: the cpuset in which each task's flags needs to be changed
  * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
  *
  * Called with cgroup_mutex held
@@ -1233,7 +1244,7 @@ static int update_flag(cpuset_flagbits_t
 {
 	struct cpuset *trialcs;
 	int balance_flag_changed;
-	int spread_flag_changed;
+	int cpuset_flags_changed;
 	struct ptr_heap heap;
 	int err;
 
@@ -1257,8 +1268,9 @@ static int update_flag(cpuset_flagbits_t
 	balance_flag_changed = (is_sched_load_balance(cs) !=
 				is_sched_load_balance(trialcs));
 
-	spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))
-			|| (is_spread_page(cs) != is_spread_page(trialcs)));
+	cpuset_flags_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))
+			|| (is_spread_page(cs) != is_spread_page(trialcs))
+			|| (is_shared_huge_policy(cs) != is_shared_huge_policy(trialcs)));
 
 	mutex_lock(&callback_mutex);
 	cs->flags = trialcs->flags;
@@ -1267,7 +1279,7 @@ static int update_flag(cpuset_flagbits_t
 	if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
 		async_rebuild_sched_domains();
 
-	if (spread_flag_changed)
+	if (cpuset_flags_changed)
 		update_tasks_flags(cs, &heap);
 	heap_free(&heap);
 out:
@@ -1428,7 +1440,7 @@ static void cpuset_attach_task(struct ta
 	WARN_ON_ONCE(err);
 
 	cpuset_change_task_nodemask(tsk, to);
-	cpuset_update_task_spread_flag(cs, tsk);
+ 	cpuset_update_task_cpuset_flags(cs, tsk);
 
 }
 
@@ -1494,6 +1506,7 @@ typedef enum {
 	FILE_MEMORY_PRESSURE,
 	FILE_SPREAD_PAGE,
 	FILE_SPREAD_SLAB,
+	FILE_SHARED_HUGE_POLICY,
 } cpuset_filetype_t;
 
 static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
@@ -1533,6 +1546,9 @@ static int cpuset_write_u64(struct cgrou
 	case FILE_SPREAD_SLAB:
 		retval = update_flag(CS_SPREAD_SLAB, cs, val);
 		break;
+	case FILE_SHARED_HUGE_POLICY:
+		retval = update_flag(CS_SHARED_HUGE_POLICY, cs, val);
+		break;
 	default:
 		retval = -EINVAL;
 		break;
@@ -1697,6 +1713,8 @@ static u64 cpuset_read_u64(struct cgroup
 		return is_spread_page(cs);
 	case FILE_SPREAD_SLAB:
 		return is_spread_slab(cs);
+	case FILE_SHARED_HUGE_POLICY:
+		return is_shared_huge_policy(cs);
 	default:
 		BUG();
 	}
@@ -1814,6 +1832,13 @@ static struct cftype cft_memory_pressure
 	.private = FILE_MEMORY_PRESSURE_ENABLED,
 };
 
+static struct cftype cft_shared_huge_policy = {
+	.name = "shared_huge_policy",
+	.read_u64 = cpuset_read_u64,
+	.write_u64 = cpuset_write_u64,
+	.private = FILE_SHARED_HUGE_POLICY,
+};
+
 static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont)
 {
 	int err;
@@ -1821,6 +1846,12 @@ static int cpuset_populate(struct cgroup
 	err = cgroup_add_files(cont, ss, files, ARRAY_SIZE(files));
 	if (err)
 		return err;
+	/*
+	 * only if shared file policy configured
+	 */
+	err = add_shared_xxx_policy_file(cont, ss, &cft_shared_huge_policy);
+	if (err < 0)
+		return err;
 	/* memory_pressure_enabled is in root cpuset only */
 	if (!cont->parent)
 		err = cgroup_add_file(cont, ss,
@@ -1895,6 +1926,8 @@ static struct cgroup_subsys_state *cpuse
 		set_bit(CS_SPREAD_PAGE, &cs->flags);
 	if (is_spread_slab(parent))
 		set_bit(CS_SPREAD_SLAB, &cs->flags);
+	if (is_shared_huge_policy(parent))
+		set_bit(CS_SHARED_HUGE_POLICY, &cs->flags);
 	set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
 	cpumask_clear(cs->cpus_allowed);
 	nodes_clear(cs->mems_allowed);
@@ -1968,6 +2001,17 @@ int __init cpuset_init(void)
 }
 
 /**
+ * cpuset_init_shared_file_policy - set default value for shared_file_policy
+ * enablement.
+ */
+
+void __init cpuset_init_shared_huge_policy(int dflt)
+{
+	if (dflt)
+		set_bit(CS_SHARED_HUGE_POLICY, &top_cpuset.flags);
+}
+
+/**
  * cpuset_do_move_task - move a given task to another cpuset
  * @tsk: pointer to task_struct the task to move
  * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner
Index: linux-2.6.36-mmotm-101103-1217/include/linux/cpuset.h
===================================================================
--- linux-2.6.36-mmotm-101103-1217.orig/include/linux/cpuset.h
+++ linux-2.6.36-mmotm-101103-1217/include/linux/cpuset.h
@@ -128,6 +128,28 @@ static inline void set_mems_allowed(node
 	task_unlock(current);
 }
 
+#ifdef CONFIG_NUMA
+static inline int add_shared_xxx_policy_file(struct cgroup *cg,
+						struct cgroup_subsys *ss,
+						struct cftype *cft)
+{
+	return cgroup_add_file(cg, ss, cft);
+}
+
+#else
+/*
+ * don't expose "shared_huge_policy" file if !NUMA
+ */
+static inline int add_shared_xxx_policy_file(struct cgroup *cg,
+						struct cgroup_subsys *ss,
+						struct cftype *cft)
+{
+	return 0;
+}
+#endif
+
+extern void __init cpuset_init_shared_huge_policy(int dflt);
+
 #else /* !CONFIG_CPUSETS */
 
 static inline int cpuset_init(void) { return 0; }
@@ -242,6 +264,11 @@ static inline void put_mems_allowed(void
 {
 }
 
+static inline void cpuset_init_shared_huge_policy(int dflt)
+{
+	current->shared_file_policy_enabled = dflt;
+}
+
 #endif /* !CONFIG_CPUSETS */
 
 #endif /* _LINUX_CPUSET_H */
Index: linux-2.6.36-mmotm-101103-1217/mm/mempolicy.c
===================================================================
--- linux-2.6.36-mmotm-101103-1217.orig/mm/mempolicy.c
+++ linux-2.6.36-mmotm-101103-1217/mm/mempolicy.c
@@ -2079,7 +2079,25 @@ int __mpol_equal(struct mempolicy *a, st
 
 /*
  * Shared memory backing store policy support.
- *
+ */
+
+/*
+ * default state of per cpuset shared_huge_policy enablement
+ */
+int shared_huge_policy_default;	/* default:  disabled */
+
+static int __init setup_shared_huge_policy_default(char *str)
+{
+	int ret, val;
+	ret = get_option(&str, &val);
+	if (!ret)
+		return 0;
+	shared_huge_policy_default = !!val;
+	return 1;
+}
+__setup("shared_huge_policy_default=", setup_shared_huge_policy_default);
+
+/*
  * Remember policies even when nobody has shared memory mapped.
  * The policies are kept in Red-Black tree linked from the inode.
  * They are protected by the sp->lock spinlock, which should be held
@@ -2423,6 +2441,12 @@ void __init numa_policy_init(void)
 
 	if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
 		printk("numa_policy_init: interleaving failed\n");
+
+	/*
+	 * initialize per cpuset shared huge policy enablement
+	 * from default.
+	 */
+	cpuset_init_shared_huge_policy(shared_huge_policy_default);
 }
 
 /* Reset policy of current process to default */
Index: linux-2.6.36-mmotm-101103-1217/include/linux/shared_policy.h
===================================================================
--- linux-2.6.36-mmotm-101103-1217.orig/include/linux/shared_policy.h
+++ linux-2.6.36-mmotm-101103-1217/include/linux/shared_policy.h
@@ -30,6 +30,8 @@ struct shared_policy {
 	int            nr_sp_nodes;	/* for numa_maps */
 };
 
+extern int shared_file_policy_default;
+
 extern struct shared_policy *mpol_shared_policy_new(
 					struct address_space *mapping,
 					struct mempolicy *mpol);
@@ -43,6 +45,8 @@ extern struct mempolicy *mpol_shared_pol
 
 struct shared_policy {};
 
+#define shared_file_policy_default 0
+
 static inline int mpol_set_shared_policy(struct shared_policy *info,
 					pgoff_t pgoff, unsigned long sz,
 					struct mempolicy *new)
--
To unsubscribe from this list: send the line "unsubscribe linux-numa" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html