[PATCH/RFC 1/14] Shared Policy: Miscellaneous Cleanup

Lee Schermerhorn <lee.schermerhorn@xxxxxx> · Thu, 11 Nov 2010 14:11:56 -0500

Shared Policy - Miscellaneous shared policy cleanup

Some miscellaneous cleanup to use "sp" for shared policy in routines
that take one as an arg.  Prior use of "info" seemed misleading, as
it also refers to the shm_inode_info.  And use of "p" seemed just
plain inconsistent.

Additional cleanup/reorg of the numa_memory_policy.txt doc.

This patch is in preparation for additional shared policy rework.
I wanted to break the minor "cleanup" changes out into a separate
patch.

Signed-off-by: Lee Schermerhorn <lee.schermerhorn@xxxxxx>

 Documentation/vm/numa_memory_policy.txt |  244 +++++++++++++++++---------------
 include/linux/mempolicy.h               |   10 -
 mm/mempolicy.c                          |   24 +--
 3 files changed, 156 insertions(+), 122 deletions(-)

Index: linux-2.6.36-mmotm-101103-1217/mm/mempolicy.c
===================================================================

--- linux-2.6.36-mmotm-101103-1217.orig/mm/mempolicy.c
+++ linux-2.6.36-mmotm-101103-1217/mm/mempolicy.c
@@ -2185,7 +2185,7 @@ put_mpol:
 	}
 }
 
-int mpol_set_shared_policy(struct shared_policy *info,
+int mpol_set_shared_policy(struct shared_policy *sp,
 			struct vm_area_struct *vma, struct mempolicy *npol)
 {
 	int err;
@@ -2203,30 +2203,36 @@ int mpol_set_shared_policy(struct shared
 		if (!new)
 			return -ENOMEM;
 	}
-	err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
+	err = shared_policy_replace(sp, vma->vm_pgoff, vma->vm_pgoff+sz, new);
 	if (err && new)
 		kmem_cache_free(sn_cache, new);
 	return err;
 }
 
-/* Free a backing policy store on inode delete. */
-void mpol_free_shared_policy(struct shared_policy *p)
+/**
+ * mpol_free_shared_policy() - Free a backing policy store on inode delete.
+ * @sp - shared policy structure to free
+ *
+ * Frees the shared policy red-black tree, if any, before freeing the
+ * shared policy struct itself.
+ */
+void mpol_free_shared_policy(struct shared_policy *sp)
 {
 	struct sp_node *n;
 	struct rb_node *next;
 
-	if (!p->root.rb_node)
+	if (!sp->root.rb_node)
 		return;
-	spin_lock(&p->lock);
-	next = rb_first(&p->root);
+	spin_lock(&sp->lock);
+	next = rb_first(&sp->root);
 	while (next) {
 		n = rb_entry(next, struct sp_node, nd);
 		next = rb_next(&n->nd);
-		rb_erase(&n->nd, &p->root);
+		rb_erase(&n->nd, &sp->root);
 		mpol_put(n->policy);
 		kmem_cache_free(sn_cache, n);
 	}
-	spin_unlock(&p->lock);
+	spin_unlock(&sp->lock);
 }
 
 /* assumes fs == KERNEL_DS */
Index: linux-2.6.36-mmotm-101103-1217/include/linux/mempolicy.h
===================================================================
--- linux-2.6.36-mmotm-101103-1217.orig/include/linux/mempolicy.h
+++ linux-2.6.36-mmotm-101103-1217/include/linux/mempolicy.h
@@ -81,7 +81,7 @@ struct mm_struct;
  * the process policy is used. Interrupts ignore the memory policy
  * of the current process.
  *
- * Locking policy for interlave:
+ * Locking policy for interleave:
  * In process context there is no locking because only the process accesses
  * its own state. All vma manipulation is somewhat protected by a down_read on
  * mmap_sem.
@@ -192,10 +192,10 @@ struct shared_policy {
 };
 
 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol);
-int mpol_set_shared_policy(struct shared_policy *info,
+int mpol_set_shared_policy(struct shared_policy *sp,
 				struct vm_area_struct *vma,
 				struct mempolicy *new);
-void mpol_free_shared_policy(struct shared_policy *p);
+void mpol_free_shared_policy(struct shared_policy *sp);
 struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp,
 					    unsigned long idx);
 
@@ -284,7 +284,7 @@ static inline struct mempolicy *mpol_dup
 
 struct shared_policy {};
 
-static inline int mpol_set_shared_policy(struct shared_policy *info,
+static inline int mpol_set_shared_policy(struct shared_policy *sp,
 					struct vm_area_struct *vma,
 					struct mempolicy *new)
 {
@@ -296,7 +296,7 @@ static inline void mpol_shared_policy_in
 {
 }
 
-static inline void mpol_free_shared_policy(struct shared_policy *p)
+static inline void mpol_free_shared_policy(struct shared_policy *sp)
 {
 }
 
Index: linux-2.6.36-mmotm-101103-1217/Documentation/vm/numa_memory_policy.txt
===================================================================
--- linux-2.6.36-mmotm-101103-1217.orig/Documentation/vm/numa_memory_policy.txt
+++ linux-2.6.36-mmotm-101103-1217/Documentation/vm/numa_memory_policy.txt
@@ -12,9 +12,10 @@ Memory policies should not be confused w
 (Documentation/cgroups/cpusets.txt)
 which is an administrative mechanism for restricting the nodes from which
 memory may be allocated by a set of processes. Memory policies are a
-programming interface that a NUMA-aware application can take advantage of.  When
-both cpusets and policies are applied to a task, the restrictions of the cpuset
-takes priority.  See "MEMORY POLICIES AND CPUSETS" below for more details.
+programming interface that a NUMA-aware application can take advantage of.
+When both cpusets and policies are applied to a task, the restrictions of the
+cpuset takes priority.  See "MEMORY POLICIES AND CPUSETS" below for more
+details.
 
 MEMORY POLICY CONCEPTS
 
@@ -56,7 +57,10 @@ most general to most specific:
 	A task policy applies only to pages allocated after the policy is
 	installed.  Any pages already faulted in by the task when the task
 	changes its task policy remain where they were allocated based on
-	the policy at the time they were allocated.
+	the policy at the time they were allocated.  The Memory Policy API
+	defines a flag to request that new pages be allocated to obey a newly
+	installed memory policy, and that the contents and state of the
+	original pages be migrated to the new pages.
 
     VMA Policy:  A "VMA" or "Virtual Memory Area" refers to a range of a task's
     virtual address space.  A task may define a specific policy for a range
@@ -109,7 +113,7 @@ most general to most specific:
     object share the policy, and all pages allocated for the shared object,
     by any task, will obey the shared policy.
 
-	As of 2.6.22, only shared memory segments, created by shmget() or
+	As of 2.6.28, only shared memory segments, created by shmget() or
 	mmap(MAP_ANONYMOUS|MAP_SHARED), support shared policy.  When shared
 	policy support was added to Linux, the associated data structures were
 	added to hugetlbfs shmem segments.  At the time, hugetlbfs did not
@@ -128,25 +132,35 @@ most general to most specific:
 	The shared policy infrastructure supports different policies on subset
 	ranges of the shared object.  However, Linux still splits the VMA of
 	the task that installs the policy for each range of distinct policy.
-	Thus, different tasks that attach to a shared memory segment can have
+	Thus, different tasks that attach to a shared memory object can have
 	different VMA configurations mapping that one shared object.  This
 	can be seen by examining the /proc/<pid>/numa_maps of tasks sharing
-	a shared memory region, when one task has installed shared policy on
-	one or more ranges of the region.
+	a shared memory region.  When one task has installed shared policy on
+	one or more ranges of the region, the numa_maps of that task will
+	show different policies than the numa_maps of other tasks mapping the
+	shared object.  However, the installed shared policy with be used for
+	all pages allocated for the shared object by any of the attached tasks.
+
+	When installing shared policy on a shared object, the virtual address
+	range specified can be viewed as a "direct mapped", linear window onto
+	the underlying object.  As a result, attempting to install a shared
+	memory policy on a non-linear, shared mapping WILL [probably] install
+	the policy for some range of the object, but this range will not
+	necessarily correspond to the actual pages mapped non-linearly into the 	virtual address range.  Thus, applying a shared policy to a non-linear
+	mapping can be considered an undefined operation.
 
 Components of Memory Policies
 
     A Linux memory policy consists of a "mode", optional mode flags, and an
     optional set of nodes.  The mode determines the behavior of the policy,
-    the optional mode flags determine the behavior of the mode, and the
-    optional set of nodes can be viewed as the arguments to the policy
-    behavior.
-
-   Internally, memory policies are implemented by a reference counted
-   structure, struct mempolicy.  Details of this structure will be discussed
-   in context, below, as required to explain the behavior.
+    the optional mode flags determine the behavior of the mode, and the optional
+    set of nodes can be viewed as the arguments to the policy behavior.
 
-   Linux memory policy supports the following 4 behavioral modes:
+    Internally, memory policies are implemented by a reference counted
+    structure, struct mempolicy.  Details of this structure will be discussed
+    in context, below, as required to explain the behavior.
+
+    Linux memory policy supports the following 4 behavioral modes:
 
 	Default Mode--MPOL_DEFAULT:  This mode is only used in the memory
 	policy APIs.  Internally, MPOL_DEFAULT is converted to the NULL
@@ -174,7 +188,6 @@ Components of Memory Policies
 	allocation fails, the kernel will search other nodes, in order of
 	increasing distance from the preferred node based on information
 	provided by the platform firmware.
-	containing the cpu where the allocation takes place.
 
 	    Internally, the Preferred policy uses a single node--the
 	    preferred_node member of struct mempolicy.  When the internal
@@ -185,9 +198,11 @@ Components of Memory Policies
 
 	    It is possible for the user to specify that local allocation is
 	    always preferred by passing an empty nodemask with this mode.
+	    Note that this is the only way to specify local allocation for
+	    a VMA or Shared policy when the task policy is non-default.
 	    If an empty nodemask is passed, the policy cannot use the
 	    MPOL_F_STATIC_NODES or MPOL_F_RELATIVE_NODES flags described
-	    below.
+	    in the MEMORY POLICIES AND CPUSETS section below.
 
 	MPOL_INTERLEAVED:  This mode specifies that page allocations be
 	interleaved, on a page granularity, across the nodes specified in
@@ -211,87 +226,13 @@ Components of Memory Policies
 	    on the order in which they are allocated, rather than based on any
 	    page offset into an address range or file.  During system boot up,
 	    the temporary interleaved system default policy works in this
-	    mode.
+	    mode to distribute boot-time allocations around the nodes with
+	    memory.
 
-   Linux memory policy supports the following optional mode flags:
+    Linux memory policy supports optional "mode flags" for controlling the
+    interaction of memory policies with cpuset resource contraints.  The flags
+    are described in the MEMORY POLICIES AND CPUSETS section below.
 
-	MPOL_F_STATIC_NODES:  This flag specifies that the nodemask passed by
-	the user should not be remapped if the task or VMA's set of allowed
-	nodes changes after the memory policy has been defined.
-
-	    Without this flag, anytime a mempolicy is rebound because of a
-	    change in the set of allowed nodes, the node (Preferred) or
-	    nodemask (Bind, Interleave) is remapped to the new set of
-	    allowed nodes.  This may result in nodes being used that were
-	    previously undesired.
-
-	    With this flag, if the user-specified nodes overlap with the
-	    nodes allowed by the task's cpuset, then the memory policy is
-	    applied to their intersection.  If the two sets of nodes do not
-	    overlap, the Default policy is used.
-
-	    For example, consider a task that is attached to a cpuset with
-	    mems 1-3 that sets an Interleave policy over the same set.  If
-	    the cpuset's mems change to 3-5, the Interleave will now occur
-	    over nodes 3, 4, and 5.  With this flag, however, since only node
-	    3 is allowed from the user's nodemask, the "interleave" only
-	    occurs over that node.  If no nodes from the user's nodemask are
-	    now allowed, the Default behavior is used.
-
-	    MPOL_F_STATIC_NODES cannot be combined with the
-	    MPOL_F_RELATIVE_NODES flag.  It also cannot be used for
-	    MPOL_PREFERRED policies that were created with an empty nodemask
-	    (local allocation).
-
-	MPOL_F_RELATIVE_NODES:  This flag specifies that the nodemask passed
-	by the user will be mapped relative to the set of the task or VMA's
-	set of allowed nodes.  The kernel stores the user-passed nodemask,
-	and if the allowed nodes changes, then that original nodemask will
-	be remapped relative to the new set of allowed nodes.
-
-	    Without this flag (and without MPOL_F_STATIC_NODES), anytime a
-	    mempolicy is rebound because of a change in the set of allowed
-	    nodes, the node (Preferred) or nodemask (Bind, Interleave) is
-	    remapped to the new set of allowed nodes.  That remap may not
-	    preserve the relative nature of the user's passed nodemask to its
-	    set of allowed nodes upon successive rebinds: a nodemask of
-	    1,3,5 may be remapped to 7-9 and then to 1-3 if the set of
-	    allowed nodes is restored to its original state.
-
-	    With this flag, the remap is done so that the node numbers from
-	    the user's passed nodemask are relative to the set of allowed
-	    nodes.  In other words, if nodes 0, 2, and 4 are set in the user's
-	    nodemask, the policy will be effected over the first (and in the
-	    Bind or Interleave case, the third and fifth) nodes in the set of
-	    allowed nodes.  The nodemask passed by the user represents nodes
-	    relative to task or VMA's set of allowed nodes.
-
-	    If the user's nodemask includes nodes that are outside the range
-	    of the new set of allowed nodes (for example, node 5 is set in
-	    the user's nodemask when the set of allowed nodes is only 0-3),
-	    then the remap wraps around to the beginning of the nodemask and,
-	    if not already set, sets the node in the mempolicy nodemask.
-
-	    For example, consider a task that is attached to a cpuset with
-	    mems 2-5 that sets an Interleave policy over the same set with
-	    MPOL_F_RELATIVE_NODES.  If the cpuset's mems change to 3-7, the
-	    interleave now occurs over nodes 3,5-6.  If the cpuset's mems
-	    then change to 0,2-3,5, then the interleave occurs over nodes
-	    0,3,5.
-
-	    Thanks to the consistent remapping, applications preparing
-	    nodemasks to specify memory policies using this flag should
-	    disregard their current, actual cpuset imposed memory placement
-	    and prepare the nodemask as if they were always located on
-	    memory nodes 0 to N-1, where N is the number of memory nodes the
-	    policy is intended to manage.  Let the kernel then remap to the
-	    set of memory nodes allowed by the task's cpuset, as that may
-	    change over time.
-
-	    MPOL_F_RELATIVE_NODES cannot be combined with the
-	    MPOL_F_STATIC_NODES flag.  It also cannot be used for
-	    MPOL_PREFERRED policies that were created with an empty nodemask
-	    (local allocation).
 
 MEMORY POLICY REFERENCE COUNTING
 
@@ -435,19 +376,106 @@ MEMORY POLICIES AND CPUSETS
 Memory policies work within cpusets as described above.  For memory policies
 that require a node or set of nodes, the nodes are restricted to the set of
 nodes whose memories are allowed by the cpuset constraints.  If the nodemask
-specified for the policy contains nodes that are not allowed by the cpuset and
-MPOL_F_RELATIVE_NODES is not used, the intersection of the set of nodes
-specified for the policy and the set of nodes with memory is used.  If the
-result is the empty set, the policy is considered invalid and cannot be
-installed.  If MPOL_F_RELATIVE_NODES is used, the policy's nodes are mapped
-onto and folded into the task's set of allowed nodes as previously described.
+specified for the policy contains nodes that are not allowed by the cpuset [and
+MPOL_F_RELATIVE_NODES is not used--see below], the intersection of the set of
+nodes specified for the policy and the set of nodes with memory is used.  If
+the result is the empty set [and MPOL_F_STATIC_NODES is not used--see below],
+the policy is considered invalid and cannot be installed.
 
 The interaction of memory policies and cpusets can be problematic when tasks
 in two cpusets share access to a memory region, such as shared memory segments
 created by shmget() of mmap() with the MAP_ANONYMOUS and MAP_SHARED flags, and
 any of the tasks install shared policy on the region, only nodes whose
-memories are allowed in both cpusets may be used in the policies.  Obtaining
-this information requires "stepping outside" the memory policy APIs to use the
-cpuset information and requires that one know in what cpusets other task might
-be attaching to the shared region.  Furthermore, if the cpusets' allowed
-memory sets are disjoint, "local" allocation is the only valid policy.
+memories are allowed in both cpusets may be used in the policies.  Since
+2.6.26, applications can determine the allowed memories using the
+get_mempolicy() API with the MPOL_F_MEMS_ALLOWED flag.  However, one still
+can't easily determine in what cpusets other task might be attaching to the
+shared region.  Furthermore, if the cpusets' allowed memory sets are disjoint,
+"local" allocation is the only valid policy.
+
+To address some of the issues with the interaction of memory policies with
+cpusets, Linux supports two optional "mode flags".  These flags modify the
+interpretation of the set of nodes associated with a memory policy when:
+
+    1) the cpuset does not allow all of the nodes specified in the policy,
+    2) the cpuset allowed nodes changes,
+    3) the task is moved to a cpuset with a different set of allowed nodes.
+
+
+    MPOL_F_STATIC_NODES:  This flag specifies that the nodemask passed by
+    the user should not be remapped if the set of nodes allowed by the
+    task's cpuset changes after the memory policy has been defined.
+
+	Without this flag, anytime a mempolicy is rebound because of a
+	change in the set of allowed nodes, the node (Preferred) or
+	nodemask (Bind, Interleave) is remapped to the new set of
+	allowed nodes.  This may result in nodes being used that were
+	previously undesired.
+
+	With this flag, if the user-specified nodes overlap with the
+	nodes allowed by the task's cpuset, then the memory policy is
+	applied to their intersection.  If the two sets of nodes do not
+	overlap, the Default policy is used.
+
+	For example, consider a task that is attached to a cpuset with
+	mems 1-3 that sets an Interleave policy over the same set.  If
+	the cpuset's mems change to 3-5, without this flag, the allocations
+	will now be interleaved over nodes 3, 4, and 5.  With this flag,
+	however, since only node 3 is allowed from the user's nodemask, the
+	pages will only be allocated from node 3.  If no nodes from the
+	user's nodemask are now allowed, the Default behavior is used.
+
+	MPOL_F_STATIC_NODES cannot be combined with the
+	MPOL_F_RELATIVE_NODES flag.  It also cannot be used for
+	MPOL_PREFERRED policies that were created with an empty nodemask
+	(local allocation).
+
+    MPOL_F_RELATIVE_NODES:  This flag specifies that the nodemask passed
+    by the user should be mapped relative to the  set of nodes allowed by
+    the task's cpuset.  The kernel stores the user-passed nodemask, and
+    if the allowed nodes changes, then that original nodemask will be
+    remapped relative to the new set of allowed nodes.
+
+	Without this flag (and without MPOL_F_STATIC_NODES), anytime a
+	mempolicy is rebound because of a change in the set of allowed
+	nodes, the node (Preferred) or nodemask (Bind, Interleave) is
+	remapped to the new set of allowed nodes.  That remap may not
+	preserve the relative nature of the user's passed nodemask to its
+	set of allowed nodes upon successive rebinds: a nodemask of
+	1,3,5 may be remapped to 7-9 and then to 1-3 if the set of
+	allowed nodes is restored to its original state.
+
+	With this flag, the remap is done so that the node numbers from
+	the user's passed nodemask are relative to the set of allowed
+	nodes.  In other words, if nodes 0, 2, and 4 are set in the user's
+	nodemask, the policy will be effected over the first (and in the
+	Bind or Interleave case, the third and fifth) nodes in the set of
+	allowed nodes.  The nodemask passed by the user represents nodes
+	relative to task or VMA's set of allowed nodes.
+
+	If the user's nodemask includes nodes that are outside the range
+	of the new set of allowed nodes (for example, node 5 is set in
+	the user's nodemask when the set of allowed nodes is only 0-3),
+	then the remap wraps around to the beginning of the nodemask and,
+	if not already set, sets the node in the mempolicy nodemask.
+
+	For example, consider a task that is attached to a cpuset with
+	mems 2-5 that sets an Interleave policy over the same set with
+	MPOL_F_RELATIVE_NODES.  If the cpuset's mems change to 3-7, the
+	interleave now occurs over nodes 3,5-6.  If the cpuset's mems
+	then change to 0,2-3,5, then the interleave occurs over nodes
+	0,3,5.
+
+	Thanks to the consistent remapping, applications preparing
+	nodemasks to specify memory policies using this flag should
+	disregard their current, actual cpuset imposed memory placement
+	and prepare the nodemask as if they were always located on
+	memory nodes 0 to N-1, where N is the number of memory nodes the
+	policy is intended to manage.  Let the kernel then remap to the
+	set of memory nodes allowed by the task's cpuset, as that may
+	change over time.
+
+	MPOL_F_RELATIVE_NODES cannot be combined with the
+	MPOL_F_STATIC_NODES flag.  It also cannot be used for
+	MPOL_PREFERRED policies that were created with an empty nodemask
+	(local allocation).
--
To unsubscribe from this list: send the line "unsubscribe linux-numa" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html