[PATCH/RFC 6/14] Shared Policy: Factor alloc_page_pol routine

Lee Schermerhorn <lee.schermerhorn@xxxxxx> · Thu, 11 Nov 2010 14:12:41 -0500

Shared Policy Infrastructure - Factor alloc_page_pol routine

Implement alloc_page_pol() to allocate a page given a policy and
an offset [for interleaving].  No vma nor addr needed.  This
function will be used to allocate page_cache pages--e.g., for tmpfs
files--given the policy at a given page offset, simplifying the
shmem page allocation functions.

Revise alloc_page_vma() to simply call alloc_page_pol() after looking
up the vma policy, to eliminate duplicate code.  This change rippled
into the interleaving functions.  I was able to eliminate
interleave_nid() by computing the offset at the call sites where it
was not already available and calling [modified] offset_il_node()
directly.

	removed vma arg from offset_il_node(), as it wasn't
	used and is not available when called from
	alloc_page_pol().

Note:  re: alloc_page_vma() -- can be called w/ vma == NULL via
read_swap_cache_async() from try_to_unuse().  Can't compute a page
offset in this case.  This means that pages read by "swapoff"
don't/can't follow vma policy.  This is current behavior.  Similarly,
swapin readahead reads multiple swap pages, almost certainly associated
with different tasks, with the vma from the first swap page read--the
one that caused the fault.  Again, we can't compute the correct policy
for those pages, and this is current behavior.


Signed-off-by: Lee Schermerhorn <lee.schermerhorn@xxxxxx>

 include/linux/gfp.h |    3 +
 include/linux/mm.h  |   12 ++++-
 mm/hugetlb.c        |   10 ++++
 mm/mempolicy.c      |  107 ++++++++++++++++++++++++++++------------------------
 4 files changed, 81 insertions(+), 51 deletions(-)

Index: linux-2.6.36-mmotm-101103-1217/include/linux/gfp.h
===================================================================

--- linux-2.6.36-mmotm-101103-1217.orig/include/linux/gfp.h
+++ linux-2.6.36-mmotm-101103-1217/include/linux/gfp.h
@@ -327,10 +327,13 @@ alloc_pages(gfp_t gfp_mask, unsigned int
 }
 extern struct page *alloc_page_vma(gfp_t gfp_mask,
 			struct vm_area_struct *vma, unsigned long addr);
+struct mempolicy;
+extern struct page *alloc_page_pol(gfp_t, struct mempolicy *, pgoff_t);
 #else
 #define alloc_pages(gfp_mask, order) \
 		alloc_pages_node(numa_node_id(), gfp_mask, order)
 #define alloc_page_vma(gfp_mask, vma, addr) alloc_pages(gfp_mask, 0)
+#define alloc_page_pol(gfp_mask, pol, off)  alloc_pages(gfp_mask, 0)
 #endif
 #define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0)
 
Index: linux-2.6.36-mmotm-101103-1217/include/linux/mm.h
===================================================================
--- linux-2.6.36-mmotm-101103-1217.orig/include/linux/mm.h
+++ linux-2.6.36-mmotm-101103-1217/include/linux/mm.h
@@ -1235,15 +1235,23 @@ extern void setup_per_cpu_pageset(void);
 
 /*
  * Address to offset for policy lookup and interleave calculation.
- * Placed here because it needs struct vma definition.
+ * Placed here because it needs struct vma definition and we
+ * can't easily include mm.h in mempolicy.h, nor can we include
+ * hugetlb.h here.  Thus, the extern below.
  */
 static inline pgoff_t vma_mpol_pgoff(struct vm_area_struct *vma,
 						unsigned long addr)
 {
+	extern pgoff_t vma_huge_mpol_offset(struct vm_area_struct *,
+						unsigned long);
+
+	if (unlikely(vma->vm_flags & VM_HUGETLB))
+		return vma_huge_mpol_offset(vma, addr);
+
 	return ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
 }
 
-static inline pgoff_t vma_mpol_addr(struct vm_area_struct *vma,
+static inline unsigned long vma_mpol_addr(struct vm_area_struct *vma,
 						pgoff_t pgoff)
 {
 	return ((pgoff - vma->vm_pgoff) << PAGE_SHIFT) + vma->vm_start;
Index: linux-2.6.36-mmotm-101103-1217/mm/mempolicy.c
===================================================================
--- linux-2.6.36-mmotm-101103-1217.orig/mm/mempolicy.c
+++ linux-2.6.36-mmotm-101103-1217/mm/mempolicy.c
@@ -35,6 +35,7 @@
  *                use the process policy. This is what Linux always did
  *		  in a NUMA aware kernel and still does by, ahem, default.
  *
+//TODO:  following needs paragraph rewording.  haven't figured out what to say.
  * The process policy is applied for most non interrupt memory allocations
  * in that process' context. Interrupts ignore the policies and always
  * try to allocate on the local CPU. The VMA policy is only applied for memory
@@ -50,15 +51,15 @@
  * Same with GFP_DMA allocations.
  *
  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
- * all users and remembered even when nobody has memory mapped.
+ * all users and remembered even when nobody has memory mapped. Shared
+ * policies handle mempolicies on sub-ranges of the object using a
+ * red/black tree.  These policies persist until explicitly removed or
+ * the backing file is destroyed.
  */
 
 /* Notebook:
-   fix mmap readahead to honour policy and enable policy for any page cache
-   object
    statistics for bigpages
-   global policy for page cache? currently it uses process policy. Requires
-   first item above.
+   global policy for page cache?
    handle mremap for shared memory (currently ignored for the policy)
    grows down?
    make bind policy root only? It can trigger oom much faster and the
@@ -1671,9 +1672,10 @@ unsigned slab_node(struct mempolicy *pol
 	}
 }
 
-/* Do static interleaving for a VMA with known offset. */
-static unsigned offset_il_node(struct mempolicy *pol,
-		struct vm_area_struct *vma, unsigned long off)
+/*
+ * Do static interleaving for a policy with known offset.
+ */
+static unsigned offset_il_node(struct mempolicy *pol, pgoff_t off)
 {
 	unsigned nnodes = nodes_weight(pol->v.nodes);
 	unsigned target;
@@ -1691,28 +1693,6 @@ static unsigned offset_il_node(struct me
 	return nid;
 }
 
-/* Determine a node number for interleave */
-static inline unsigned interleave_nid(struct mempolicy *pol,
-		 struct vm_area_struct *vma, unsigned long addr, int shift)
-{
-	if (vma) {
-		unsigned long off;
-
-		/*
-		 * for small pages, there is no difference between
-		 * shift and PAGE_SHIFT, so the bit-shift is safe.
-		 * for huge pages, since vm_pgoff is in units of small
-		 * pages, we need to shift off the always 0 bits to get
-		 * a useful offset.
-		 */
-		BUG_ON(shift < PAGE_SHIFT);
-		off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
-		off += (addr - vma->vm_start) >> shift;
-		return offset_il_node(pol, vma, off);
-	} else
-		return interleave_nodes(pol);
-}
-
 #ifdef CONFIG_HUGETLBFS
 /*
  * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
@@ -1739,8 +1719,9 @@ struct zonelist *huge_zonelist(struct vm
 	*nodemask = NULL;	/* assume !MPOL_BIND */
 
 	if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
-		zl = node_zonelist(interleave_nid(*mpol, vma, addr,
-				huge_page_shift(hstate_vma(vma))), gfp_flags);
+		zl = node_zonelist(
+			offset_il_node(*mpol, vma_mpol_pgoff(vma, addr)),
+			gfp_flags);
 	} else {
 		zl = policy_zonelist(gfp_flags, *mpol);
 		if ((*mpol)->mode == MPOL_BIND)
@@ -1859,31 +1840,27 @@ static struct page *alloc_page_interleav
 }
 
 /**
- * 	alloc_page_vma	- Allocate a page for a VMA.
+ * alloc_page_pol() -- allocate a page based on policy,offset.
  *
- * 	@gfp:
+ * @gfp   - gfp mask [flags + zone] for allocation
  *      %GFP_USER    user allocation.
  *      %GFP_KERNEL  kernel allocations,
  *      %GFP_HIGHMEM highmem/user allocations,
  *      %GFP_FS      allocation should not call back into a file system.
  *      %GFP_ATOMIC  don't sleep.
  *
- * 	@vma:  Pointer to VMA or NULL if not available.
- *	@addr: Virtual Address of the allocation. Must be inside the VMA.
+ * @pol   - policy to use for allocation
+ * @pgoff - page offset for interleaving -- used only if interleave policy
  *
- * 	This function allocates a page from the kernel page pool and applies
- *	a NUMA policy associated with the VMA or the current process.
- *	When VMA is not NULL caller must hold down_read on the mmap_sem of the
- *	mm_struct of the VMA to prevent it from going away. Should be used for
- *	all allocations for pages that will be mapped into
- * 	user space. Returns NULL when no page can be allocated.
+ *	This function allocates a page from the kernel page pool and applies
+ *	the NUMA memory policy @pol, possibly indexed by @pgoff.  Should be
+ *	used for all allocations for anonymous pages that will be mapped into
+ *	user space.  Returns NULL when no page can be allocated.
  *
- *	Should be called with the mm_sem of the vma hold.
+ * Note:  extra ref on shared policies dropped on return.
  */
-struct page *
-alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
+struct page *alloc_page_pol(gfp_t gfp, struct mempolicy *pol, pgoff_t pgoff)
 {
-	struct mempolicy *pol = get_vma_policy(current, vma, addr);
 	struct zonelist *zl;
 	struct page *page;
 
@@ -1891,7 +1868,7 @@ alloc_page_vma(gfp_t gfp, struct vm_area
 	if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
 		unsigned nid;
 
-		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
+		nid = offset_il_node(pol, pgoff);
 		mpol_cond_put(pol);
 		page = alloc_page_interleave(gfp, 0, nid);
 		put_mems_allowed();
@@ -1902,8 +1879,8 @@ alloc_page_vma(gfp_t gfp, struct vm_area
 		/*
 		 * slow path: ref counted shared policy
 		 */
-		struct page *page =  __alloc_pages_nodemask(gfp, 0,
-						zl, policy_nodemask(gfp, pol));
+		struct page *page =  __alloc_pages_nodemask(gfp, 0, zl,
+						policy_nodemask(gfp, pol));
 		__mpol_put(pol);
 		put_mems_allowed();
 		return page;
@@ -1915,6 +1892,38 @@ alloc_page_vma(gfp_t gfp, struct vm_area
 	put_mems_allowed();
 	return page;
 }
+EXPORT_SYMBOL(alloc_page_pol);
+
+/**
+ *	alloc_page_vma	- Allocate a page for a VMA.
+ *
+ *	@gfp:
+ *      %GFP_USER    user allocation.
+ *      %GFP_KERNEL  kernel allocations,
+ *      %GFP_HIGHMEM highmem/user allocations,
+ *      %GFP_FS      allocation should not call back into a file system.
+ *      %GFP_ATOMIC  don't sleep.
+ *
+ *	@vma:  Pointer to VMA or NULL if not available.
+ *	@addr: Virtual Address of the allocation. Must be inside the VMA.
+ *
+ *	This function allocates a page from the kernel page pool and applies
+ *	a NUMA policy associated with the VMA or the current process.
+ *	When VMA is not NULL caller must hold down_read on the mmap_sem of the
+ *	mm_struct of the VMA to prevent it from going away. Should be used for
+ *	all allocations for anonymous pages that will be mapped into
+ *	user space. Returns NULL when no page can be allocated.
+ */
+struct page *
+alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
+{
+	struct mempolicy *pol = get_vma_policy(current, vma, addr);
+	pgoff_t pgoff = 0;
+
+	if (likely(vma))
+		pgoff = vma_mpol_pgoff(vma, addr);
+	return alloc_page_pol(gfp, pol, pgoff);
+}
 
 /**
  * 	alloc_pages_current - Allocate pages.
Index: linux-2.6.36-mmotm-101103-1217/mm/hugetlb.c
===================================================================
--- linux-2.6.36-mmotm-101103-1217.orig/mm/hugetlb.c
+++ linux-2.6.36-mmotm-101103-1217/mm/hugetlb.c
@@ -230,6 +230,16 @@ pgoff_t linear_hugepage_index(struct vm_
 }
 
 /*
+ * As above, given just vma and address.
+ * For computing huge page offset for interleave mempolicy
+ */
+pgoff_t vma_huge_mpol_offset(struct vm_area_struct *vma,
+					unsigned long address)
+{
+	return vma_hugecache_offset(hstate_vma(vma), vma, address);
+}
+
+/*
  * Return the size of the pages allocated when backing a VMA. In the majority
  * cases this will be same size as used by the page table entries.
  */
--
To unsubscribe from this list: send the line "unsubscribe linux-numa" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html