[PATCH/RFC 8/14] Shared Policy: use alloc_page_pol for swap and shmempages

Lee Schermerhorn <lee.schermerhorn@xxxxxx> · Thu, 11 Nov 2010 14:12:58 -0500

Shared Policy Infrastructure - use alloc_page_pol() for shmem
and swap cache allocations

Now that we have the alloc_page_pol() to allocate a page
given a policy, we can use it to "simplify" shmem and swap
cache page allocations.  This eliminates the need for
pseudo-vmas on stack for shmem page allocations and moves us
towards a "policy + offset" model for page cache allocations,
rather that a "vma + address" model.  The vma+address are not
[both] available everywhere we would like to do policy-based page
allocation, whereas the policy and pgoff usually are.  However,
this does mean, however, that we need to be aware of mempolicy
reference counting in swapin read-ahead.

read_swap_cache_async() and swapin_readahead() have been changed
to take a policy and page offset [for interleaving] instead of a
vma and address.  swapin_readahead() passes the policy and pgoff
to read_swap_cache_async() to do the read.  read_swap_cache_async()
now uses alloc_page_pol() with the policy and offset, instead of
alloc_page_vma().

  Note that the pgoff used by swapin_readahead() is essentially
  bogus for all but the first read.  This was already the case
  for the 'address' argument before this patch.  With this patch,
  swapin_readahead() holds pgoff constant to select the same
  node for each readahead page, in the case of interleave
  policy.  This preserves pre-existing behavior.

shmem_swapin() now uses get_file_policy() directly to look up
the shared policy on the shmem pseudo-file which it passes
to swapin_readahead().  swapin_readahead() can call
read_swap_cache_async() multiple times in a loop before the final
tail call.  read_swap_cache_async() itself may loop to retry [in
case of swapin races?].  To avoid multiple "frees" of the shared
policy, swapin_readahead() makes a "conditional unshared" copy
of the policy on stack via mpol_cond_assign().  This releases the
extra ref for a shared policy, and is effectively a no-op for
non-shared policy.  Because the copy is non-shared, alloc_page_pol()
will not attempt to decrement the reference count.

Note that get_vma_policy() becomes an in-kernel global for
use outside of mempolicy.c, like get_file_policy(), to
lookup vma based policy for other calls to swapin_readahead().
Again, use of get_vma_policy() balances reference counts with
mpol_cond_assign() in swapin_readahead().


Signed-off-by: Lee Schermerhorn <lee.schermerhorn@xxxxxx>

 include/linux/mempolicy.h |    8 ++++++
 include/linux/swap.h      |    6 ++--
 mm/memory.c               |    5 +++
 mm/mempolicy.c            |    2 -
 mm/shmem.c                |   58 +++++++++++++++-------------------------------
 mm/swap_state.c           |   31 +++++++++++++++++-------
 mm/swapfile.c             |   14 ++++++++---
 7 files changed, 68 insertions(+), 56 deletions(-)

Index: linux-2.6.36-mmotm-101103-1217/mm/swap_state.c
===================================================================

--- linux-2.6.36-mmotm-101103-1217.orig/mm/swap_state.c
+++ linux-2.6.36-mmotm-101103-1217/mm/swap_state.c
@@ -17,6 +17,7 @@
 #include <linux/buffer_head.h>
 #include <linux/backing-dev.h>
 #include <linux/pagevec.h>
+#include <linux/mempolicy.h>
 #include <linux/migrate.h>
 #include <linux/page_cgroup.h>
 
@@ -275,9 +276,11 @@ struct page * lookup_swap_cache(swp_entr
  * and reading the disk if it is not already cached.
  * A failure return means that either the page allocation failed or that
  * the swap entry is no longer in use.
+ *
+ * This function will drop any incoming conditional reference on @pol.
  */
 struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
-			struct vm_area_struct *vma, unsigned long addr)
+			struct mempolicy *pol, pgoff_t pgoff)
 {
 	struct page *found_page, *new_page = NULL;
 	int err;
@@ -296,7 +299,8 @@ struct page *read_swap_cache_async(swp_e
 		 * Get a new page to read into from swap.
 		 */
 		if (!new_page) {
-			new_page = alloc_page_vma(gfp_mask, vma, addr);
+			new_page = alloc_page_pol(GFP_HIGHUSER_MOVABLE,
+								pol, pgoff);
 			if (!new_page)
 				break;		/* Out of memory */
 		}
@@ -353,8 +357,8 @@ struct page *read_swap_cache_async(swp_e
  * swapin_readahead - swap in pages in hope we need them soon
  * @entry: swap entry of this memory
  * @gfp_mask: memory allocation flags
- * @vma: user vma this address belongs to
- * @addr: target address for mempolicy
+ * @pol: mempolicy that controls allocation.
+ * @pgoff: page offset for interleave policy
  *
  * Returns the struct page for entry and addr, after queueing swapin.
  *
@@ -369,29 +373,38 @@ struct page *read_swap_cache_async(swp_e
  * Caller must hold down_read on the vma->vm_mm if vma is not NULL.
  */
 struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
-			struct vm_area_struct *vma, unsigned long addr)
+				struct mempolicy *pol, pgoff_t pgoff)
 {
-	int nr_pages;
+	struct mempolicy mpol;
 	struct page *page;
 	unsigned long offset;
 	unsigned long end_offset;
+	int nr_pages;
+
+	/*
+	 * make a non-shared copy of pol and release incoming ref, if
+	 * necessary, for read ahead loop and read_swap_cache_async()
+	 * retry loop.
+	 */
+	pol = mpol_cond_copy(&mpol, pol);
 
 	/*
 	 * Get starting offset for readaround, and number of pages to read.
 	 * Adjust starting address by readbehind (for NUMA interleave case)?
 	 * No, it's very unlikely that swap layout would follow vma layout,
 	 * more likely that neighbouring swap pages came from the same node:
-	 * so use the same "addr" to choose the same node for each swap read.
+	 * so use the same "pgoff" to choose the same node for each swap read.
 	 */
 	nr_pages = valid_swaphandles(entry, &offset);
 	for (end_offset = offset + nr_pages; offset < end_offset; offset++) {
+
 		/* Ok, do the async read-ahead now */
 		page = read_swap_cache_async(swp_entry(swp_type(entry), offset),
-						gfp_mask, vma, addr);
+						gfp_mask, pol, pgoff);
 		if (!page)
 			break;
 		page_cache_release(page);
 	}
 	lru_add_drain();	/* Push any new pages onto the LRU now */
-	return read_swap_cache_async(entry, gfp_mask, vma, addr);
+	return read_swap_cache_async(entry, gfp_mask, pol, pgoff);
 }
Index: linux-2.6.36-mmotm-101103-1217/include/linux/swap.h
===================================================================
--- linux-2.6.36-mmotm-101103-1217.orig/include/linux/swap.h
+++ linux-2.6.36-mmotm-101103-1217/include/linux/swap.h
@@ -317,9 +317,9 @@ extern void free_page_and_swap_cache(str
 extern void free_pages_and_swap_cache(struct page **, int);
 extern struct page *lookup_swap_cache(swp_entry_t);
 extern struct page *read_swap_cache_async(swp_entry_t, gfp_t,
-			struct vm_area_struct *vma, unsigned long addr);
+					struct mempolicy *, pgoff_t);
 extern struct page *swapin_readahead(swp_entry_t, gfp_t,
-			struct vm_area_struct *vma, unsigned long addr);
+					struct mempolicy *, pgoff_t);
 
 /* linux/mm/swapfile.c */
 extern long nr_swap_pages;
@@ -427,7 +427,7 @@ static inline void swapcache_free(swp_en
 }
 
 static inline struct page *swapin_readahead(swp_entry_t swp, gfp_t gfp_mask,
-			struct vm_area_struct *vma, unsigned long addr)
+					struct mempolicy *pol, pgoff_t pgoff)
 {
 	return NULL;
 }
Index: linux-2.6.36-mmotm-101103-1217/mm/mempolicy.c
===================================================================
--- linux-2.6.36-mmotm-101103-1217.orig/mm/mempolicy.c
+++ linux-2.6.36-mmotm-101103-1217/mm/mempolicy.c
@@ -1571,7 +1571,7 @@ struct mempolicy *get_file_policy(struct
  * freeing by another task.  It is the caller's responsibility to free the
  * extra reference for shared policies.
  */
-static struct mempolicy *get_vma_policy(struct task_struct *task,
+struct mempolicy *get_vma_policy(struct task_struct *task,
 		struct vm_area_struct *vma, unsigned long addr)
 {
 	struct mempolicy *pol = task->mempolicy;
Index: linux-2.6.36-mmotm-101103-1217/include/linux/mempolicy.h
===================================================================
--- linux-2.6.36-mmotm-101103-1217.orig/include/linux/mempolicy.h
+++ linux-2.6.36-mmotm-101103-1217/include/linux/mempolicy.h
@@ -193,6 +193,8 @@ extern void mpol_rebind_task(struct task
 extern void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new);
 extern void mpol_fix_fork_child_flag(struct task_struct *p);
 
+extern struct mempolicy *get_vma_policy(struct task_struct *,
+				struct vm_area_struct *, unsigned long);
 extern struct mempolicy *get_file_policy(struct address_space *, pgoff_t);
 extern struct zonelist *huge_zonelist(struct vm_area_struct *vma,
 				unsigned long addr, gfp_t gfp_flags,
@@ -324,6 +326,12 @@ static inline bool mempolicy_nodemask_in
 	return false;
 }
 
+static inline struct mempolicy *get_vma_policy(struct task_struct *task,
+		struct vm_area_struct *vma, unsigned long addr)
+{
+	return NULL;
+}
+
 static inline struct mempolicy *get_file_policy(struct address_space *, pgoff_t)
 {
 	return NULL;
Index: linux-2.6.36-mmotm-101103-1217/mm/swapfile.c
===================================================================
--- linux-2.6.36-mmotm-101103-1217.orig/mm/swapfile.c
+++ linux-2.6.36-mmotm-101103-1217/mm/swapfile.c
@@ -31,6 +31,7 @@
 #include <linux/syscalls.h>
 #include <linux/memcontrol.h>
 #include <linux/poll.h>
+#include <linux/mempolicy.h>
 
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
@@ -1096,6 +1097,7 @@ static int try_to_unuse(unsigned int typ
 	struct mm_struct *start_mm;
 	unsigned char *swap_map;
 	unsigned char swcount;
+	struct mempolicy *pol;
 	struct page *page;
 	swp_entry_t entry;
 	unsigned int i = 0;
@@ -1132,12 +1134,18 @@ static int try_to_unuse(unsigned int typ
 		/*
 		 * Get a page for the entry, using the existing swap
 		 * cache page if there is one.  Otherwise, get a clean
-		 * page and read the swap into it.
+		 * page and read the swap into it.  Use dummy policy
+		 * [current task's policy or system default] with swap
+		 * cache index for interleaving to allocate new page.
+		 * Note:  read_swap_cache_async() drops reference on policy.
+		 *        need to refetch policy for each call.  Not a
+		 *        performance concern in this loop.
 		 */
 		swap_map = &si->swap_map[i];
 		entry = swp_entry(type, i);
-		page = read_swap_cache_async(entry,
-					GFP_HIGHUSER_MOVABLE, NULL, 0);
+		pol = get_vma_policy(current, NULL, 0);
+		page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE,
+							 pol, i);
 		if (!page) {
 			/*
 			 * Either swap_duplicate() failed because entry
Index: linux-2.6.36-mmotm-101103-1217/mm/memory.c
===================================================================
--- linux-2.6.36-mmotm-101103-1217.orig/mm/memory.c
+++ linux-2.6.36-mmotm-101103-1217/mm/memory.c
@@ -2651,9 +2651,12 @@ static int do_swap_page(struct mm_struct
 	delayacct_set_flag(DELAYACCT_PF_SWAPIN);
 	page = lookup_swap_cache(entry);
 	if (!page) {
+		struct mempolicy *pol = get_vma_policy(current, vma, address);
+		pgoff_t pgoff = vma_mpol_pgoff(vma, address);
+
 		grab_swap_token(mm); /* Contend for token _before_ read-in */
 		page = swapin_readahead(entry,
-					GFP_HIGHUSER_MOVABLE, vma, address);
+					GFP_HIGHUSER_MOVABLE, pol, pgoff);
 		if (!page) {
 			/*
 			 * Back out if somebody else faulted in this pte
Index: linux-2.6.36-mmotm-101103-1217/mm/shmem.c
===================================================================
--- linux-2.6.36-mmotm-101103-1217.orig/mm/shmem.c
+++ linux-2.6.36-mmotm-101103-1217/mm/shmem.c
@@ -1146,39 +1146,21 @@ static struct mempolicy *shmem_get_sbmpo
 }
 #endif /* CONFIG_TMPFS */
 
-struct page *shmem_swapin(swp_entry_t entry, gfp_t gfp,
-				struct shared_policy *sp, unsigned long idx)
+struct page *shmem_swapin(swp_entry_t entry,
+			struct address_space *mapping, unsigned long idx)
 {
-	struct mempolicy mpol, *spol;
-	struct vm_area_struct pvma;
-	struct page *page;
-
-	spol = mpol_cond_copy(&mpol, mpol_shared_policy_lookup(sp, idx));
-
-	/* Create a pseudo vma that just contains the policy */
-	pvma.vm_start = 0;
-	pvma.vm_pgoff = idx;
-	pvma.vm_file = NULL;
-	pvma.vm_policy = spol;
-	page = swapin_readahead(entry, gfp, &pvma, 0);
-	return page;
+	return swapin_readahead(entry, mapping_gfp_mask(mapping),
+	                                   get_file_policy(mapping, idx), idx);
 }
 
-static struct page *shmem_alloc_page(gfp_t gfp, struct shared_policy *sp,
-				unsigned long idx)
+static inline struct page *shmem_alloc_page(struct address_space *mapping,
+					    unsigned long idx)
 {
-	struct vm_area_struct pvma;
-
-	/* Create a pseudo vma that just contains the policy */
-	pvma.vm_start = 0;
-	pvma.vm_pgoff = idx;
-	pvma.vm_file = NULL;
-	pvma.vm_policy = mpol_shared_policy_lookup(sp, idx);
-
 	/*
-	 * alloc_page_vma() will drop the shared policy reference
+	 * alloc_page_pol() will drop the shared policy reference
 	 */
-	return alloc_page_vma(gfp, &pvma, 0);
+	return alloc_page_pol(mapping_gfp_mask(mapping) | __GFP_ZERO,
+				 get_file_policy(mapping, idx), idx);
 }
 #else /* !CONFIG_NUMA */
 #ifdef CONFIG_TMPFS
@@ -1187,16 +1169,17 @@ static inline void shmem_show_mpol(struc
 }
 #endif /* CONFIG_TMPFS */
 
-static inline struct page *shmem_swapin(swp_entry_t entry, gfp_t gfp, void *sp,
-						unsigned long idx)
+static inline struct page *shmem_swapin(swp_entry_t entry,
+					struct address_space *mapping,
+					unsigned long idx)
 {
-	return swapin_readahead(entry, gfp, NULL, 0);
+ 	return swapin_readahead(entry, mapping_gfp_mask(mapping), NULL, 0);
 }
 
-static inline struct page *shmem_alloc_page(gfp_t gfp, void *sp,
-						unsigned long idx)
+static inline struct page *shmem_alloc_page(struct address_space *mapping,
+					    unsigned long idx)
 {
-	return alloc_page(gfp);
+	return alloc_page(mapping_gfp_mask(mapping) | __GFP_ZERO);
 }
 #endif /* CONFIG_NUMA */
 
@@ -1259,8 +1242,7 @@ repeat:
 		radix_tree_preload_end();
 		if (sgp != SGP_READ && !prealloc_page) {
 			/* We don't care if this fails */
-			prealloc_page = shmem_alloc_page(gfp,
-					mapping_shared_policy(mapping), idx);
+			prealloc_page = shmem_alloc_page(mapping, idx);
 			if (prealloc_page) {
 				if (mem_cgroup_cache_charge(prealloc_page,
 						current->mm, GFP_KERNEL)) {
@@ -1293,8 +1275,7 @@ repeat:
 				*type |= VM_FAULT_MAJOR;
 			}
 			spin_unlock(&info->lock);
-			swappage = shmem_swapin(swap, gfp,
-					mapping_shared_policy(mapping), idx);
+			swappage = shmem_swapin(swap, mapping, idx);
 			if (!swappage) {
 				spin_lock(&info->lock);
 				entry = shmem_swp_alloc(info, idx, sgp);
@@ -1421,8 +1402,7 @@ repeat:
 
 			if (!prealloc_page) {
 				spin_unlock(&info->lock);
-				filepage = shmem_alloc_page(gfp,
-						mapping_shared_policy(mapping), idx);
+				filepage = shmem_alloc_page(mapping, idx);
 				if (!filepage) {
 					shmem_unacct_blocks(info->flags, 1);
 					shmem_free_blocks(inode, 1);
--
To unsubscribe from this list: send the line "unsubscribe linux-numa" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html