[PATCH/RFC 7/14] Shared Policy: use shared policy for page cache allocations

Lee Schermerhorn <lee.schermerhorn@xxxxxx> · Thu, 11 Nov 2010 14:12:50 -0500

Shared Policy Infrastructure - use shared policy for page cache allocations

This patch implements a "get_file_policy()" function, analogous
to get_vma_policy(), but for a given file[inode/mapping] at
at specified offset, using the shared_policy, if any, in the
file's address_space.  If no shared policy, returns the process
policy of the argument task [to match get_vma_policy() args] or
default policy, if no process policy.

	Note that for a file policy to exist [on other than shmem
	segments] the file must currently be mmap()ed into a task's
	address space with MAP_SHARED, with the policy installed
	via mbind().

	A later patch will hook up the generic file mempolicy
	vm_ops and define a per cpuset control file to enable
	this semantic.  Default will be same as current behavior--
	no policy on shared file mapping.

Details:

Revert [__]page_cache_alloc() to take mapping argument as it once
used to.  I need that to locate the shared policy.  Add pgoff_t
argument.  Fix up page_cache_alloc() and page_cache_alloc_cold()
in pagemap.h and all direct callers of __page_cache_alloc() accordingly.

Modify __page_cache_alloc() to use get_file_policy() and
alloc_page_pol().  Again, without generic file mempolicy, this
behaves the same as alloc_page_current()

page_cache_alloc*() now take an additional offset/index
argument, available at all call sites, to lookup the appropriate
policy and to compute interleave node for interleave policy.
The patches fixes all in-tree users of the modified interfaces.

Re: interaction with cpusets page spread:  if the file has a
shared policy structure attached, that policy takes precedence
over spreading.

Now that we have get_file_policy() and alloc_page_pol(), we can eliminate
another case of a pseudo-vma on the stack and use the new infrastructure
to allocate shmem pages.  This will be done in subsequent patches.

Re: ceph fs calls to __page_cache_alloc():  these are the only
calls where an inode/mapping and page offset/index are not
available.  As such, they don't seem to be bona fide page cache
allocations.  So, I've replaced them with direct calls to
alloc_page() as this is what page_cache_alloc() evaluated to
before this series.

Signed-off-by: Lee Schermerhorn <lee.schermerhorn@xxxxxx>

 fs/btrfs/compression.c    |    4 +--
 fs/cachefiles/rdwr.c      |    6 +++--
 fs/ntfs/file.c            |    2 -
 fs/splice.c               |    2 -
 include/linux/mempolicy.h |    8 +++++++
 include/linux/pagemap.h   |   18 ++++++++++------
 mm/filemap.c              |   50 +++++++++++++++++++++++++++++++++++-----------
 mm/mempolicy.c            |   25 +++++++++++++++++++++--
 mm/readahead.c            |    2 -
 net/ceph/messenger.c      |    2 -
 net/ceph/pagelist.c       |    4 +--
 net/ceph/pagevec.c        |    2 -
 12 files changed, 94 insertions(+), 31 deletions(-)

Index: linux-2.6.36-mmotm-101103-1217/mm/filemap.c
===================================================================

--- linux-2.6.36-mmotm-101103-1217.orig/mm/filemap.c
+++ linux-2.6.36-mmotm-101103-1217/mm/filemap.c
@@ -35,6 +35,8 @@
 #include <linux/memcontrol.h>
 #include <linux/mm_inline.h> /* for page_is_file_cache() */
 #include <linux/cleancache.h>
+#include <linux/mempolicy.h>
+
 #include "internal.h"
 
 /*
@@ -472,19 +474,43 @@ int add_to_page_cache_lru(struct page *p
 EXPORT_SYMBOL_GPL(add_to_page_cache_lru);
 
 #ifdef CONFIG_NUMA
-struct page *__page_cache_alloc(gfp_t gfp)
+/**
+ * __page_cache_alloc - allocate a page cache page
+ * @mapping - address_space for which page will be allocated
+ * @pgoff   - page index in mapping -- for mem policy
+ * @gfp     - gfp flags
+ *
+ * If the mapping does not contain a shared policy, and page cache spreading
+ * is enabled for the current context's cpuset, allocate a page from the node
+ * indicated by page cache spreading.
+ *
+ * Otherwise, fetch the memory policy at the indicated pgoff and allocate
+ * a page according to that policy.  Note that if the mapping does not
+ * have a shared policy, the allocation will use the task policy, if any,
+ * else the system default policy.
+ *
+ * All allocations will use the specified gfp mask.
+ */
+struct page *__page_cache_alloc(struct address_space *mapping, pgoff_t pgoff,
+					gfp_t gfp)
 {
-	int n;
+	struct mempolicy *pol;
 	struct page *page;
+	int n;
 
-	if (cpuset_do_page_mem_spread()) {
+	/*
+	 * Consider spreading only if no shared_policy
+	 */
+	if (!mapping->spolicy && cpuset_do_page_mem_spread()) {
 		get_mems_allowed();
 		n = cpuset_mem_spread_node();
 		page = alloc_pages_exact_node(n, gfp, 0);
 		put_mems_allowed();
 		return page;
-	}
-	return alloc_pages(gfp, 0);
+	} else
+		pol = get_file_policy(mapping, pgoff);
+
+	return alloc_page_pol(gfp, pol, pgoff);
 }
 EXPORT_SYMBOL(__page_cache_alloc);
 #endif
@@ -733,7 +759,7 @@ struct page *find_or_create_page(struct
 repeat:
 	page = find_lock_page(mapping, index);
 	if (!page) {
-		page = __page_cache_alloc(gfp_mask);
+		page = __page_cache_alloc(mapping, index, gfp_mask);
 		if (!page)
 			return NULL;
 		/*
@@ -951,7 +977,8 @@ grab_cache_page_nowait(struct address_sp
 		page_cache_release(page);
 		return NULL;
 	}
-	page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~__GFP_FS);
+	page = __page_cache_alloc(mapping, index,
+				  mapping_gfp_mask(mapping) & ~__GFP_FS);
 	if (page && add_to_page_cache_lru(page, mapping, index, GFP_NOFS)) {
 		page_cache_release(page);
 		page = NULL;
@@ -1182,7 +1209,7 @@ no_cached_page:
 		 * Ok, it wasn't cached, so we need to create a new
 		 * page..
 		 */
-		page = page_cache_alloc_cold(mapping);
+		page = page_cache_alloc_cold(mapping, index);
 		if (!page) {
 			desc->error = -ENOMEM;
 			goto out;
@@ -1440,7 +1467,7 @@ static int page_cache_read(struct file *
 	int ret;
 
 	do {
-		page = page_cache_alloc_cold(mapping);
+		page = page_cache_alloc_cold(mapping, offset);
 		if (!page)
 			return -ENOMEM;
 
@@ -1709,7 +1736,7 @@ static struct page *__read_cache_page(st
 repeat:
 	page = find_get_page(mapping, index);
 	if (!page) {
-		page = __page_cache_alloc(gfp | __GFP_COLD);
+		page = page_cache_alloc_cold(mapping, index);
 		if (!page)
 			return ERR_PTR(-ENOMEM);
 		err = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL);
@@ -2234,7 +2261,8 @@ repeat:
 	if (likely(page))
 		return page;
 
-	page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~gfp_notmask);
+	page = __page_cache_alloc(mapping, index,
+				mapping_gfp_mask(mapping) & ~gfp_notmask);
 	if (!page)
 		return NULL;
 	status = add_to_page_cache_lru(page, mapping, index,
Index: linux-2.6.36-mmotm-101103-1217/include/linux/pagemap.h
===================================================================
--- linux-2.6.36-mmotm-101103-1217.orig/include/linux/pagemap.h
+++ linux-2.6.36-mmotm-101103-1217/include/linux/pagemap.h
@@ -201,22 +201,26 @@ static inline void page_unfreeze_refs(st
 }
 
 #ifdef CONFIG_NUMA
-extern struct page *__page_cache_alloc(gfp_t gfp);
+extern struct page *__page_cache_alloc(struct address_space *, pgoff_t,
+							gfp_t);
 #else
-static inline struct page *__page_cache_alloc(gfp_t gfp)
+static inline struct page *__page_cache_alloc(struct address_space *mapping,
+						pgoff_t off, gfp_t gfp)
 {
-	return alloc_pages(gfp, 0);
+	return alloc_pages(mapping_gfp_mask(mapping));
 }
 #endif
 
-static inline struct page *page_cache_alloc(struct address_space *x)
+static inline struct page *page_cache_alloc(struct address_space *mapping,
+						pgoff_t off)
 {
-	return __page_cache_alloc(mapping_gfp_mask(x));
+	return __page_cache_alloc(mapping, off, mapping_gfp_mask(mapping));
 }
 
-static inline struct page *page_cache_alloc_cold(struct address_space *x)
+static inline struct page *page_cache_alloc_cold(struct address_space *mapping,
+						pgoff_t off)
 {
-	return __page_cache_alloc(mapping_gfp_mask(x)|__GFP_COLD);
+	return __page_cache_alloc(mapping, off, mapping_gfp_mask(mapping) | __GFP_COLD);
 }
 
 typedef int filler_t(void *, struct page *);
Index: linux-2.6.36-mmotm-101103-1217/fs/splice.c
===================================================================
--- linux-2.6.36-mmotm-101103-1217.orig/fs/splice.c
+++ linux-2.6.36-mmotm-101103-1217/fs/splice.c
@@ -349,7 +349,7 @@ __generic_file_splice_read(struct file *
 			/*
 			 * page didn't exist, allocate one.
 			 */
-			page = page_cache_alloc_cold(mapping);
+			page = page_cache_alloc_cold(mapping, index);
 			if (!page)
 				break;
 
Index: linux-2.6.36-mmotm-101103-1217/mm/readahead.c
===================================================================
--- linux-2.6.36-mmotm-101103-1217.orig/mm/readahead.c
+++ linux-2.6.36-mmotm-101103-1217/mm/readahead.c
@@ -174,7 +174,7 @@ __do_page_cache_readahead(struct address
 		if (page)
 			continue;
 
-		page = page_cache_alloc_cold(mapping);
+		page = page_cache_alloc_cold(mapping, page_offset);
 		if (!page)
 			break;
 		page->index = page_offset;
Index: linux-2.6.36-mmotm-101103-1217/fs/ntfs/file.c
===================================================================
--- linux-2.6.36-mmotm-101103-1217.orig/fs/ntfs/file.c
+++ linux-2.6.36-mmotm-101103-1217/fs/ntfs/file.c
@@ -415,7 +415,7 @@ static inline int __ntfs_grab_cache_page
 		pages[nr] = find_lock_page(mapping, index);
 		if (!pages[nr]) {
 			if (!*cached_page) {
-				*cached_page = page_cache_alloc(mapping);
+				*cached_page = page_cache_alloc(mapping, index);
 				if (unlikely(!*cached_page)) {
 					err = -ENOMEM;
 					goto err_out;
Index: linux-2.6.36-mmotm-101103-1217/include/linux/mempolicy.h
===================================================================
--- linux-2.6.36-mmotm-101103-1217.orig/include/linux/mempolicy.h
+++ linux-2.6.36-mmotm-101103-1217/include/linux/mempolicy.h
@@ -109,6 +109,8 @@ struct mempolicy {
 	} w;
 };
 
+extern struct mempolicy default_policy;
+
 /*
  * vma memory policy flags
  */
@@ -191,6 +193,7 @@ extern void mpol_rebind_task(struct task
 extern void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new);
 extern void mpol_fix_fork_child_flag(struct task_struct *p);
 
+extern struct mempolicy *get_file_policy(struct address_space *, pgoff_t);
 extern struct zonelist *huge_zonelist(struct vm_area_struct *vma,
 				unsigned long addr, gfp_t gfp_flags,
 				struct mempolicy **mpol, nodemask_t **nodemask);
@@ -321,6 +324,11 @@ static inline bool mempolicy_nodemask_in
 	return false;
 }
 
+static inline struct mempolicy *get_file_policy(struct address_space *, pgoff_t)
+{
+	return NULL;
+}
+
 static inline int do_migrate_pages(struct mm_struct *mm,
 			const nodemask_t *from_nodes,
 			const nodemask_t *to_nodes, int flags)
Index: linux-2.6.36-mmotm-101103-1217/mm/mempolicy.c
===================================================================
--- linux-2.6.36-mmotm-101103-1217.orig/mm/mempolicy.c
+++ linux-2.6.36-mmotm-101103-1217/mm/mempolicy.c
@@ -1534,8 +1534,29 @@ asmlinkage long compat_sys_mbind(compat_
 
 #endif
 
-/*
- * get_vma_policy(@task, @vma, @addr)
+/**
+ * get_file_policy - Return effective policy for @mapping at @pgoff
+ * @mapping - file's address_space that might contain shared policy
+ * @pgoff - page offset into file/object
+ *
+ * Falls back to task or system default policy, as necessary.
+ */
+struct mempolicy *get_file_policy(struct address_space *mapping, pgoff_t pgoff)
+{
+	struct shared_policy *sp = mapping->spolicy;
+	struct mempolicy *pol = NULL;
+
+	if (unlikely(sp))
+		pol = mpol_shared_policy_lookup(sp, pgoff);
+	else if (likely(current))
+		pol = current->mempolicy;
+	if (likely(!pol))
+		pol = &default_policy;
+	return pol;
+}
+
+/**
+ * get_vma_policy
  * @task - task for fallback if vma policy == default
  * @vma   - virtual memory area whose policy is sought
  * @addr  - address in @vma for shared policy lookup
Index: linux-2.6.36-mmotm-101103-1217/fs/cachefiles/rdwr.c
===================================================================
--- linux-2.6.36-mmotm-101103-1217.orig/fs/cachefiles/rdwr.c
+++ linux-2.6.36-mmotm-101103-1217/fs/cachefiles/rdwr.c
@@ -258,7 +258,8 @@ static int cachefiles_read_backing_file_
 			goto backing_page_already_present;
 
 		if (!newpage) {
-			newpage = page_cache_alloc_cold(bmapping);
+			newpage = page_cache_alloc_cold(bmapping,
+							netpage->index);
 			if (!newpage)
 				goto nomem_monitor;
 		}
@@ -500,7 +501,8 @@ static int cachefiles_read_backing_file(
 				goto backing_page_already_present;
 
 			if (!newpage) {
-				newpage = page_cache_alloc_cold(bmapping);
+				newpage = page_cache_alloc_cold(bmapping,
+							netpage->index);
 				if (!newpage)
 					goto nomem;
 			}
Index: linux-2.6.36-mmotm-101103-1217/fs/btrfs/compression.c
===================================================================
--- linux-2.6.36-mmotm-101103-1217.orig/fs/btrfs/compression.c
+++ linux-2.6.36-mmotm-101103-1217/fs/btrfs/compression.c
@@ -474,8 +474,8 @@ static noinline int add_ra_bio_pages(str
 			goto next;
 		}
 
-		page = __page_cache_alloc(mapping_gfp_mask(mapping) &
-								~__GFP_FS);
+		page = __page_cache_alloc(mapping, page_index,
+					mapping_gfp_mask(mapping) & ~__GFP_FS);
 		if (!page)
 			break;
 
Index: linux-2.6.36-mmotm-101103-1217/net/ceph/messenger.c
===================================================================
--- linux-2.6.36-mmotm-101103-1217.orig/net/ceph/messenger.c
+++ linux-2.6.36-mmotm-101103-1217/net/ceph/messenger.c
@@ -2111,7 +2111,7 @@ struct ceph_messenger *ceph_messenger_cr
 
 	/* the zero page is needed if a request is "canceled" while the message
 	 * is being written over the socket */
-	msgr->zero_page = __page_cache_alloc(GFP_KERNEL | __GFP_ZERO);
+	msgr->zero_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
 	if (!msgr->zero_page) {
 		kfree(msgr);
 		return ERR_PTR(-ENOMEM);
Index: linux-2.6.36-mmotm-101103-1217/net/ceph/pagelist.c
===================================================================
--- linux-2.6.36-mmotm-101103-1217.orig/net/ceph/pagelist.c
+++ linux-2.6.36-mmotm-101103-1217/net/ceph/pagelist.c
@@ -33,7 +33,7 @@ static int ceph_pagelist_addpage(struct
 	struct page *page;
 
 	if (!pl->num_pages_free) {
-		page = __page_cache_alloc(GFP_NOFS);
+		page = alloc_page(GFP_NOFS);
 	} else {
 		page = list_first_entry(&pl->free_list, struct page, lru);
 		list_del(&page->lru);
@@ -85,7 +85,7 @@ int ceph_pagelist_reserve(struct ceph_pa
 	space = (space + PAGE_SIZE - 1) >> PAGE_SHIFT;   /* conv to num pages */
 
 	while (space > pl->num_pages_free) {
-		struct page *page = __page_cache_alloc(GFP_NOFS);
+		struct page *page = alloc_page(GFP_NOFS);
 		if (!page)
 			return -ENOMEM;
 		list_add_tail(&page->lru, &pl->free_list);
Index: linux-2.6.36-mmotm-101103-1217/net/ceph/pagevec.c
===================================================================
--- linux-2.6.36-mmotm-101103-1217.orig/net/ceph/pagevec.c
+++ linux-2.6.36-mmotm-101103-1217/net/ceph/pagevec.c
@@ -69,7 +69,7 @@ struct page **ceph_alloc_page_vector(int
 	if (!pages)
 		return ERR_PTR(-ENOMEM);
 	for (i = 0; i < num_pages; i++) {
-		pages[i] = __page_cache_alloc(flags);
+		pages[i] = alloc_page(flags);
 		if (pages[i] == NULL) {
 			ceph_release_page_vector(pages, i);
 			return ERR_PTR(-ENOMEM);
--
To unsubscribe from this list: send the line "unsubscribe linux-numa" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html