Shared Policy Infrastructure - use shared policy for page cache allocations This patch implements a "get_file_policy()" function, analogous to get_vma_policy(), but for a given file[inode/mapping] at at specified offset, using the shared_policy, if any, in the file's address_space. If no shared policy, returns the process policy of the argument task [to match get_vma_policy() args] or default policy, if no process policy. Note that for a file policy to exist [on other than shmem segments] the file must currently be mmap()ed into a task's address space with MAP_SHARED, with the policy installed via mbind(). A later patch will hook up the generic file mempolicy vm_ops and define a per cpuset control file to enable this semantic. Default will be same as current behavior-- no policy on shared file mapping. Details: Revert [__]page_cache_alloc() to take mapping argument as it once used to. I need that to locate the shared policy. Add pgoff_t argument. Fix up page_cache_alloc() and page_cache_alloc_cold() in pagemap.h and all direct callers of __page_cache_alloc() accordingly. Modify __page_cache_alloc() to use get_file_policy() and alloc_page_pol(). Again, without generic file mempolicy, this behaves the same as alloc_page_current() page_cache_alloc*() now take an additional offset/index argument, available at all call sites, to lookup the appropriate policy and to compute interleave node for interleave policy. The patches fixes all in-tree users of the modified interfaces. Re: interaction with cpusets page spread: if the file has a shared policy structure attached, that policy takes precedence over spreading. Now that we have get_file_policy() and alloc_page_pol(), we can eliminate another case of a pseudo-vma on the stack and use the new infrastructure to allocate shmem pages. This will be done in subsequent patches. Re: ceph fs calls to __page_cache_alloc(): these are the only calls where an inode/mapping and page offset/index are not available. As such, they don't seem to be bona fide page cache allocations. So, I've replaced them with direct calls to alloc_page() as this is what page_cache_alloc() evaluated to before this series. Signed-off-by: Lee Schermerhorn <lee.schermerhorn@xxxxxx> fs/btrfs/compression.c | 4 +-- fs/cachefiles/rdwr.c | 6 +++-- fs/ntfs/file.c | 2 - fs/splice.c | 2 - include/linux/mempolicy.h | 8 +++++++ include/linux/pagemap.h | 18 ++++++++++------ mm/filemap.c | 50 +++++++++++++++++++++++++++++++++++----------- mm/mempolicy.c | 25 +++++++++++++++++++++-- mm/readahead.c | 2 - net/ceph/messenger.c | 2 - net/ceph/pagelist.c | 4 +-- net/ceph/pagevec.c | 2 - 12 files changed, 94 insertions(+), 31 deletions(-) Index: linux-2.6.36-mmotm-101103-1217/mm/filemap.c =================================================================== --- linux-2.6.36-mmotm-101103-1217.orig/mm/filemap.c +++ linux-2.6.36-mmotm-101103-1217/mm/filemap.c @@ -35,6 +35,8 @@ #include <linux/memcontrol.h> #include <linux/mm_inline.h> /* for page_is_file_cache() */ #include <linux/cleancache.h> +#include <linux/mempolicy.h> + #include "internal.h" /* @@ -472,19 +474,43 @@ int add_to_page_cache_lru(struct page *p EXPORT_SYMBOL_GPL(add_to_page_cache_lru); #ifdef CONFIG_NUMA -struct page *__page_cache_alloc(gfp_t gfp) +/** + * __page_cache_alloc - allocate a page cache page + * @mapping - address_space for which page will be allocated + * @pgoff - page index in mapping -- for mem policy + * @gfp - gfp flags + * + * If the mapping does not contain a shared policy, and page cache spreading + * is enabled for the current context's cpuset, allocate a page from the node + * indicated by page cache spreading. + * + * Otherwise, fetch the memory policy at the indicated pgoff and allocate + * a page according to that policy. Note that if the mapping does not + * have a shared policy, the allocation will use the task policy, if any, + * else the system default policy. + * + * All allocations will use the specified gfp mask. + */ +struct page *__page_cache_alloc(struct address_space *mapping, pgoff_t pgoff, + gfp_t gfp) { - int n; + struct mempolicy *pol; struct page *page; + int n; - if (cpuset_do_page_mem_spread()) { + /* + * Consider spreading only if no shared_policy + */ + if (!mapping->spolicy && cpuset_do_page_mem_spread()) { get_mems_allowed(); n = cpuset_mem_spread_node(); page = alloc_pages_exact_node(n, gfp, 0); put_mems_allowed(); return page; - } - return alloc_pages(gfp, 0); + } else + pol = get_file_policy(mapping, pgoff); + + return alloc_page_pol(gfp, pol, pgoff); } EXPORT_SYMBOL(__page_cache_alloc); #endif @@ -733,7 +759,7 @@ struct page *find_or_create_page(struct repeat: page = find_lock_page(mapping, index); if (!page) { - page = __page_cache_alloc(gfp_mask); + page = __page_cache_alloc(mapping, index, gfp_mask); if (!page) return NULL; /* @@ -951,7 +977,8 @@ grab_cache_page_nowait(struct address_sp page_cache_release(page); return NULL; } - page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~__GFP_FS); + page = __page_cache_alloc(mapping, index, + mapping_gfp_mask(mapping) & ~__GFP_FS); if (page && add_to_page_cache_lru(page, mapping, index, GFP_NOFS)) { page_cache_release(page); page = NULL; @@ -1182,7 +1209,7 @@ no_cached_page: * Ok, it wasn't cached, so we need to create a new * page.. */ - page = page_cache_alloc_cold(mapping); + page = page_cache_alloc_cold(mapping, index); if (!page) { desc->error = -ENOMEM; goto out; @@ -1440,7 +1467,7 @@ static int page_cache_read(struct file * int ret; do { - page = page_cache_alloc_cold(mapping); + page = page_cache_alloc_cold(mapping, offset); if (!page) return -ENOMEM; @@ -1709,7 +1736,7 @@ static struct page *__read_cache_page(st repeat: page = find_get_page(mapping, index); if (!page) { - page = __page_cache_alloc(gfp | __GFP_COLD); + page = page_cache_alloc_cold(mapping, index); if (!page) return ERR_PTR(-ENOMEM); err = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL); @@ -2234,7 +2261,8 @@ repeat: if (likely(page)) return page; - page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~gfp_notmask); + page = __page_cache_alloc(mapping, index, + mapping_gfp_mask(mapping) & ~gfp_notmask); if (!page) return NULL; status = add_to_page_cache_lru(page, mapping, index, Index: linux-2.6.36-mmotm-101103-1217/include/linux/pagemap.h =================================================================== --- linux-2.6.36-mmotm-101103-1217.orig/include/linux/pagemap.h +++ linux-2.6.36-mmotm-101103-1217/include/linux/pagemap.h @@ -201,22 +201,26 @@ static inline void page_unfreeze_refs(st } #ifdef CONFIG_NUMA -extern struct page *__page_cache_alloc(gfp_t gfp); +extern struct page *__page_cache_alloc(struct address_space *, pgoff_t, + gfp_t); #else -static inline struct page *__page_cache_alloc(gfp_t gfp) +static inline struct page *__page_cache_alloc(struct address_space *mapping, + pgoff_t off, gfp_t gfp) { - return alloc_pages(gfp, 0); + return alloc_pages(mapping_gfp_mask(mapping)); } #endif -static inline struct page *page_cache_alloc(struct address_space *x) +static inline struct page *page_cache_alloc(struct address_space *mapping, + pgoff_t off) { - return __page_cache_alloc(mapping_gfp_mask(x)); + return __page_cache_alloc(mapping, off, mapping_gfp_mask(mapping)); } -static inline struct page *page_cache_alloc_cold(struct address_space *x) +static inline struct page *page_cache_alloc_cold(struct address_space *mapping, + pgoff_t off) { - return __page_cache_alloc(mapping_gfp_mask(x)|__GFP_COLD); + return __page_cache_alloc(mapping, off, mapping_gfp_mask(mapping) | __GFP_COLD); } typedef int filler_t(void *, struct page *); Index: linux-2.6.36-mmotm-101103-1217/fs/splice.c =================================================================== --- linux-2.6.36-mmotm-101103-1217.orig/fs/splice.c +++ linux-2.6.36-mmotm-101103-1217/fs/splice.c @@ -349,7 +349,7 @@ __generic_file_splice_read(struct file * /* * page didn't exist, allocate one. */ - page = page_cache_alloc_cold(mapping); + page = page_cache_alloc_cold(mapping, index); if (!page) break; Index: linux-2.6.36-mmotm-101103-1217/mm/readahead.c =================================================================== --- linux-2.6.36-mmotm-101103-1217.orig/mm/readahead.c +++ linux-2.6.36-mmotm-101103-1217/mm/readahead.c @@ -174,7 +174,7 @@ __do_page_cache_readahead(struct address if (page) continue; - page = page_cache_alloc_cold(mapping); + page = page_cache_alloc_cold(mapping, page_offset); if (!page) break; page->index = page_offset; Index: linux-2.6.36-mmotm-101103-1217/fs/ntfs/file.c =================================================================== --- linux-2.6.36-mmotm-101103-1217.orig/fs/ntfs/file.c +++ linux-2.6.36-mmotm-101103-1217/fs/ntfs/file.c @@ -415,7 +415,7 @@ static inline int __ntfs_grab_cache_page pages[nr] = find_lock_page(mapping, index); if (!pages[nr]) { if (!*cached_page) { - *cached_page = page_cache_alloc(mapping); + *cached_page = page_cache_alloc(mapping, index); if (unlikely(!*cached_page)) { err = -ENOMEM; goto err_out; Index: linux-2.6.36-mmotm-101103-1217/include/linux/mempolicy.h =================================================================== --- linux-2.6.36-mmotm-101103-1217.orig/include/linux/mempolicy.h +++ linux-2.6.36-mmotm-101103-1217/include/linux/mempolicy.h @@ -109,6 +109,8 @@ struct mempolicy { } w; }; +extern struct mempolicy default_policy; + /* * vma memory policy flags */ @@ -191,6 +193,7 @@ extern void mpol_rebind_task(struct task extern void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new); extern void mpol_fix_fork_child_flag(struct task_struct *p); +extern struct mempolicy *get_file_policy(struct address_space *, pgoff_t); extern struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags, struct mempolicy **mpol, nodemask_t **nodemask); @@ -321,6 +324,11 @@ static inline bool mempolicy_nodemask_in return false; } +static inline struct mempolicy *get_file_policy(struct address_space *, pgoff_t) +{ + return NULL; +} + static inline int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags) Index: linux-2.6.36-mmotm-101103-1217/mm/mempolicy.c =================================================================== --- linux-2.6.36-mmotm-101103-1217.orig/mm/mempolicy.c +++ linux-2.6.36-mmotm-101103-1217/mm/mempolicy.c @@ -1534,8 +1534,29 @@ asmlinkage long compat_sys_mbind(compat_ #endif -/* - * get_vma_policy(@task, @vma, @addr) +/** + * get_file_policy - Return effective policy for @mapping at @pgoff + * @mapping - file's address_space that might contain shared policy + * @pgoff - page offset into file/object + * + * Falls back to task or system default policy, as necessary. + */ +struct mempolicy *get_file_policy(struct address_space *mapping, pgoff_t pgoff) +{ + struct shared_policy *sp = mapping->spolicy; + struct mempolicy *pol = NULL; + + if (unlikely(sp)) + pol = mpol_shared_policy_lookup(sp, pgoff); + else if (likely(current)) + pol = current->mempolicy; + if (likely(!pol)) + pol = &default_policy; + return pol; +} + +/** + * get_vma_policy * @task - task for fallback if vma policy == default * @vma - virtual memory area whose policy is sought * @addr - address in @vma for shared policy lookup Index: linux-2.6.36-mmotm-101103-1217/fs/cachefiles/rdwr.c =================================================================== --- linux-2.6.36-mmotm-101103-1217.orig/fs/cachefiles/rdwr.c +++ linux-2.6.36-mmotm-101103-1217/fs/cachefiles/rdwr.c @@ -258,7 +258,8 @@ static int cachefiles_read_backing_file_ goto backing_page_already_present; if (!newpage) { - newpage = page_cache_alloc_cold(bmapping); + newpage = page_cache_alloc_cold(bmapping, + netpage->index); if (!newpage) goto nomem_monitor; } @@ -500,7 +501,8 @@ static int cachefiles_read_backing_file( goto backing_page_already_present; if (!newpage) { - newpage = page_cache_alloc_cold(bmapping); + newpage = page_cache_alloc_cold(bmapping, + netpage->index); if (!newpage) goto nomem; } Index: linux-2.6.36-mmotm-101103-1217/fs/btrfs/compression.c =================================================================== --- linux-2.6.36-mmotm-101103-1217.orig/fs/btrfs/compression.c +++ linux-2.6.36-mmotm-101103-1217/fs/btrfs/compression.c @@ -474,8 +474,8 @@ static noinline int add_ra_bio_pages(str goto next; } - page = __page_cache_alloc(mapping_gfp_mask(mapping) & - ~__GFP_FS); + page = __page_cache_alloc(mapping, page_index, + mapping_gfp_mask(mapping) & ~__GFP_FS); if (!page) break; Index: linux-2.6.36-mmotm-101103-1217/net/ceph/messenger.c =================================================================== --- linux-2.6.36-mmotm-101103-1217.orig/net/ceph/messenger.c +++ linux-2.6.36-mmotm-101103-1217/net/ceph/messenger.c @@ -2111,7 +2111,7 @@ struct ceph_messenger *ceph_messenger_cr /* the zero page is needed if a request is "canceled" while the message * is being written over the socket */ - msgr->zero_page = __page_cache_alloc(GFP_KERNEL | __GFP_ZERO); + msgr->zero_page = alloc_page(GFP_KERNEL | __GFP_ZERO); if (!msgr->zero_page) { kfree(msgr); return ERR_PTR(-ENOMEM); Index: linux-2.6.36-mmotm-101103-1217/net/ceph/pagelist.c =================================================================== --- linux-2.6.36-mmotm-101103-1217.orig/net/ceph/pagelist.c +++ linux-2.6.36-mmotm-101103-1217/net/ceph/pagelist.c @@ -33,7 +33,7 @@ static int ceph_pagelist_addpage(struct struct page *page; if (!pl->num_pages_free) { - page = __page_cache_alloc(GFP_NOFS); + page = alloc_page(GFP_NOFS); } else { page = list_first_entry(&pl->free_list, struct page, lru); list_del(&page->lru); @@ -85,7 +85,7 @@ int ceph_pagelist_reserve(struct ceph_pa space = (space + PAGE_SIZE - 1) >> PAGE_SHIFT; /* conv to num pages */ while (space > pl->num_pages_free) { - struct page *page = __page_cache_alloc(GFP_NOFS); + struct page *page = alloc_page(GFP_NOFS); if (!page) return -ENOMEM; list_add_tail(&page->lru, &pl->free_list); Index: linux-2.6.36-mmotm-101103-1217/net/ceph/pagevec.c =================================================================== --- linux-2.6.36-mmotm-101103-1217.orig/net/ceph/pagevec.c +++ linux-2.6.36-mmotm-101103-1217/net/ceph/pagevec.c @@ -69,7 +69,7 @@ struct page **ceph_alloc_page_vector(int if (!pages) return ERR_PTR(-ENOMEM); for (i = 0; i < num_pages; i++) { - pages[i] = __page_cache_alloc(flags); + pages[i] = alloc_page(flags); if (pages[i] == NULL) { ceph_release_page_vector(pages, i); return ERR_PTR(-ENOMEM); -- To unsubscribe from this list: send the line "unsubscribe linux-numa" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html