Shared Policy Infrastructure - Factor alloc_page_pol routine Implement alloc_page_pol() to allocate a page given a policy and an offset [for interleaving]. No vma nor addr needed. This function will be used to allocate page_cache pages--e.g., for tmpfs files--given the policy at a given page offset, simplifying the shmem page allocation functions. Revise alloc_page_vma() to simply call alloc_page_pol() after looking up the vma policy, to eliminate duplicate code. This change rippled into the interleaving functions. I was able to eliminate interleave_nid() by computing the offset at the call sites where it was not already available and calling [modified] offset_il_node() directly. removed vma arg from offset_il_node(), as it wasn't used and is not available when called from alloc_page_pol(). Note: re: alloc_page_vma() -- can be called w/ vma == NULL via read_swap_cache_async() from try_to_unuse(). Can't compute a page offset in this case. This means that pages read by "swapoff" don't/can't follow vma policy. This is current behavior. Similarly, swapin readahead reads multiple swap pages, almost certainly associated with different tasks, with the vma from the first swap page read--the one that caused the fault. Again, we can't compute the correct policy for those pages, and this is current behavior. Signed-off-by: Lee Schermerhorn <lee.schermerhorn@xxxxxx> include/linux/gfp.h | 3 + include/linux/mm.h | 12 ++++- mm/hugetlb.c | 10 ++++ mm/mempolicy.c | 107 ++++++++++++++++++++++++++++------------------------ 4 files changed, 81 insertions(+), 51 deletions(-) Index: linux-2.6.36-mmotm-101103-1217/include/linux/gfp.h =================================================================== --- linux-2.6.36-mmotm-101103-1217.orig/include/linux/gfp.h +++ linux-2.6.36-mmotm-101103-1217/include/linux/gfp.h @@ -327,10 +327,13 @@ alloc_pages(gfp_t gfp_mask, unsigned int } extern struct page *alloc_page_vma(gfp_t gfp_mask, struct vm_area_struct *vma, unsigned long addr); +struct mempolicy; +extern struct page *alloc_page_pol(gfp_t, struct mempolicy *, pgoff_t); #else #define alloc_pages(gfp_mask, order) \ alloc_pages_node(numa_node_id(), gfp_mask, order) #define alloc_page_vma(gfp_mask, vma, addr) alloc_pages(gfp_mask, 0) +#define alloc_page_pol(gfp_mask, pol, off) alloc_pages(gfp_mask, 0) #endif #define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0) Index: linux-2.6.36-mmotm-101103-1217/include/linux/mm.h =================================================================== --- linux-2.6.36-mmotm-101103-1217.orig/include/linux/mm.h +++ linux-2.6.36-mmotm-101103-1217/include/linux/mm.h @@ -1235,15 +1235,23 @@ extern void setup_per_cpu_pageset(void); /* * Address to offset for policy lookup and interleave calculation. - * Placed here because it needs struct vma definition. + * Placed here because it needs struct vma definition and we + * can't easily include mm.h in mempolicy.h, nor can we include + * hugetlb.h here. Thus, the extern below. */ static inline pgoff_t vma_mpol_pgoff(struct vm_area_struct *vma, unsigned long addr) { + extern pgoff_t vma_huge_mpol_offset(struct vm_area_struct *, + unsigned long); + + if (unlikely(vma->vm_flags & VM_HUGETLB)) + return vma_huge_mpol_offset(vma, addr); + return ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; } -static inline pgoff_t vma_mpol_addr(struct vm_area_struct *vma, +static inline unsigned long vma_mpol_addr(struct vm_area_struct *vma, pgoff_t pgoff) { return ((pgoff - vma->vm_pgoff) << PAGE_SHIFT) + vma->vm_start; Index: linux-2.6.36-mmotm-101103-1217/mm/mempolicy.c =================================================================== --- linux-2.6.36-mmotm-101103-1217.orig/mm/mempolicy.c +++ linux-2.6.36-mmotm-101103-1217/mm/mempolicy.c @@ -35,6 +35,7 @@ * use the process policy. This is what Linux always did * in a NUMA aware kernel and still does by, ahem, default. * +//TODO: following needs paragraph rewording. haven't figured out what to say. * The process policy is applied for most non interrupt memory allocations * in that process' context. Interrupts ignore the policies and always * try to allocate on the local CPU. The VMA policy is only applied for memory @@ -50,15 +51,15 @@ * Same with GFP_DMA allocations. * * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between - * all users and remembered even when nobody has memory mapped. + * all users and remembered even when nobody has memory mapped. Shared + * policies handle mempolicies on sub-ranges of the object using a + * red/black tree. These policies persist until explicitly removed or + * the backing file is destroyed. */ /* Notebook: - fix mmap readahead to honour policy and enable policy for any page cache - object statistics for bigpages - global policy for page cache? currently it uses process policy. Requires - first item above. + global policy for page cache? handle mremap for shared memory (currently ignored for the policy) grows down? make bind policy root only? It can trigger oom much faster and the @@ -1671,9 +1672,10 @@ unsigned slab_node(struct mempolicy *pol } } -/* Do static interleaving for a VMA with known offset. */ -static unsigned offset_il_node(struct mempolicy *pol, - struct vm_area_struct *vma, unsigned long off) +/* + * Do static interleaving for a policy with known offset. + */ +static unsigned offset_il_node(struct mempolicy *pol, pgoff_t off) { unsigned nnodes = nodes_weight(pol->v.nodes); unsigned target; @@ -1691,28 +1693,6 @@ static unsigned offset_il_node(struct me return nid; } -/* Determine a node number for interleave */ -static inline unsigned interleave_nid(struct mempolicy *pol, - struct vm_area_struct *vma, unsigned long addr, int shift) -{ - if (vma) { - unsigned long off; - - /* - * for small pages, there is no difference between - * shift and PAGE_SHIFT, so the bit-shift is safe. - * for huge pages, since vm_pgoff is in units of small - * pages, we need to shift off the always 0 bits to get - * a useful offset. - */ - BUG_ON(shift < PAGE_SHIFT); - off = vma->vm_pgoff >> (shift - PAGE_SHIFT); - off += (addr - vma->vm_start) >> shift; - return offset_il_node(pol, vma, off); - } else - return interleave_nodes(pol); -} - #ifdef CONFIG_HUGETLBFS /* * huge_zonelist(@vma, @addr, @gfp_flags, @mpol) @@ -1739,8 +1719,9 @@ struct zonelist *huge_zonelist(struct vm *nodemask = NULL; /* assume !MPOL_BIND */ if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) { - zl = node_zonelist(interleave_nid(*mpol, vma, addr, - huge_page_shift(hstate_vma(vma))), gfp_flags); + zl = node_zonelist( + offset_il_node(*mpol, vma_mpol_pgoff(vma, addr)), + gfp_flags); } else { zl = policy_zonelist(gfp_flags, *mpol); if ((*mpol)->mode == MPOL_BIND) @@ -1859,31 +1840,27 @@ static struct page *alloc_page_interleav } /** - * alloc_page_vma - Allocate a page for a VMA. + * alloc_page_pol() -- allocate a page based on policy,offset. * - * @gfp: + * @gfp - gfp mask [flags + zone] for allocation * %GFP_USER user allocation. * %GFP_KERNEL kernel allocations, * %GFP_HIGHMEM highmem/user allocations, * %GFP_FS allocation should not call back into a file system. * %GFP_ATOMIC don't sleep. * - * @vma: Pointer to VMA or NULL if not available. - * @addr: Virtual Address of the allocation. Must be inside the VMA. + * @pol - policy to use for allocation + * @pgoff - page offset for interleaving -- used only if interleave policy * - * This function allocates a page from the kernel page pool and applies - * a NUMA policy associated with the VMA or the current process. - * When VMA is not NULL caller must hold down_read on the mmap_sem of the - * mm_struct of the VMA to prevent it from going away. Should be used for - * all allocations for pages that will be mapped into - * user space. Returns NULL when no page can be allocated. + * This function allocates a page from the kernel page pool and applies + * the NUMA memory policy @pol, possibly indexed by @pgoff. Should be + * used for all allocations for anonymous pages that will be mapped into + * user space. Returns NULL when no page can be allocated. * - * Should be called with the mm_sem of the vma hold. + * Note: extra ref on shared policies dropped on return. */ -struct page * -alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) +struct page *alloc_page_pol(gfp_t gfp, struct mempolicy *pol, pgoff_t pgoff) { - struct mempolicy *pol = get_vma_policy(current, vma, addr); struct zonelist *zl; struct page *page; @@ -1891,7 +1868,7 @@ alloc_page_vma(gfp_t gfp, struct vm_area if (unlikely(pol->mode == MPOL_INTERLEAVE)) { unsigned nid; - nid = interleave_nid(pol, vma, addr, PAGE_SHIFT); + nid = offset_il_node(pol, pgoff); mpol_cond_put(pol); page = alloc_page_interleave(gfp, 0, nid); put_mems_allowed(); @@ -1902,8 +1879,8 @@ alloc_page_vma(gfp_t gfp, struct vm_area /* * slow path: ref counted shared policy */ - struct page *page = __alloc_pages_nodemask(gfp, 0, - zl, policy_nodemask(gfp, pol)); + struct page *page = __alloc_pages_nodemask(gfp, 0, zl, + policy_nodemask(gfp, pol)); __mpol_put(pol); put_mems_allowed(); return page; @@ -1915,6 +1892,38 @@ alloc_page_vma(gfp_t gfp, struct vm_area put_mems_allowed(); return page; } +EXPORT_SYMBOL(alloc_page_pol); + +/** + * alloc_page_vma - Allocate a page for a VMA. + * + * @gfp: + * %GFP_USER user allocation. + * %GFP_KERNEL kernel allocations, + * %GFP_HIGHMEM highmem/user allocations, + * %GFP_FS allocation should not call back into a file system. + * %GFP_ATOMIC don't sleep. + * + * @vma: Pointer to VMA or NULL if not available. + * @addr: Virtual Address of the allocation. Must be inside the VMA. + * + * This function allocates a page from the kernel page pool and applies + * a NUMA policy associated with the VMA or the current process. + * When VMA is not NULL caller must hold down_read on the mmap_sem of the + * mm_struct of the VMA to prevent it from going away. Should be used for + * all allocations for anonymous pages that will be mapped into + * user space. Returns NULL when no page can be allocated. + */ +struct page * +alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) +{ + struct mempolicy *pol = get_vma_policy(current, vma, addr); + pgoff_t pgoff = 0; + + if (likely(vma)) + pgoff = vma_mpol_pgoff(vma, addr); + return alloc_page_pol(gfp, pol, pgoff); +} /** * alloc_pages_current - Allocate pages. Index: linux-2.6.36-mmotm-101103-1217/mm/hugetlb.c =================================================================== --- linux-2.6.36-mmotm-101103-1217.orig/mm/hugetlb.c +++ linux-2.6.36-mmotm-101103-1217/mm/hugetlb.c @@ -230,6 +230,16 @@ pgoff_t linear_hugepage_index(struct vm_ } /* + * As above, given just vma and address. + * For computing huge page offset for interleave mempolicy + */ +pgoff_t vma_huge_mpol_offset(struct vm_area_struct *vma, + unsigned long address) +{ + return vma_hugecache_offset(hstate_vma(vma), vma, address); +} + +/* * Return the size of the pages allocated when backing a VMA. In the majority * cases this will be same size as used by the page table entries. */ -- To unsubscribe from this list: send the line "unsubscribe linux-numa" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html