Shared Policy Infrastructure - add generic file set/get policy vm ops Add set/get policy vm ops to generic_file_vm_ops in support of mmap()ed file memory policies. This patch effectively "hooks up" shared file mappings to the NUMA shared policy infrastructure. However, a task will only use a shared policy if it has been enabled by the task's cpuset's "shared_file_policy" control file. Default is disabled--i.e., existing behaviors. To ensure that applications do not get surprised by unrelated applications applying shared policy to their files, allow only the owner of a file to apply shared policy. Note that we could make this enforcement conditional on a per-cpuset "shared_policy_enforce_ownership" file. NOTE: may be able to unify with shmem_{get|set}_policy. Updated numa_memory_policy.txt to document this behavior. Signed-off-by: Lee Schermerhorn <lee.schermerhorn@xxxxxx> Documentation/vm/numa_memory_policy.txt | 47 ++++++++++++++++++----------- mm/filemap.c | 51 ++++++++++++++++++++++++++++++++ mm/mempolicy.c | 35 +++++++++++++++++---- mm/shmem.c | 4 +- 4 files changed, 110 insertions(+), 27 deletions(-) Index: linux-2.6.36-mmotm-101103-1217/mm/filemap.c =================================================================== --- linux-2.6.36-mmotm-101103-1217.orig/mm/filemap.c +++ linux-2.6.36-mmotm-101103-1217/mm/filemap.c @@ -513,6 +513,47 @@ struct page *__page_cache_alloc(struct a return alloc_page_pol(gfp, pol, pgoff); } EXPORT_SYMBOL(__page_cache_alloc); + +static int generic_file_set_policy(struct vm_area_struct *vma, + unsigned long start, unsigned long end, + struct mempolicy *new) +{ + struct address_space *mapping; + struct shared_policy *sp; + int ret; + + mapping = vma->vm_file->f_mapping; + + /* + * Only owner or privileged task can set shared policy on shared + * regular file mappings. + */ + if (!is_owner_or_cap(mapping->host)) + return -EPERM; + + sp = mapping->spolicy; + if (!sp) { + sp = mpol_shared_policy_new(mapping, NULL); + if (IS_ERR(sp)) + return PTR_ERR(sp); + } + + ret = mpol_set_shared_policy(sp, vma_mpol_pgoff(vma, start), + (end - start) >> PAGE_SHIFT, new); + if (!ret) + mpol_set_vma_nosplit(vma); + return ret; +} + +static struct mempolicy * +generic_file_get_policy(struct vm_area_struct *vma, unsigned long addr) +{ + struct shared_policy *sp = vma->vm_file->f_mapping->spolicy; + if (!sp) + return NULL; + + return mpol_shared_policy_lookup(sp, vma_mpol_pgoff(vma, addr)); +} #endif static int __sleep_on_page_lock(void *word) @@ -1686,6 +1727,10 @@ EXPORT_SYMBOL(filemap_fault); const struct vm_operations_struct generic_file_vm_ops = { .fault = filemap_fault, +#ifdef CONFIG_NUMA + .set_policy = generic_file_set_policy, + .get_policy = generic_file_get_policy, +#endif }; /* This is used for a general mmap of a disk file */ @@ -1699,6 +1744,12 @@ int generic_file_mmap(struct file * file file_accessed(file); vma->vm_ops = &generic_file_vm_ops; vma->vm_flags |= VM_CAN_NONLINEAR; + + /* + * shared policies and non-linear mappings are mutually exclusive + */ + if ((vma->vm_flags & VM_SHARED) && mapping_shared_policy(mapping)) + mpol_set_vma_nosplit(vma); return 0; } Index: linux-2.6.36-mmotm-101103-1217/mm/mempolicy.c =================================================================== --- linux-2.6.36-mmotm-101103-1217.orig/mm/mempolicy.c +++ linux-2.6.36-mmotm-101103-1217/mm/mempolicy.c @@ -638,18 +638,38 @@ check_range(struct mm_struct *mm, unsign return first; } +/* + * helper functions for deciding whether to split vmas on set_policy + * or to use the vma policy op for set/get. Note that we only get + * into these if the vma represents a shared, linear, file mapping, + * including shmem. + */ static bool vma_is_shared_linear(struct vm_area_struct *vma) { return ((vma->vm_flags & (VM_SHARED|VM_NONLINEAR)) == VM_SHARED); } -static bool mpol_nosplit_vma(struct vm_area_struct *vma) +static bool has_set_policy_op(struct vm_area_struct *vma) +{ + return (vma->vm_ops && vma->vm_ops->set_policy); +} + +/* + * We don't split vmas on set_policy if VMPOL_F_NOSPLIT is set or we have + * a shared, linear mapping, AND a set_policy() vm_op. VMPOL_F_NOSPLIT + * will be set for shmem segments and files mmap()ed SHARED if a shared + * policy has previously been applied to this file. + */ +static int mpol_nosplit_vma(struct vm_area_struct *vma) { if (vma->vm_mpol_flags & VMPOL_F_NOSPLIT) return true; - if (vma_is_shared_linear(vma) && - vma->vm_ops && vma->vm_ops->set_policy) { + if (vma_is_shared_linear(vma) && has_set_policy_op(vma) && + shared_file_policy_enabled(current)) { + /* + * short circuit future queries. + */ vma->vm_mpol_flags |= VMPOL_F_NOSPLIT; return true; } @@ -701,7 +721,8 @@ static int policy_vma(struct vm_area_str /* Step 2: apply policy to a range and do splits. */ static int mbind_range(struct mm_struct *mm, unsigned long start, - unsigned long end, struct mempolicy *new_pol) + unsigned long end, struct mempolicy *new_pol, + unsigned long flags) { struct vm_area_struct *next; struct vm_area_struct *prev; @@ -925,7 +946,7 @@ static long do_get_mempolicy(int *policy up_read(&mm->mmap_sem); return -EFAULT; } - if (vma->vm_ops && vma->vm_ops->get_policy) + if (mpol_use_get_op(vma)) pol = vma->vm_ops->get_policy(vma, addr); else pol = vma->vm_policy; @@ -1244,7 +1265,7 @@ static long do_mbind(unsigned long start if (!IS_ERR(vma)) { int nr_failed = 0; - err = mbind_range(mm, start, end, new); + err = mbind_range(mm, start, end, new, flags); if (!list_empty(&pagelist)) { nr_failed = migrate_pages(&pagelist, new_vma_page, @@ -1611,7 +1632,7 @@ struct mempolicy *get_vma_policy(struct if (vma) { /* - * use get_policy op, if any, for shared mappings + * use get_policy op, if applicable, for shared mappings */ if (mpol_use_get_op(vma)) { struct mempolicy *vpol = vma->vm_ops->get_policy(vma, Index: linux-2.6.36-mmotm-101103-1217/mm/shmem.c =================================================================== --- linux-2.6.36-mmotm-101103-1217.orig/mm/shmem.c +++ linux-2.6.36-mmotm-101103-1217/mm/shmem.c @@ -219,7 +219,7 @@ static const struct file_operations shme static const struct inode_operations shmem_inode_operations; static const struct inode_operations shmem_dir_inode_operations; static const struct inode_operations shmem_special_inode_operations; -static const struct vm_operations_struct shmem_vm_ops; +const struct vm_operations_struct shmem_vm_ops; static struct backing_dev_info shmem_backing_dev_info __read_mostly = { .ra_pages = 0, /* No readahead */ @@ -2526,7 +2526,7 @@ static const struct super_operations shm .put_super = shmem_put_super, }; -static const struct vm_operations_struct shmem_vm_ops = { +const struct vm_operations_struct shmem_vm_ops = { .fault = shmem_fault, #ifdef CONFIG_NUMA .set_policy = shmem_set_policy, Index: linux-2.6.36-mmotm-101103-1217/Documentation/vm/numa_memory_policy.txt =================================================================== --- linux-2.6.36-mmotm-101103-1217.orig/Documentation/vm/numa_memory_policy.txt +++ linux-2.6.36-mmotm-101103-1217/Documentation/vm/numa_memory_policy.txt @@ -78,11 +78,10 @@ most general to most specific: VMA policy applies ONLY to anonymous pages. These include pages allocated for anonymous segments, such as the task stack and heap, and any regions of the address space mmap()ed with the MAP_ANONYMOUS flag. - If a VMA policy is applied to a file mapping, it will be ignored if - the mapping used the MAP_SHARED flag. If the file mapping used the - MAP_PRIVATE flag, the VMA policy will only be applied when an - anonymous page is allocated on an attempt to write to the mapping-- - i.e., at Copy-On-Write. + If a VMA policy is applied to a file mapping mapped with the + MAP_PRIVATE flag, the VMA policy will only be applied when an anonymous + page is allocated on an attempt to write to the mapping--i.e., at + Copy-On-Write. VMA policies are shared between all tasks that share a virtual address space--a.k.a. threads--independent of when the policy is installed; and @@ -107,11 +106,16 @@ most general to most specific: mapped shared into one or more tasks' distinct address spaces. An application installs a shared policies the same way as VMA policies--using the mbind() system call specifying a range of virtual addresses that map - the shared object. However, unlike VMA policies, which can be considered - to be an attribute of a range of a task's address space, shared policies - apply directly to the shared object. Thus, all tasks that attach to the - object share the policy, and all pages allocated for the shared object, - by any task, will obey the shared policy. + some range of the shared object. However, unlike VMA policies, which can + be considered to be an attribute of a range of a task's address space, + shared policies apply directly to [a range of] the shared object. Thus, + all tasks that attach to the object share the policy, and all pages + allocated for the shared object, by any task, after the policy is installed, + will obey the shared policy. + + If no shared policy exists for a given page offset in a shared object, + allocation will fall back to task policy, and then to system default + policy, like VMA policies. As of 2.6.28, only shared memory segments, created by shmget() or mmap(MAP_ANONYMOUS|MAP_SHARED), support shared policy. Prior to @@ -124,12 +128,19 @@ most general to most specific: pages. To preserve existing behavior for applications that might care, this new behavior must be enabled on a per-cpuset basis. - As mentioned above [re: VMA policies], allocations of page cache - pages for regular files mmap()ed with MAP_SHARED ignore any VMA - policy installed on the virtual address range backed by the shared - file mapping. Rather, shared page cache pages, including pages backing - private mappings that have not yet been written by the task, follow - task policy, if any, else System Default Policy. + Prior to 2.6.XX, shared memory policies were not supported on regular + files. Allocations of page cache pages for regular files mmap()ed with + MAP_SHARED ignored any VMA policy installed on the virtual address + range backed by the shared file mapping. Rather, shared page cache + pages, including pages backing private mappings that have not yet been + written by the task, followed task policy, if any, else System Default + Policy. As of 2.6.XX, mbind() will install shared policies on [a range + of] a regular file mmap()ed with MAP_SHARED. To minimize unpleasant + surprises for existing applications, only the owner or appropriately + privileged task may apply a shared policy to a regular file, and the + policy will persist only as long as the file remains mapped in one or + more task's virtual address space. Further, this new behavior must be + enabled on a per-cpuset basis. The shared policy infrastructure supports different policies on subset ranges of the shared object. However, before Linux 2.6.XX, the kernel @@ -386,8 +397,8 @@ the policy is considered invalid and can The interaction of memory policies and cpusets can be problematic when tasks in two cpusets share access to a memory region, such as shared memory segments -created by shmget() of mmap() with the MAP_ANONYMOUS and MAP_SHARED flags, and -any of the tasks install shared policy on the region, only nodes whose +created by shmget() of mmap() with the MAP_ANONYMOUS and/or MAP_SHARED flags, +and any of the tasks install shared policy on the region, only nodes whose memories are allowed in both cpusets may be used in the policies. Since 2.6.26, applications can determine the allowed memories using the get_mempolicy() API with the MPOL_F_MEMS_ALLOWED flag. However, one still -- To unsubscribe from this list: send the line "unsubscribe linux-numa" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html