Shared Policy Infrastructure - define mapped file policy persistence model This patch starts the process of supporting optional shared policy on shared memory mapped files. Mapped file policy applies to a range of a linearly memory mapped file mmap()ed with the MAP_SHARED flag. The mapping serves as a linear window onto the mapped range. Retain the shared policy until the last shared mapping is removed, so that cached files do not retain policies installed by defunct applications. Use rcu deferred free to close possible race between last shared mapper removing the shared policy and non-mmap page cache access. Shmem segments [including SHM_HUGETLB segments] look like shared mapped files to the shared policy infrastructure. The policy persistence model for shmem segments is that once a shared policy is applied, it remains as long as the segment exists. To retain this behavior, define a shared policy persistence flag--SPOL_F_PERSIST--and set this flag when allocating a shared policy for a shmem segment. Now, we can push the freeing any shmem/hugetlbfs persistent shared policy when the segment is deleted down into the fs-independent inode cleanup path. Signed-off-by: Lee Schermerhorn <lee.schermerhorn@xxxxxx> fs/hugetlbfs/inode.c | 1 fs/inode.c | 7 ++++ include/linux/shared_policy.h | 11 ++++-- mm/mempolicy.c | 70 ++++++++++++++++++++++++++++++++---------- mm/mmap.c | 11 ++++++ mm/shmem.c | 5 --- 6 files changed, 81 insertions(+), 24 deletions(-) Index: linux-2.6.36-mmotm-101103-1217/fs/hugetlbfs/inode.c =================================================================== --- linux-2.6.36-mmotm-101103-1217.orig/fs/hugetlbfs/inode.c +++ linux-2.6.36-mmotm-101103-1217/fs/hugetlbfs/inode.c @@ -663,7 +663,6 @@ static struct inode *hugetlbfs_alloc_ino static void hugetlbfs_destroy_inode(struct inode *inode) { hugetlbfs_inc_free_inodes(HUGETLBFS_SB(inode->i_sb)); - mpol_free_shared_policy(inode->i_mapping); kmem_cache_free(hugetlbfs_inode_cachep, HUGETLBFS_I(inode)); } Index: linux-2.6.36-mmotm-101103-1217/fs/inode.c =================================================================== --- linux-2.6.36-mmotm-101103-1217.orig/fs/inode.c +++ linux-2.6.36-mmotm-101103-1217/fs/inode.c @@ -25,6 +25,7 @@ #include <linux/async.h> #include <linux/posix_acl.h> #include <linux/ima.h> +#include <linux/shared_policy.h> /* * This is needed for the following functions: @@ -305,6 +306,12 @@ void inode_init_once(struct inode *inode #ifdef CONFIG_FSNOTIFY INIT_HLIST_HEAD(&inode->i_fsnotify_marks); #endif + /* + * free any shared policy + */ + if ((inode->i_mode & S_IFMT) == S_IFREG) + mpol_free_shared_policy(inode->i_mapping); + } EXPORT_SYMBOL(inode_init_once); Index: linux-2.6.36-mmotm-101103-1217/mm/shmem.c =================================================================== --- linux-2.6.36-mmotm-101103-1217.orig/mm/shmem.c +++ linux-2.6.36-mmotm-101103-1217/mm/shmem.c @@ -1516,6 +1516,7 @@ int shmem_set_policy(struct vm_area_stru if (IS_ERR(sp)) return PTR_ERR(sp); } + sp->sp_flags |= SPOL_F_PERSIST; return mpol_set_shared_policy(sp, vma_mpol_pgoff(vma, start), (end - start) >> PAGE_SHIFT, new); } @@ -2417,10 +2418,6 @@ static struct inode *shmem_alloc_inode(s static void shmem_destroy_inode(struct inode *inode) { - if ((inode->i_mode & S_IFMT) == S_IFREG) { - /* only struct inode is valid if it's an inline symlink */ - mpol_free_shared_policy(inode->i_mapping); - } kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); } Index: linux-2.6.36-mmotm-101103-1217/mm/mmap.c =================================================================== --- linux-2.6.36-mmotm-101103-1217.orig/mm/mmap.c +++ linux-2.6.36-mmotm-101103-1217/mm/mmap.c @@ -198,6 +198,17 @@ static void __remove_shared_vm_struct(st if (vma->vm_flags & VM_SHARED) mapping->i_mmap_writable--; + if (!mapping->i_mmap_writable) { + /* + * shared mmap()ed file policy persistence model: + * remove policy when removing last shared mapping, + * unless marked as persistent--e.g., shmem + */ + struct shared_policy *sp = mapping_shared_policy(mapping); + if (sp && !(sp->sp_flags & SPOL_F_PERSIST)) + mpol_free_shared_policy(mapping); + } + flush_dcache_mmap_lock(mapping); if (unlikely(vma->vm_flags & VM_NONLINEAR)) list_del_init(&vma->shared.vm_set.list); Index: linux-2.6.36-mmotm-101103-1217/include/linux/shared_policy.h =================================================================== --- linux-2.6.36-mmotm-101103-1217.orig/include/linux/shared_policy.h +++ linux-2.6.36-mmotm-101103-1217/include/linux/shared_policy.h @@ -4,6 +4,7 @@ #include <linux/fs.h> #include <linux/spinlock.h> #include <linux/rbtree.h> +#include <linux/rcupdate.h> /* * Tree of shared policies for a shared memory regions and memory @@ -25,11 +26,15 @@ struct sp_node { }; struct shared_policy { - struct rb_root root; - spinlock_t lock; /* protects rb tree */ - int nr_sp_nodes; /* for numa_maps */ + struct rb_root root; + spinlock_t lock; /* protects rb tree, nr_sp_nodes */ + int nr_sp_nodes; /* for numa_maps */ + int sp_flags; /* persistence, ... */ + struct rcu_head sp_rcu; /* deferred reclaim */ }; +#define SPOL_F_PERSIST 0x01 /* for shmem use */ + extern int shared_file_policy_default; extern struct shared_policy *mpol_shared_policy_new( Index: linux-2.6.36-mmotm-101103-1217/mm/mempolicy.c =================================================================== --- linux-2.6.36-mmotm-101103-1217.orig/mm/mempolicy.c +++ linux-2.6.36-mmotm-101103-1217/mm/mempolicy.c @@ -1572,13 +1572,17 @@ asmlinkage long compat_sys_mbind(compat_ */ struct mempolicy *get_file_policy(struct address_space *mapping, pgoff_t pgoff) { - struct shared_policy *sp = mapping->spolicy; + struct shared_policy *sp; struct mempolicy *pol = NULL; + rcu_read_lock(); + sp = rcu_dereference(mapping->spolicy); if (unlikely(sp)) pol = mpol_shared_policy_lookup(sp, pgoff); else if (likely(current)) pol = current->mempolicy; + rcu_read_unlock(); + if (likely(!pol)) pol = &default_policy; return pol; @@ -2291,6 +2295,10 @@ restart: * On entry, the current task has a reference on a non-NULL @mpol. * This must be released on exit. * This is called at get_inode() calls and we can use GFP_KERNEL. + * + * Locking: mapping->spolicy stabilized by current->mm->mmap_sem. + * Can't remove last shared mapping while we hold the sem; can't + * remove inode/shared policy while inode is mmap()ed shared. */ struct shared_policy *mpol_shared_policy_new(struct address_space *mapping, struct mempolicy *mpol) @@ -2349,9 +2357,10 @@ put_free: */ spin_lock(&mapping->i_mmap_lock); spx = mapping->spolicy; - if (!spx && !err) - mapping->spolicy = spx = sp; - else + if (!spx && !err) { + spx = sp; + rcu_assign_pointer(mapping->spolicy, sp); + } else err = !0; spin_unlock(&mapping->i_mmap_lock); if (err) @@ -2367,6 +2376,9 @@ put_free: * @sz: size of range [bytes] to which mempolicy applies * @mpol: the mempolicy to install * + * Locking: mapping->spolicy stabilized by current->mm->mmap_sem. + * Can't remove last shared mapping while we hold the sem; can't + * remove inode/shared policy while inode is mmap()ed shared. */ int mpol_set_shared_policy(struct shared_policy *sp, pgoff_t pgoff, unsigned long sz, @@ -2394,37 +2406,63 @@ int mpol_set_shared_policy(struct shared /** * mpol_free_shared_policy() - Free a backing policy store on inode delete. - * @mapping - address_space struct containing pointer to shared policy to be freed. + * @mapping - address_space struct containing pointer to shared policy to be + * freed. * * Frees the shared policy red-black tree, if any, before freeing the * shared policy struct itself, if any. + + * Locking: only free shared policy on inode deletion [shmem] or + * removal of last shared mmap()ing. Can only delete inode when no + * more references. Removal of last shared mmap()ing protected by + * mmap_sem [and mapping->i_mmap_lock]. Still a potential race with + * shared policy lookups from page cache on behalf of file descriptor + * access to pages. Use deferred RCU to protect readers [in get_file_policy()] + * from shared policy free on removal of last shared mmap()ing. */ -void mpol_free_shared_policy(struct address_space *mapping) +static void __mpol_free_shared_policy(struct rcu_head *rhp) { - struct shared_policy *sp = mapping->spolicy; - struct sp_node *n; + struct shared_policy *sp = container_of(rhp, struct shared_policy, + sp_rcu); struct rb_node *next; - if (!sp) - return; - - mapping->spolicy = NULL; - + /* + * Now, we can safely tear down the shared policy tree, if any + */ if (sp->root.rb_node) { - spin_lock(&sp->lock); next = rb_first(&sp->root); while (next) { - n = rb_entry(next, struct sp_node, nd); + struct sp_node *n = rb_entry(next, struct sp_node, nd); next = rb_next(&n->nd); rb_erase(&n->nd, &sp->root); mpol_put(n->policy); kmem_cache_free(sn_cache, n); } - spin_unlock(&sp->lock); } kmem_cache_free(sp_cache, sp); } +void mpol_free_shared_policy(struct address_space *mapping) +{ + struct shared_policy *sp = mapping->spolicy; + + if (!sp) + return; + + rcu_assign_pointer(mapping->spolicy, NULL); + + /* + * Presence of 'PERSIST flag means we're freeing the + * shared policy in the inode destruction path. No + * need for RCU synchronization. + */ + if (sp->sp_flags & SPOL_F_PERSIST) + __mpol_free_shared_policy(&sp->sp_rcu); + else + call_rcu(&sp->sp_rcu, __mpol_free_shared_policy); + +} + /* assumes fs == KERNEL_DS */ void __init numa_policy_init(void) { -- To unsubscribe from this list: send the line "unsubscribe linux-numa" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html