Shared Policy Infrastructure - dynamically alloc shared policies Remove shared policy structs from shmem and hugetlbfs inode info structs and dynamically allocate them as needed. Make shared policy pointer in address_space dependent on CONFIG_NUMA to avoid burdening configs that don't need/want NUMA support. Access [get/set] the shared_policy via wrappers that also depend on 'NUMA [to avoid excessive #ifdef in .c files]. Initialize shmem and hugetlbfs inode/address_space spolicy pointer to null, unless superblock [mount] specifies a non-default policy. Null shared policy pointer will cause 'get policy'--e.g., for page allocations--to fall back to task policy, if any, else to system default policy. Just like NULL vma policies. set_policy() ops must create shared_policy struct from a new kmem cache when a new policy is installed and no spolicy exists. mpol_shared_policy_init() replaced with mpol_shared_policy_new() to accomplish this. shmem must create/initialize a shared_policy when it allocates an inode if the tmpfs super-block/mount point specifies a non-default policy. mpol_free_shared_policy() must free the spolicy, if any, when inode is destroyed. NOTE: along with the previous patch in the series, this patch adds a single pointer to the generic address_space struct and thus to all inodes. Last I looked, this did not decrease the inodes/slab for x86_64 [and ia64 FWIW]. Other arcs need to be checked. If this extra pointer is problematic, I believe we could over load the non-linear pointer and use as_flags to detect. Signed-off-by: Lee Schermerhorn <lee.schermerhorn@xxxxxx> fs/hugetlbfs/inode.c | 19 ++---- fs/inode.c | 1 include/linux/fs.h | 27 ++++++++ include/linux/hugetlb.h | 1 include/linux/shared_policy.h | 24 ++++--- include/linux/shmem_fs.h | 1 mm/mempolicy.c | 128 ++++++++++++++++++++++++++++-------------- mm/shmem.c | 49 ++++++++++------ 8 files changed, 168 insertions(+), 82 deletions(-) Index: linux-2.6.36-mmotm-101103-1217/include/linux/shared_policy.h =================================================================== --- linux-2.6.36-mmotm-101103-1217.orig/include/linux/shared_policy.h +++ linux-2.6.36-mmotm-101103-1217/include/linux/shared_policy.h @@ -1,6 +1,7 @@ #ifndef _LINUX_SHARED_POLICY_H #define _LINUX_SHARED_POLICY_H 1 +#include <linux/fs.h> #include <linux/spinlock.h> #include <linux/rbtree.h> @@ -28,13 +29,15 @@ struct shared_policy { spinlock_t lock; /* protects rb tree */ }; -void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol); -int mpol_set_shared_policy(struct shared_policy *, - struct vm_area_struct *, - struct mempolicy *); -void mpol_free_shared_policy(struct shared_policy *); -struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *, - unsigned long); +extern struct shared_policy *mpol_shared_policy_new( + struct address_space *mapping, + struct mempolicy *mpol); +extern int mpol_set_shared_policy(struct shared_policy *, + struct vm_area_struct *, + struct mempolicy *); +extern void mpol_free_shared_policy(struct address_space *); +extern struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *, + unsigned long); #else /* !NUMA */ @@ -47,12 +50,12 @@ static inline int mpol_set_shared_policy return -EINVAL; } -static inline void mpol_shared_policy_init(struct shared_policy *sp, - struct mempolicy *mpol) +static inline struct shared_policy * +mpol_shared_policy_new(struct address_space *mapping, struct mempolicy *mpol) { } -static inline void mpol_free_shared_policy(struct shared_policy *p) +static inline void mpol_free_shared_policy(struct shared_policy *sp) { } @@ -61,6 +64,7 @@ mpol_shared_policy_lookup(struct shared_ { return NULL; } + #endif #endif /* _LINUX_SHARED_POLICY_H */ Index: linux-2.6.36-mmotm-101103-1217/mm/mempolicy.c =================================================================== --- linux-2.6.36-mmotm-101103-1217.orig/mm/mempolicy.c +++ linux-2.6.36-mmotm-101103-1217/mm/mempolicy.c @@ -102,6 +102,7 @@ #define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2) /* Gather statistics */ static struct kmem_cache *policy_cache; +static struct kmem_cache *sp_cache; static struct kmem_cache *sn_cache; /* Highest zone. An specific allocation for a zone below that is not @@ -2137,52 +2138,86 @@ restart: } /** - * mpol_shared_policy_init - initialize shared policy for inode - * @sp: pointer to inode shared policy - * @mpol: struct mempolicy to install + * mpol_shared_policy_new - allocate and initialize a shared policy struct + * @mpol: struct mempolicy to install, if non-NULL == tmpfs mount point + * mempolicy. * - * Install non-NULL @mpol in inode's shared policy rb-tree. + * Allocate a new shared policy structure and install non-NULL @mpol. * On entry, the current task has a reference on a non-NULL @mpol. * This must be released on exit. * This is called at get_inode() calls and we can use GFP_KERNEL. */ -void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) +struct shared_policy *mpol_shared_policy_new(struct address_space *mapping, + struct mempolicy *mpol) { - int ret; - - sp->root = RB_ROOT; /* empty tree == default mempolicy */ - spin_lock_init(&sp->lock); + struct shared_policy *sp, *spx; + struct mempolicy *new = NULL; + int err = 0; if (mpol) { - struct vm_area_struct pvma; - struct mempolicy *new; NODEMASK_SCRATCH(scratch); - if (!scratch) - goto put_mpol; - /* contextualize the tmpfs mount point mempolicy */ + if (!scratch) { + sp = ERR_PTR(-ENOMEM); + err = !0; + goto put_free; + } + sp = mapping->spolicy; + /* + * Contextualize the tmpfs mount point mempolicy. Ensure that + * we have a good mempolicy before allocating new shared policy. + */ new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask); - if (IS_ERR(new)) - goto free_scratch; /* no valid nodemask intersection */ + err = IS_ERR(new); + if (err) + goto put_free; task_lock(current); - ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch); + err = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch); task_unlock(current); - if (ret) - goto put_new; +put_free: + mpol_put(mpol); /* drop our ref on sb mpol */ + NODEMASK_SCRATCH_FREE(scratch); /* scratch may be NULL */ + if (err) { + mpol_put(new); /* free bogus new mpol */ + return sp; + } + } - /* Create pseudo-vma that contains just the policy */ + sp = kmem_cache_alloc(sp_cache, GFP_KERNEL); + if (!sp) { + mpol_put(new); + return ERR_PTR(-ENOMEM); + } + sp->root = RB_ROOT; /* empty tree == default mempolicy */ + spin_lock_init(&sp->lock); + + if (new) { + /* + * Create pseudo-vma to specify policy range and + * install new mempolicy + */ + struct vm_area_struct pvma; memset(&pvma, 0, sizeof(struct vm_area_struct)); pvma.vm_end = TASK_SIZE; /* policy covers entire file */ - mpol_set_shared_policy(sp, &pvma, new); /* adds ref */ - -put_new: + err = mpol_set_shared_policy(sp, &pvma, new); /* adds ref */ mpol_put(new); /* drop initial ref */ -free_scratch: - NODEMASK_SCRATCH_FREE(scratch); -put_mpol: - mpol_put(mpol); /* drop our incoming ref on sb mpol */ } + + /* + * resolve potential set/set race; handle 'set' error + */ + spin_lock(&mapping->i_mmap_lock); + spx = mapping->spolicy; + if (!spx && !err) + mapping->spolicy = spx = sp; + else + err = !0; + spin_unlock(&mapping->i_mmap_lock); + if (err) + kmem_cache_free(sp_cache, sp); + + return spx; } int mpol_set_shared_policy(struct shared_policy *sp, @@ -2211,28 +2246,35 @@ int mpol_set_shared_policy(struct shared /** * mpol_free_shared_policy() - Free a backing policy store on inode delete. - * @sp - shared policy structure to free + * @mapping - address_space struct containing pointer to shared policy to be freed. * * Frees the shared policy red-black tree, if any, before freeing the - * shared policy struct itself. + * shared policy struct itself, if any. */ -void mpol_free_shared_policy(struct shared_policy *sp) +void mpol_free_shared_policy(struct address_space *mapping) { + struct shared_policy *sp = mapping->spolicy; struct sp_node *n; struct rb_node *next; - if (!sp->root.rb_node) - return; - spin_lock(&sp->lock); - next = rb_first(&sp->root); - while (next) { - n = rb_entry(next, struct sp_node, nd); - next = rb_next(&n->nd); - rb_erase(&n->nd, &sp->root); - mpol_put(n->policy); - kmem_cache_free(sn_cache, n); + if (!sp) + return; + + mapping->spolicy = NULL; + + if (sp->root.rb_node) { + spin_lock(&sp->lock); + next = rb_first(&sp->root); + while (next) { + n = rb_entry(next, struct sp_node, nd); + next = rb_next(&n->nd); + rb_erase(&n->nd, &sp->root); + mpol_put(n->policy); + kmem_cache_free(sn_cache, n); + } + spin_unlock(&sp->lock); } - spin_unlock(&sp->lock); + kmem_cache_free(sp_cache, sp); } /* assumes fs == KERNEL_DS */ @@ -2246,6 +2288,10 @@ void __init numa_policy_init(void) sizeof(struct mempolicy), 0, SLAB_PANIC, NULL); + sp_cache = kmem_cache_create("shared_policy", + sizeof(struct shared_policy), + 0, SLAB_PANIC, NULL); + sn_cache = kmem_cache_create("shared_policy_node", sizeof(struct sp_node), 0, SLAB_PANIC, NULL); Index: linux-2.6.36-mmotm-101103-1217/mm/shmem.c =================================================================== --- linux-2.6.36-mmotm-101103-1217.orig/mm/shmem.c +++ linux-2.6.36-mmotm-101103-1217/mm/shmem.c @@ -1259,7 +1259,8 @@ repeat: radix_tree_preload_end(); if (sgp != SGP_READ && !prealloc_page) { /* We don't care if this fails */ - prealloc_page = shmem_alloc_page(gfp, mapping->spolicy, idx); + prealloc_page = shmem_alloc_page(gfp, + mapping_shared_policy(mapping), idx); if (prealloc_page) { if (mem_cgroup_cache_charge(prealloc_page, current->mm, GFP_KERNEL)) { @@ -1292,8 +1293,8 @@ repeat: *type |= VM_FAULT_MAJOR; } spin_unlock(&info->lock); - swappage = shmem_swapin(swap, gfp, mapping->spolicy, - idx); + swappage = shmem_swapin(swap, gfp, + mapping_shared_policy(mapping), idx); if (!swappage) { spin_lock(&info->lock); entry = shmem_swp_alloc(info, idx, sgp); @@ -1420,7 +1421,8 @@ repeat: if (!prealloc_page) { spin_unlock(&info->lock); - filepage = shmem_alloc_page(gfp, mapping->spolicy, idx); + filepage = shmem_alloc_page(gfp, + mapping_shared_policy(mapping), idx); if (!filepage) { shmem_unacct_blocks(info->flags, 1); shmem_free_blocks(inode, 1); @@ -1525,18 +1527,28 @@ static int shmem_fault(struct vm_area_st #ifdef CONFIG_NUMA static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *new) { - struct inode *i = vma->vm_file->f_path.dentry->d_inode; - return mpol_set_shared_policy(&SHMEM_I(i)->policy, vma, new); + struct address_space *mapping = vma->vm_file->f_mapping; + struct shared_policy *sp = mapping_shared_policy(mapping); + + if (!sp) { + sp = mpol_shared_policy_new(mapping, NULL); + if (IS_ERR(sp)) + return PTR_ERR(sp); + } + return mpol_set_shared_policy(sp, vma, new); } static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma, unsigned long addr) { - struct inode *i = vma->vm_file->f_path.dentry->d_inode; + struct address_space *mapping = vma->vm_file->f_mapping; + struct shared_policy *sp = mapping_shared_policy(mapping); unsigned long idx; + if (!sp) + return NULL; /* == default policy */ idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; - return mpol_shared_policy_lookup(&SHMEM_I(i)->policy, idx); + return mpol_shared_policy_lookup(sp, idx); } #endif @@ -1608,9 +1620,15 @@ static struct inode *shmem_get_inode(str inode->i_mapping->a_ops = &shmem_aops; inode->i_op = &shmem_inode_operations; inode->i_fop = &shmem_file_operations; - inode->i_mapping->spolicy = &info->policy; - mpol_shared_policy_init(inode->i_mapping->spolicy, - shmem_get_sbmpol(sbinfo)); + if (sbinfo->mpol) { + struct address_space *mapping = + inode->i_mapping; + struct shared_policy *sp = + mpol_shared_policy_new(mapping, + shmem_get_sbmpol(sbinfo)); + if (!IS_ERR(sp)) + set_mapping_shared_policy(mapping, sp); + } break; case S_IFDIR: inc_nlink(inode); @@ -1621,12 +1639,9 @@ static struct inode *shmem_get_inode(str break; case S_IFLNK: /* - * Must not load anything in the rbtree, - * mpol_free_shared_policy will not be called. + * This case only exists so that we don't attempt + * to call init_special_inode() for sym links. */ - inode->i_mapping->spolicy = &info->policy; - mpol_shared_policy_init(inode->i_mapping->spolicy, - NULL); break; } } else @@ -2422,7 +2437,7 @@ static void shmem_destroy_inode(struct i { if ((inode->i_mode & S_IFMT) == S_IFREG) { /* only struct inode is valid if it's an inline symlink */ - mpol_free_shared_policy(inode->i_mapping->spolicy); + mpol_free_shared_policy(inode->i_mapping); } kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); } Index: linux-2.6.36-mmotm-101103-1217/fs/hugetlbfs/inode.c =================================================================== --- linux-2.6.36-mmotm-101103-1217.orig/fs/hugetlbfs/inode.c +++ linux-2.6.36-mmotm-101103-1217/fs/hugetlbfs/inode.c @@ -448,14 +448,13 @@ static int hugetlbfs_setattr(struct dent return 0; } -static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid, +static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid, gid_t gid, int mode, dev_t dev) { struct inode *inode; inode = new_inode(sb); if (inode) { - struct hugetlbfs_inode_info *info; inode->i_ino = get_next_ino(); inode->i_mode = mode; inode->i_uid = uid; @@ -464,16 +463,9 @@ static struct inode *hugetlbfs_get_inode inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; INIT_LIST_HEAD(&inode->i_mapping->private_list); - info = HUGETLBFS_I(inode); /* - * The policy is initialized here even if we are creating a - * private inode because initialization simply creates an - * an empty rb tree and calls spin_lock_init(), later when we - * call mpol_free_shared_policy() it will just return because - * the rb tree will still be empty. + * leave i_mapping->spolicy NULL [default policy] */ - inode->i_mapping->spolicy = &info->policy; - mpol_shared_policy_init(inode->i_mapping->spolicy, NULL); switch (mode & S_IFMT) { default: init_special_inode(inode, mode, dev); @@ -486,7 +478,10 @@ static struct inode *hugetlbfs_get_inode inode->i_op = &hugetlbfs_dir_inode_operations; inode->i_fop = &simple_dir_operations; - /* directory inodes start off with i_nlink == 2 (for "." entry) */ + /* + * directory inodes start off with i_nlink == 2 + * (for "." entry) + */ inc_nlink(inode); break; case S_IFLNK: @@ -667,7 +662,7 @@ static struct inode *hugetlbfs_alloc_ino static void hugetlbfs_destroy_inode(struct inode *inode) { hugetlbfs_inc_free_inodes(HUGETLBFS_SB(inode->i_sb)); - mpol_free_shared_policy(&HUGETLBFS_I(inode)->policy); + mpol_free_shared_policy(inode->i_mapping); kmem_cache_free(hugetlbfs_inode_cachep, HUGETLBFS_I(inode)); } Index: linux-2.6.36-mmotm-101103-1217/include/linux/hugetlb.h =================================================================== --- linux-2.6.36-mmotm-101103-1217.orig/include/linux/hugetlb.h +++ linux-2.6.36-mmotm-101103-1217/include/linux/hugetlb.h @@ -152,7 +152,6 @@ struct hugetlbfs_sb_info { struct hugetlbfs_inode_info { - struct shared_policy policy; struct inode vfs_inode; }; Index: linux-2.6.36-mmotm-101103-1217/include/linux/shmem_fs.h =================================================================== --- linux-2.6.36-mmotm-101103-1217.orig/include/linux/shmem_fs.h +++ linux-2.6.36-mmotm-101103-1217/include/linux/shmem_fs.h @@ -15,7 +15,6 @@ struct shmem_inode_info { unsigned long alloced; /* data pages alloced to file */ unsigned long swapped; /* subtotal assigned to swap */ unsigned long next_index; /* highest alloced index + 1 */ - struct shared_policy policy; /* NUMA memory alloc policy */ struct page *i_indirect; /* top indirect blocks page */ swp_entry_t i_direct[SHMEM_NR_DIRECT]; /* first blocks */ struct list_head swaplist; /* chain of maybes on swap */ Index: linux-2.6.36-mmotm-101103-1217/fs/inode.c =================================================================== --- linux-2.6.36-mmotm-101103-1217.orig/fs/inode.c +++ linux-2.6.36-mmotm-101103-1217/fs/inode.c @@ -215,6 +215,7 @@ int inode_init_always(struct super_block mapping->backing_dev_info = bdi; } inode->i_private = NULL; + set_mapping_shared_policy(mapping, NULL); inode->i_mapping = mapping; #ifdef CONFIG_FS_POSIX_ACL inode->i_acl = inode->i_default_acl = ACL_NOT_CACHED; Index: linux-2.6.36-mmotm-101103-1217/include/linux/fs.h =================================================================== --- linux-2.6.36-mmotm-101103-1217.orig/include/linux/fs.h +++ linux-2.6.36-mmotm-101103-1217/include/linux/fs.h @@ -647,7 +647,9 @@ struct address_space { struct list_head private_list; /* ditto */ struct address_space *assoc_mapping; /* ditto */ +#ifdef CONFIG_NUMA struct shared_policy *spolicy; +#endif } __attribute__((aligned(sizeof(long)))); /* * On most architectures that alignment is already the case; but @@ -655,6 +657,31 @@ struct address_space { * of struct page's "mapping" pointer be used for PAGE_MAPPING_ANON. */ +#ifdef CONFIG_NUMA +static inline struct shared_policy * +mapping_shared_policy(struct address_space *mapping) +{ + return mapping->spolicy; +} + +static inline void set_mapping_shared_policy(struct address_space *mapping, + struct shared_policy *sp) +{ + mapping->spolicy = sp; +} + +#else +static inline struct shared_policy * +mapping_shared_policy(struct address_space *mapping) +{ + return NULL; +} + +static inline void set_mapping_shared_policy(struct address_space *mapping, + struct shared_policy *sp) +{ } +#endif + struct block_device { dev_t bd_dev; /* not a kdev_t - it's a search key */ struct inode * bd_inode; /* will die */ -- To unsubscribe from this list: send the line "unsubscribe linux-numa" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html