[PATCH/RFC 3/14] Shared Policy: allocate shared policies as needed

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Shared Policy Infrastructure - dynamically alloc shared policies

Remove shared policy structs from shmem and hugetlbfs inode
info structs and dynamically allocate them as needed.

Make shared policy pointer in address_space dependent on
CONFIG_NUMA to avoid burdening configs that don't need/want
NUMA support.  Access [get/set] the shared_policy via wrappers
that also depend on 'NUMA [to avoid excessive #ifdef in .c files].

Initialize shmem and hugetlbfs inode/address_space spolicy
pointer to null, unless superblock [mount] specifies a
non-default policy.  Null shared policy pointer will cause
'get policy'--e.g., for page allocations--to fall back to task
policy, if any, else to system default policy.  Just like
NULL vma policies.

set_policy() ops must create shared_policy struct from a new
kmem cache when a new policy is installed and no spolicy exists.
mpol_shared_policy_init() replaced with mpol_shared_policy_new()
to accomplish this.

shmem must create/initialize a shared_policy when it allocates
an inode if the tmpfs super-block/mount point specifies a
non-default policy.

mpol_free_shared_policy() must free the spolicy, if any, when
inode is destroyed.

NOTE:  along with the previous patch in the series, this patch
adds a single pointer to the generic address_space struct and
thus to all inodes.  Last I looked, this did not decrease the
inodes/slab for x86_64 [and ia64 FWIW].  Other arcs need to be
checked.  If this extra pointer is problematic, I believe we
could over load the non-linear pointer and use as_flags to detect.


Signed-off-by: Lee Schermerhorn <lee.schermerhorn@xxxxxx>

 fs/hugetlbfs/inode.c          |   19 ++----
 fs/inode.c                    |    1 
 include/linux/fs.h            |   27 ++++++++
 include/linux/hugetlb.h       |    1 
 include/linux/shared_policy.h |   24 ++++---
 include/linux/shmem_fs.h      |    1 
 mm/mempolicy.c                |  128 ++++++++++++++++++++++++++++--------------
 mm/shmem.c                    |   49 ++++++++++------
 8 files changed, 168 insertions(+), 82 deletions(-)

Index: linux-2.6.36-mmotm-101103-1217/include/linux/shared_policy.h
===================================================================
--- linux-2.6.36-mmotm-101103-1217.orig/include/linux/shared_policy.h
+++ linux-2.6.36-mmotm-101103-1217/include/linux/shared_policy.h
@@ -1,6 +1,7 @@
 #ifndef _LINUX_SHARED_POLICY_H
 #define _LINUX_SHARED_POLICY_H 1
 
+#include <linux/fs.h>
 #include <linux/spinlock.h>
 #include <linux/rbtree.h>
 
@@ -28,13 +29,15 @@ struct shared_policy {
 	spinlock_t lock;	/* protects rb tree */
 };
 
-void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol);
-int mpol_set_shared_policy(struct shared_policy *,
-				struct vm_area_struct *,
-				struct mempolicy *);
-void mpol_free_shared_policy(struct shared_policy *);
-struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *,
-					    unsigned long);
+extern struct shared_policy *mpol_shared_policy_new(
+					struct address_space *mapping,
+					struct mempolicy *mpol);
+extern int mpol_set_shared_policy(struct shared_policy *,
+					struct vm_area_struct *,
+					struct mempolicy *);
+extern void mpol_free_shared_policy(struct address_space *);
+extern struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *,
+					unsigned long);
 
 #else /* !NUMA */
 
@@ -47,12 +50,12 @@ static inline int mpol_set_shared_policy
 	return -EINVAL;
 }
 
-static inline void mpol_shared_policy_init(struct shared_policy *sp,
-						struct mempolicy *mpol)
+static inline struct shared_policy *
+mpol_shared_policy_new(struct address_space *mapping, struct mempolicy *mpol)
 {
 }
 
-static inline void mpol_free_shared_policy(struct shared_policy *p)
+static inline void mpol_free_shared_policy(struct shared_policy *sp)
 {
 }
 
@@ -61,6 +64,7 @@ mpol_shared_policy_lookup(struct shared_
 {
 	return NULL;
 }
+
 #endif
 
 #endif /* _LINUX_SHARED_POLICY_H */
Index: linux-2.6.36-mmotm-101103-1217/mm/mempolicy.c
===================================================================
--- linux-2.6.36-mmotm-101103-1217.orig/mm/mempolicy.c
+++ linux-2.6.36-mmotm-101103-1217/mm/mempolicy.c
@@ -102,6 +102,7 @@
 #define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2)		/* Gather statistics */
 
 static struct kmem_cache *policy_cache;
+static struct kmem_cache *sp_cache;
 static struct kmem_cache *sn_cache;
 
 /* Highest zone. An specific allocation for a zone below that is not
@@ -2137,52 +2138,86 @@ restart:
 }
 
 /**
- * mpol_shared_policy_init - initialize shared policy for inode
- * @sp: pointer to inode shared policy
- * @mpol:  struct mempolicy to install
+ * mpol_shared_policy_new - allocate and initialize a shared policy struct
+ * @mpol:  struct mempolicy to install, if non-NULL == tmpfs mount point
+ * mempolicy.
  *
- * Install non-NULL @mpol in inode's shared policy rb-tree.
+ * Allocate a new shared policy structure and install non-NULL @mpol.
  * On entry, the current task has a reference on a non-NULL @mpol.
  * This must be released on exit.
  * This is called at get_inode() calls and we can use GFP_KERNEL.
  */
-void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
+struct shared_policy *mpol_shared_policy_new(struct address_space *mapping,
+						struct mempolicy *mpol)
 {
-	int ret;
-
-	sp->root = RB_ROOT;		/* empty tree == default mempolicy */
-	spin_lock_init(&sp->lock);
+	struct shared_policy *sp, *spx;
+	struct mempolicy *new = NULL;
+	int err = 0;
 
 	if (mpol) {
-		struct vm_area_struct pvma;
-		struct mempolicy *new;
 		NODEMASK_SCRATCH(scratch);
 
-		if (!scratch)
-			goto put_mpol;
-		/* contextualize the tmpfs mount point mempolicy */
+		if (!scratch) {
+			sp = ERR_PTR(-ENOMEM);
+			err = !0;
+			goto put_free;
+		}
+		sp = mapping->spolicy;
+		/*
+		 * Contextualize the tmpfs mount point mempolicy.  Ensure that
+		 * we have a good mempolicy before allocating new shared policy.
+		 */
 		new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
-		if (IS_ERR(new))
-			goto free_scratch; /* no valid nodemask intersection */
+		err = IS_ERR(new);
+		if (err)
+			goto put_free;
 
 		task_lock(current);
-		ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
+		err = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
 		task_unlock(current);
-		if (ret)
-			goto put_new;
+put_free:
+		mpol_put(mpol);	/* drop our ref on sb mpol */
+		NODEMASK_SCRATCH_FREE(scratch);	/* scratch may be NULL */
+		if (err) {
+			mpol_put(new);	/* free bogus new mpol */
+			return sp;
+		}
+	}
 
-		/* Create pseudo-vma that contains just the policy */
+	sp = kmem_cache_alloc(sp_cache, GFP_KERNEL);
+	if (!sp) {
+		mpol_put(new);
+		return ERR_PTR(-ENOMEM);
+	}
+	sp->root = RB_ROOT;		/* empty tree == default mempolicy */
+	spin_lock_init(&sp->lock);
+
+	if (new) {
+		/*
+		 * Create pseudo-vma to specify policy range and
+		 * install new mempolicy
+		 */
+		struct vm_area_struct pvma;
 		memset(&pvma, 0, sizeof(struct vm_area_struct));
 		pvma.vm_end = TASK_SIZE;	/* policy covers entire file */
-		mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
-
-put_new:
+		err = mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
 		mpol_put(new);			/* drop initial ref */
-free_scratch:
-		NODEMASK_SCRATCH_FREE(scratch);
-put_mpol:
-		mpol_put(mpol);	/* drop our incoming ref on sb mpol */
 	}
+
+	/*
+	 * resolve potential set/set race; handle 'set' error
+	 */
+	spin_lock(&mapping->i_mmap_lock);
+	spx = mapping->spolicy;
+	if (!spx && !err)
+		mapping->spolicy = spx = sp;
+	else
+		err = !0;
+	spin_unlock(&mapping->i_mmap_lock);
+	if (err)
+		kmem_cache_free(sp_cache, sp);
+
+	return spx;
 }
 
 int mpol_set_shared_policy(struct shared_policy *sp,
@@ -2211,28 +2246,35 @@ int mpol_set_shared_policy(struct shared
 
 /**
  * mpol_free_shared_policy() - Free a backing policy store on inode delete.
- * @sp - shared policy structure to free
+ * @mapping - address_space struct containing pointer to shared policy to be freed.
  *
  * Frees the shared policy red-black tree, if any, before freeing the
- * shared policy struct itself.
+ * shared policy struct itself, if any.
  */
-void mpol_free_shared_policy(struct shared_policy *sp)
+void mpol_free_shared_policy(struct address_space *mapping)
 {
+	struct shared_policy *sp = mapping->spolicy;
 	struct sp_node *n;
 	struct rb_node *next;
 
-	if (!sp->root.rb_node)
-		return;
-	spin_lock(&sp->lock);
-	next = rb_first(&sp->root);
-	while (next) {
-		n = rb_entry(next, struct sp_node, nd);
-		next = rb_next(&n->nd);
-		rb_erase(&n->nd, &sp->root);
-		mpol_put(n->policy);
-		kmem_cache_free(sn_cache, n);
+	if (!sp)
+  		return;
+
+	mapping->spolicy = NULL;
+
+	if (sp->root.rb_node) {
+		spin_lock(&sp->lock);
+		next = rb_first(&sp->root);
+		while (next) {
+			n = rb_entry(next, struct sp_node, nd);
+			next = rb_next(&n->nd);
+			rb_erase(&n->nd, &sp->root);
+			mpol_put(n->policy);
+			kmem_cache_free(sn_cache, n);
+		}
+		spin_unlock(&sp->lock);
 	}
-	spin_unlock(&sp->lock);
+	kmem_cache_free(sp_cache, sp);
 }
 
 /* assumes fs == KERNEL_DS */
@@ -2246,6 +2288,10 @@ void __init numa_policy_init(void)
 					 sizeof(struct mempolicy),
 					 0, SLAB_PANIC, NULL);
 
+	sp_cache = kmem_cache_create("shared_policy",
+				     sizeof(struct shared_policy),
+				     0, SLAB_PANIC, NULL);
+
 	sn_cache = kmem_cache_create("shared_policy_node",
 				     sizeof(struct sp_node),
 				     0, SLAB_PANIC, NULL);
Index: linux-2.6.36-mmotm-101103-1217/mm/shmem.c
===================================================================
--- linux-2.6.36-mmotm-101103-1217.orig/mm/shmem.c
+++ linux-2.6.36-mmotm-101103-1217/mm/shmem.c
@@ -1259,7 +1259,8 @@ repeat:
 		radix_tree_preload_end();
 		if (sgp != SGP_READ && !prealloc_page) {
 			/* We don't care if this fails */
-			prealloc_page = shmem_alloc_page(gfp, mapping->spolicy, idx);
+			prealloc_page = shmem_alloc_page(gfp,
+					mapping_shared_policy(mapping), idx);
 			if (prealloc_page) {
 				if (mem_cgroup_cache_charge(prealloc_page,
 						current->mm, GFP_KERNEL)) {
@@ -1292,8 +1293,8 @@ repeat:
 				*type |= VM_FAULT_MAJOR;
 			}
 			spin_unlock(&info->lock);
-			swappage = shmem_swapin(swap, gfp, mapping->spolicy,
-									idx);
+			swappage = shmem_swapin(swap, gfp,
+					mapping_shared_policy(mapping), idx);
 			if (!swappage) {
 				spin_lock(&info->lock);
 				entry = shmem_swp_alloc(info, idx, sgp);
@@ -1420,7 +1421,8 @@ repeat:
 
 			if (!prealloc_page) {
 				spin_unlock(&info->lock);
-				filepage = shmem_alloc_page(gfp, mapping->spolicy, idx);
+				filepage = shmem_alloc_page(gfp,
+						mapping_shared_policy(mapping), idx);
 				if (!filepage) {
 					shmem_unacct_blocks(info->flags, 1);
 					shmem_free_blocks(inode, 1);
@@ -1525,18 +1527,28 @@ static int shmem_fault(struct vm_area_st
 #ifdef CONFIG_NUMA
 static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *new)
 {
-	struct inode *i = vma->vm_file->f_path.dentry->d_inode;
-	return mpol_set_shared_policy(&SHMEM_I(i)->policy, vma, new);
+	struct address_space *mapping = vma->vm_file->f_mapping;
+	struct shared_policy *sp = mapping_shared_policy(mapping);
+
+	if (!sp) {
+		sp = mpol_shared_policy_new(mapping, NULL);
+		if (IS_ERR(sp))
+			return PTR_ERR(sp);
+	}
+	return mpol_set_shared_policy(sp, vma, new);
 }
 
 static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma,
 					  unsigned long addr)
 {
-	struct inode *i = vma->vm_file->f_path.dentry->d_inode;
+	struct address_space *mapping = vma->vm_file->f_mapping;
+	struct shared_policy *sp = mapping_shared_policy(mapping);
 	unsigned long idx;
 
+	if (!sp)
+		return NULL;	/* == default policy */
 	idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
-	return mpol_shared_policy_lookup(&SHMEM_I(i)->policy, idx);
+	return mpol_shared_policy_lookup(sp, idx);
 }
 #endif
 
@@ -1608,9 +1620,15 @@ static struct inode *shmem_get_inode(str
 			inode->i_mapping->a_ops = &shmem_aops;
 			inode->i_op = &shmem_inode_operations;
 			inode->i_fop = &shmem_file_operations;
-			inode->i_mapping->spolicy = &info->policy;
-			mpol_shared_policy_init(inode->i_mapping->spolicy,
-						 shmem_get_sbmpol(sbinfo));
+			if (sbinfo->mpol) {
+				struct address_space *mapping =
+							 inode->i_mapping;
+				struct shared_policy *sp =
+						mpol_shared_policy_new(mapping,
+						     shmem_get_sbmpol(sbinfo));
+				if (!IS_ERR(sp))
+					set_mapping_shared_policy(mapping, sp);
+			}
 			break;
 		case S_IFDIR:
 			inc_nlink(inode);
@@ -1621,12 +1639,9 @@ static struct inode *shmem_get_inode(str
 			break;
 		case S_IFLNK:
 			/*
-			 * Must not load anything in the rbtree,
-			 * mpol_free_shared_policy will not be called.
+			 * This case only exists so that we don't attempt
+			 * to call init_special_inode() for sym links.
 			 */
-			inode->i_mapping->spolicy = &info->policy;
-			mpol_shared_policy_init(inode->i_mapping->spolicy,
-						NULL);
 			break;
 		}
 	} else
@@ -2422,7 +2437,7 @@ static void shmem_destroy_inode(struct i
 {
 	if ((inode->i_mode & S_IFMT) == S_IFREG) {
 		/* only struct inode is valid if it's an inline symlink */
-		mpol_free_shared_policy(inode->i_mapping->spolicy);
+		mpol_free_shared_policy(inode->i_mapping);
 	}
 	kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
 }
Index: linux-2.6.36-mmotm-101103-1217/fs/hugetlbfs/inode.c
===================================================================
--- linux-2.6.36-mmotm-101103-1217.orig/fs/hugetlbfs/inode.c
+++ linux-2.6.36-mmotm-101103-1217/fs/hugetlbfs/inode.c
@@ -448,14 +448,13 @@ static int hugetlbfs_setattr(struct dent
 	return 0;
 }
 
-static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid, 
+static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid,
 					gid_t gid, int mode, dev_t dev)
 {
 	struct inode *inode;
 
 	inode = new_inode(sb);
 	if (inode) {
-		struct hugetlbfs_inode_info *info;
 		inode->i_ino = get_next_ino();
 		inode->i_mode = mode;
 		inode->i_uid = uid;
@@ -464,16 +463,9 @@ static struct inode *hugetlbfs_get_inode
 		inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info;
 		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 		INIT_LIST_HEAD(&inode->i_mapping->private_list);
-		info = HUGETLBFS_I(inode);
 		/*
-		 * The policy is initialized here even if we are creating a
-		 * private inode because initialization simply creates an
-		 * an empty rb tree and calls spin_lock_init(), later when we
-		 * call mpol_free_shared_policy() it will just return because
-		 * the rb tree will still be empty.
+		 * leave i_mapping->spolicy NULL [default policy]
 		 */
-		inode->i_mapping->spolicy = &info->policy;
-		mpol_shared_policy_init(inode->i_mapping->spolicy, NULL);
 		switch (mode & S_IFMT) {
 		default:
 			init_special_inode(inode, mode, dev);
@@ -486,7 +478,10 @@ static struct inode *hugetlbfs_get_inode
 			inode->i_op = &hugetlbfs_dir_inode_operations;
 			inode->i_fop = &simple_dir_operations;
 
-			/* directory inodes start off with i_nlink == 2 (for "." entry) */
+			/*
+			 * directory inodes start off with i_nlink == 2
+			 * (for "." entry)
+			 */
 			inc_nlink(inode);
 			break;
 		case S_IFLNK:
@@ -667,7 +662,7 @@ static struct inode *hugetlbfs_alloc_ino
 static void hugetlbfs_destroy_inode(struct inode *inode)
 {
 	hugetlbfs_inc_free_inodes(HUGETLBFS_SB(inode->i_sb));
-	mpol_free_shared_policy(&HUGETLBFS_I(inode)->policy);
+	mpol_free_shared_policy(inode->i_mapping);
 	kmem_cache_free(hugetlbfs_inode_cachep, HUGETLBFS_I(inode));
 }
 
Index: linux-2.6.36-mmotm-101103-1217/include/linux/hugetlb.h
===================================================================
--- linux-2.6.36-mmotm-101103-1217.orig/include/linux/hugetlb.h
+++ linux-2.6.36-mmotm-101103-1217/include/linux/hugetlb.h
@@ -152,7 +152,6 @@ struct hugetlbfs_sb_info {
 
 
 struct hugetlbfs_inode_info {
-	struct shared_policy policy;
 	struct inode vfs_inode;
 };
 
Index: linux-2.6.36-mmotm-101103-1217/include/linux/shmem_fs.h
===================================================================
--- linux-2.6.36-mmotm-101103-1217.orig/include/linux/shmem_fs.h
+++ linux-2.6.36-mmotm-101103-1217/include/linux/shmem_fs.h
@@ -15,7 +15,6 @@ struct shmem_inode_info {
 	unsigned long		alloced;	/* data pages alloced to file */
 	unsigned long		swapped;	/* subtotal assigned to swap */
 	unsigned long		next_index;	/* highest alloced index + 1 */
-	struct shared_policy	policy;		/* NUMA memory alloc policy */
 	struct page		*i_indirect;	/* top indirect blocks page */
 	swp_entry_t		i_direct[SHMEM_NR_DIRECT]; /* first blocks */
 	struct list_head	swaplist;	/* chain of maybes on swap */
Index: linux-2.6.36-mmotm-101103-1217/fs/inode.c
===================================================================
--- linux-2.6.36-mmotm-101103-1217.orig/fs/inode.c
+++ linux-2.6.36-mmotm-101103-1217/fs/inode.c
@@ -215,6 +215,7 @@ int inode_init_always(struct super_block
 		mapping->backing_dev_info = bdi;
 	}
 	inode->i_private = NULL;
+	set_mapping_shared_policy(mapping, NULL);
 	inode->i_mapping = mapping;
 #ifdef CONFIG_FS_POSIX_ACL
 	inode->i_acl = inode->i_default_acl = ACL_NOT_CACHED;
Index: linux-2.6.36-mmotm-101103-1217/include/linux/fs.h
===================================================================
--- linux-2.6.36-mmotm-101103-1217.orig/include/linux/fs.h
+++ linux-2.6.36-mmotm-101103-1217/include/linux/fs.h
@@ -647,7 +647,9 @@ struct address_space {
 	struct list_head	private_list;	/* ditto */
 	struct address_space	*assoc_mapping;	/* ditto */
 
+#ifdef CONFIG_NUMA
 	struct shared_policy	*spolicy;
+#endif
 } __attribute__((aligned(sizeof(long))));
 	/*
 	 * On most architectures that alignment is already the case; but
@@ -655,6 +657,31 @@ struct address_space {
 	 * of struct page's "mapping" pointer be used for PAGE_MAPPING_ANON.
 	 */
 
+#ifdef CONFIG_NUMA
+static inline struct shared_policy *
+mapping_shared_policy(struct address_space *mapping)
+{
+	return mapping->spolicy;
+}
+
+static inline void set_mapping_shared_policy(struct address_space *mapping,
+						struct shared_policy *sp)
+{
+	mapping->spolicy = sp;
+}
+
+#else
+static inline struct shared_policy *
+mapping_shared_policy(struct address_space *mapping)
+{
+	return NULL;
+}
+
+static inline void set_mapping_shared_policy(struct address_space *mapping,
+						struct shared_policy *sp)
+{ }
+#endif
+
 struct block_device {
 	dev_t			bd_dev;  /* not a kdev_t - it's a search key */
 	struct inode *		bd_inode;	/* will die */
--
To unsubscribe from this list: send the line "unsubscribe linux-numa" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[Index of Archives]     [Linux Kernel]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux SCSI]     [Devices]

  Powered by Linux