Re: [patch 23/35] fs: icache use per-CPU lists and locks for sb inode lists

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Tue, 19 Oct 2010, npiggin@xxxxxxxx wrote:
> Signed-off-by: Nick Piggin <npiggin@xxxxxxxxx>
> 
> ---
>  fs/drop_caches.c                 |    4 -
>  fs/fs-writeback.c                |   15 +++--
>  fs/inode.c                       |   99 ++++++++++++++++++++++++++++-----------
>  fs/notify/inode_mark.c           |    6 +-
>  fs/quota/dquot.c                 |    8 +--
>  fs/super.c                       |   16 +++++-
>  include/linux/fs.h               |   58 ++++++++++++++++++++++
>  include/linux/fsnotify_backend.h |    4 -
>  include/linux/writeback.h        |    1 
>  9 files changed, 164 insertions(+), 47 deletions(-)
> 
> Index: linux-2.6/fs/inode.c
> ===================================================================
> --- linux-2.6.orig/fs/inode.c	2010-10-19 14:18:59.000000000 +1100
> +++ linux-2.6/fs/inode.c	2010-10-19 14:19:23.000000000 +1100

[snip]

> @@ -718,13 +714,63 @@
>  	return tmp & I_HASHMASK;
>  }
>  
> +static inline int inode_list_cpu(struct inode *inode)
> +{
> +#ifdef CONFIG_SMP
> +	return inode->i_sb_list_cpu;
> +#else
> +	return smp_processor_id();
> +#endif
> +}
> +
> +/* helper for file_sb_list_add to reduce ifdefs */
> +static inline void __inode_sb_list_add(struct inode *inode, struct super_block *sb)
> +{
> +	struct list_head *list;
> +#ifdef CONFIG_SMP
> +	int cpu;
> +	cpu = smp_processor_id();
> +	inode->i_sb_list_cpu = cpu;
> +	list = per_cpu_ptr(sb->s_inodes, cpu);
> +#else
> +	list = &sb->s_inodes;
> +#endif
> +	list_add_rcu(&inode->i_sb_list, list);
> +}
> +
> +/**
> + * inode_sb_list_add - add an inode to the sb's file list
> + * @inode: inode to add
> + * @sb: sb to add it to
> + *
> + * Use this function to associate an with the superblock it belongs to.

                                       ^^^inode

> + */
> +static void inode_sb_list_add(struct inode *inode, struct super_block *sb)
> +{
> +	lg_local_lock(inode_list_lglock);
> +	__inode_sb_list_add(inode, sb);
> +	lg_local_unlock(inode_list_lglock);
> +}
> +
> +/**
> + * inode_sb_list_del - remove an inode from the sb's inode list
> + * @inode: inode to remove
> + * @sb: sb to remove it from
> + *
> + * Use this function to remove an inode from its superblock.
> + */
> +static void inode_sb_list_del(struct inode *inode)
> +{
> +	lg_local_lock_cpu(inode_list_lglock, inode_list_cpu(inode));
> +	list_del_rcu(&inode->i_sb_list);
> +	lg_local_unlock_cpu(inode_list_lglock, inode_list_cpu(inode));
> +}
> +
>  static inline void
>  __inode_add_to_lists(struct super_block *sb, struct inode_hash_bucket *b,
>  			struct inode *inode)
>  {
> -	spin_lock(&sb_inode_list_lock);
> -	list_add_rcu(&inode->i_sb_list, &sb->s_inodes);
> -	spin_unlock(&sb_inode_list_lock);
> +	inode_sb_list_add(inode, sb);
>  	if (b) {
>  		spin_lock_bucket(b);
>  		hlist_bl_add_head_rcu(&inode->i_hash, &b->head);
> @@ -1270,6 +1316,7 @@
>  			continue;
>  		if (!spin_trylock(&old->i_lock)) {
>  			spin_unlock_bucket(b);
> +			cpu_relax();

Doesn't this logically belong to a previous patch?

>  			goto repeat;
>  		}
>  		goto found_old;
> @@ -1453,9 +1500,7 @@
>  			inodes_stat.nr_unused--;
>  		spin_unlock(&wb_inode_list_lock);
>  	}
> -	spin_lock(&sb_inode_list_lock);
> -	list_del_rcu(&inode->i_sb_list);
> -	spin_unlock(&sb_inode_list_lock);
> +	inode_sb_list_del(inode);
>  	WARN_ON(inode->i_state & I_NEW);
>  	inode->i_state |= I_FREEING;
>  	spin_unlock(&inode->i_lock);
> @@ -1732,6 +1777,8 @@
>  					 init_once);
>  	register_shrinker(&icache_shrinker);
>  
> +	lg_lock_init(inode_list_lglock);
> +
>  	/* Hash may have been set up in inode_init_early */
>  	if (!hashdist)
>  		return;
> Index: linux-2.6/include/linux/fs.h
> ===================================================================
> --- linux-2.6.orig/include/linux/fs.h	2010-10-19 14:18:59.000000000 +1100
> +++ linux-2.6/include/linux/fs.h	2010-10-19 14:19:22.000000000 +1100
> @@ -374,6 +374,7 @@
>  #include <linux/cache.h>
>  #include <linux/kobject.h>
>  #include <linux/list.h>
> +#include <linux/rculist.h>
>  #include <linux/rculist_bl.h>
>  #include <linux/radix-tree.h>
>  #include <linux/prio_tree.h>
> @@ -733,6 +734,9 @@
>  		struct rcu_head		i_rcu;
>  	};
>  	unsigned long		i_ino;
> +#ifdef CONFIG_SMP
> +	int			i_sb_list_cpu;
> +#endif
>  	unsigned int		i_count;
>  	unsigned int		i_nlink;
>  	uid_t			i_uid;
> @@ -1344,11 +1348,12 @@
>  #endif
>  	const struct xattr_handler **s_xattr;
>  
> -	struct list_head	s_inodes;	/* all inodes */
>  	struct hlist_head	s_anon;		/* anonymous dentries for (nfs) exporting */
>  #ifdef CONFIG_SMP
> +	struct list_head __percpu *s_inodes;
>  	struct list_head __percpu *s_files;
>  #else
> +	struct list_head	s_inodes;	/* all inodes */
>  	struct list_head	s_files;
>  #endif
>  	/* s_dentry_lru and s_nr_dentry_unused are protected by dcache_lock */
> @@ -2202,6 +2207,57 @@
>  	__insert_inode_hash(inode, inode->i_ino);
>  }
>  
> +#ifdef CONFIG_SMP
> +/*
> + * These macros iterate all inodes on all CPUs for a given superblock.
> + * rcu_read_lock must be held.
> + */
> +#define do_inode_list_for_each_entry_rcu(__sb, __inode)		\
> +{								\
> +	int i;							\
> +	for_each_possible_cpu(i) {				\
> +		struct list_head *list;				\
> +		list = per_cpu_ptr((__sb)->s_inodes, i);	\
> +		list_for_each_entry_rcu((__inode), list, i_sb_list)
> +
> +#define while_inode_list_for_each_entry_rcu			\
> +	}							\
> +}
> +
> +#define do_inode_list_for_each_entry_safe(__sb, __inode, __tmp)	\
> +{								\
> +	int i;							\
> +	for_each_possible_cpu(i) {				\
> +		struct list_head *list;				\
> +		list = per_cpu_ptr((__sb)->s_inodes, i);	\
> +		list_for_each_entry_safe((__inode), (__tmp), list, i_sb_list)
> +
> +#define while_inode_list_for_each_entry_safe			\
> +	}							\
> +}
> +
> +#else
> +
> +#define do_inode_list_for_each_entry_rcu(__sb, __inode)		\
> +{								\
> +	struct list_head *list;					\
> +	list = &(sb)->s_inodes;					\
> +	list_for_each_entry_rcu((__inode), list, i_sb_list)
> +
> +#define while_inode_list_for_each_entry_rcu			\
> +}
> +
> +#define do_inode_list_for_each_entry_safe(__sb, __inode, __tmp)	\
> +{								\
> +	struct list_head *list;					\
> +	list = &(sb)->s_inodes;					\
> +	list_for_each_entry_safe((__inode), (__tmp), list, i_sb_list)
> +
> +#define while_inode_list_for_each_entry_safe			\
> +}
> +
> +#endif
> +
>  #ifdef CONFIG_BLOCK
>  extern void submit_bio(int, struct bio *);
>  extern int bdev_read_only(struct block_device *);
> Index: linux-2.6/fs/super.c
> ===================================================================
> --- linux-2.6.orig/fs/super.c	2010-10-19 14:17:17.000000000 +1100
> +++ linux-2.6/fs/super.c	2010-10-19 14:18:59.000000000 +1100
> @@ -67,12 +67,25 @@
>  			for_each_possible_cpu(i)
>  				INIT_LIST_HEAD(per_cpu_ptr(s->s_files, i));
>  		}
> +		s->s_inodes = alloc_percpu(struct list_head);
> +		if (!s->s_inodes) {
> +			free_percpu(s->s_files);
> +			security_sb_free(s);
> +			kfree(s);
> +			s = NULL;
> +			goto out;

Factor out error cleanups to separate out labels?

> +		} else {
> +			int i;
> +
> +			for_each_possible_cpu(i)
> +				INIT_LIST_HEAD(per_cpu_ptr(s->s_inodes, i));
> +		}
>  #else
>  		INIT_LIST_HEAD(&s->s_files);
> +		INIT_LIST_HEAD(&s->s_inodes);
>  #endif
>  		INIT_LIST_HEAD(&s->s_instances);
>  		INIT_HLIST_HEAD(&s->s_anon);
> -		INIT_LIST_HEAD(&s->s_inodes);
>  		INIT_LIST_HEAD(&s->s_dentry_lru);
>  		init_rwsem(&s->s_umount);
>  		mutex_init(&s->s_lock);
> @@ -124,6 +137,7 @@
>  static inline void destroy_super(struct super_block *s)
>  {
>  #ifdef CONFIG_SMP
> +	free_percpu(s->s_inodes);
>  	free_percpu(s->s_files);
>  #endif
>  	security_sb_free(s);
> Index: linux-2.6/fs/drop_caches.c
> ===================================================================
> --- linux-2.6.orig/fs/drop_caches.c	2010-10-19 14:18:59.000000000 +1100
> +++ linux-2.6/fs/drop_caches.c	2010-10-19 14:19:18.000000000 +1100
> @@ -17,7 +17,7 @@
>  	struct inode *inode, *toput_inode = NULL;
>  
>  	rcu_read_lock();
> -	list_for_each_entry_rcu(inode, &sb->s_inodes, i_sb_list) {
> +	do_inode_list_for_each_entry_rcu(sb, inode) {
>  		spin_lock(&inode->i_lock);
>  		if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)
>  				|| inode->i_mapping->nrpages == 0) {
> @@ -31,7 +31,7 @@
>  		iput(toput_inode);
>  		toput_inode = inode;
>  		rcu_read_lock();
> -	}
> +	} while_inode_list_for_each_entry_rcu
>  	rcu_read_unlock();
>  	iput(toput_inode);
>  }
> Index: linux-2.6/fs/fs-writeback.c
> ===================================================================
> --- linux-2.6.orig/fs/fs-writeback.c	2010-10-19 14:18:59.000000000 +1100
> +++ linux-2.6/fs/fs-writeback.c	2010-10-19 14:19:22.000000000 +1100
> @@ -1074,7 +1074,7 @@
>  	 * we still have to wait for that writeout.
>  	 */
>  	rcu_read_lock();
> -	list_for_each_entry_rcu(inode, &sb->s_inodes, i_sb_list) {
> +	do_inode_list_for_each_entry_rcu(sb, inode) {
>  		struct address_space *mapping;
>  
>  		spin_lock(&inode->i_lock);
> @@ -1093,11 +1093,12 @@
>  		spin_unlock(&inode->i_lock);
>  		rcu_read_unlock();
>  		/*
> -		 * We hold a reference to 'inode' so it couldn't have been
> -		 * removed from s_inodes list while we dropped the i_lock.  We
> -		 * cannot iput the inode now as we can be holding the last
> -		 * reference and we cannot iput it under spinlock. So we keep
> -		 * the reference and iput it later.
> +		 * We hold a reference to 'inode' so it couldn't have
> +		 * been removed from s_inodes list while we dropped the
> +		 * i_lock.  We cannot iput the inode now as we can be
> +		 * holding the last reference and we cannot iput it
> +		 * under spinlock. So we keep the reference and iput it
> +		 * later.
>  		 */
>  		iput(old_inode);
>  		old_inode = inode;
> @@ -1107,7 +1108,7 @@
>  		cond_resched();
>  
>  		rcu_read_lock();
> -	}
> +	} while_inode_list_for_each_entry_rcu
>  	rcu_read_unlock();
>  	iput(old_inode);
>  }
> Index: linux-2.6/fs/notify/inode_mark.c
> ===================================================================
> --- linux-2.6.orig/fs/notify/inode_mark.c	2010-10-19 14:18:59.000000000 +1100
> +++ linux-2.6/fs/notify/inode_mark.c	2010-10-19 14:19:18.000000000 +1100
> @@ -236,11 +236,11 @@
>   * and with the sb going away, no new inodes will appear or be referenced
>   * from other paths.
>   */
> -void fsnotify_unmount_inodes(struct list_head *list)
> +void fsnotify_unmount_inodes(struct super_block *sb)
>  {
>  	struct inode *inode, *next_i, *need_iput = NULL;
>  
> -	list_for_each_entry_safe(inode, next_i, list, i_sb_list) {
> +	do_inode_list_for_each_entry_safe(sb, inode, next_i) {
>  		struct inode *need_iput_tmp;
>  
>  		spin_lock(&inode->i_lock);
> @@ -295,5 +295,5 @@
>  		fsnotify_inode_delete(inode);
>  
>  		iput(inode);
> -	}
> +	} while_inode_list_for_each_entry_safe
>  }
> Index: linux-2.6/fs/quota/dquot.c
> ===================================================================
> --- linux-2.6.orig/fs/quota/dquot.c	2010-10-19 14:18:59.000000000 +1100
> +++ linux-2.6/fs/quota/dquot.c	2010-10-19 14:19:18.000000000 +1100
> @@ -898,7 +898,7 @@
>  #endif
>  
>  	rcu_read_lock();
> -	list_for_each_entry_rcu(inode, &sb->s_inodes, i_sb_list) {
> +	do_inode_list_for_each_entry_rcu(sb, inode) {
>  		spin_lock(&inode->i_lock);
>  		if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) {
>  			spin_unlock(&inode->i_lock);
> @@ -930,7 +930,7 @@
>  		 * lock. So we keep the reference and iput it later. */
>  		old_inode = inode;
>  		rcu_read_lock();
> -	}
> +	} while_inode_list_for_each_entry_rcu
>  	rcu_read_unlock();
>  	iput(old_inode);
>  
> @@ -1013,7 +1013,7 @@
>  	int reserved = 0;
>  
>  	rcu_read_lock();
> -	list_for_each_entry_rcu(inode, &sb->s_inodes, i_sb_list) {
> +	do_inode_list_for_each_entry_rcu(sb, inode) {
>  		/*
>  		 *  We have to scan also I_NEW inodes because they can already
>  		 *  have quota pointer initialized. Luckily, we need to touch
> @@ -1025,7 +1025,7 @@
>  				reserved = 1;
>  			remove_inode_dquot_ref(inode, type, tofree_head);
>  		}
> -	}
> +	} while_inode_list_for_each_entry_rcu
>  	rcu_read_unlock();
>  #ifdef CONFIG_QUOTA_DEBUG
>  	if (reserved) {
> Index: linux-2.6/include/linux/fsnotify_backend.h
> ===================================================================
> --- linux-2.6.orig/include/linux/fsnotify_backend.h	2010-10-19 14:17:17.000000000 +1100
> +++ linux-2.6/include/linux/fsnotify_backend.h	2010-10-19 14:18:59.000000000 +1100
> @@ -402,7 +402,7 @@
>  extern void fsnotify_clear_marks_by_group(struct fsnotify_group *group);
>  extern void fsnotify_get_mark(struct fsnotify_mark *mark);
>  extern void fsnotify_put_mark(struct fsnotify_mark *mark);
> -extern void fsnotify_unmount_inodes(struct list_head *list);
> +extern void fsnotify_unmount_inodes(struct super_block *sb);
>  
>  /* put here because inotify does some weird stuff when destroying watches */
>  extern struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask,
> @@ -443,7 +443,7 @@
>  	return 0;
>  }
>  
> -static inline void fsnotify_unmount_inodes(struct list_head *list)
> +static inline void fsnotify_unmount_inodes(struct super_block *sb)
>  {}
>  
>  #endif	/* CONFIG_FSNOTIFY */
> Index: linux-2.6/include/linux/writeback.h
> ===================================================================
> --- linux-2.6.orig/include/linux/writeback.h	2010-10-19 14:18:59.000000000 +1100
> +++ linux-2.6/include/linux/writeback.h	2010-10-19 14:19:21.000000000 +1100
> @@ -9,7 +9,6 @@
>  
>  struct backing_dev_info;
>  
> -extern spinlock_t sb_inode_list_lock;
>  extern spinlock_t wb_inode_list_lock;
>  extern struct list_head inode_unused;
>  
> 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
> the body of a message to majordomo@xxxxxxxxxxxxxxx
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[Index of Archives]     [Linux Ext4 Filesystem]     [Union Filesystem]     [Filesystem Testing]     [Ceph Users]     [Ecryptfs]     [AutoFS]     [Kernel Newbies]     [Share Photos]     [Security]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux Cachefs]     [Reiser Filesystem]     [Linux RAID]     [Samba]     [Device Mapper]     [CEPH Development]
  Powered by Linux