This patch refactors the following superblock inode list (sb->s_inodes) iteration functions in vfs: 1. iterate_bdevs() 2. drop_pagecache_sb() 3. wait_sb_inodes() 4. evict_inodes() 5. invalidate_inodes() 6. fsnotify_unmount_inodes() 7. add_dquot_ref() 8. remove_dquot_ref() The per-inode processing codes of the above functions are extracted out into inline functions to ease their conversion to use the per-cpu list. There is no functional change. Signed-off-by: Waiman Long <Waiman.Long@xxxxxxx> --- fs/block_dev.c | 59 +++++++++++--------- fs/drop_caches.c | 39 ++++++++----- fs/fs-writeback.c | 73 +++++++++++++----------- fs/inode.c | 108 ++++++++++++++++++++--------------- fs/notify/inode_mark.c | 146 ++++++++++++++++++++++++++---------------------- fs/quota/dquot.c | 105 ++++++++++++++++++++-------------- 6 files changed, 298 insertions(+), 232 deletions(-) diff --git a/fs/block_dev.c b/fs/block_dev.c index 39b3a17..6eaeedf 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1862,38 +1862,45 @@ int __invalidate_device(struct block_device *bdev, bool kill_dirty) } EXPORT_SYMBOL(__invalidate_device); +static inline void +__iterate_bdev(spinlock_t *lock, struct inode *inode, struct inode **old_inode, + void (*func)(struct block_device *, void *), void *arg) +{ + struct address_space *mapping = inode->i_mapping; + + spin_lock(&inode->i_lock); + if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW) || + mapping->nrpages == 0) { + spin_unlock(&inode->i_lock); + return; + } + __iget(inode); + spin_unlock(&inode->i_lock); + spin_unlock(lock); + /* + * We hold a reference to 'inode' so it couldn't have been + * removed from s_inodes list while we dropped the + * pcpu_lock. We cannot iput the inode now as we can + * be holding the last reference and we cannot iput it under + * pcpu_lock. So we keep the reference and iput it later. + */ + iput(*old_inode); + *old_inode = inode; + + func(I_BDEV(inode), arg); + + spin_lock(lock); +} + void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg) { struct inode *inode, *old_inode = NULL; spin_lock(&blockdev_superblock->s_inode_list_lock); - list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) { - struct address_space *mapping = inode->i_mapping; - - spin_lock(&inode->i_lock); - if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW) || - mapping->nrpages == 0) { - spin_unlock(&inode->i_lock); - continue; - } - __iget(inode); - spin_unlock(&inode->i_lock); - spin_unlock(&blockdev_superblock->s_inode_list_lock); - /* - * We hold a reference to 'inode' so it couldn't have been - * removed from s_inodes list while we dropped the - * s_inode_list_lock We cannot iput the inode now as we can - * be holding the last reference and we cannot iput it under - * s_inode_list_lock. So we keep the reference and iput it - * later. - */ - iput(old_inode); - old_inode = inode; + list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) + __iterate_bdev(&blockdev_superblock->s_inode_list_lock, + inode, &old_inode, func, arg); - func(I_BDEV(inode), arg); - - spin_lock(&blockdev_superblock->s_inode_list_lock); - } spin_unlock(&blockdev_superblock->s_inode_list_lock); iput(old_inode); } diff --git a/fs/drop_caches.c b/fs/drop_caches.c index d72d52b..d3449d5 100644 --- a/fs/drop_caches.c +++ b/fs/drop_caches.c @@ -13,28 +13,35 @@ /* A global variable is a bit ugly, but it keeps the code simple */ int sysctl_drop_caches; +static inline void __drop_pagecache_sb(spinlock_t *lock, struct inode *inode, + struct inode **toput_inode) +{ + spin_lock(&inode->i_lock); + if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) || + (inode->i_mapping->nrpages == 0)) { + spin_unlock(&inode->i_lock); + return; + } + __iget(inode); + spin_unlock(&inode->i_lock); + spin_unlock(lock); + + invalidate_mapping_pages(inode->i_mapping, 0, -1); + iput(*toput_inode); + *toput_inode = inode; + + spin_lock(lock); +} + static void drop_pagecache_sb(struct super_block *sb, void *unused) { struct inode *inode, *toput_inode = NULL; spin_lock(&sb->s_inode_list_lock); - list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { - spin_lock(&inode->i_lock); - if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) || - (inode->i_mapping->nrpages == 0)) { - spin_unlock(&inode->i_lock); - continue; - } - __iget(inode); - spin_unlock(&inode->i_lock); - spin_unlock(&sb->s_inode_list_lock); + list_for_each_entry(inode, &sb->s_inodes, i_sb_list) + __drop_pagecache_sb(&sb->s_inode_list_lock, inode, + &toput_inode); - invalidate_mapping_pages(inode->i_mapping, 0, -1); - iput(toput_inode); - toput_inode = inode; - - spin_lock(&sb->s_inode_list_lock); - } spin_unlock(&sb->s_inode_list_lock); iput(toput_inode); } diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 6915c95..5ad6eda 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -2095,6 +2095,43 @@ out_unlock_inode: } EXPORT_SYMBOL(__mark_inode_dirty); +static inline void __wait_sb_inode(spinlock_t *lock, struct inode *inode, + struct inode **old_inode) +{ + struct address_space *mapping = inode->i_mapping; + + spin_lock(&inode->i_lock); + if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) || + (mapping->nrpages == 0)) { + spin_unlock(&inode->i_lock); + return; + } + __iget(inode); + spin_unlock(&inode->i_lock); + spin_unlock(lock); + + /* + * We hold a reference to 'inode' so it couldn't have been + * removed from s_inodes list while we dropped the + * pcpu_lock. We cannot iput the inode now as we can + * be holding the last reference and we cannot iput it under + * pcpu_lock. So we keep the reference and iput it later. + */ + iput(*old_inode); + *old_inode = inode; + + /* + * We keep the error status of individual mapping so that + * applications can catch the writeback error using fsync(2). + * See filemap_fdatawait_keep_errors() for details. + */ + filemap_fdatawait_keep_errors(mapping); + + cond_resched(); + + spin_lock(lock); +} + /* * The @s_sync_lock is used to serialise concurrent sync operations * to avoid lock contention problems with concurrent wait_sb_inodes() calls. @@ -2124,41 +2161,9 @@ static void wait_sb_inodes(struct super_block *sb) * In which case, the inode may not be on the dirty list, but * we still have to wait for that writeout. */ - list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { - struct address_space *mapping = inode->i_mapping; - - spin_lock(&inode->i_lock); - if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) || - (mapping->nrpages == 0)) { - spin_unlock(&inode->i_lock); - continue; - } - __iget(inode); - spin_unlock(&inode->i_lock); - spin_unlock(&sb->s_inode_list_lock); + list_for_each_entry(inode, &sb->s_inodes, i_sb_list) + __wait_sb_inode(&sb->s_inode_list_lock, inode, &old_inode); - /* - * We hold a reference to 'inode' so it couldn't have been - * removed from s_inodes list while we dropped the - * s_inode_list_lock. We cannot iput the inode now as we can - * be holding the last reference and we cannot iput it under - * s_inode_list_lock. So we keep the reference and iput it - * later. - */ - iput(old_inode); - old_inode = inode; - - /* - * We keep the error status of individual mapping so that - * applications can catch the writeback error using fsync(2). - * See filemap_fdatawait_keep_errors() for details. - */ - filemap_fdatawait_keep_errors(mapping); - - cond_resched(); - - spin_lock(&sb->s_inode_list_lock); - } spin_unlock(&sb->s_inode_list_lock); iput(old_inode); mutex_unlock(&sb->s_sync_lock); diff --git a/fs/inode.c b/fs/inode.c index 9f62db3..6dd609e 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -579,6 +579,37 @@ static void dispose_list(struct list_head *head) } } +static inline int __evict_inode(spinlock_t *lock, struct inode *inode, + struct list_head *dispose) +{ + if (atomic_read(&inode->i_count)) + return 0; + + spin_lock(&inode->i_lock); + if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { + spin_unlock(&inode->i_lock); + return 0; + } + + inode->i_state |= I_FREEING; + inode_lru_list_del(inode); + spin_unlock(&inode->i_lock); + list_add(&inode->i_lru, dispose); + + /* + * We can have a ton of inodes to evict at unmount time given + * enough memory, check to see if we need to go to sleep for a + * bit so we don't livelock. + */ + if (need_resched()) { + spin_unlock(lock); + cond_resched(); + dispose_list(dispose); + return 1; /* Redo it again */ + } + return 0; +} + /** * evict_inodes - evict all evictable inodes for a superblock * @sb: superblock to operate on @@ -596,35 +627,39 @@ void evict_inodes(struct super_block *sb) again: spin_lock(&sb->s_inode_list_lock); list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) { - if (atomic_read(&inode->i_count)) - continue; + if (__evict_inode(&sb->s_inode_list_lock, inode, &dispose)) + goto again; + } + spin_unlock(&sb->s_inode_list_lock); - spin_lock(&inode->i_lock); - if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { - spin_unlock(&inode->i_lock); - continue; - } + dispose_list(&dispose); +} - inode->i_state |= I_FREEING; - inode_lru_list_del(inode); +static inline void __invalidate_inode(struct inode *inode, bool kill_dirty, + struct list_head *dispose, int *busy) +{ + spin_lock(&inode->i_lock); + if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { spin_unlock(&inode->i_lock); - list_add(&inode->i_lru, &dispose); + return; + } - /* - * We can have a ton of inodes to evict at unmount time given - * enough memory, check to see if we need to go to sleep for a - * bit so we don't livelock. - */ - if (need_resched()) { - spin_unlock(&sb->s_inode_list_lock); - cond_resched(); - dispose_list(&dispose); - goto again; - } + if (inode->i_state & I_DIRTY_ALL && !kill_dirty) { + spin_unlock(&inode->i_lock); + *busy = 1; + return; } - spin_unlock(&sb->s_inode_list_lock); - dispose_list(&dispose); + if (atomic_read(&inode->i_count)) { + spin_unlock(&inode->i_lock); + *busy = 1; + return; + } + + inode->i_state |= I_FREEING; + inode_lru_list_del(inode); + spin_unlock(&inode->i_lock); + list_add(&inode->i_lru, dispose); } /** @@ -644,28 +679,9 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty) LIST_HEAD(dispose); spin_lock(&sb->s_inode_list_lock); - list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) { - spin_lock(&inode->i_lock); - if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { - spin_unlock(&inode->i_lock); - continue; - } - if (inode->i_state & I_DIRTY_ALL && !kill_dirty) { - spin_unlock(&inode->i_lock); - busy = 1; - continue; - } - if (atomic_read(&inode->i_count)) { - spin_unlock(&inode->i_lock); - busy = 1; - continue; - } + list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) + __invalidate_inode(inode, kill_dirty, &dispose, &busy); - inode->i_state |= I_FREEING; - inode_lru_list_del(inode); - spin_unlock(&inode->i_lock); - list_add(&inode->i_lru, &dispose); - } spin_unlock(&sb->s_inode_list_lock); dispose_list(&dispose); diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c index 741077d..ec52dcb 100644 --- a/fs/notify/inode_mark.c +++ b/fs/notify/inode_mark.c @@ -141,86 +141,98 @@ int fsnotify_add_inode_mark(struct fsnotify_mark *mark, return ret; } -/** - * fsnotify_unmount_inodes - an sb is unmounting. handle any watched inodes. - * @sb: superblock being unmounted. - * - * Called during unmount with no locks held, so needs to be safe against - * concurrent modifiers. We temporarily drop sb->s_inode_list_lock and CAN block. - */ -void fsnotify_unmount_inodes(struct super_block *sb) +static inline void +__fsnotify_unmount_inode(spinlock_t *lock, struct inode *inode, + struct list_head *head, struct inode **pnext, + struct inode **need_iput) { - struct inode *inode, *next_i, *need_iput = NULL; - - spin_lock(&sb->s_inode_list_lock); - list_for_each_entry_safe(inode, next_i, &sb->s_inodes, i_sb_list) { - struct inode *need_iput_tmp; + struct inode *need_iput_tmp; + struct inode *next_i = *pnext; - /* - * We cannot __iget() an inode in state I_FREEING, - * I_WILL_FREE, or I_NEW which is fine because by that point - * the inode cannot have any associated watches. - */ - spin_lock(&inode->i_lock); - if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) { - spin_unlock(&inode->i_lock); - continue; - } + /* + * We cannot __iget() an inode in state I_FREEING, + * I_WILL_FREE, or I_NEW which is fine because by that point + * the inode cannot have any associated watches. + */ + spin_lock(&inode->i_lock); + if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) { + spin_unlock(&inode->i_lock); + return; + } - /* - * If i_count is zero, the inode cannot have any watches and - * doing an __iget/iput with MS_ACTIVE clear would actually - * evict all inodes with zero i_count from icache which is - * unnecessarily violent and may in fact be illegal to do. - */ - if (!atomic_read(&inode->i_count)) { - spin_unlock(&inode->i_lock); - continue; - } + /* + * If i_count is zero, the inode cannot have any watches and + * doing an __iget/iput with MS_ACTIVE clear would actually + * evict all inodes with zero i_count from icache which is + * unnecessarily violent and may in fact be illegal to do. + */ + if (!atomic_read(&inode->i_count)) { + spin_unlock(&inode->i_lock); + return; + } - need_iput_tmp = need_iput; - need_iput = NULL; + need_iput_tmp = *need_iput; + *need_iput = NULL; - /* In case fsnotify_inode_delete() drops a reference. */ - if (inode != need_iput_tmp) - __iget(inode); - else - need_iput_tmp = NULL; - spin_unlock(&inode->i_lock); + /* In case fsnotify_inode_delete() drops a reference. */ + if (inode != need_iput_tmp) + __iget(inode); + else + need_iput_tmp = NULL; + spin_unlock(&inode->i_lock); - /* In case the dropping of a reference would nuke next_i. */ - while (&next_i->i_sb_list != &sb->s_inodes) { - spin_lock(&next_i->i_lock); - if (!(next_i->i_state & (I_FREEING | I_WILL_FREE)) && - atomic_read(&next_i->i_count)) { - __iget(next_i); - need_iput = next_i; - spin_unlock(&next_i->i_lock); - break; - } + /* In case the dropping of a reference would nuke next_i. */ + while (&next_i->i_sb_list != head) { + spin_lock(&next_i->i_lock); + if (!(next_i->i_state & (I_FREEING | I_WILL_FREE)) && + atomic_read(&next_i->i_count)) { + __iget(next_i); + *need_iput = next_i; spin_unlock(&next_i->i_lock); - next_i = list_next_entry(next_i, i_sb_list); + break; } + spin_unlock(&next_i->i_lock); + next_i = list_next_entry(next_i, i_sb_list); + } + *pnext = next_i; - /* - * We can safely drop s_inode_list_lock here because either - * we actually hold references on both inode and next_i or - * end of list. Also no new inodes will be added since the - * umount has begun. - */ - spin_unlock(&sb->s_inode_list_lock); + /* + * We can safely drop pcpu_lock here because either + * we actually hold references on both inode and next_i or + * end of list. Also no new inodes will be added since the + * umount has begun. + */ + spin_unlock(lock); - if (need_iput_tmp) - iput(need_iput_tmp); + if (need_iput_tmp) + iput(need_iput_tmp); - /* for each watch, send FS_UNMOUNT and then remove it */ - fsnotify(inode, FS_UNMOUNT, inode, FSNOTIFY_EVENT_INODE, NULL, 0); + /* for each watch, send FS_UNMOUNT and then remove it */ + fsnotify(inode, FS_UNMOUNT, inode, FSNOTIFY_EVENT_INODE, NULL, 0); - fsnotify_inode_delete(inode); + fsnotify_inode_delete(inode); - iput(inode); + iput(inode); + + spin_lock(lock); +} + +/** + * fsnotify_unmount_inodes - an sb is unmounting. handle any watched inodes. + * @sb: superblock being unmounted. + * + * Called during unmount with no locks held, so needs to be safe against + * concurrent modifiers. We temporarily drop sb->s_inodes_cpu->lock and CAN + * block. + */ +void fsnotify_unmount_inodes(struct super_block *sb) +{ + struct inode *inode, *next_i, *need_iput = NULL; + + spin_lock(&sb->s_inode_list_lock); + list_for_each_entry_safe(inode, next_i, &sb->s_inodes, i_sb_list) + __fsnotify_unmount_inode(&sb->s_inode_list_lock, inode, + &sb->s_inodes, &next_i, &need_iput); - spin_lock(&sb->s_inode_list_lock); - } spin_unlock(&sb->s_inode_list_lock); } diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 3c3b81b..143183b 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -920,6 +920,42 @@ static int dqinit_needed(struct inode *inode, int type) return 0; } +static inline void +__add_dquot_ref(spinlock_t *lock, struct inode *inode, int type, +#ifdef CONFIG_QUOTA_DEBUG + int *reserved, +#endif + struct inode **old_inode) +{ + spin_lock(&inode->i_lock); + if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) || + !atomic_read(&inode->i_writecount) || + !dqinit_needed(inode, type)) { + spin_unlock(&inode->i_lock); + return; + } + __iget(inode); + spin_unlock(&inode->i_lock); + spin_unlock(lock); + +#ifdef CONFIG_QUOTA_DEBUG + if (unlikely(inode_get_rsv_space(inode) > 0)) + *reserved = 1; +#endif + iput(*old_inode); + __dquot_initialize(inode, type); + + /* + * We hold a reference to 'inode' so it couldn't have been + * removed from s_inodes list while we dropped the + * pcpu_lock. We cannot iput the inode now as we can be + * holding the last reference and we cannot iput it under + * pcpu_lock. So we keep the reference and iput it later. + */ + *old_inode = inode; + spin_lock(lock); +} + /* This routine is guarded by dqonoff_mutex mutex */ static void add_dquot_ref(struct super_block *sb, int type) { @@ -929,36 +965,12 @@ static void add_dquot_ref(struct super_block *sb, int type) #endif spin_lock(&sb->s_inode_list_lock); - list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { - spin_lock(&inode->i_lock); - if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) || - !atomic_read(&inode->i_writecount) || - !dqinit_needed(inode, type)) { - spin_unlock(&inode->i_lock); - continue; - } - __iget(inode); - spin_unlock(&inode->i_lock); - spin_unlock(&sb->s_inode_list_lock); - + list_for_each_entry(inode, &sb->s_inodes, i_sb_list) + __add_dquot_ref(&sb->s_inode_list_lock, inode, type, #ifdef CONFIG_QUOTA_DEBUG - if (unlikely(inode_get_rsv_space(inode) > 0)) - reserved = 1; + &reserved, #endif - iput(old_inode); - __dquot_initialize(inode, type); - - /* - * We hold a reference to 'inode' so it couldn't have been - * removed from s_inodes list while we dropped the - * s_inode_list_lock. We cannot iput the inode now as we can be - * holding the last reference and we cannot iput it under - * s_inode_list_lock. So we keep the reference and iput it - * later. - */ - old_inode = inode; - spin_lock(&sb->s_inode_list_lock); - } + &old_inode); spin_unlock(&sb->s_inode_list_lock); iput(old_inode); @@ -1022,6 +1034,25 @@ static void put_dquot_list(struct list_head *tofree_head) } } +static inline void +__remove_dquot_ref(struct inode *inode, int type, + struct list_head *tofree_head, int *reserved) +{ + /* + * We have to scan also I_NEW inodes because they can already + * have quota pointer initialized. Luckily, we need to touch + * only quota pointers and these have separate locking + * (dq_data_lock). + */ + spin_lock(&dq_data_lock); + if (!IS_NOQUOTA(inode)) { + if (unlikely(inode_get_rsv_space(inode) > 0)) + *reserved = 1; + remove_inode_dquot_ref(inode, type, tofree_head); + } + spin_unlock(&dq_data_lock); +} + static void remove_dquot_ref(struct super_block *sb, int type, struct list_head *tofree_head) { @@ -1029,21 +1060,9 @@ static void remove_dquot_ref(struct super_block *sb, int type, int reserved = 0; spin_lock(&sb->s_inode_list_lock); - list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { - /* - * We have to scan also I_NEW inodes because they can already - * have quota pointer initialized. Luckily, we need to touch - * only quota pointers and these have separate locking - * (dq_data_lock). - */ - spin_lock(&dq_data_lock); - if (!IS_NOQUOTA(inode)) { - if (unlikely(inode_get_rsv_space(inode) > 0)) - reserved = 1; - remove_inode_dquot_ref(inode, type, tofree_head); - } - spin_unlock(&dq_data_lock); - } + list_for_each_entry(inode, &sb->s_inodes, i_sb_list) + __remove_dquot_ref(inode, type, tofree_head, &reserved); + spin_unlock(&sb->s_inode_list_lock); #ifdef CONFIG_QUOTA_DEBUG if (reserved) { -- 1.7.1 -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html