On Fri, 11 Jul 2014, Andi Kleen wrote: > Date: Fri, 11 Jul 2014 13:23:39 -0700 > From: Andi Kleen <andi@xxxxxxxxxxxxxx> > To: linux-fsdevel@xxxxxxxxxxxxxxx > Cc: linux-kernel@xxxxxxxxxxxxxxx, Andi Kleen <ak@xxxxxxxxxxxxxxx>, > Al Viro <viro@xxxxxxxxxxxxxxxxxx>, Miklos Szeredi <mszeredi@xxxxxxx> > Subject: [PATCH] vfs: Turn the unlinked inode counter into a percpu counter > > From: Andi Kleen <ak@xxxxxxxxxxxxxxx> > > The unlinked open inodes super block counter that was added some time ago > unfortunately becomes a very hot cache line in some delete heavy > high IOPs or tmpfs workloads with enough cores, when files > are frequently removed. With TSX it also causes plenty of aborts. > > Turn the atomic into a percpu_counter. It's nearly perfectly > suited for a percpu counter because it is only checked > in very slow paths, like remount. > > Original commit: > > commit 7ada4db88634429f4da690ad1c4eb73c93085f0c > Author: Miklos Szeredi <mszeredi@xxxxxxx> > Date: Mon Nov 21 12:11:32 2011 +0100 > > vfs: count unlinked inodes > > Add a new counter to the superblock that keeps track of unlinked but > not yet deleted inodes. > > Cc: Al Viro <viro@xxxxxxxxxxxxxxxxxx> > Cc: Miklos Szeredi <mszeredi@xxxxxxx> > Signed-off-by: Andi Kleen <ak@xxxxxxxxxxxxxxx> > --- > fs/ext2/super.c | 2 +- > fs/inode.c | 14 ++++++-------- > fs/namespace.c | 4 ++-- > fs/super.c | 3 +++ > include/linux/fs.h | 2 +- > 5 files changed, 13 insertions(+), 12 deletions(-) > > diff --git a/fs/ext2/super.c b/fs/ext2/super.c > index 3750031..ac0ad9a 100644 > --- a/fs/ext2/super.c > +++ b/fs/ext2/super.c > @@ -1218,7 +1218,7 @@ static int ext2_freeze(struct super_block *sb) > * because we have unattached inodes and thus filesystem is not fully > * consistent. > */ > - if (atomic_long_read(&sb->s_remove_count)) { > + if (percpu_counter_sum(&sb->s_remove_counters)) { > ext2_sync_fs(sb, 1); > return 0; > } > diff --git a/fs/inode.c b/fs/inode.c > index f96d2a6..4820c35 100644 > --- a/fs/inode.c > +++ b/fs/inode.c > @@ -234,10 +234,8 @@ void __destroy_inode(struct inode *inode) > BUG_ON(inode_has_buffers(inode)); > security_inode_free(inode); > fsnotify_inode_delete(inode); > - if (!inode->i_nlink) { > - WARN_ON(atomic_long_read(&inode->i_sb->s_remove_count) == 0); This is not mentioned in the description, but I guess it makes sense since it'd be quite expensive. -Lukas > - atomic_long_dec(&inode->i_sb->s_remove_count); > - } > + if (!inode->i_nlink) > + percpu_counter_dec(&inode->i_sb->s_remove_counters); > > #ifdef CONFIG_FS_POSIX_ACL > if (inode->i_acl && inode->i_acl != ACL_NOT_CACHED) > @@ -281,7 +279,7 @@ void drop_nlink(struct inode *inode) > WARN_ON(inode->i_nlink == 0); > inode->__i_nlink--; > if (!inode->i_nlink) > - atomic_long_inc(&inode->i_sb->s_remove_count); > + percpu_counter_inc(&inode->i_sb->s_remove_counters); > } > EXPORT_SYMBOL(drop_nlink); > > @@ -297,7 +295,7 @@ void clear_nlink(struct inode *inode) > { > if (inode->i_nlink) { > inode->__i_nlink = 0; > - atomic_long_inc(&inode->i_sb->s_remove_count); > + percpu_counter_inc(&inode->i_sb->s_remove_counters); > } > } > EXPORT_SYMBOL(clear_nlink); > @@ -317,7 +315,7 @@ void set_nlink(struct inode *inode, unsigned int nlink) > } else { > /* Yes, some filesystems do change nlink from zero to one */ > if (inode->i_nlink == 0) > - atomic_long_dec(&inode->i_sb->s_remove_count); > + percpu_counter_dec(&inode->i_sb->s_remove_counters); > > inode->__i_nlink = nlink; > } > @@ -336,7 +334,7 @@ void inc_nlink(struct inode *inode) > { > if (unlikely(inode->i_nlink == 0)) { > WARN_ON(!(inode->i_state & I_LINKABLE)); > - atomic_long_dec(&inode->i_sb->s_remove_count); > + percpu_counter_dec(&inode->i_sb->s_remove_counters); > } > > inode->__i_nlink++; > diff --git a/fs/namespace.c b/fs/namespace.c > index 182bc41..20d33d9 100644 > --- a/fs/namespace.c > +++ b/fs/namespace.c > @@ -535,7 +535,7 @@ int sb_prepare_remount_readonly(struct super_block *sb) > int err = 0; > > /* Racy optimization. Recheck the counter under MNT_WRITE_HOLD */ > - if (atomic_long_read(&sb->s_remove_count)) > + if (percpu_counter_sum(&sb->s_remove_counters) != 0) > return -EBUSY; > > lock_mount_hash(); > @@ -549,7 +549,7 @@ int sb_prepare_remount_readonly(struct super_block *sb) > } > } > } > - if (!err && atomic_long_read(&sb->s_remove_count)) > + if (!err && percpu_counter_sum(&sb->s_remove_counters) != 0) > err = -EBUSY; > > if (!err) { > diff --git a/fs/super.c b/fs/super.c > index 48377f7..b7a8a45 100644 > --- a/fs/super.c > +++ b/fs/super.c > @@ -142,6 +142,7 @@ static void destroy_super(struct super_block *s) > list_lru_destroy(&s->s_inode_lru); > for (i = 0; i < SB_FREEZE_LEVELS; i++) > percpu_counter_destroy(&s->s_writers.counter[i]); > + percpu_counter_destroy(&s->s_remove_counters); > security_sb_free(s); > WARN_ON(!list_empty(&s->s_mounts)); > kfree(s->s_subtype); > @@ -171,6 +172,8 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags) > if (security_sb_alloc(s)) > goto fail; > > + if (percpu_counter_init(&s->s_remove_counters, 0) < 0) > + goto fail; > for (i = 0; i < SB_FREEZE_LEVELS; i++) { > if (percpu_counter_init(&s->s_writers.counter[i], 0) < 0) > goto fail; > diff --git a/include/linux/fs.h b/include/linux/fs.h > index 8780312..0374552 100644 > --- a/include/linux/fs.h > +++ b/include/linux/fs.h > @@ -1240,7 +1240,7 @@ struct super_block { > struct shrinker s_shrink; /* per-sb shrinker handle */ > > /* Number of inodes with nlink == 0 but still referenced */ > - atomic_long_t s_remove_count; > + struct percpu_counter s_remove_counters; > > /* Being remounted read-only */ > int s_readonly_remount; > -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html