On Tue, Apr 06, 2021 at 10:33:43PM +1000, Dave Chinner wrote: > From: Dave Chinner <dchinner@xxxxxxxxxx> > > Because scalability of the global inode_hash_lock really, really > sucks and prevents me from doing scalability characterisation and > analysis of bcachefs algorithms. > > Profiles of a 32-way concurrent create of 51.2m inodes with fsmark > on a couple of different filesystems on a 5.10 kernel: > > - 52.13% 0.04% [kernel] [k] ext4_create > - 52.09% ext4_create > - 41.03% __ext4_new_inode > - 29.92% insert_inode_locked > - 25.35% _raw_spin_lock > - do_raw_spin_lock > - 24.97% __pv_queued_spin_lock_slowpath > > > - 72.33% 0.02% [kernel] [k] do_filp_open > - 72.31% do_filp_open > - 72.28% path_openat > - 57.03% bch2_create > - 56.46% __bch2_create > - 40.43% inode_insert5 > - 36.07% _raw_spin_lock > - do_raw_spin_lock > 35.86% __pv_queued_spin_lock_slowpath > 4.02% find_inode > > btrfs was tested but it is limited by internal lock contention at > >=2 threads on this workload, so never hammers the inode cache lock > hard enough for this change to matter to it's performance. > > However, both bcachefs and ext4 demonstrate poor scaling at >=8 > threads on concurrent lookup or create workloads. > > Hence convert the inode hash table to a RCU-aware hash-bl table just > like the dentry cache. Note that we need to store a pointer to the > hlist_bl_head the inode has been added to in the inode so that when > it comes to unhash the inode we know what list to lock. We need to > do this because, unlike the dentry cache, the hash value that is > used to hash the inode is not generated from the inode itself. i.e. > filesystems can provide this themselves so we have to either store > the hashval or the hlist head pointer in the inode to be able to > find the right list head for removal... > > Concurrent create with variying thread count (files/s): > > vanilla Patched > threads ext4 bcachefs ext4 bcachefs > 2 117k 112k 85k > 4 185k 190k 145k > 8 303k 185k 346k 255k > 16 389k 190k 465k 420k > 32 360k 142k 437k 481k > > ext4 bcachefs > threads vanilla patched vanilla patched > 2 117k 112k 80k 85k > 4 185k 190k 133k 145k > 8 303k 346k 185k 255k > 16 389k 465k 190k 420k > 32 360k 437k 142k 481k > > CPU usage for both bcachefs and ext4 at 16 and 32 threads has been > halved on the patched kernel, while performance has increased > marginally on ext4 and massively on bcachefs. Internal filesystem > algorithms now limit performance on these workloads, not the global > inode_hash_lock. > > Profile of the workloads on the patched kernels: > > - 35.94% 0.07% [kernel] [k] ext4_create > - 35.87% ext4_create > - 20.45% __ext4_new_inode > ... > 3.36% insert_inode_locked > > - 78.43% do_filp_open > - 78.36% path_openat > - 53.95% bch2_create > - 47.99% __bch2_create > .... > - 7.57% inode_insert5 > 6.94% find_inode > > Spinlock contention is largely gone from the inode hash operations > and the filesystems are limited by contention in their internal > algorithms. > > Signed-off-by: Dave Chinner <dchinner@xxxxxxxxxx> Reviewed-and-tested-by: Kent Overstreet <kent.overstreet@xxxxxxxxx> > --- > fs/inode.c | 200 ++++++++++++++++++++++++++++----------------- > include/linux/fs.h | 9 +- > 2 files changed, 132 insertions(+), 77 deletions(-) > > diff --git a/fs/inode.c b/fs/inode.c > index b8d9eb3454dc..867af386177b 100644 > --- a/fs/inode.c > +++ b/fs/inode.c > @@ -57,8 +57,7 @@ > > static unsigned int i_hash_mask __read_mostly; > static unsigned int i_hash_shift __read_mostly; > -static struct hlist_head *inode_hashtable __read_mostly; > -static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock); > +static struct hlist_bl_head *inode_hashtable __read_mostly; > > static unsigned long hash(struct super_block *sb, unsigned long hashval) > { > @@ -70,7 +69,7 @@ static unsigned long hash(struct super_block *sb, unsigned long hashval) > return tmp & i_hash_mask; > } > > -static inline struct hlist_head *i_hash_head(struct super_block *sb, > +static inline struct hlist_bl_head *i_hash_head(struct super_block *sb, > unsigned int hashval) > { > return inode_hashtable + hash(sb, hashval); > @@ -407,7 +406,7 @@ EXPORT_SYMBOL(address_space_init_once); > void inode_init_once(struct inode *inode) > { > memset(inode, 0, sizeof(*inode)); > - INIT_HLIST_NODE(&inode->i_hash); > + INIT_HLIST_BL_NODE(&inode->i_hash); > INIT_LIST_HEAD(&inode->i_devices); > INIT_LIST_HEAD(&inode->i_io_list); > INIT_LIST_HEAD(&inode->i_wb_list); > @@ -491,6 +490,17 @@ static inline void inode_sb_list_del(struct inode *inode) > } > } > > +/* > + * Ensure that we store the hash head in the inode when we insert the inode into > + * the hlist_bl_head... > + */ > +static inline void > +__insert_inode_hash_head(struct inode *inode, struct hlist_bl_head *b) > +{ > + hlist_bl_add_head_rcu(&inode->i_hash, b); > + inode->i_hash_head = b; > +} > + > /** > * __insert_inode_hash - hash an inode > * @inode: unhashed inode > @@ -501,13 +511,13 @@ static inline void inode_sb_list_del(struct inode *inode) > */ > void __insert_inode_hash(struct inode *inode, unsigned long hashval) > { > - struct hlist_head *b = inode_hashtable + hash(inode->i_sb, hashval); > + struct hlist_bl_head *b = i_hash_head(inode->i_sb, hashval); > > - spin_lock(&inode_hash_lock); > + hlist_bl_lock(b); > spin_lock(&inode->i_lock); > - hlist_add_head_rcu(&inode->i_hash, b); > + __insert_inode_hash_head(inode, b); > spin_unlock(&inode->i_lock); > - spin_unlock(&inode_hash_lock); > + hlist_bl_unlock(b); > } > EXPORT_SYMBOL(__insert_inode_hash); > > @@ -519,11 +529,44 @@ EXPORT_SYMBOL(__insert_inode_hash); > */ > void __remove_inode_hash(struct inode *inode) > { > - spin_lock(&inode_hash_lock); > - spin_lock(&inode->i_lock); > - hlist_del_init_rcu(&inode->i_hash); > - spin_unlock(&inode->i_lock); > - spin_unlock(&inode_hash_lock); > + struct hlist_bl_head *b = inode->i_hash_head; > + > + /* > + * There are some callers that come through here without synchronisation > + * and potentially with multiple references to the inode. Hence we have > + * to handle the case that we might race with a remove and insert to a > + * different list. Coda, in particular, seems to have a userspace API > + * that can directly trigger "unhash/rehash to different list" behaviour > + * without any serialisation at all. > + * > + * Hence we have to handle the situation where the inode->i_hash_head > + * might point to a different list than what we expect, indicating that > + * we raced with another unhash and potentially a new insertion. This > + * means we have to retest the head once we have everything locked up > + * and loop again if it doesn't match. > + */ > + while (b) { > + hlist_bl_lock(b); > + spin_lock(&inode->i_lock); > + if (b != inode->i_hash_head) { > + hlist_bl_unlock(b); > + b = inode->i_hash_head; > + spin_unlock(&inode->i_lock); > + continue; > + } > + /* > + * Need to set the pprev pointer to NULL after list removal so > + * that both RCU traversals and hlist_bl_unhashed() work > + * correctly at this point. > + */ > + hlist_bl_del_rcu(&inode->i_hash); > + inode->i_hash.pprev = NULL; > + inode->i_hash_head = NULL; > + spin_unlock(&inode->i_lock); > + hlist_bl_unlock(b); > + break; > + } > + > } > EXPORT_SYMBOL(__remove_inode_hash); > > @@ -813,26 +856,28 @@ long prune_icache_sb(struct super_block *sb, struct shrink_control *sc) > return freed; > } > > -static void __wait_on_freeing_inode(struct inode *inode); > +static void __wait_on_freeing_inode(struct hlist_bl_head *b, > + struct inode *inode); > /* > * Called with the inode lock held. > */ > static struct inode *find_inode(struct super_block *sb, > - struct hlist_head *head, > + struct hlist_bl_head *b, > int (*test)(struct inode *, void *), > void *data) > { > + struct hlist_bl_node *node; > struct inode *inode = NULL; > > repeat: > - hlist_for_each_entry(inode, head, i_hash) { > + hlist_bl_for_each_entry(inode, node, b, i_hash) { > if (inode->i_sb != sb) > continue; > if (!test(inode, data)) > continue; > spin_lock(&inode->i_lock); > if (inode->i_state & (I_FREEING|I_WILL_FREE)) { > - __wait_on_freeing_inode(inode); > + __wait_on_freeing_inode(b, inode); > goto repeat; > } > if (unlikely(inode->i_state & I_CREATING)) { > @@ -851,19 +896,20 @@ static struct inode *find_inode(struct super_block *sb, > * iget_locked for details. > */ > static struct inode *find_inode_fast(struct super_block *sb, > - struct hlist_head *head, unsigned long ino) > + struct hlist_bl_head *b, unsigned long ino) > { > + struct hlist_bl_node *node; > struct inode *inode = NULL; > > repeat: > - hlist_for_each_entry(inode, head, i_hash) { > + hlist_bl_for_each_entry(inode, node, b, i_hash) { > if (inode->i_ino != ino) > continue; > if (inode->i_sb != sb) > continue; > spin_lock(&inode->i_lock); > if (inode->i_state & (I_FREEING|I_WILL_FREE)) { > - __wait_on_freeing_inode(inode); > + __wait_on_freeing_inode(b, inode); > goto repeat; > } > if (unlikely(inode->i_state & I_CREATING)) { > @@ -1072,26 +1118,26 @@ EXPORT_SYMBOL(unlock_two_nondirectories); > * return it locked, hashed, and with the I_NEW flag set. The file system gets > * to fill it in before unlocking it via unlock_new_inode(). > * > - * Note both @test and @set are called with the inode_hash_lock held, so can't > - * sleep. > + * Note both @test and @set are called with the inode hash chain lock held, > + * so can't sleep. > */ > struct inode *inode_insert5(struct inode *inode, unsigned long hashval, > int (*test)(struct inode *, void *), > int (*set)(struct inode *, void *), void *data) > { > - struct hlist_head *head = i_hash_head(inode->i_sb, hashval); > + struct hlist_bl_head *b = i_hash_head(inode->i_sb, hashval); > struct inode *old; > bool creating = inode->i_state & I_CREATING; > > again: > - spin_lock(&inode_hash_lock); > - old = find_inode(inode->i_sb, head, test, data); > + hlist_bl_lock(b); > + old = find_inode(inode->i_sb, b, test, data); > if (unlikely(old)) { > /* > * Uhhuh, somebody else created the same inode under us. > * Use the old inode instead of the preallocated one. > */ > - spin_unlock(&inode_hash_lock); > + hlist_bl_unlock(b); > if (IS_ERR(old)) > return NULL; > wait_on_inode(old); > @@ -1113,12 +1159,12 @@ struct inode *inode_insert5(struct inode *inode, unsigned long hashval, > */ > spin_lock(&inode->i_lock); > inode->i_state |= I_NEW; > - hlist_add_head_rcu(&inode->i_hash, head); > + __insert_inode_hash_head(inode, b); > spin_unlock(&inode->i_lock); > if (!creating) > inode_sb_list_add(inode); > unlock: > - spin_unlock(&inode_hash_lock); > + hlist_bl_unlock(b); > > return inode; > } > @@ -1179,12 +1225,12 @@ EXPORT_SYMBOL(iget5_locked); > */ > struct inode *iget_locked(struct super_block *sb, unsigned long ino) > { > - struct hlist_head *head = i_hash_head(sb, ino); > + struct hlist_bl_head *b = i_hash_head(sb, ino); > struct inode *inode; > again: > - spin_lock(&inode_hash_lock); > - inode = find_inode_fast(sb, head, ino); > - spin_unlock(&inode_hash_lock); > + hlist_bl_lock(b); > + inode = find_inode_fast(sb, b, ino); > + hlist_bl_unlock(b); > if (inode) { > if (IS_ERR(inode)) > return NULL; > @@ -1200,17 +1246,17 @@ struct inode *iget_locked(struct super_block *sb, unsigned long ino) > if (inode) { > struct inode *old; > > - spin_lock(&inode_hash_lock); > + hlist_bl_lock(b); > /* We released the lock, so.. */ > - old = find_inode_fast(sb, head, ino); > + old = find_inode_fast(sb, b, ino); > if (!old) { > inode->i_ino = ino; > spin_lock(&inode->i_lock); > inode->i_state = I_NEW; > - hlist_add_head_rcu(&inode->i_hash, head); > + __insert_inode_hash_head(inode, b); > spin_unlock(&inode->i_lock); > inode_sb_list_add(inode); > - spin_unlock(&inode_hash_lock); > + hlist_bl_unlock(b); > > /* Return the locked inode with I_NEW set, the > * caller is responsible for filling in the contents > @@ -1223,7 +1269,7 @@ struct inode *iget_locked(struct super_block *sb, unsigned long ino) > * us. Use the old inode instead of the one we just > * allocated. > */ > - spin_unlock(&inode_hash_lock); > + hlist_bl_unlock(b); > destroy_inode(inode); > if (IS_ERR(old)) > return NULL; > @@ -1247,10 +1293,11 @@ EXPORT_SYMBOL(iget_locked); > */ > static int test_inode_iunique(struct super_block *sb, unsigned long ino) > { > - struct hlist_head *b = i_hash_head(sb, ino); > + struct hlist_bl_head *b = i_hash_head(sb, ino); > + struct hlist_bl_node *node; > struct inode *inode; > > - hlist_for_each_entry_rcu(inode, b, i_hash) { > + hlist_bl_for_each_entry_rcu(inode, node, b, i_hash) { > if (inode->i_ino == ino && inode->i_sb == sb) > return 0; > } > @@ -1334,12 +1381,12 @@ EXPORT_SYMBOL(igrab); > struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval, > int (*test)(struct inode *, void *), void *data) > { > - struct hlist_head *head = i_hash_head(sb, hashval); > + struct hlist_bl_head *b = i_hash_head(sb, hashval); > struct inode *inode; > > - spin_lock(&inode_hash_lock); > - inode = find_inode(sb, head, test, data); > - spin_unlock(&inode_hash_lock); > + hlist_bl_lock(b); > + inode = find_inode(sb, b, test, data); > + hlist_bl_unlock(b); > > return IS_ERR(inode) ? NULL : inode; > } > @@ -1389,12 +1436,12 @@ EXPORT_SYMBOL(ilookup5); > */ > struct inode *ilookup(struct super_block *sb, unsigned long ino) > { > - struct hlist_head *head = i_hash_head(sb, ino); > + struct hlist_bl_head *b = i_hash_head(sb, ino); > struct inode *inode; > again: > - spin_lock(&inode_hash_lock); > - inode = find_inode_fast(sb, head, ino); > - spin_unlock(&inode_hash_lock); > + hlist_bl_lock(b); > + inode = find_inode_fast(sb, b, ino); > + hlist_bl_unlock(b); > > if (inode) { > if (IS_ERR(inode)) > @@ -1438,12 +1485,13 @@ struct inode *find_inode_nowait(struct super_block *sb, > void *), > void *data) > { > - struct hlist_head *head = i_hash_head(sb, hashval); > + struct hlist_bl_head *b = i_hash_head(sb, hashval); > + struct hlist_bl_node *node; > struct inode *inode, *ret_inode = NULL; > int mval; > > - spin_lock(&inode_hash_lock); > - hlist_for_each_entry(inode, head, i_hash) { > + hlist_bl_lock(b); > + hlist_bl_for_each_entry(inode, node, b, i_hash) { > if (inode->i_sb != sb) > continue; > mval = match(inode, hashval, data); > @@ -1454,7 +1502,7 @@ struct inode *find_inode_nowait(struct super_block *sb, > goto out; > } > out: > - spin_unlock(&inode_hash_lock); > + hlist_bl_unlock(b); > return ret_inode; > } > EXPORT_SYMBOL(find_inode_nowait); > @@ -1483,13 +1531,14 @@ EXPORT_SYMBOL(find_inode_nowait); > struct inode *find_inode_rcu(struct super_block *sb, unsigned long hashval, > int (*test)(struct inode *, void *), void *data) > { > - struct hlist_head *head = i_hash_head(sb, hashval); > + struct hlist_bl_head *b = i_hash_head(sb, hashval); > + struct hlist_bl_node *node; > struct inode *inode; > > RCU_LOCKDEP_WARN(!rcu_read_lock_held(), > "suspicious find_inode_rcu() usage"); > > - hlist_for_each_entry_rcu(inode, head, i_hash) { > + hlist_bl_for_each_entry_rcu(inode, node, b, i_hash) { > if (inode->i_sb == sb && > !(READ_ONCE(inode->i_state) & (I_FREEING | I_WILL_FREE)) && > test(inode, data)) > @@ -1521,13 +1570,14 @@ EXPORT_SYMBOL(find_inode_rcu); > struct inode *find_inode_by_ino_rcu(struct super_block *sb, > unsigned long ino) > { > - struct hlist_head *head = i_hash_head(sb, ino); > + struct hlist_bl_head *b = i_hash_head(sb, ino); > + struct hlist_bl_node *node; > struct inode *inode; > > RCU_LOCKDEP_WARN(!rcu_read_lock_held(), > "suspicious find_inode_by_ino_rcu() usage"); > > - hlist_for_each_entry_rcu(inode, head, i_hash) { > + hlist_bl_for_each_entry_rcu(inode, node, b, i_hash) { > if (inode->i_ino == ino && > inode->i_sb == sb && > !(READ_ONCE(inode->i_state) & (I_FREEING | I_WILL_FREE))) > @@ -1541,39 +1591,42 @@ int insert_inode_locked(struct inode *inode) > { > struct super_block *sb = inode->i_sb; > ino_t ino = inode->i_ino; > - struct hlist_head *head = i_hash_head(sb, ino); > + struct hlist_bl_head *b = i_hash_head(sb, ino); > > while (1) { > - struct inode *old = NULL; > - spin_lock(&inode_hash_lock); > - hlist_for_each_entry(old, head, i_hash) { > - if (old->i_ino != ino) > + struct hlist_bl_node *node; > + struct inode *old = NULL, *t; > + > + hlist_bl_lock(b); > + hlist_bl_for_each_entry(t, node, b, i_hash) { > + if (t->i_ino != ino) > continue; > - if (old->i_sb != sb) > + if (t->i_sb != sb) > continue; > - spin_lock(&old->i_lock); > - if (old->i_state & (I_FREEING|I_WILL_FREE)) { > - spin_unlock(&old->i_lock); > + spin_lock(&t->i_lock); > + if (t->i_state & (I_FREEING|I_WILL_FREE)) { > + spin_unlock(&t->i_lock); > continue; > } > + old = t; > break; > } > if (likely(!old)) { > spin_lock(&inode->i_lock); > inode->i_state |= I_NEW | I_CREATING; > - hlist_add_head_rcu(&inode->i_hash, head); > + __insert_inode_hash_head(inode, b); > spin_unlock(&inode->i_lock); > - spin_unlock(&inode_hash_lock); > + hlist_bl_unlock(b); > return 0; > } > if (unlikely(old->i_state & I_CREATING)) { > spin_unlock(&old->i_lock); > - spin_unlock(&inode_hash_lock); > + hlist_bl_unlock(b); > return -EBUSY; > } > __iget(old); > spin_unlock(&old->i_lock); > - spin_unlock(&inode_hash_lock); > + hlist_bl_unlock(b); > wait_on_inode(old); > if (unlikely(!inode_unhashed(old))) { > iput(old); > @@ -2046,17 +2099,18 @@ EXPORT_SYMBOL(inode_needs_sync); > * wake_up_bit(&inode->i_state, __I_NEW) after removing from the hash list > * will DTRT. > */ > -static void __wait_on_freeing_inode(struct inode *inode) > +static void __wait_on_freeing_inode(struct hlist_bl_head *b, > + struct inode *inode) > { > wait_queue_head_t *wq; > DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW); > wq = bit_waitqueue(&inode->i_state, __I_NEW); > prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); > spin_unlock(&inode->i_lock); > - spin_unlock(&inode_hash_lock); > + hlist_bl_unlock(b); > schedule(); > finish_wait(wq, &wait.wq_entry); > - spin_lock(&inode_hash_lock); > + hlist_bl_lock(b); > } > > static __initdata unsigned long ihash_entries; > @@ -2082,7 +2136,7 @@ void __init inode_init_early(void) > > inode_hashtable = > alloc_large_system_hash("Inode-cache", > - sizeof(struct hlist_head), > + sizeof(struct hlist_bl_head), > ihash_entries, > 14, > HASH_EARLY | HASH_ZERO, > @@ -2108,7 +2162,7 @@ void __init inode_init(void) > > inode_hashtable = > alloc_large_system_hash("Inode-cache", > - sizeof(struct hlist_head), > + sizeof(struct hlist_bl_head), > ihash_entries, > 14, > HASH_ZERO, > diff --git a/include/linux/fs.h b/include/linux/fs.h > index ec8f3ddf4a6a..89c9e49a71a6 100644 > --- a/include/linux/fs.h > +++ b/include/linux/fs.h > @@ -664,7 +664,8 @@ struct inode { > unsigned long dirtied_when; /* jiffies of first dirtying */ > unsigned long dirtied_time_when; > > - struct hlist_node i_hash; > + struct hlist_bl_node i_hash; > + struct hlist_bl_head *i_hash_head; > struct list_head i_io_list; /* backing dev IO list */ > #ifdef CONFIG_CGROUP_WRITEBACK > struct bdi_writeback *i_wb; /* the associated cgroup wb */ > @@ -730,7 +731,7 @@ static inline unsigned int i_blocksize(const struct inode *node) > > static inline int inode_unhashed(struct inode *inode) > { > - return hlist_unhashed(&inode->i_hash); > + return hlist_bl_unhashed(&inode->i_hash); > } > > /* > @@ -741,7 +742,7 @@ static inline int inode_unhashed(struct inode *inode) > */ > static inline void inode_fake_hash(struct inode *inode) > { > - hlist_add_fake(&inode->i_hash); > + hlist_bl_add_fake(&inode->i_hash); > } > > /* > @@ -3065,7 +3066,7 @@ static inline void insert_inode_hash(struct inode *inode) > extern void __remove_inode_hash(struct inode *); > static inline void remove_inode_hash(struct inode *inode) > { > - if (!inode_unhashed(inode) && !hlist_fake(&inode->i_hash)) > + if (!inode_unhashed(inode) && !hlist_bl_fake(&inode->i_hash)) > __remove_inode_hash(inode); > } > > -- > 2.31.0 >