Stock kernel scales worse than FreeBSD when doing a 20-way stat(2) on the same tmpfs-backed file. According to perf top: 38.09% lockref_put_return 26.08% lockref_get_not_dead 25.60% __d_lookup_rcu 0.89% clear_bhb_loop __d_lookup_rcu is participating in cacheline ping pong due to the embedded name sharing a cacheline with lockref. Moving it out resolves the problem: 41.50% lockref_put_return 41.03% lockref_get_not_dead 1.54% clear_bhb_loop benchmark (will-it-scale, Sapphire Rapids, tmpfs, ops/s): FreeBSD:7219334 before: 5038006 after: 7842883 (+55%) One minor remark: the 'after' result is unstable, fluctuating in the range ~7.8 mln to ~9 mln during different runs. Signed-off-by: Mateusz Guzik <mjguzik@xxxxxxxxx> --- include/linux/dcache.h | 7 ++++++- lib/lockref.c | 2 +- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/include/linux/dcache.h b/include/linux/dcache.h index bf53e3894aae..326dbccc3736 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -89,13 +89,18 @@ struct dentry { struct inode *d_inode; /* Where the name belongs to - NULL is * negative */ unsigned char d_iname[DNAME_INLINE_LEN]; /* small names */ + /* --- cacheline 1 boundary (64 bytes) was 32 bytes ago --- */ /* Ref lookup also touches following */ - struct lockref d_lockref; /* per-dentry lock and refcount */ const struct dentry_operations *d_op; struct super_block *d_sb; /* The root of the dentry tree */ unsigned long d_time; /* used by d_revalidate */ void *d_fsdata; /* fs-specific data */ + /* --- cacheline 2 boundary (128 bytes) --- */ + struct lockref d_lockref; /* per-dentry lock and refcount + * keep separate from RCU lookup area if + * possible! + */ union { struct list_head d_lru; /* LRU list */ diff --git a/lib/lockref.c b/lib/lockref.c index 596b521bc1f1..c1e2736a7bac 100644 --- a/lib/lockref.c +++ b/lib/lockref.c @@ -45,7 +45,7 @@ static inline bool lockref_trywait_unlocked(struct lockref *lockref) { struct lockref old; - int retry = 100; + int retry = 256; for (;;) { cpu_relax(); -- 2.43.0