On Mon, Jun 26, 2023 at 03:04:53PM -0300, Marcelo Tosatti wrote: > Upon closer investigation, it was found that in current codebase, lookup_bh_lru > is slower than __find_get_block_slow: > > 114 ns per __find_get_block > 68 ns per __find_get_block_slow > > So remove the per-CPU buffer_head caching. > > Test program: > > #define NRLOOPS 200000 > static int __init example_init(void) > { > ktime_t s, e; > s64 delta; > int i, suc; > > bdev = blkdev_get_by_path("/dev/loop0", FMODE_READ, NULL); > if (IS_ERR(bdev)) { > printk(KERN_ERR "failed to load /dev/loop0\n"); > return -ENODEV; > } > > suc = 0; > delta = 0; > for (i=0; i < NRLOOPS; i++) { > struct buffer_head *bh; > > s = ktime_get(); > bh = __find_get_block(bdev, 1, 512); > e = ktime_get(); > if (bh) { > suc++; > __brelse(bh); > } > delta = delta + ktime_to_ns(ktime_sub(e, s)); > > } > printk(KERN_ERR "%lld ns per __find_get_block (suc=%d)\n", delta/NRLOOPS, suc); > > suc = 0; > delta = 0; > for (i=0; i < NRLOOPS; i++) { > struct buffer_head *bh; > > s = ktime_get(); > bh = __find_get_block_slow(bdev, 1); > e = ktime_get(); > if (bh) { > suc++; > __brelse(bh); > } > delta = delta + ktime_to_ns(ktime_sub(e, s)); > } > printk(KERN_ERR "%lld ns per __find_get_block_slow (suc=%d)\n", delta/NRLOOPS, suc); It occurs to me that this is close to being the best-case scenario for page-cache lookup as well as for lru lookup. Can you re-run it with block 4UL * 1024 * 1024 * 1024 instead of block 1?