To keep this thread alive and get some direction on the next steps, I made some changes with which I am able to do **buffered reads** with fio on brd with logical block size > 4k. Along with your patches (this patch and the brd patches), I added the following diff: diff --git a/fs/mpage.c b/fs/mpage.c index 242e213ee064..2e0c066d72d3 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -161,7 +161,7 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args) struct folio *folio = args->folio; struct inode *inode = folio->mapping->host; const unsigned blkbits = inode->i_blkbits; - const unsigned blocks_per_page = PAGE_SIZE >> blkbits; + const unsigned blocks_per_page = folio_size(folio) >> blkbits; const unsigned blocksize = 1 << blkbits; struct buffer_head *map_bh = &args->map_bh; sector_t block_in_file; diff --git a/mm/readahead.c b/mm/readahead.c index 47afbca1d122..2e42b5127f4c 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -210,7 +210,7 @@ void page_cache_ra_unbounded(struct readahead_control *ractl, unsigned long index = readahead_index(ractl); gfp_t gfp_mask = readahead_gfp_mask(mapping); unsigned long i; - + int order = 0; /* * Partway through the readahead operation, we will have added * locked pages to the page cache, but will not yet have submitted @@ -223,6 +223,9 @@ void page_cache_ra_unbounded(struct readahead_control *ractl, */ unsigned int nofs = memalloc_nofs_save(); + if (mapping->host->i_blkbits > PAGE_SHIFT) + order = mapping->host->i_blkbits - PAGE_SHIFT; + filemap_invalidate_lock_shared(mapping); /* * Preallocate as many pages as we will need. @@ -245,7 +248,7 @@ void page_cache_ra_unbounded(struct readahead_control *ractl, continue; } - folio = filemap_alloc_folio(gfp_mask, 0); + folio = filemap_alloc_folio(gfp_mask, order); if (!folio) break; if (filemap_add_folio(mapping, folio, index + i, @@ -259,7 +262,7 @@ void page_cache_ra_unbounded(struct readahead_control *ractl, if (i == nr_to_read - lookahead_size) folio_set_readahead(folio); ractl->_workingset |= folio_test_workingset(folio); - ractl->_nr_pages++; + ractl->_nr_pages += folio_nr_pages(folio); } And with that (drum roll): root@debian:~# cat /sys/block/ram0/queue/logical_block_size 8192 root@debian:~# fio -bs=8k -iodepth=8 -rw=read -ioengine=io_uring -size=200M -name=io_uring_1 -filename=/dev/ram0 io_uring_1: (g=0): rw=read, bs=(R) 8192B-8192B, (W) 8192B-8192B, (T) 8192B-8192B, ioengine=io_uring, iodepth=8 fio-3.33 Starting 1 process io_uring_1: (groupid=0, jobs=1): err= 0: pid=450: Thu Apr 20 11:34:10 2023 read: IOPS=94.8k, BW=741MiB/s (777MB/s)(40.0MiB/54msec) <snip> Run status group 0 (all jobs): READ: bw=741MiB/s (777MB/s), 741MiB/s-741MiB/s (777MB/s-777MB/s), io=40.0MiB (41.9MB), run=54-54msec Disk stats (read/write): ram0: ios=0/0, merge=0/0, ticks=0/0, in_queue=0, util=0.00% **Questions on the future work**: As willy pointed out, we have to do this `order = mapping->host->i_blkbits - PAGE_SHIFT` in many places. Should we pursue something that willy suggested: encapsulating order in the mapping->flags as a next step?[1] [1] https://lore.kernel.org/lkml/ZDty+PQfHkrGBojn@xxxxxxxxxxxxxxxxxxxx/