On Wed, Jun 14, 2023 at 05:06:14PM +0200, Hannes Reinecke wrote: > On 6/14/23 15:53, Matthew Wilcox wrote: > > On Wed, Jun 14, 2023 at 03:17:25PM +0200, Hannes Reinecke wrote: > > > Turns out that was quite easy to fix (just remove the check in > > > set_blocksize()), but now I get this: > > > > > > SGI XFS with ACLs, security attributes, quota, no debug enabled > > > XFS (ram0): File system with blocksize 16384 bytes. Only pagesize (4096) or > > > less will currently work. > > > > What happens if you just remove this hunk: > > > > +++ b/fs/xfs/xfs_super.c > > @@ -1583,18 +1583,6 @@ xfs_fs_fill_super( > > goto out_free_sb; > > } > > > > - /* > > - * Until this is fixed only page-sized or smaller data blocks work. > > - */ > > - if (mp->m_sb.sb_blocksize > PAGE_SIZE) { > > - xfs_warn(mp, > > - "File system with blocksize %d bytes. " > > - "Only pagesize (%ld) or less will currently work.", > > - mp->m_sb.sb_blocksize, PAGE_SIZE); > > - error = -ENOSYS; > > - goto out_free_sb; > > - } > > - > > /* Ensure this filesystem fits in the page cache limits */ > > if (xfs_sb_validate_fsb_count(&mp->m_sb, mp->m_sb.sb_dblocks) || > > xfs_sb_validate_fsb_count(&mp->m_sb, mp->m_sb.sb_rblocks)) { > > Whee! That works! > > Rebased things with your memcpy_{to,from}_folio() patches, disabled that > chunk, and: > > # mount /dev/ram0 /mnt What is the output of mkfs.xfs? > XFS (ram0): Mounting V5 Filesystem 5cd71ab5-2d11-4c18-97dd-71708f40e551 > XFS (ram0): Ending clean mount > xfs filesystem being mounted at /mnt supports timestamps until 2038-01-19 > (0x7fffffff) > # umount /mnt > XFS (ram0): Unmounting Filesystem 5cd71ab5-2d11-4c18-97dd-71708f40e551 Nope. Not here. Debug kernel builds assert fail at mount time with: XFS: Assertion failed: PAGE_SHIFT >= sbp->sb_blocklog, file: fs/xfs/xfs_mount.c, line: 133 Because we do a check to ensure that the entire filesystem address range can be indexed by the page cache. I suspect this is actually a stale, left over check from the days we used the page cache for indexing cached metadata, but with that sorted.... It fails here (8GB ram disk): #mkfs.xfs -f -b size=64k /dev/ram0 meta-data=/dev/ram0 isize=512 agcount=4, agsize=32000 blks = sectsz=512 attr=2, projid32bit=1 = crc=1 finobt=1, sparse=1, rmapbt=0 = reflink=1 bigtime=1 inobtcount=1 nrext64=0 data = bsize=65536 blocks=128000, imaxpct=25 = sunit=0 swidth=0 blks naming =version 2 bsize=65536 ascii-ci=0, ftype=1 log =internal log bsize=65536 blocks=1024, version=2 = sectsz=512 sunit=0 blks, lazy-count=1 realtime =none extsz=65536 blocks=0, rtextents=0 # mount /dev/ram0 /mnt/test [ 34.047433] XFS (ram0): Mounting V5 Filesystem 074579ae-9c33-447a-a336-8ea99cda87c3 [ 34.053962] BUG: Bad rss-counter state mm:00000000b41e2cf6 type:MM_FILEPAGES val:11 [ 34.054451] general protection fault, probably for non-canonical address 0x4002888237d00000: 0000 [#1] PREEMPT SMP [ 34.057426] psi: task underflow! cpu=8 t=2 tasks=[0 0 0 0] clear=4 set=0 [ 34.065011] CPU: 2 PID: 3689 Comm: mount Not tainted 6.4.0-rc6-dgc+ #1832 [ 34.068647] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.2-debian-1.16.2-1 04/01/2014 [ 34.073236] RIP: 0010:__submit_bio+0x9e/0x110 [ 34.075124] Code: c3 e8 d6 c5 88 ff 48 8b 43 08 48 89 df 4c 8b 60 10 49 8b 44 24 60 ff 10 49 8b 5c 24 68 e8 3a 92 88 ff 48 8b 43 10 a8 03 75 56 <65> 48 8 [ 34.084879] RSP: 0018:ffffc900045c3a10 EFLAGS: 00010246 [ 34.087501] RAX: 4003000000000000 RBX: ffff8885c1568000 RCX: 0000000000000080 [ 34.090455] RDX: ffff888805d8a900 RSI: 0000000000000286 RDI: ffffc900045c3ad8 [ 34.093419] RBP: ffffc900045c3a20 R08: 0000000000000000 R09: 0000000000000000 [ 34.096381] R10: 0000000000000000 R11: 0000000000000000 R12: ffff88880124b000 [ 34.099340] R13: ffff8885c1568000 R14: ffff888100620000 R15: 0000000000fa0000 [ 34.102285] FS: 00007f1b86428840(0000) GS:ffff888237d00000(0000) knlGS:0000000000000000 [ 34.105410] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 34.107577] CR2: 00007f1b86364000 CR3: 000000010482d003 CR4: 0000000000060ee0 [ 34.110114] Call Trace: [ 34.111031] <TASK> [ 34.111782] ? show_regs+0x61/0x70 [ 34.112945] ? die_addr+0x37/0x90 [ 34.114080] ? exc_general_protection+0x19e/0x3b0 [ 34.115619] ? asm_exc_general_protection+0x27/0x30 [ 34.117162] ? __submit_bio+0x9e/0x110 [ 34.118355] submit_bio_noacct_nocheck+0xf3/0x330 [ 34.119819] submit_bio_noacct+0x196/0x490 [ 34.121042] submit_bio+0x58/0x60 [ 34.122045] submit_bio_wait+0x70/0xd0 [ 34.123178] xfs_rw_bdev+0x188/0x1b0 [ 34.124255] xlog_do_io+0x95/0x170 [ 34.125283] xlog_bwrite+0x14/0x20 [ 34.126310] xlog_write_log_records+0x179/0x260 [ 34.127637] xlog_clear_stale_blocks+0xa5/0x1c0 [ 34.128917] xlog_find_tail+0x372/0x3b0 [ 34.130011] xlog_recover+0x2f/0x190 [ 34.131041] xfs_log_mount+0x1b8/0x350 [ 34.132055] xfs_mountfs+0x451/0x9a0 [ 34.133019] xfs_fs_fill_super+0x4d9/0x920 [ 34.134113] get_tree_bdev+0x16e/0x270 [ 34.135130] ? xfs_open_devices+0x230/0x230 [ 34.136184] xfs_fs_get_tree+0x15/0x20 [ 34.137144] vfs_get_tree+0x24/0xd0 [ 34.138041] path_mount+0x2fd/0xae0 [ 34.138955] ? putname+0x53/0x60 [ 34.139749] __x64_sys_mount+0x108/0x140 [ 34.140698] do_syscall_64+0x34/0x80 [ 34.141574] entry_SYSCALL_64_after_hwframe+0x63/0xcd Hmmmm - that's logical sector aligned/sized IO that is failing like this. The fs is using 64kB block size, 512 byte sector size. So I went looking. # blockdev --report /dev/ram0 RO RA SSZ BSZ StartSec Size Device rw 256 512 4096 0 8388608000 /dev/ram0 # Huh. sector size is fine, block size for the device isn't. # cat /sys/block/ram0/queue/physical_block_size 65536 # Yup, brd definitely picked up that it is supposed to be using 64kB blocks. # blockdev --setbsz 65536 /dev/ram0 blockdev: ioctl error on BLKBSZSET: Invalid argument # Huh. <dig dig dig> int set_blocksize(struct block_device *bdev, int size) { /* Size must be a power of two, and between 512 and PAGE_SIZE */ if (size > PAGE_SIZE || size < 512 || !is_power_of_2(size)) return -EINVAL; ..... Yeah, ok. The block device doesn't support 64kB block sizes. Lucky that XFs uses this as it's sector size: # mkfs.xfs -f -b size=64k -s size=16k /dev/ram0 .... # mount /dev/ram0 /mnt/test [ 692.564375] XFS (ram0): Cannot set_blocksize to 16384 on device ram0 <mount fails> # Now expected. I wonder if the problem is 512 byte sector sizes.... # mkfs.xfs -f -s size=4k -b size=64k /dev/ram0 meta-data=/dev/ram0 isize=512 agcount=4, agsize=32000 blks = sectsz=4096 attr=2, projid32bit=1 = crc=1 finobt=1, sparse=1, rmapbt=0 = reflink=1 bigtime=1 inobtcount=1 nrext64=0 data = bsize=65536 blocks=128000, imaxpct=25 = sunit=0 swidth=0 blks naming =version 2 bsize=65536 ascii-ci=0, ftype=1 log =internal log bsize=65536 blocks=1024, version=2 = sectsz=4096 sunit=1 blks, lazy-count=1 realtime =none extsz=65536 blocks=0, rtextents=0 # mount /dev/ram0 /mnt/test [ 835.711473] XFS (ram0): Mounting V5 Filesystem 72743c95-1264-43cd-8867-1f2b2e30ba24 [ 835.722700] XFS (ram0): Ending clean mount # Okay, there we go. The patchset appears to have some kind of problem with the filesystem using the minimum logical sector size of 512 bytes on this modified driver. Setting sector size == PAGE_SIZE allows the filesystem to mount, but brd should not break if logical sector aligned/sized IO is done. $ sudo xfs_io -f -d -c "pwrite -b 1M 0 1M" -c "pread -v 0 1M" /mnt/test/foo wrote 1048576/1048576 bytes at offset 0 1 MiB, 1 ops; 0.0004 sec (2.266 GiB/sec and 2320.1856 ops/sec) ..... 000ffff0: cd cd cd cd cd cd cd cd cd cd cd cd cd cd cd cd ................ read 1048576/1048576 bytes at offset 0 1 MiB, 16 ops; 0.9233 sec (1.083 MiB/sec and 17.3284 ops/sec) $ Ok, direct IO works just fine. $ xfs_io -c "pwrite -S 0xaa -b 1M 0 1M" -c "pread 0 1M -v" /mnt/test/foo ..... 000ffff0: aa aa aa aa aa aa aa aa aa aa aa aa aa aa aa aa ................ read 1048576/1048576 bytes at offset 0 1 MiB, 16 ops; 0.7035 sec (1.421 MiB/sec and 22.7428 ops/sec) $ Ok, well aligned buffered IO looks like it works. Right, let's step it up and do some more complex stuff. Let's run a basic fsx pass on the filesystem: $ sudo ~/src/xfstests-dev/ltp/fsx -d /mnt/test/baz Seed set to 1 main: filesystem does not support dedupe range, disabling! main: filesystem does not support exchange range, disabling! truncating to largest ever: 0x3aea7 2 trunc from 0x0 to 0x3aea7 3 copy from 0x1a3d6 to 0x26608, (0xc232 bytes) at 0x2ea8c Segmentation fault $ And there's the boom, only 3 operations into the test. This is kinda what I expected - getting fsx to run for billions of ops without failure might take a while. Huh, why did it say FIDEDUPERANGE was not supported - that's weird, something is broken there, maybe the fsx test. [ 1787.365339] ------------[ cut here ]------------ [ 1787.368623] kernel BUG at include/linux/pagemap.h:1248! [ 1787.371488] invalid opcode: 0000 [#1] PREEMPT SMP [ 1787.374814] CPU: 10 PID: 5153 Comm: fsx Not tainted 6.4.0-rc6-dgc+ #1832 [ 1787.377240] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.2-debian-1.16.2-1 04/01/2014 [ 1787.383061] RIP: 0010:read_pages+0x11d/0x230 [ 1787.385268] Code: 4c 89 e7 e8 c5 14 ff ff f0 41 ff 4c 24 34 0f 85 55 ff ff ff 4c 89 e7 e8 61 1d 00 00 8b 73 24 8b 43 20 39 f0 0f 83 4d ff ff ff <0f> 0b 0 [ 1787.395078] RSP: 0018:ffffc90004113918 EFLAGS: 00010283 [ 1787.396636] RAX: 0000000000000001 RBX: ffffc90004113ab0 RCX: 0000000000000000 [ 1787.400357] RDX: 0000000000001000 RSI: 0000000000000010 RDI: ffffea0017421000 [ 1787.403989] RBP: ffffc90004113960 R08: 0000000000001000 R09: 0000000000000000 [ 1787.407915] R10: ffff8885d084a000 R11: ffffc900041137d8 R12: ffffffff822b2e60 [ 1787.411472] R13: 000000000000001b R14: ffffea0017421000 R15: ffff8885c0cc8318 [ 1787.415342] FS: 00007f96c86fcc40(0000) GS:ffff8885fed00000(0000) knlGS:0000000000000000 [ 1787.418404] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 1787.420717] CR2: 000055ecf4289908 CR3: 00000005feceb001 CR4: 0000000000060ee0 [ 1787.423110] Call Trace: [ 1787.424214] <TASK> [ 1787.425606] ? show_regs+0x61/0x70 [ 1787.426568] ? die+0x37/0x90 [ 1787.427584] ? do_trap+0xec/0x100 [ 1787.428959] ? do_error_trap+0x6c/0x90 [ 1787.430672] ? read_pages+0x11d/0x230 [ 1787.432259] ? exc_invalid_op+0x52/0x70 [ 1787.433312] ? read_pages+0x11d/0x230 [ 1787.434544] ? asm_exc_invalid_op+0x1b/0x20 [ 1787.436038] ? read_pages+0x11d/0x230 [ 1787.437442] ? read_pages+0x5c/0x230 [ 1787.438943] page_cache_ra_unbounded+0x128/0x1b0 [ 1787.440419] do_page_cache_ra+0x6c/0x70 [ 1787.441765] ondemand_readahead+0x31f/0x350 [ 1787.443426] page_cache_sync_ra+0x49/0x50 [ 1787.445070] filemap_get_pages+0x10e/0x680 [ 1787.446259] ? xfs_ilock+0xc1/0x220 [ 1787.447426] filemap_read+0xed/0x380 [ 1787.448632] ? kmem_cache_free+0x1f5/0x480 [ 1787.449926] ? xfs_log_ticket_put+0x2f/0x60 [ 1787.451152] ? xfs_inode_item_release+0x2e/0xa0 [ 1787.453128] generic_file_read_iter+0xdb/0x160 [ 1787.454527] xfs_file_buffered_read+0x54/0xd0 [ 1787.455894] xfs_file_read_iter+0x74/0xe0 [ 1787.457544] generic_file_splice_read+0x8c/0x150 [ 1787.460094] do_splice_to+0x85/0xb0 [ 1787.461285] splice_direct_to_actor+0xb3/0x210 [ 1787.462336] ? pipe_to_sendpage+0xa0/0xa0 [ 1787.463287] do_splice_direct+0x92/0xd0 [ 1787.464203] vfs_copy_file_range+0x2af/0x560 [ 1787.465229] __do_sys_copy_file_range+0xe3/0x1f0 [ 1787.466429] __x64_sys_copy_file_range+0x24/0x30 [ 1787.468053] do_syscall_64+0x34/0x80 [ 1787.469392] entry_SYSCALL_64_after_hwframe+0x63/0xcd static inline struct folio *__readahead_folio(struct readahead_control *ractl) { struct folio *folio; >>>>> BUG_ON(ractl->_batch_count > ractl->_nr_pages); ractl->_nr_pages -= ractl->_batch_count; ractl->_index += ractl->_batch_count; .... So something is going wrong in the readahead path from a splice operation from copy_file_range(). ..... Wait, what? Why is it splicing rather than doing a remap operation? 'cp --reflink=always bar bar2' appears to work fine, so it's unexpected that it's copying data rather than cloning extents. Something is going wrong there... ..... Ok, that's enough time spent on this right now. The BS > PS stuff in this patchset doesn't allow filesystems to work correctly, and the reasons for things going wrong are not obvious. I suspect that this is going to take quite some work just to muscle through all these whacky corner cases - fsx will find a lot of them; you'll need to work through them until it runs without fail for a couple of billion ops. The patch I was using is below. Cheers, Dave. -- Dave Chinner david@xxxxxxxxxxxxx xfs: support block size > page size From: Dave Chinner <dchinner@xxxxxxxxxx> Everything is supposed to work, so turn on the BOOM. Signed-off-by: Dave Chinner <dchinner@xxxxxxxxxx> --- drivers/block/brd.c | 2 +- fs/xfs/xfs_mount.c | 4 +++- fs/xfs/xfs_super.c | 12 ------------ 3 files changed, 4 insertions(+), 14 deletions(-) diff --git a/drivers/block/brd.c b/drivers/block/brd.c index a9f3c6591e75..e6e4f31bfcf5 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -314,7 +314,7 @@ static int max_part = 1; module_param(max_part, int, 0444); MODULE_PARM_DESC(max_part, "Num Minors to reserve between devices"); -static unsigned int rd_blksize = PAGE_SIZE; +static unsigned int rd_blksize = 65536; module_param(rd_blksize, uint, 0444); MODULE_PARM_DESC(rd_blksize, "Blocksize of each RAM disk in bytes."); diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index fb87ffb48f7f..921acd02787c 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -130,10 +130,12 @@ xfs_sb_validate_fsb_count( xfs_sb_t *sbp, uint64_t nblocks) { - ASSERT(PAGE_SHIFT >= sbp->sb_blocklog); ASSERT(sbp->sb_blocklog >= BBSHIFT); /* Limited by ULONG_MAX of page cache index */ + if (sbp->sb_blocklog > PAGE_SHIFT && + (nblocks << (sbp->sb_blocklog - PAGE_SHIFT) > ULONG_MAX)) + return -EFBIG; if (nblocks >> (PAGE_SHIFT - sbp->sb_blocklog) > ULONG_MAX) return -EFBIG; return 0; diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 4120bd1cba90..3c2fc203a5c0 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1583,18 +1583,6 @@ xfs_fs_fill_super( goto out_free_sb; } - /* - * Until this is fixed only page-sized or smaller data blocks work. - */ - if (mp->m_sb.sb_blocksize > PAGE_SIZE) { - xfs_warn(mp, - "File system with blocksize %d bytes. " - "Only pagesize (%ld) or less will currently work.", - mp->m_sb.sb_blocksize, PAGE_SIZE); - error = -ENOSYS; - goto out_free_sb; - } - /* Ensure this filesystem fits in the page cache limits */ if (xfs_sb_validate_fsb_count(&mp->m_sb, mp->m_sb.sb_dblocks) || xfs_sb_validate_fsb_count(&mp->m_sb, mp->m_sb.sb_rblocks)) {