On 2022/06/07 15:09, Christoph Hellwig wrote: > On Fri, Jun 03, 2022 at 08:49:39PM +0900, Damien Le Moal wrote: >> If a read operation (e.g. a readahead) is issued to a sequential zone >> file with an offset exactly equal to the current file size, the iomap >> type will be set to IOMAP_UNWRITTEN, which will prevent an IO, but the >> iomap length is always calculated as 0. This causes a WARN_ON() in >> iomap_iter(): > > Is there a testsuite somewhere with a reproducer? Yes, test case 0325 of the test suite that comes with zonefs-tools: git@xxxxxxxxxx:westerndigitalcorporation/zonefs-tools.git The tests are under the "tests" directory and running: zonefs-tests-nullblk.sh -t 0325 triggers the problem 100% of the time. > >> diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c >> index 123464d2145a..64f4ceb6f579 100644 >> --- a/fs/zonefs/super.c >> +++ b/fs/zonefs/super.c >> @@ -144,7 +144,7 @@ static int zonefs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, >> iomap->type = IOMAP_MAPPED; >> if (flags & IOMAP_WRITE) >> length = zi->i_max_size - offset; >> - else >> + else if (offset < isize) >> length = min(length, isize - offset); > > So you still report an IOMAP_UNWRITTEN extent for the whole size of > the requst past EOF? Looking at XFS we do return the whole requested > length, but do return it as HOLE. Maybe we need to clarify the behavior > here and document it. Yes, I checked xfs and saw that. But in zonefs case, since the file blocks are always preallocated, blocks after the write pointer are indeed UNWRITTEN. I did check that iomap read processing treats UNWRITTEN and HOLE similarly, that is, ignore the value of length and stop issuing IOs when either type is seen. But I may have missed something. Note that initially I wrote the patch below to fix the problem. But it is very large and should probably be a cleanup for the next cycle. It separates the begin op for read and write, which makes things easier to understand. >From 1e1024daff9158f36fe01328c4be83db940d4309 Mon Sep 17 00:00:00 2001 From: Damien Le Moal <damien.lemoal@xxxxxxxxxxxxxxxxxx> Date: Mon, 23 May 2022 16:29:10 +0900 Subject: [PATCH] zonefs: fix iomap_begin operation Signed-off-by: Damien Le Moal <damien.lemoal@xxxxxxxxxxxxxxxxxx> --- fs/zonefs/super.c | 103 +++++++++++++++++++++++++++++----------------- 1 file changed, 66 insertions(+), 37 deletions(-) diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index 123464d2145a..29c609aede65 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -110,15 +110,48 @@ static inline void zonefs_i_size_write(struct inode *inode, loff_t isize) } } -static int zonefs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, - unsigned int flags, struct iomap *iomap, - struct iomap *srcmap) +static int zonefs_read_iomap_begin(struct inode *inode, loff_t offset, + loff_t length, unsigned int flags, + struct iomap *iomap, struct iomap *srcmap) { struct zonefs_inode_info *zi = ZONEFS_I(inode); struct super_block *sb = inode->i_sb; - loff_t isize; + loff_t isize = i_size_read(inode); + + /* + * All blocks are always mapped below EOF. If reading past EOF, + * act as if there is a hole up to the file maximum size. + */ + iomap->bdev = inode->i_sb->s_bdev; + iomap->offset = ALIGN_DOWN(offset, sb->s_blocksize); + if (iomap->offset >= isize) { + iomap->type = IOMAP_HOLE; + iomap->addr = IOMAP_NULL_ADDR; + iomap->length = length; + } else { + iomap->type = IOMAP_MAPPED; + iomap->addr = (zi->i_zsector << SECTOR_SHIFT) + iomap->offset; + iomap->length = min(length, isize - iomap->offset); + } + + trace_zonefs_iomap_begin(inode, iomap); + + return 0; +} + +static const struct iomap_ops zonefs_read_iomap_ops = { + .iomap_begin = zonefs_read_iomap_begin, +}; + +static int zonefs_write_iomap_begin(struct inode *inode, loff_t offset, + loff_t length, unsigned int flags, + struct iomap *iomap, struct iomap *srcmap) +{ + struct zonefs_inode_info *zi = ZONEFS_I(inode); + struct super_block *sb = inode->i_sb; + loff_t remaining, isize = i_size_read(inode); - /* All I/Os should always be within the file maximum size */ + /* All write I/Os should always be within the file maximum size */ if (WARN_ON_ONCE(offset + length > zi->i_max_size)) return -EIO; @@ -128,56 +161,51 @@ static int zonefs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, * operation. */ if (WARN_ON_ONCE(zi->i_ztype == ZONEFS_ZTYPE_SEQ && - (flags & IOMAP_WRITE) && !(flags & IOMAP_DIRECT))) + !(flags & IOMAP_DIRECT))) return -EIO; /* - * For conventional zones, all blocks are always mapped. For sequential - * zones, all blocks after always mapped below the inode size (zone - * write pointer) and unwriten beyond. + * For conventional zones, since the inode size is fixed, all blocks + * are always mapped. For sequential zones, all blocks after always + * mapped below the inode size (zone write pointer) and unwriten beyond. */ - mutex_lock(&zi->i_truncate_mutex); - isize = i_size_read(inode); - if (offset >= isize) - iomap->type = IOMAP_UNWRITTEN; - else - iomap->type = IOMAP_MAPPED; - if (flags & IOMAP_WRITE) - length = zi->i_max_size - offset; - else - length = min(length, isize - offset); - mutex_unlock(&zi->i_truncate_mutex); - - iomap->offset = ALIGN_DOWN(offset, sb->s_blocksize); - iomap->length = ALIGN(offset + length, sb->s_blocksize) - iomap->offset; iomap->bdev = inode->i_sb->s_bdev; + iomap->offset = ALIGN_DOWN(offset, sb->s_blocksize); iomap->addr = (zi->i_zsector << SECTOR_SHIFT) + iomap->offset; + if (iomap->offset >= isize) { + iomap->type = IOMAP_UNWRITTEN; + remaining = zi->i_max_size - iomap->offset; + } else { + iomap->type = IOMAP_MAPPED; + remaining = isize - iomap->offset; + } + iomap->length = min(length, remaining); trace_zonefs_iomap_begin(inode, iomap); return 0; } -static const struct iomap_ops zonefs_iomap_ops = { - .iomap_begin = zonefs_iomap_begin, +static const struct iomap_ops zonefs_write_iomap_ops = { + .iomap_begin = zonefs_write_iomap_begin, }; static int zonefs_read_folio(struct file *unused, struct folio *folio) { - return iomap_read_folio(folio, &zonefs_iomap_ops); + return iomap_read_folio(folio, &zonefs_read_iomap_ops); } static void zonefs_readahead(struct readahead_control *rac) { - iomap_readahead(rac, &zonefs_iomap_ops); + iomap_readahead(rac, &zonefs_read_iomap_ops); } /* * Map blocks for page writeback. This is used only on conventional zone files, * which implies that the page range can only be within the fixed inode size. */ -static int zonefs_map_blocks(struct iomap_writepage_ctx *wpc, - struct inode *inode, loff_t offset) +static int zonefs_write_map_blocks(struct iomap_writepage_ctx *wpc, + struct inode *inode, loff_t offset) { struct zonefs_inode_info *zi = ZONEFS_I(inode); @@ -191,12 +219,12 @@ static int zonefs_map_blocks(struct iomap_writepage_ctx *wpc, offset < wpc->iomap.offset + wpc->iomap.length) return 0; - return zonefs_iomap_begin(inode, offset, zi->i_max_size - offset, - IOMAP_WRITE, &wpc->iomap, NULL); + return zonefs_write_iomap_begin(inode, offset, zi->i_max_size - offset, + IOMAP_WRITE, &wpc->iomap, NULL); } static const struct iomap_writeback_ops zonefs_writeback_ops = { - .map_blocks = zonefs_map_blocks, + .map_blocks = zonefs_write_map_blocks, }; static int zonefs_writepage(struct page *page, struct writeback_control *wbc) @@ -226,7 +254,8 @@ static int zonefs_swap_activate(struct swap_info_struct *sis, return -EINVAL; } - return iomap_swapfile_activate(sis, swap_file, span, &zonefs_iomap_ops); + return iomap_swapfile_activate(sis, swap_file, span, + &zonefs_read_iomap_ops); } static const struct address_space_operations zonefs_file_aops = { @@ -647,7 +676,7 @@ static vm_fault_t zonefs_filemap_page_mkwrite(struct vm_fault *vmf) /* Serialize against truncates */ filemap_invalidate_lock_shared(inode->i_mapping); - ret = iomap_page_mkwrite(vmf, &zonefs_iomap_ops); + ret = iomap_page_mkwrite(vmf, &zonefs_write_iomap_ops); filemap_invalidate_unlock_shared(inode->i_mapping); sb_end_pagefault(inode->i_sb); @@ -899,7 +928,7 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from) if (append) ret = zonefs_file_dio_append(iocb, from); else - ret = iomap_dio_rw(iocb, from, &zonefs_iomap_ops, + ret = iomap_dio_rw(iocb, from, &zonefs_write_iomap_ops, &zonefs_write_dio_ops, 0, NULL, 0); if (zi->i_ztype == ZONEFS_ZTYPE_SEQ && (ret > 0 || ret == -EIOCBQUEUED)) { @@ -948,7 +977,7 @@ static ssize_t zonefs_file_buffered_write(struct kiocb *iocb, if (ret <= 0) goto inode_unlock; - ret = iomap_file_buffered_write(iocb, from, &zonefs_iomap_ops); + ret = iomap_file_buffered_write(iocb, from, &zonefs_write_iomap_ops); if (ret > 0) iocb->ki_pos += ret; else if (ret == -EIO) @@ -1041,7 +1070,7 @@ static ssize_t zonefs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) goto inode_unlock; } file_accessed(iocb->ki_filp); - ret = iomap_dio_rw(iocb, to, &zonefs_iomap_ops, + ret = iomap_dio_rw(iocb, to, &zonefs_read_iomap_ops, &zonefs_read_dio_ops, 0, NULL, 0); } else { ret = generic_file_read_iter(iocb, to); -- 2.36.1 -- Damien Le Moal Western Digital Research