Re: [PATCH 3/3] zonefs: fix zonefs_iomap_begin() for reads

Damien Le Moal <damien.lemoal@xxxxxxxxxxxxxxxxxx> · Tue, 7 Jun 2022 15:53:35 +0900

On 2022/06/07 15:09, Christoph Hellwig wrote:
> On Fri, Jun 03, 2022 at 08:49:39PM +0900, Damien Le Moal wrote:
>> If a read operation (e.g. a readahead) is issued to a sequential zone
>> file with an offset exactly equal to the current file size, the iomap
>> type will be set to IOMAP_UNWRITTEN, which will prevent an IO, but the
>> iomap length is always calculated as 0. This causes a WARN_ON() in
>> iomap_iter():
> 
> Is there a testsuite somewhere with a reproducer?

Yes, test case 0325 of the test suite that comes with zonefs-tools:

git@xxxxxxxxxx:westerndigitalcorporation/zonefs-tools.git

The tests are under the "tests" directory and running:

zonefs-tests-nullblk.sh -t 0325

triggers the problem 100% of the time.

> 
>> diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
>> index 123464d2145a..64f4ceb6f579 100644
>> --- a/fs/zonefs/super.c
>> +++ b/fs/zonefs/super.c
>> @@ -144,7 +144,7 @@ static int zonefs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
>>  		iomap->type = IOMAP_MAPPED;
>>  	if (flags & IOMAP_WRITE)
>>  		length = zi->i_max_size - offset;
>> -	else
>> +	else if (offset < isize)
>>  		length = min(length, isize - offset);
> 
> So you still report an IOMAP_UNWRITTEN extent for the whole size of
> the requst past EOF?  Looking at XFS we do return the whole requested
> length, but do return it as HOLE.  Maybe we need to clarify the behavior
> here and document it.

Yes, I checked xfs and saw that. But in zonefs case, since the file blocks are
always preallocated, blocks after the write pointer are indeed UNWRITTEN. I did
check that iomap read processing treats UNWRITTEN and HOLE similarly, that is,
ignore the value of length and stop issuing IOs when either type is seen. But I
may have missed something.

Note that initially I wrote the patch below to fix the problem. But it is very
large and should probably be a cleanup for the next cycle. It separates the
begin op for read and write, which makes things easier to understand.



>From 1e1024daff9158f36fe01328c4be83db940d4309 Mon Sep 17 00:00:00 2001
From: Damien Le Moal <damien.lemoal@xxxxxxxxxxxxxxxxxx>
Date: Mon, 23 May 2022 16:29:10 +0900
Subject: [PATCH] zonefs: fix iomap_begin operation


Signed-off-by: Damien Le Moal <damien.lemoal@xxxxxxxxxxxxxxxxxx>
---
 fs/zonefs/super.c | 103 +++++++++++++++++++++++++++++-----------------
 1 file changed, 66 insertions(+), 37 deletions(-)

diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index 123464d2145a..29c609aede65 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -110,15 +110,48 @@ static inline void zonefs_i_size_write(struct inode
*inode, loff_t isize)
 	}
 }

-static int zonefs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
-			      unsigned int flags, struct iomap *iomap,
-			      struct iomap *srcmap)
+static int zonefs_read_iomap_begin(struct inode *inode, loff_t offset,
+				   loff_t length, unsigned int flags,
+				   struct iomap *iomap, struct iomap *srcmap)
 {
 	struct zonefs_inode_info *zi = ZONEFS_I(inode);
 	struct super_block *sb = inode->i_sb;
-	loff_t isize;
+	loff_t isize = i_size_read(inode);
+
+	/*
+	 * All blocks are always mapped below EOF. If reading past EOF,
+	 * act as if there is a hole up to the file maximum size.
+	 */
+	iomap->bdev = inode->i_sb->s_bdev;
+	iomap->offset = ALIGN_DOWN(offset, sb->s_blocksize);
+	if (iomap->offset >= isize) {
+		iomap->type = IOMAP_HOLE;
+		iomap->addr = IOMAP_NULL_ADDR;
+		iomap->length = length;
+	} else {
+		iomap->type = IOMAP_MAPPED;
+		iomap->addr = (zi->i_zsector << SECTOR_SHIFT) + iomap->offset;
+		iomap->length = min(length, isize - iomap->offset);
+	}
+
+	trace_zonefs_iomap_begin(inode, iomap);
+
+	return 0;
+}
+
+static const struct iomap_ops zonefs_read_iomap_ops = {
+	.iomap_begin	= zonefs_read_iomap_begin,
+};
+
+static int zonefs_write_iomap_begin(struct inode *inode, loff_t offset,
+				    loff_t length, unsigned int flags,
+				    struct iomap *iomap, struct iomap *srcmap)
+{
+	struct zonefs_inode_info *zi = ZONEFS_I(inode);
+	struct super_block *sb = inode->i_sb;
+	loff_t remaining, isize = i_size_read(inode);

-	/* All I/Os should always be within the file maximum size */
+	/* All write I/Os should always be within the file maximum size */
 	if (WARN_ON_ONCE(offset + length > zi->i_max_size))
 		return -EIO;

@@ -128,56 +161,51 @@ static int zonefs_iomap_begin(struct inode *inode, loff_t
offset, loff_t length,
 	 * operation.
 	 */
 	if (WARN_ON_ONCE(zi->i_ztype == ZONEFS_ZTYPE_SEQ &&
-			 (flags & IOMAP_WRITE) && !(flags & IOMAP_DIRECT)))
+			 !(flags & IOMAP_DIRECT)))
 		return -EIO;

 	/*
-	 * For conventional zones, all blocks are always mapped. For sequential
-	 * zones, all blocks after always mapped below the inode size (zone
-	 * write pointer) and unwriten beyond.
+	 * For conventional zones, since the inode size is fixed, all blocks
+	 * are always mapped. For sequential zones, all blocks after always
+	 * mapped below the inode size (zone write pointer) and unwriten beyond.
 	 */
-	mutex_lock(&zi->i_truncate_mutex);
-	isize = i_size_read(inode);
-	if (offset >= isize)
-		iomap->type = IOMAP_UNWRITTEN;
-	else
-		iomap->type = IOMAP_MAPPED;
-	if (flags & IOMAP_WRITE)
-		length = zi->i_max_size - offset;
-	else
-		length = min(length, isize - offset);
-	mutex_unlock(&zi->i_truncate_mutex);
-
-	iomap->offset = ALIGN_DOWN(offset, sb->s_blocksize);
-	iomap->length = ALIGN(offset + length, sb->s_blocksize) - iomap->offset;
 	iomap->bdev = inode->i_sb->s_bdev;
+	iomap->offset = ALIGN_DOWN(offset, sb->s_blocksize);
 	iomap->addr = (zi->i_zsector << SECTOR_SHIFT) + iomap->offset;
+	if (iomap->offset >= isize) {
+		iomap->type = IOMAP_UNWRITTEN;
+		remaining = zi->i_max_size - iomap->offset;
+	} else {
+		iomap->type = IOMAP_MAPPED;
+		remaining = isize - iomap->offset;
+	}
+	iomap->length = min(length, remaining);

 	trace_zonefs_iomap_begin(inode, iomap);

 	return 0;
 }

-static const struct iomap_ops zonefs_iomap_ops = {
-	.iomap_begin	= zonefs_iomap_begin,
+static const struct iomap_ops zonefs_write_iomap_ops = {
+	.iomap_begin	= zonefs_write_iomap_begin,
 };

 static int zonefs_read_folio(struct file *unused, struct folio *folio)
 {
-	return iomap_read_folio(folio, &zonefs_iomap_ops);
+	return iomap_read_folio(folio, &zonefs_read_iomap_ops);
 }

 static void zonefs_readahead(struct readahead_control *rac)
 {
-	iomap_readahead(rac, &zonefs_iomap_ops);
+	iomap_readahead(rac, &zonefs_read_iomap_ops);
 }

 /*
  * Map blocks for page writeback. This is used only on conventional zone files,
  * which implies that the page range can only be within the fixed inode size.
  */
-static int zonefs_map_blocks(struct iomap_writepage_ctx *wpc,
-			     struct inode *inode, loff_t offset)
+static int zonefs_write_map_blocks(struct iomap_writepage_ctx *wpc,
+				   struct inode *inode, loff_t offset)
 {
 	struct zonefs_inode_info *zi = ZONEFS_I(inode);

@@ -191,12 +219,12 @@ static int zonefs_map_blocks(struct iomap_writepage_ctx *wpc,
 	    offset < wpc->iomap.offset + wpc->iomap.length)
 		return 0;

-	return zonefs_iomap_begin(inode, offset, zi->i_max_size - offset,
-				  IOMAP_WRITE, &wpc->iomap, NULL);
+	return zonefs_write_iomap_begin(inode, offset, zi->i_max_size - offset,
+					IOMAP_WRITE, &wpc->iomap, NULL);
 }

 static const struct iomap_writeback_ops zonefs_writeback_ops = {
-	.map_blocks		= zonefs_map_blocks,
+	.map_blocks		= zonefs_write_map_blocks,
 };

 static int zonefs_writepage(struct page *page, struct writeback_control *wbc)
@@ -226,7 +254,8 @@ static int zonefs_swap_activate(struct swap_info_struct *sis,
 		return -EINVAL;
 	}

-	return iomap_swapfile_activate(sis, swap_file, span, &zonefs_iomap_ops);
+	return iomap_swapfile_activate(sis, swap_file, span,
+				       &zonefs_read_iomap_ops);
 }

 static const struct address_space_operations zonefs_file_aops = {
@@ -647,7 +676,7 @@ static vm_fault_t zonefs_filemap_page_mkwrite(struct
vm_fault *vmf)

 	/* Serialize against truncates */
 	filemap_invalidate_lock_shared(inode->i_mapping);
-	ret = iomap_page_mkwrite(vmf, &zonefs_iomap_ops);
+	ret = iomap_page_mkwrite(vmf, &zonefs_write_iomap_ops);
 	filemap_invalidate_unlock_shared(inode->i_mapping);

 	sb_end_pagefault(inode->i_sb);
@@ -899,7 +928,7 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb,
struct iov_iter *from)
 	if (append)
 		ret = zonefs_file_dio_append(iocb, from);
 	else
-		ret = iomap_dio_rw(iocb, from, &zonefs_iomap_ops,
+		ret = iomap_dio_rw(iocb, from, &zonefs_write_iomap_ops,
 				   &zonefs_write_dio_ops, 0, NULL, 0);
 	if (zi->i_ztype == ZONEFS_ZTYPE_SEQ &&
 	    (ret > 0 || ret == -EIOCBQUEUED)) {
@@ -948,7 +977,7 @@ static ssize_t zonefs_file_buffered_write(struct kiocb *iocb,
 	if (ret <= 0)
 		goto inode_unlock;

-	ret = iomap_file_buffered_write(iocb, from, &zonefs_iomap_ops);
+	ret = iomap_file_buffered_write(iocb, from, &zonefs_write_iomap_ops);
 	if (ret > 0)
 		iocb->ki_pos += ret;
 	else if (ret == -EIO)
@@ -1041,7 +1070,7 @@ static ssize_t zonefs_file_read_iter(struct kiocb *iocb,
struct iov_iter *to)
 			goto inode_unlock;
 		}
 		file_accessed(iocb->ki_filp);
-		ret = iomap_dio_rw(iocb, to, &zonefs_iomap_ops,
+		ret = iomap_dio_rw(iocb, to, &zonefs_read_iomap_ops,
 				   &zonefs_read_dio_ops, 0, NULL, 0);
 	} else {
 		ret = generic_file_read_iter(iocb, to);
-- 
2.36.1



-- 
Damien Le Moal
Western Digital Research