Eric has noticed that after pagecache read rework, generic/418 is occasionally failing for ext4 when blocksize < pagesize. In fact, the pagecache rework just made hard to hit race in ext4 more likely. The problem is that since ext4 conversion of direct IO writes to iomap framework (commit 378f32bab371), we update inode size after direct IO write only after invalidating page cache. Thus if buffered read sneaks at unfortunate moment like: CPU1 - write at offset 1k CPU2 - read from offset 0 iomap_dio_rw(..., IOMAP_DIO_FORCE_WAIT); ext4_readpage(); ext4_handle_inode_extension() the read will zero out tail of the page as it still sees smaller inode size and thus page cache becomes inconsistent with on-disk contents with all the consequences. Fix the problem by moving inode size update into end_io handler which gets called before the page cache is invalidated. Reported-by: Eric Whitney <enwlinux@xxxxxxxxx> Fixes: 378f32bab371 ("ext4: introduce direct I/O write using iomap infrastructure") Signed-off-by: Jan Kara <jack@xxxxxxx> --- fs/ext4/file.c | 185 ++++++++++++++++++++++++++----------------------- 1 file changed, 99 insertions(+), 86 deletions(-) diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 2505313d96b0..ec59ea078f53 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -280,11 +280,67 @@ static ssize_t ext4_buffered_write_iter(struct kiocb *iocb, return ret; } +static int ext4_prepare_dio_extend(struct inode *inode) +{ + handle_t *handle; + int ret; + + handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + ext4_fc_start_update(inode); + ret = ext4_orphan_add(handle, inode); + ext4_fc_stop_update(inode); + if (ret) { + ext4_journal_stop(handle); + return ret; + } + ext4_journal_stop(handle); + return 0; +} + +static void ext4_cleanup_dio_extend(struct inode *inode, loff_t offset, + ssize_t written, size_t count) +{ + handle_t *handle; + u8 blkbits = inode->i_blkbits; + ext4_lblk_t written_blk, end_blk; + + written_blk = ALIGN(offset + written, 1 << blkbits); + end_blk = ALIGN(offset + count, 1 << blkbits); + if (written_blk < end_blk && ext4_can_truncate(inode)) { + ext4_truncate_failed_write(inode); + /* + * If the truncate operation failed early, then the inode may + * still be on the orphan list. In that case, we need to try + * remove the inode from the in-memory linked list. + */ + if (inode->i_nlink) + ext4_orphan_del(NULL, inode); + return; + } + + /* + * We need to ensure that the inode is removed from the orphan + * list if it has been added prematurely, due to writeback of + * delalloc blocks. + */ + if (!list_empty(&EXT4_I(inode)->i_orphan) && inode->i_nlink) { + handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); + if (IS_ERR(handle)) { + ext4_orphan_del(NULL, inode); + return; + } + ext4_orphan_del(handle, inode); + ext4_journal_stop(handle); + } +} + static ssize_t ext4_handle_inode_extension(struct inode *inode, loff_t offset, ssize_t written, size_t count) { handle_t *handle; - bool truncate = false; u8 blkbits = inode->i_blkbits; ext4_lblk_t written_blk, end_blk; int ret; @@ -298,73 +354,31 @@ static ssize_t ext4_handle_inode_extension(struct inode *inode, loff_t offset, * as much as we intended. */ WARN_ON_ONCE(i_size_read(inode) < EXT4_I(inode)->i_disksize); - if (offset + count <= EXT4_I(inode)->i_disksize) { - /* - * We need to ensure that the inode is removed from the orphan - * list if it has been added prematurely, due to writeback of - * delalloc blocks. - */ - if (!list_empty(&EXT4_I(inode)->i_orphan) && inode->i_nlink) { - handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); - - if (IS_ERR(handle)) { - ext4_orphan_del(NULL, inode); - return PTR_ERR(handle); - } - - ext4_orphan_del(handle, inode); - ext4_journal_stop(handle); - } - + if (offset + count <= EXT4_I(inode)->i_disksize) return written; - } - - if (written < 0) - goto truncate; handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); - if (IS_ERR(handle)) { - written = PTR_ERR(handle); - goto truncate; - } + if (IS_ERR(handle)) + return PTR_ERR(handle); if (ext4_update_inode_size(inode, offset + written)) { ret = ext4_mark_inode_dirty(handle, inode); if (unlikely(ret)) { - written = ret; ext4_journal_stop(handle); - goto truncate; + return ret; } } /* - * We may need to truncate allocated but not written blocks beyond EOF. + * If there cannot be allocated but unused blocks beyond EOF, remove + * the inode from the orphan list as we are done with it. */ written_blk = ALIGN(offset + written, 1 << blkbits); end_blk = ALIGN(offset + count, 1 << blkbits); - if (written_blk < end_blk && ext4_can_truncate(inode)) - truncate = true; - - /* - * Remove the inode from the orphan list if it has been extended and - * everything went OK. - */ - if (!truncate && inode->i_nlink) + if (written_blk == end_blk && inode->i_nlink) ext4_orphan_del(handle, inode); ext4_journal_stop(handle); - if (truncate) { -truncate: - ext4_truncate_failed_write(inode); - /* - * If the truncate operation failed early, then the inode may - * still be on the orphan list. In that case, we need to try - * remove the inode from the in-memory linked list. - */ - if (inode->i_nlink) - ext4_orphan_del(NULL, inode); - } - return written; } @@ -385,10 +399,28 @@ static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size, return 0; } +static int ext4_dio_extending_write_end_io(struct kiocb *iocb, ssize_t size, + ssize_t orig_size, int error, + unsigned int flags) +{ + struct inode *inode = file_inode(iocb->ki_filp); + int ret; + + ret = ext4_dio_write_end_io(iocb, size, orig_size, error, flags); + if (ret < 0) + return ret; + return ext4_handle_inode_extension(inode, iocb->ki_pos, size, + orig_size); +} + static const struct iomap_dio_ops ext4_dio_write_ops = { .end_io = ext4_dio_write_end_io, }; +static const struct iomap_dio_ops ext4_dio_extending_write_ops = { + .end_io = ext4_dio_extending_write_end_io, +}; + /* * The intention here is to start with shared lock acquired then see if any * condition requires an exclusive inode lock. If yes, then we restart the @@ -455,11 +487,11 @@ static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from, static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) { ssize_t ret; - handle_t *handle; struct inode *inode = file_inode(iocb->ki_filp); loff_t offset = iocb->ki_pos; size_t count = iov_iter_count(from); const struct iomap_ops *iomap_ops = &ext4_iomap_ops; + const struct iomap_dio_ops *dio_ops = &ext4_dio_write_ops; bool extend = false, unaligned_io = false; bool ilock_shared = true; @@ -530,33 +562,21 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) inode_dio_wait(inode); if (extend) { - handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); + ret = ext4_prepare_dio_extend(inode); + if (ret) goto out; - } - - ext4_fc_start_update(inode); - ret = ext4_orphan_add(handle, inode); - ext4_fc_stop_update(inode); - if (ret) { - ext4_journal_stop(handle); - goto out; - } - - ext4_journal_stop(handle); } if (ilock_shared) iomap_ops = &ext4_iomap_overwrite_ops; - ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops, - (unaligned_io || extend) ? IOMAP_DIO_FORCE_WAIT : 0); - if (ret == -ENOTBLK) - ret = 0; - if (extend) - ret = ext4_handle_inode_extension(inode, offset, ret, count); + dio_ops = &ext4_dio_extending_write_ops; + ret = iomap_dio_rw(iocb, from, iomap_ops, dio_ops, + (extend || unaligned_io) ? IOMAP_DIO_FORCE_WAIT : 0); + if (ret == -ENOTBLK) + ret = 0; + ext4_cleanup_dio_extend(inode, offset, ret, count); out: if (ilock_shared) inode_unlock_shared(inode); @@ -599,7 +619,6 @@ ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from) ssize_t ret; size_t count; loff_t offset; - handle_t *handle; bool extend = false; struct inode *inode = file_inode(iocb->ki_filp); @@ -618,26 +637,20 @@ ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from) count = iov_iter_count(from); if (offset + count > EXT4_I(inode)->i_disksize) { - handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - goto out; - } - - ret = ext4_orphan_add(handle, inode); - if (ret) { - ext4_journal_stop(handle); + ret = ext4_prepare_dio_extend(inode); + if (ret < 0) goto out; - } - extend = true; - ext4_journal_stop(handle); } ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops); - if (extend) - ret = ext4_handle_inode_extension(inode, offset, ret, count); + if (extend) { + if (ret > 0) + ret = ext4_handle_inode_extension(inode, offset, ret, + count); + ext4_cleanup_dio_extend(inode, offset, ret, count); + } out: inode_unlock(inode); if (ret > 0) -- 2.26.2