On Wed 14-04-21 11:59:35, Jan Kara wrote: > Eric has noticed that after pagecache read rework, generic/418 is > occasionally failing for ext4 when blocksize < pagesize. In fact, the > pagecache rework just made hard to hit race in ext4 more likely. The > problem is that since ext4 conversion of direct IO writes to iomap > framework (commit 378f32bab371), we update inode size after direct IO > write only after invalidating page cache. Thus if buffered read sneaks > at unfortunate moment like: > > CPU1 - write at offset 1k CPU2 - read from offset 0 > iomap_dio_rw(..., IOMAP_DIO_FORCE_WAIT); > ext4_readpage(); > ext4_handle_inode_extension() > > the read will zero out tail of the page as it still sees smaller inode > size and thus page cache becomes inconsistent with on-disk contents with > all the consequences. > > Fix the problem by moving inode size update into end_io handler which > gets called before the page cache is invalidated. > > Reported-by: Eric Whitney <enwlinux@xxxxxxxxx> > Fixes: 378f32bab371 ("ext4: introduce direct I/O write using iomap infrastructure") > CC: stable@xxxxxxxxxxxxxxx > Signed-off-by: Jan Kara <jack@xxxxxxx> Dave just suggested a better alternative than this in a different thread. I'll submit it once testing completes... Honza > --- > fs/ext4/file.c | 194 +++++++++++++++++++++++++------------------------ > 1 file changed, 98 insertions(+), 96 deletions(-) > > Eric, can you please try whether this patch fixes the failures you are > occasionally seeing? > > diff --git a/fs/ext4/file.c b/fs/ext4/file.c > index 194f5d00fa32..2b84fb48bde6 100644 > --- a/fs/ext4/file.c > +++ b/fs/ext4/file.c > @@ -280,13 +280,66 @@ static ssize_t ext4_buffered_write_iter(struct kiocb *iocb, > return ret; > } > > -static ssize_t ext4_handle_inode_extension(struct inode *inode, loff_t offset, > - ssize_t written, size_t count) > +static int ext4_prepare_dio_extend(struct inode *inode) > +{ > + handle_t *handle; > + int ret; > + > + handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); > + if (IS_ERR(handle)) > + return PTR_ERR(handle); > + > + ext4_fc_start_update(inode); > + ret = ext4_orphan_add(handle, inode); > + ext4_fc_stop_update(inode); > + if (ret) { > + ext4_journal_stop(handle); > + return ret; > + } > + ext4_journal_stop(handle); > + return 0; > +} > + > +static void ext4_cleanup_dio_extend(struct inode *inode, loff_t offset, > + ssize_t written, size_t count) > { > handle_t *handle; > - bool truncate = false; > u8 blkbits = inode->i_blkbits; > ext4_lblk_t written_blk, end_blk; > + > + /* Error? Nothing was written... */ > + if (written < 0) > + written = 0; > + > + written_blk = ALIGN(offset + written, 1 << blkbits); > + end_blk = ALIGN(offset + count, 1 << blkbits); > + if (written_blk < end_blk && ext4_can_truncate(inode)) { > + ext4_truncate_failed_write(inode); > + /* > + * If the truncate operation failed early, then the inode may > + * still be on the orphan list. In that case, we need to try > + * remove the inode from the in-memory linked list. > + */ > + if (inode->i_nlink) > + ext4_orphan_del(NULL, inode); > + return; > + } > + > + if (!list_empty(&EXT4_I(inode)->i_orphan) && inode->i_nlink) { > + handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); > + if (IS_ERR(handle)) { > + ext4_orphan_del(NULL, inode); > + return; > + } > + ext4_orphan_del(handle, inode); > + ext4_journal_stop(handle); > + } > +} > + > +static int ext4_handle_inode_extension(struct inode *inode, loff_t offset, > + ssize_t written) > +{ > + handle_t *handle; > int ret; > > /* > @@ -298,74 +351,23 @@ static ssize_t ext4_handle_inode_extension(struct inode *inode, loff_t offset, > * as much as we intended. > */ > WARN_ON_ONCE(i_size_read(inode) < EXT4_I(inode)->i_disksize); > - if (offset + count <= EXT4_I(inode)->i_disksize) { > - /* > - * We need to ensure that the inode is removed from the orphan > - * list if it has been added prematurely, due to writeback of > - * delalloc blocks. > - */ > - if (!list_empty(&EXT4_I(inode)->i_orphan) && inode->i_nlink) { > - handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); > - > - if (IS_ERR(handle)) { > - ext4_orphan_del(NULL, inode); > - return PTR_ERR(handle); > - } > - > - ext4_orphan_del(handle, inode); > - ext4_journal_stop(handle); > - } > - > - return written; > - } > - > - if (written < 0) > - goto truncate; > + if (offset + written <= EXT4_I(inode)->i_disksize) > + return 0; > > handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); > - if (IS_ERR(handle)) { > - written = PTR_ERR(handle); > - goto truncate; > - } > + if (IS_ERR(handle)) > + return PTR_ERR(handle); > > if (ext4_update_inode_size(inode, offset + written)) { > ret = ext4_mark_inode_dirty(handle, inode); > if (unlikely(ret)) { > - written = ret; > ext4_journal_stop(handle); > - goto truncate; > + return ret; > } > } > - > - /* > - * We may need to truncate allocated but not written blocks beyond EOF. > - */ > - written_blk = ALIGN(offset + written, 1 << blkbits); > - end_blk = ALIGN(offset + count, 1 << blkbits); > - if (written_blk < end_blk && ext4_can_truncate(inode)) > - truncate = true; > - > - /* > - * Remove the inode from the orphan list if it has been extended and > - * everything went OK. > - */ > - if (!truncate && inode->i_nlink) > - ext4_orphan_del(handle, inode); > ext4_journal_stop(handle); > > - if (truncate) { > -truncate: > - ext4_truncate_failed_write(inode); > - /* > - * If the truncate operation failed early, then the inode may > - * still be on the orphan list. In that case, we need to try > - * remove the inode from the in-memory linked list. > - */ > - if (inode->i_nlink) > - ext4_orphan_del(NULL, inode); > - } > - > - return written; > + return 0; > } > > static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size, > @@ -380,14 +382,29 @@ static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size, > if (size && flags & IOMAP_DIO_UNWRITTEN) > return ext4_convert_unwritten_extents(NULL, inode, > offset, size); > - > return 0; > } > > +static int ext4_dio_extending_write_end_io(struct kiocb *iocb, ssize_t size, > + int error, unsigned int flags) > +{ > + struct inode *inode = file_inode(iocb->ki_filp); > + int ret; > + > + ret = ext4_dio_write_end_io(iocb, size, error, flags); > + if (ret < 0) > + return ret; > + return ext4_handle_inode_extension(inode, iocb->ki_pos, size); > +} > + > static const struct iomap_dio_ops ext4_dio_write_ops = { > .end_io = ext4_dio_write_end_io, > }; > > +static const struct iomap_dio_ops ext4_dio_extending_write_ops = { > + .end_io = ext4_dio_extending_write_end_io, > +}; > + > /* > * The intention here is to start with shared lock acquired then see if any > * condition requires an exclusive inode lock. If yes, then we restart the > @@ -454,11 +471,11 @@ static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from, > static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) > { > ssize_t ret; > - handle_t *handle; > struct inode *inode = file_inode(iocb->ki_filp); > loff_t offset = iocb->ki_pos; > size_t count = iov_iter_count(from); > const struct iomap_ops *iomap_ops = &ext4_iomap_ops; > + const struct iomap_dio_ops *dio_ops = &ext4_dio_write_ops; > bool extend = false, unaligned_io = false; > bool ilock_shared = true; > > @@ -529,33 +546,21 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) > inode_dio_wait(inode); > > if (extend) { > - handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); > - if (IS_ERR(handle)) { > - ret = PTR_ERR(handle); > - goto out; > - } > - > - ext4_fc_start_update(inode); > - ret = ext4_orphan_add(handle, inode); > - ext4_fc_stop_update(inode); > - if (ret) { > - ext4_journal_stop(handle); > + ret = ext4_prepare_dio_extend(inode); > + if (ret) > goto out; > - } > - > - ext4_journal_stop(handle); > + dio_ops = &ext4_dio_extending_write_ops; > } > > if (ilock_shared) > iomap_ops = &ext4_iomap_overwrite_ops; > - ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops, > - (unaligned_io || extend) ? IOMAP_DIO_FORCE_WAIT : 0); > + > + ret = iomap_dio_rw(iocb, from, iomap_ops, dio_ops, > + (extend || unaligned_io) ? IOMAP_DIO_FORCE_WAIT : 0); > if (ret == -ENOTBLK) > ret = 0; > - > if (extend) > - ret = ext4_handle_inode_extension(inode, offset, ret, count); > - > + ext4_cleanup_dio_extend(inode, offset, ret, count); > out: > if (ilock_shared) > inode_unlock_shared(inode); > @@ -598,7 +603,6 @@ ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from) > ssize_t ret; > size_t count; > loff_t offset; > - handle_t *handle; > bool extend = false; > struct inode *inode = file_inode(iocb->ki_filp); > > @@ -617,26 +621,24 @@ ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from) > count = iov_iter_count(from); > > if (offset + count > EXT4_I(inode)->i_disksize) { > - handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); > - if (IS_ERR(handle)) { > - ret = PTR_ERR(handle); > - goto out; > - } > - > - ret = ext4_orphan_add(handle, inode); > - if (ret) { > - ext4_journal_stop(handle); > + ret = ext4_prepare_dio_extend(inode); > + if (ret < 0) > goto out; > - } > - > extend = true; > - ext4_journal_stop(handle); > } > > ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops); > > - if (extend) > - ret = ext4_handle_inode_extension(inode, offset, ret, count); > + if (extend) { > + if (ret > 0) { > + int ret2; > + > + ret2 = ext4_handle_inode_extension(inode, offset, ret); > + if (ret2 < 0) > + ret = ret2; > + } > + ext4_cleanup_dio_extend(inode, offset, ret, count); > + } > out: > inode_unlock(inode); > if (ret > 0) > -- > 2.26.2 > -- Jan Kara <jack@xxxxxxxx> SUSE Labs, CR