---
fs/ext4/file.c | 219 +++++++++++++++++++++++++++++++++---------------
fs/ext4/inode.c | 57 ++++++++++---
2 files changed, 198 insertions(+), 78 deletions(-)
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 8e586198f6e6..bf22425a6a6f 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -29,6 +29,7 @@
#include <linux/pagevec.h>
#include <linux/uio.h>
#include <linux/mman.h>
+#include <linux/backing-dev.h>
#include "ext4.h"
#include "ext4_jbd2.h"
#include "xattr.h"
@@ -217,6 +218,14 @@ static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from)
if (ret <= 0)
return ret;
+ ret = file_remove_privs(iocb->ki_filp);
+ if (ret)
+ return 0;
+
+ ret = file_update_time(iocb->ki_filp);
+ if (ret)
+ return 0;
+
if (unlikely(IS_IMMUTABLE(inode)))
return -EPERM;
@@ -234,6 +243,34 @@ static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from)
return iov_iter_count(from);
}
+static ssize_t ext4_buffered_write_iter(struct kiocb *iocb,
+ struct iov_iter *from)
+{
+ ssize_t ret;
+ struct inode *inode = file_inode(iocb->ki_filp);
+
+ if (iocb->ki_flags & IOCB_NOWAIT)
+ return -EOPNOTSUPP;
+
+ if (!inode_trylock(inode))
+ inode_lock(inode);
+
+ ret = ext4_write_checks(iocb, from);
+ if (ret <= 0)
+ goto out;
+
+ current->backing_dev_info = inode_to_bdi(inode);
+ ret = generic_perform_write(iocb->ki_filp, from, iocb->ki_pos);
+ current->backing_dev_info = NULL;
+out:
+ inode_unlock(inode);
+ if (likely(ret > 0)) {
+ iocb->ki_pos += ret;
+ ret = generic_write_sync(iocb, ret);
+ }
+ return ret;
+}
+
static int ext4_handle_inode_extension(struct inode *inode, loff_t offset,
ssize_t len, size_t count)
{
@@ -311,6 +348,118 @@ static int ext4_handle_failed_inode_extension(struct inode *inode, loff_t size)
return ret;
}
+/*
+ * For a write that extends the inode size, ext4_dio_write_iter() will
+ * wait for the write to complete. Consequently, operations performed
+ * within this function are still covered by the inode_lock().
+ */
+static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size, int error,
+ unsigned int flags)
+{
+ int ret = 0;
+ loff_t offset = iocb->ki_pos;
+ struct inode *inode = file_inode(iocb->ki_filp);
+
+ if (error) {
+ ret = ext4_handle_failed_inode_extension(inode, offset + size);
+ return ret ? ret : error;
+ }
+
+ if (flags & IOMAP_DIO_UNWRITTEN) {
+ ret = ext4_convert_unwritten_extents(NULL, inode,
+ offset, size);
+ if (ret)
+ return ret;
+ }
+
+ if (offset + size > i_size_read(inode)) {
+ ret = ext4_handle_inode_extension(inode, offset, size, 0);
+ if (ret)
+ return ret;
+ }
+ return ret;
+}
+
+static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+ ssize_t ret;
+ loff_t offset = iocb->ki_pos;
+ size_t count = iov_iter_count(from);
+ struct inode *inode = file_inode(iocb->ki_filp);
+ bool extend = false, overwrite = false, unaligned_aio = false;
+
+ if (!inode_trylock(inode)) {
+ if (iocb->ki_flags & IOCB_NOWAIT)
+ return -EAGAIN;
+ inode_lock(inode);
+ }
+
+ if (!ext4_dio_checks(inode)) {
+ inode_unlock(inode);
+ /*
+ * Fallback to buffered IO if the operation on the
+ * inode is not supported by direct IO.
+ */
+ return ext4_buffered_write_iter(iocb, from);
+ }
+
+ ret = ext4_write_checks(iocb, from);
+ if (ret <= 0) {
+ inode_unlock(inode);
+ return ret;
+ }
+
+ /*
+ * Unaligned direct AIO must be serialized among each other as
+ * the zeroing of partial blocks of two competing unaligned
+ * AIOs can result in data corruption.
+ */
+ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) &&
+ !is_sync_kiocb(iocb) && ext4_unaligned_aio(inode, from, offset)) {
+ unaligned_aio = true;
+ inode_dio_wait(inode);
+ }
+
+ /*
+ * Determine whether the IO operation will overwrite allocated
+ * and initialized blocks. If so, check to see whether it is
+ * possible to take the dioread_nolock path.
+ */
+ if (!unaligned_aio && ext4_overwrite_io(inode, offset, count) &&
+ ext4_should_dioread_nolock(inode)) {
+ overwrite = true;
+ downgrade_write(&inode->i_rwsem);
+ }
+
+ if (offset + count > i_size_read(inode) ||
+ offset + count > EXT4_I(inode)->i_disksize) {
+ ext4_update_i_disksize(inode, inode->i_size);
+ extend = true;
+ }
+
+ ret = iomap_dio_rw(iocb, from, &ext4_iomap_ops, ext4_dio_write_end_io);
+
+ /*
+ * Unaligned direct AIO must be the only IO in flight or else
+ * any overlapping aligned IO after unaligned IO might result
+ * in data corruption. We also need to wait here in the case
+ * where the inode is being extended so that inode extension
+ * routines in ext4_dio_write_end_io() are covered by the
+ * inode_lock().
+ */
+ if (ret == -EIOCBQUEUED && (unaligned_aio || extend))
+ inode_dio_wait(inode);
+
+ if (overwrite)
+ inode_unlock_shared(inode);
+ else
+ inode_unlock(inode);
+
+ if (ret >= 0 && iov_iter_count(from))
+ return ext4_buffered_write_iter(iocb, from);
+ return ret;
+}
+
#ifdef CONFIG_FS_DAX
static ssize_t
ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
@@ -325,15 +474,10 @@ ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
return -EAGAIN;
inode_lock(inode);
}
+
ret = ext4_write_checks(iocb, from);
if (ret <= 0)
goto out;
- ret = file_remove_privs(iocb->ki_filp);
- if (ret)
- goto out;
- ret = file_update_time(iocb->ki_filp);
- if (ret)
- goto out;
offset = iocb->ki_pos;
ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops);
@@ -359,73 +503,16 @@ static ssize_t
ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct inode *inode = file_inode(iocb->ki_filp);
- int o_direct = iocb->ki_flags & IOCB_DIRECT;
- int unaligned_aio = 0;
- int overwrite = 0;
- ssize_t ret;
if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
return -EIO;
-#ifdef CONFIG_FS_DAX
if (IS_DAX(inode))
return ext4_dax_write_iter(iocb, from);
-#endif
- if (!o_direct && (iocb->ki_flags & IOCB_NOWAIT))
- return -EOPNOTSUPP;
- if (!inode_trylock(inode)) {
- if (iocb->ki_flags & IOCB_NOWAIT)
- return -EAGAIN;
- inode_lock(inode);
- }
-
- ret = ext4_write_checks(iocb, from);
- if (ret <= 0)
- goto out;
-
- /*
- * Unaligned direct AIO must be serialized among each other as zeroing
- * of partial blocks of two competing unaligned AIOs can result in data
- * corruption.
- */
- if (o_direct && ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) &&
- !is_sync_kiocb(iocb) &&
- ext4_unaligned_aio(inode, from, iocb->ki_pos)) {
- unaligned_aio = 1;
- ext4_unwritten_wait(inode);
- }
-
- iocb->private = &overwrite;
- /* Check whether we do a DIO overwrite or not */
- if (o_direct && !unaligned_aio) {
- if (ext4_overwrite_io(inode, iocb->ki_pos, iov_iter_count(from))) {
- if (ext4_should_dioread_nolock(inode))
- overwrite = 1;
- } else if (iocb->ki_flags & IOCB_NOWAIT) {
- ret = -EAGAIN;
- goto out;
- }
- }
-
- ret = __generic_file_write_iter(iocb, from);
- /*
- * Unaligned direct AIO must be the only IO in flight. Otherwise
- * overlapping aligned IO after unaligned might result in data
- * corruption.
- */
- if (ret == -EIOCBQUEUED && unaligned_aio)
- ext4_unwritten_wait(inode);
- inode_unlock(inode);
-
- if (ret > 0)
- ret = generic_write_sync(iocb, ret);
-
- return ret;
-
-out:
- inode_unlock(inode);
- return ret;
+ if (iocb->ki_flags & IOCB_DIRECT)
+ return ext4_dio_write_iter(iocb, from);
+ return ext4_buffered_write_iter(iocb, from);
}
#ifdef CONFIG_FS_DAX
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index efb184928e51..f52ad3065236 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3513,11 +3513,13 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
}
}
} else if (flags & IOMAP_WRITE) {
- int dio_credits;
handle_t *handle;
- int retries = 0;
+ int dio_credits, retries = 0, m_flags = 0;
- /* Trim mapping request to maximum we can map at once for DIO */
+ /*
+ * Trim mapping request to maximum we can map at once
+ * for DIO.
+ */
if (map.m_len > DIO_MAX_BLOCKS)
map.m_len = DIO_MAX_BLOCKS;
dio_credits = ext4_chunk_trans_blocks(inode, map.m_len);
@@ -3533,8 +3535,30 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
if (IS_ERR(handle))
return PTR_ERR(handle);
- ret = ext4_map_blocks(handle, inode, &map,
- EXT4_GET_BLOCKS_CREATE_ZERO);
+ /*
+ * DAX and direct IO are the only two operations that
+ * are currently supported with IOMAP_WRITE.
+ */
+ WARN_ON(!IS_DAX(inode) && !(flags & IOMAP_DIRECT));
+ if (IS_DAX(inode))
+ m_flags = EXT4_GET_BLOCKS_CREATE_ZERO;
+ else if (round_down(offset, i_blocksize(inode)) >=
+ i_size_read(inode))
+ m_flags = EXT4_GET_BLOCKS_CREATE;
+ else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+ m_flags = EXT4_GET_BLOCKS_IO_CREATE_EXT;
+
+ ret = ext4_map_blocks(handle, inode, &map, m_flags);
+
+ /*
+ * We cannot fill holes in indirect tree based inodes
+ * as that could expose stale data in the case of a
+ * crash. Use the magic error code to fallback to
+ * buffered IO.
+ */
+ if (!m_flags && !ret)
+ ret = -ENOTBLK;
+
if (ret < 0) {
ext4_journal_stop(handle);
if (ret == -ENOSPC &&
@@ -3544,13 +3568,14 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
}
/*
- * If we added blocks beyond i_size, we need to make sure they
- * will get truncated if we crash before updating i_size in
- * ext4_iomap_end(). For faults we don't need to do that (and
- * even cannot because for orphan list operations inode_lock is
- * required) - if we happen to instantiate block beyond i_size,
- * it is because we race with truncate which has already added
- * the inode to the orphan list.
+ * If we added blocks beyond i_size, we need to make
+ * sure they will get truncated if we crash before
+ * updating the i_size. For faults we don't need to do
+ * that (and even cannot because for orphan list
+ * operations inode_lock is required) - if we happen
+ * to instantiate block beyond i_size, it is because
+ * we race with truncate which has already added the
+ * inode to the orphan list.
*/
if (!(flags & IOMAP_FAULT) && first_block + map.m_len >
(i_size_read(inode) + (1 << blkbits) - 1) >> blkbits) {
@@ -3612,6 +3637,14 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length,
ssize_t written, unsigned flags, struct iomap *iomap)
{
+ /*
+ * Check to see whether an error occurred while writing data
+ * out to allocated blocks. If so, return the magic error code
+ * so that we fallback to buffered IO and reuse the blocks
+ * that were allocated in preparation for the direct IO write.
+ */
+ if (flags & IOMAP_DIRECT && written == 0)
+ return -ENOTBLK;
return 0;
}