From: Baokun Li <libaokun1@xxxxxxxxxx> The data_err=abort was initially introduced to address users' worries about data corruption spreading unnoticed. With direct writes, we can rely on return values to confirm successful writes to disk. But with buffered writes, a successful return only means the data has been written to memory. Users have no way of knowing if the data has actually written it to disk unless they use fsync (which impacts performance and can sometimes miss errors). The current data_err=abort implementation relies on the ordered data list, but past changes have inadvertently altered its behavior. For example, if an extent is unwritten, we do not add the inode to the ordered data list. Therefore, jbd2 will not wait for the data write-back of that inode to complete and check for errors in the inode mapping. Moreover, the checks performed by jbd2 can also miss errors. Now, all buffered writes eventually call ext4_end_bio(), where I/O errors are checked. Therefore, we can check for the data_err=abort mode at this point and abort the journal in a kworker (due to the interrupt context). Therefore, when data_err=abort is enabled, the journal is aborted in ext4_end_io_end() when an I/O error is detected in ext4_end_bio() to make users who are concerned about the contents of the file happy. Suggested-by: Jan Kara <jack@xxxxxxx> Link: https://patch.msgid.link/c7ab26f3-85ad-4b31-b132-0afb0e07bf79@xxxxxxxxxx Signed-off-by: Baokun Li <libaokun1@xxxxxxxxxx> Reviewed-by: Zhang Yi <yi.zhang@xxxxxxxxxx> --- fs/ext4/ext4.h | 2 ++ fs/ext4/page-io.c | 48 +++++++++++++++++++++++++++++++++-------------- 2 files changed, 36 insertions(+), 14 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 9da0e32af02a..e0b536df8872 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -281,6 +281,8 @@ struct ext4_system_blocks { #define EXT4_IO_END_UNWRITTEN 0x0001 #define EXT4_IO_END_FAILED 0x0002 +#define EXT4_IO_END_NEED_COMPLETION (EXT4_IO_END_UNWRITTEN | EXT4_IO_END_FAILED) + struct ext4_io_end_vec { struct list_head list; /* list of io_end_vec */ loff_t offset; /* offset in the file */ diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 6054ec27fb48..5dbd3ff188a3 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -164,7 +164,8 @@ static void ext4_release_io_end(ext4_io_end_t *io_end) } /* - * Check a range of space and convert unwritten extents to written. Note that + * On successful IO, check a range of space and convert unwritten extents to + * written. On IO failure, check if journal abort is needed. Note that * we are protected from truncate touching same part of extent tree by the * fact that truncate code waits for all DIO to finish (thus exclusion from * direct IO is achieved) and also waits for PageWriteback bits. Thus we @@ -175,6 +176,7 @@ static int ext4_end_io_end(ext4_io_end_t *io_end) { struct inode *inode = io_end->inode; handle_t *handle = io_end->handle; + struct super_block *sb = inode->i_sb; int ret = 0; ext4_debug("ext4_end_io_nolock: io_end 0x%p from inode %lu,list->next 0x%p," @@ -190,11 +192,15 @@ static int ext4_end_io_end(ext4_io_end_t *io_end) ret = -EIO; if (handle) jbd2_journal_free_reserved(handle); + + if (test_opt(sb, DATA_ERR_ABORT)) + jbd2_journal_abort(EXT4_SB(sb)->s_journal, ret); } else { ret = ext4_convert_unwritten_io_end_vec(handle, io_end); } - if (ret < 0 && !ext4_forced_shutdown(inode->i_sb)) { - ext4_msg(inode->i_sb, KERN_EMERG, + if (ret < 0 && !ext4_forced_shutdown(sb) && + io_end->flag & EXT4_IO_END_UNWRITTEN) { + ext4_msg(sb, KERN_EMERG, "failed to convert unwritten extents to written " "extents -- potential data loss! " "(inode %lu, error %d)", inode->i_ino, ret); @@ -228,6 +234,16 @@ static void dump_completed_IO(struct inode *inode, struct list_head *head) #endif } +static bool ext4_io_end_need_completion(ext4_io_end_t *io_end) +{ + if (io_end->flag & EXT4_IO_END_UNWRITTEN) + return true; + if (test_opt(io_end->inode->i_sb, DATA_ERR_ABORT) && + io_end->flag & EXT4_IO_END_FAILED) + return true; + return false; +} + /* Add the io_end to per-inode completed end_io list. */ static void ext4_add_complete_io(ext4_io_end_t *io_end) { @@ -236,9 +252,11 @@ static void ext4_add_complete_io(ext4_io_end_t *io_end) struct workqueue_struct *wq; unsigned long flags; - /* Only reserved conversions from writeback should enter here */ - WARN_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN)); - WARN_ON(!io_end->handle && sbi->s_journal); + /* Only reserved conversions or pending IO errors will enter here. */ + WARN_ON(!(io_end->flag & EXT4_IO_END_NEED_COMPLETION)); + WARN_ON(io_end->flag & EXT4_IO_END_UNWRITTEN && + !io_end->handle && sbi->s_journal); + spin_lock_irqsave(&ei->i_completed_io_lock, flags); wq = sbi->rsv_conversion_wq; if (list_empty(&ei->i_rsv_conversion_list)) @@ -263,7 +281,7 @@ static int ext4_do_flush_completed_IO(struct inode *inode, while (!list_empty(&unwritten)) { io_end = list_entry(unwritten.next, ext4_io_end_t, list); - BUG_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN)); + BUG_ON(!(io_end->flag & EXT4_IO_END_NEED_COMPLETION)); list_del_init(&io_end->list); err = ext4_end_io_end(io_end); @@ -274,7 +292,8 @@ static int ext4_do_flush_completed_IO(struct inode *inode, } /* - * work on completed IO, to convert unwritten extents to extents + * Used to convert unwritten extents to written extents upon IO completion, + * or used to abort the journal upon IO errors. */ void ext4_end_io_rsv_work(struct work_struct *work) { @@ -299,19 +318,20 @@ ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags) void ext4_put_io_end_defer(ext4_io_end_t *io_end) { if (refcount_dec_and_test(&io_end->count)) { - if (!(io_end->flag & EXT4_IO_END_UNWRITTEN) || - list_empty(&io_end->list_vec)) { - ext4_release_io_end(io_end); + if (io_end->flag & EXT4_IO_END_FAILED || + (io_end->flag & EXT4_IO_END_UNWRITTEN && + !list_empty(&io_end->list_vec))) { + ext4_add_complete_io(io_end); return; } - ext4_add_complete_io(io_end); + ext4_release_io_end(io_end); } } int ext4_put_io_end(ext4_io_end_t *io_end) { if (refcount_dec_and_test(&io_end->count)) { - if (io_end->flag & EXT4_IO_END_UNWRITTEN) + if (ext4_io_end_need_completion(io_end)) return ext4_end_io_end(io_end); ext4_release_io_end(io_end); @@ -355,7 +375,7 @@ static void ext4_end_bio(struct bio *bio) blk_status_to_errno(bio->bi_status)); } - if (io_end->flag & EXT4_IO_END_UNWRITTEN) { + if (ext4_io_end_need_completion(io_end)) { /* * Link bio into list hanging from io_end. We have to do it * atomically as bio completions can be racing against each -- 2.39.2