If a file is opened with O_SYNC|O_DIRECT, the drive cache does not get flushed after the write completion. Instead, it's flushed *before* the I/O is sent to the disk (in __generic_file_aio_write). This patch attempts to fix that problem by marking an I/O as requiring a cache flush in endio processing. I'll send a follow-on patch to the generic write code to get rid of the bogus generic_write_sync call when EIOCBQUEUED is returned. From: Jeff Moyer <jmoyer@xxxxxxxxxx> Signed-off-by: Jeff Moyer <jmoyer@xxxxxxxxxx> [darrick.wong@xxxxxxxxxx: Rework original patch to reflect a subsequent ext4 reorganization] Signed-off-by: Darrick J. Wong <darrick.wong@xxxxxxxxxx> --- fs/ext4/ext4.h | 9 +++++ fs/ext4/file.c | 2 + fs/ext4/inode.c | 6 +++ fs/ext4/page-io.c | 92 +++++++++++++++++++++++++++++++++++++++++++++-------- fs/ext4/super.c | 13 +++++++ 5 files changed, 106 insertions(+), 16 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 3c20de1..f5a0b89 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -186,6 +186,7 @@ struct mpage_da_data { #define EXT4_IO_END_ERROR 0x0002 #define EXT4_IO_END_QUEUED 0x0004 #define EXT4_IO_END_DIRECT 0x0008 +#define EXT4_IO_END_NEEDS_SYNC 0x0010 struct ext4_io_page { struct page *p_page; @@ -1279,6 +1280,9 @@ struct ext4_sb_info { /* workqueue for dio unwritten */ struct workqueue_struct *dio_unwritten_wq; + /* workqueue for aio+dio+o_sync disk cache flushing */ + struct workqueue_struct *aio_dio_flush_wq; + /* timer for periodic error stats printing */ struct timer_list s_err_report; @@ -1335,6 +1339,11 @@ static inline void ext4_set_io_unwritten_flag(struct inode *inode, } } +static inline int ext4_io_end_deferred(ext4_io_end_t *i) +{ + return i->flag & (EXT4_IO_END_UNWRITTEN | EXT4_IO_END_NEEDS_SYNC); +} + static inline ext4_io_end_t *ext4_inode_aio(struct inode *inode) { return inode->i_private; diff --git a/fs/ext4/file.c b/fs/ext4/file.c index bf3966b..dd34b81 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -155,7 +155,7 @@ ext4_file_dio_write(struct kiocb *iocb, const struct iovec *iov, ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); mutex_unlock(&inode->i_mutex); - if (ret > 0 || ret == -EIOCBQUEUED) { + if (ret > 0) { ssize_t err; err = generic_write_sync(file, pos, ret); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index b3c243b..fc4f05e 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2893,8 +2893,12 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, iocb->private = NULL; + /* AIO+DIO+O_SYNC I/Os need a cache flush after completion */ + if (is_async && (IS_SYNC(inode) || (iocb->ki_filp->f_flags & O_DSYNC))) + io_end->flag |= EXT4_IO_END_NEEDS_SYNC; + /* if not aio dio with unwritten extents, just free io and return */ - if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { + if (!ext4_io_end_deferred(io_end)) { ext4_free_io_end(io_end); out: if (is_async) diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 68e896e..cda013a 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -84,6 +84,50 @@ void ext4_free_io_end(ext4_io_end_t *io) kmem_cache_free(io_end_cachep, io); } +/* + * This function is called in the completion path for AIO O_SYNC|O_DIRECT + * writes, and also in the fsync path. The purpose is to ensure that the + * disk caches for the journal and data devices are flushed. + */ +static int ext4_end_io_do_flush(ext4_io_end_t *io) +{ + struct inode *inode = io->inode; + tid_t commit_tid; + bool needs_barrier = false; + journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; + int barriers_enabled = test_opt(inode->i_sb, BARRIER); + int ret = 0; + + if (!barriers_enabled) + return 0; + + /* + * If we are running in nojournal mode, just flush the disk + * cache and return. + */ + if (!journal) + return blkdev_issue_flush(inode->i_sb->s_bdev, GFP_NOIO, NULL); + + if (ext4_should_journal_data(inode)) { + ret = ext4_force_commit(inode->i_sb); + goto out; + } + + commit_tid = io->iocb->ki_filp->f_flags & __O_SYNC ? + EXT4_I(inode)->i_sync_tid : EXT4_I(inode)->i_datasync_tid; + if (!jbd2_trans_will_send_data_barrier(journal, commit_tid)) + needs_barrier = true; + + jbd2_log_start_commit(journal, commit_tid); + ret = jbd2_log_wait_commit(journal, commit_tid); + + if (!ret && needs_barrier) + ret = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_NOIO, NULL); + +out: + return ret; +} + /* check a range of space and convert unwritten extents to written. */ static int ext4_end_io(ext4_io_end_t *io) { @@ -96,21 +140,37 @@ static int ext4_end_io(ext4_io_end_t *io) "list->prev 0x%p\n", io, inode->i_ino, io->list.next, io->list.prev); - ret = ext4_convert_unwritten_extents(inode, offset, size); - if (ret < 0) { - ext4_msg(inode->i_sb, KERN_EMERG, - "failed to convert unwritten extents to written " - "extents -- potential data loss! " - "(inode %lu, offset %llu, size %zd, error %d)", - inode->i_ino, offset, size, ret); + if (io->flag & EXT4_IO_END_UNWRITTEN) { + ret = ext4_convert_unwritten_extents(inode, offset, size); + if (ret < 0) { + ext4_msg(inode->i_sb, KERN_EMERG, + "failed to convert unwritten extents to " + "written extents -- potential data loss! " + "(inode %lu, offset %llu, size %zd, error %d)", + inode->i_ino, offset, size, ret); + goto endio; + } } + + /* + * This function has two callers. The first is the end_io_work + * routine just below, which is an asynchronous completion context. + * The second is in the fsync path. For the latter path, we can't + * return from here until the job is done. Hence, + * ext4_end_io_do_flush is blocking. + */ + if (io->flag & EXT4_IO_END_NEEDS_SYNC) + ret = ext4_end_io_do_flush(io); + +endio: if (io->iocb) aio_complete(io->iocb, io->result, 0); if (io->flag & EXT4_IO_END_DIRECT) inode_dio_done(inode); /* Wake up anyone waiting on unwritten extent conversion */ - if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten)) + if (io->flag & EXT4_IO_END_UNWRITTEN && + atomic_dec_and_test(&EXT4_I(inode)->i_unwritten)) wake_up_all(ext4_ioend_wq(io->inode)); return ret; } @@ -149,8 +209,11 @@ void ext4_add_complete_io(ext4_io_end_t *io_end) struct workqueue_struct *wq; unsigned long flags; - BUG_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN)); - wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq; + BUG_ON(!ext4_io_end_deferred(io_end)); + if (io_end->flag & EXT4_IO_END_UNWRITTEN) + wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq; + else + wq = EXT4_SB(io_end->inode->i_sb)->aio_dio_flush_wq; spin_lock_irqsave(&ei->i_completed_io_lock, flags); if (list_empty(&ei->i_completed_io_list)) { @@ -180,7 +243,7 @@ static int ext4_do_flush_completed_IO(struct inode *inode, while (!list_empty(&unwritten)) { io = list_entry(unwritten.next, ext4_io_end_t, list); - BUG_ON(!(io->flag & EXT4_IO_END_UNWRITTEN)); + BUG_ON(!ext4_io_end_deferred(io)); list_del_init(&io->list); err = ext4_end_io(io); @@ -192,7 +255,8 @@ static int ext4_do_flush_completed_IO(struct inode *inode, spin_lock_irqsave(&ei->i_completed_io_lock, flags); while (!list_empty(&complete)) { io = list_entry(complete.next, ext4_io_end_t, list); - io->flag &= ~EXT4_IO_END_UNWRITTEN; + io->flag &= ~(EXT4_IO_END_UNWRITTEN | + EXT4_IO_END_NEEDS_SYNC); /* end_io context can not be destroyed now because it still * used by queued worker. Worker thread will destroy it later */ if (io->flag & EXT4_IO_END_QUEUED) @@ -204,7 +268,7 @@ static int ext4_do_flush_completed_IO(struct inode *inode, * flag, and destroy it's end_io if it was converted already */ if (work_io) { work_io->flag &= ~EXT4_IO_END_QUEUED; - if (!(work_io->flag & EXT4_IO_END_UNWRITTEN)) + if (!ext4_io_end_deferred(work_io)) list_add_tail(&work_io->list, &to_free); } spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); @@ -319,7 +383,7 @@ static void ext4_end_bio(struct bio *bio, int error) bi_sector >> (inode->i_blkbits - 9)); } - if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { + if (!ext4_io_end_deferred(io_end)) { ext4_free_io_end(io_end); return; } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 80928f7..c657c4d 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -849,7 +849,9 @@ static void ext4_put_super(struct super_block *sb) dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); flush_workqueue(sbi->dio_unwritten_wq); + flush_workqueue(sbi->aio_dio_flush_wq); destroy_workqueue(sbi->dio_unwritten_wq); + destroy_workqueue(sbi->aio_dio_flush_wq); if (sbi->s_journal) { err = jbd2_journal_destroy(sbi->s_journal); @@ -3913,6 +3915,14 @@ no_journal: goto failed_mount_wq; } + EXT4_SB(sb)->aio_dio_flush_wq = + alloc_workqueue("ext4-aio-dio-flush", + WQ_MEM_RECLAIM | WQ_UNBOUND, 1); + if (!EXT4_SB(sb)->aio_dio_flush_wq) { + pr_err("EXT4-fs: failed to create flush workqueue\n"); + goto failed_flush_wq; + } + /* * The jbd2_journal_load will have done any necessary log recovery, * so we can safely mount the rest of the filesystem now. @@ -4045,6 +4055,8 @@ failed_mount4a: sb->s_root = NULL; failed_mount4: ext4_msg(sb, KERN_ERR, "mount failed"); + destroy_workqueue(EXT4_SB(sb)->aio_dio_flush_wq); +failed_flush_wq: destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq); failed_mount_wq: if (sbi->s_journal) { @@ -4494,6 +4506,7 @@ static int ext4_sync_fs(struct super_block *sb, int wait) trace_ext4_sync_fs(sb, wait); flush_workqueue(sbi->dio_unwritten_wq); + flush_workqueue(sbi->aio_dio_flush_wq); /* * Writeback quota in non-journalled quota case - journalled quota has * no dirty dquots _______________________________________________ xfs mailing list xfs@xxxxxxxxxxx http://oss.sgi.com/mailman/listinfo/xfs