If a file is opened with O_SYNC|O_DIRECT, the drive cache does not get flushed after the write completion for AIOs. This patch attempts to fix that problem by marking an I/O as requiring a cache flush in endio processing, and then issuing the cache flush after any unwritten extent conversion is done. From: Jeff Moyer <jmoyer@xxxxxxxxxx> Signed-off-by: Jeff Moyer <jmoyer@xxxxxxxxxx> [darrick.wong@xxxxxxxxxx: Rework patch to use per-mount workqueues] Signed-off-by: Darrick J. Wong <darrick.wong@xxxxxxxxxx> --- fs/xfs/xfs_aops.c | 52 +++++++++++++++++++++++++++++++++++++++++++++++++++- fs/xfs/xfs_aops.h | 1 + fs/xfs/xfs_mount.h | 1 + fs/xfs/xfs_super.c | 8 ++++++++ 4 files changed, 61 insertions(+), 1 deletion(-) diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index e57e2da..9cebbb7 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -173,6 +173,24 @@ xfs_setfilesize( } /* + * In the case of synchronous, AIO, O_DIRECT writes, we need to flush + * the disk cache when the I/O is complete. + */ +STATIC bool +xfs_ioend_needs_cache_flush( + struct xfs_ioend *ioend) +{ + struct xfs_inode *ip = XFS_I(ioend->io_inode); + struct xfs_mount *mp = ip->i_mount; + + if (!(mp->m_flags & XFS_MOUNT_BARRIER)) + return false; + + return IS_SYNC(ioend->io_inode) || + (ioend->io_iocb->ki_filp->f_flags & O_DSYNC); +} + +/* * Schedule IO completion handling on the final put of an ioend. * * If there is no work to do we might as well call it a day and free the @@ -189,11 +207,30 @@ xfs_finish_ioend( queue_work(mp->m_unwritten_workqueue, &ioend->io_work); else if (ioend->io_append_trans) queue_work(mp->m_data_workqueue, &ioend->io_work); + else if (ioend->io_needs_fsync) + queue_work(mp->m_aio_blkdev_flush_wq, &ioend->io_work); else xfs_destroy_ioend(ioend); } } +STATIC int +xfs_ioend_force_cache_flush( + xfs_ioend_t *ioend) +{ + struct xfs_inode *ip = XFS_I(ioend->io_inode); + struct xfs_mount *mp = ip->i_mount; + int err = 0; + int datasync; + + datasync = !IS_SYNC(ioend->io_inode) && + !(ioend->io_iocb->ki_filp->f_flags & __O_SYNC); + err = do_xfs_file_fsync(ip, mp, datasync); + xfs_destroy_ioend(ioend); + /* do_xfs_file_fsync returns -errno. our caller expects positive. */ + return -err; +} + /* * IO write completion. */ @@ -250,12 +287,22 @@ xfs_end_io( error = xfs_setfilesize(ioend); if (error) ioend->io_error = -error; + } else if (ioend->io_needs_fsync) { + error = xfs_ioend_force_cache_flush(ioend); + if (error && ioend->io_result > 0) + ioend->io_error = -error; + ioend->io_needs_fsync = 0; } else { ASSERT(!xfs_ioend_is_append(ioend)); } done: - xfs_destroy_ioend(ioend); + /* the honoring of O_SYNC has to be done last */ + if (ioend->io_needs_fsync) { + atomic_inc(&ioend->io_remaining); + xfs_finish_ioend(ioend); + } else + xfs_destroy_ioend(ioend); } /* @@ -292,6 +339,7 @@ xfs_alloc_ioend( atomic_set(&ioend->io_remaining, 1); ioend->io_isasync = 0; ioend->io_isdirect = 0; + ioend->io_needs_fsync = 0; ioend->io_error = 0; ioend->io_list = NULL; ioend->io_type = type; @@ -1409,6 +1457,8 @@ xfs_end_io_direct_write( if (is_async) { ioend->io_isasync = 1; + if (xfs_ioend_needs_cache_flush(ioend)) + ioend->io_needs_fsync = 1; xfs_finish_ioend(ioend); } else { xfs_finish_ioend_sync(ioend); diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h index c325abb..e48c7c2 100644 --- a/fs/xfs/xfs_aops.h +++ b/fs/xfs/xfs_aops.h @@ -47,6 +47,7 @@ typedef struct xfs_ioend { atomic_t io_remaining; /* hold count */ unsigned int io_isasync : 1; /* needs aio_complete */ unsigned int io_isdirect : 1;/* direct I/O */ + unsigned int io_needs_fsync : 1; /* aio+dio+o_sync */ struct inode *io_inode; /* file being written to */ struct buffer_head *io_buffer_head;/* buffer linked list head */ struct buffer_head *io_buffer_tail;/* buffer linked list tail */ diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index deee09e..ecd3d2e 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -209,6 +209,7 @@ typedef struct xfs_mount { struct workqueue_struct *m_data_workqueue; struct workqueue_struct *m_unwritten_workqueue; struct workqueue_struct *m_cil_workqueue; + struct workqueue_struct *m_aio_blkdev_flush_wq; } xfs_mount_t; /* diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 26a09bd..b05b557 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -863,8 +863,15 @@ xfs_init_mount_workqueues( WQ_MEM_RECLAIM, 0, mp->m_fsname); if (!mp->m_cil_workqueue) goto out_destroy_unwritten; + + mp->m_aio_blkdev_flush_wq = alloc_workqueue("xfs-aio-blkdev-flush/%s", + WQ_MEM_RECLAIM, 0, mp->m_fsname); + if (!mp->m_aio_blkdev_flush_wq) + goto out_destroy_cil_queue; return 0; +out_destroy_cil_queue: + destroy_workqueue(mp->m_cil_workqueue); out_destroy_unwritten: destroy_workqueue(mp->m_unwritten_workqueue); out_destroy_data_iodone_queue: @@ -877,6 +884,7 @@ STATIC void xfs_destroy_mount_workqueues( struct xfs_mount *mp) { + destroy_workqueue(mp->m_aio_blkdev_flush_wq); destroy_workqueue(mp->m_cil_workqueue); destroy_workqueue(mp->m_data_workqueue); destroy_workqueue(mp->m_unwritten_workqueue); -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html