Re: [PATCH v7] fs: Fix page cache inconsistency when mixing buffered and AIO DIO

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Al, Jens,

can any of you please take this throught your tree ?

Thanks!
-Lukas

On Tue, Aug 15, 2017 at 03:28:54PM +0200, Lukas Czerner wrote:
> Currently when mixing buffered reads and asynchronous direct writes it
> is possible to end up with the situation where we have stale data in the
> page cache while the new data is already written to disk. This is
> permanent until the affected pages are flushed away. Despite the fact
> that mixing buffered and direct IO is ill-advised it does pose a thread
> for a data integrity, is unexpected and should be fixed.
> 
> Fix this by deferring completion of asynchronous direct writes to a
> process context in the case that there are mapped pages to be found in
> the inode. Later before the completion in dio_complete() invalidate
> the pages in question. This ensures that after the completion the pages
> in the written area are either unmapped, or populated with up-to-date
> data. Also do the same for the iomap case which uses
> iomap_dio_complete() instead.
> 
> This has a side effect of deferring the completion to a process context
> for every AIO DIO that happens on inode that has pages mapped. However
> since the consensus is that this is ill-advised practice the performance
> implication should not be a problem.
> 
> This was based on proposal from Jeff Moyer, thanks!
> 
> Signed-off-by: Lukas Czerner <lczerner@xxxxxxxxxx>
> Cc: Jeff Moyer <jmoyer@xxxxxxxxxx>
> ---
> v2: Remove leftover ret variable from invalidate call in iomap_dio_complete
> v3: Do not invalidate in case of error. Add some coments
> v4: Remove unnecessary variable, remove unnecessary inner braces
> v5: Style changes
> v6: Remove redundant invalidatepage, add warning and comment
> v7: Run invalidateion conditionally from generic_file_direct_write()
> 
>  fs/direct-io.c | 49 +++++++++++++++++++++++++++++++++++++++++++------
>  fs/iomap.c     | 29 ++++++++++++++++-------------
>  mm/filemap.c   | 10 ++++++++--
>  3 files changed, 67 insertions(+), 21 deletions(-)
> 
> diff --git a/fs/direct-io.c b/fs/direct-io.c
> index 08cf278..ffb9e19 100644
> --- a/fs/direct-io.c
> +++ b/fs/direct-io.c
> @@ -229,6 +229,7 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, bool is_async)
>  {
>  	loff_t offset = dio->iocb->ki_pos;
>  	ssize_t transferred = 0;
> +	int err;
>  
>  	/*
>  	 * AIO submission can race with bio completion to get here while
> @@ -258,8 +259,22 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, bool is_async)
>  	if (ret == 0)
>  		ret = transferred;
>  
> +	/*
> +	 * Try again to invalidate clean pages which might have been cached by
> +	 * non-direct readahead, or faulted in by get_user_pages() if the source
> +	 * of the write was an mmap'ed region of the file we're writing.  Either
> +	 * one is a pretty crazy thing to do, so we don't support it 100%.  If
> +	 * this invalidation fails, tough, the write still worked...
> +	 */
> +	if (ret > 0 && dio->op == REQ_OP_WRITE &&
> +	    dio->inode->i_mapping->nrpages) {
> +		err = invalidate_inode_pages2_range(dio->inode->i_mapping,
> +					offset >> PAGE_SHIFT,
> +					(offset + ret - 1) >> PAGE_SHIFT);
> +		WARN_ON_ONCE(err);
> +	}
> +
>  	if (dio->end_io) {
> -		int err;
>  
>  		// XXX: ki_pos??
>  		err = dio->end_io(dio->iocb, offset, ret, dio->private);
> @@ -304,6 +319,7 @@ static void dio_bio_end_aio(struct bio *bio)
>  	struct dio *dio = bio->bi_private;
>  	unsigned long remaining;
>  	unsigned long flags;
> +	bool defer_completion = false;
>  
>  	/* cleanup the bio */
>  	dio_bio_complete(dio, bio);
> @@ -315,7 +331,19 @@ static void dio_bio_end_aio(struct bio *bio)
>  	spin_unlock_irqrestore(&dio->bio_lock, flags);
>  
>  	if (remaining == 0) {
> -		if (dio->result && dio->defer_completion) {
> +		/*
> +		 * Defer completion when defer_completion is set or
> +		 * when the inode has pages mapped and this is AIO write.
> +		 * We need to invalidate those pages because there is a
> +		 * chance they contain stale data in the case buffered IO
> +		 * went in between AIO submission and completion into the
> +		 * same region.
> +		 */
> +		if (dio->result)
> +			defer_completion = dio->defer_completion ||
> +					   (dio->op == REQ_OP_WRITE &&
> +					    dio->inode->i_mapping->nrpages);
> +		if (defer_completion) {
>  			INIT_WORK(&dio->complete_work, dio_aio_complete_work);
>  			queue_work(dio->inode->i_sb->s_dio_done_wq,
>  				   &dio->complete_work);
> @@ -1210,10 +1238,19 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
>  	 * For AIO O_(D)SYNC writes we need to defer completions to a workqueue
>  	 * so that we can call ->fsync.
>  	 */
> -	if (dio->is_async && iov_iter_rw(iter) == WRITE &&
> -	    ((iocb->ki_filp->f_flags & O_DSYNC) ||
> -	     IS_SYNC(iocb->ki_filp->f_mapping->host))) {
> -		retval = dio_set_defer_completion(dio);
> +	if (dio->is_async && iov_iter_rw(iter) == WRITE) {
> +		retval = 0;
> +		if ((iocb->ki_filp->f_flags & O_DSYNC) ||
> +		    IS_SYNC(iocb->ki_filp->f_mapping->host))
> +			retval = dio_set_defer_completion(dio);
> +		else if (!dio->inode->i_sb->s_dio_done_wq) {
> +			/*
> +			 * In case of AIO write racing with buffered read we
> +			 * need to defer completion. We can't decide this now,
> +			 * however the workqueue needs to be initialized here.
> +			 */
> +			retval = sb_init_dio_done_wq(dio->inode->i_sb);
> +		}
>  		if (retval) {
>  			/*
>  			 * We grab i_mutex only for reads so we don't have
> diff --git a/fs/iomap.c b/fs/iomap.c
> index 0392661..c3e299a 100644
> --- a/fs/iomap.c
> +++ b/fs/iomap.c
> @@ -713,8 +713,24 @@ struct iomap_dio {
>  static ssize_t iomap_dio_complete(struct iomap_dio *dio)
>  {
>  	struct kiocb *iocb = dio->iocb;
> +	struct inode *inode = file_inode(iocb->ki_filp);
>  	ssize_t ret;
>  
> +	/*
> +	 * Try again to invalidate clean pages which might have been cached by
> +	 * non-direct readahead, or faulted in by get_user_pages() if the source
> +	 * of the write was an mmap'ed region of the file we're writing.  Either
> +	 * one is a pretty crazy thing to do, so we don't support it 100%.  If
> +	 * this invalidation fails, tough, the write still worked...
> +	 */
> +	if (!dio->error &&
> +	    (dio->flags & IOMAP_DIO_WRITE) && inode->i_mapping->nrpages) {
> +		ret = invalidate_inode_pages2_range(inode->i_mapping,
> +				iocb->ki_pos >> PAGE_SHIFT,
> +				(iocb->ki_pos + dio->size - 1) >> PAGE_SHIFT);
> +		WARN_ON_ONCE(ret);
> +	}
> +
>  	if (dio->end_io) {
>  		ret = dio->end_io(iocb,
>  				dio->error ? dio->error : dio->size,
> @@ -1042,19 +1058,6 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
>  
>  	ret = iomap_dio_complete(dio);
>  
> -	/*
> -	 * Try again to invalidate clean pages which might have been cached by
> -	 * non-direct readahead, or faulted in by get_user_pages() if the source
> -	 * of the write was an mmap'ed region of the file we're writing.  Either
> -	 * one is a pretty crazy thing to do, so we don't support it 100%.  If
> -	 * this invalidation fails, tough, the write still worked...
> -	 */
> -	if (iov_iter_rw(iter) == WRITE) {
> -		int err = invalidate_inode_pages2_range(mapping,
> -				start >> PAGE_SHIFT, end >> PAGE_SHIFT);
> -		WARN_ON_ONCE(err);
> -	}
> -
>  	return ret;
>  
>  out_free_dio:
> diff --git a/mm/filemap.c b/mm/filemap.c
> index a497024..9440e02 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -2885,9 +2885,15 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
>  	 * we're writing.  Either one is a pretty crazy thing to do,
>  	 * so we don't support it 100%.  If this invalidation
>  	 * fails, tough, the write still worked...
> +	 *
> +	 * Most of the time we do not need this since dio_complete() will do
> +	 * the invalidation for us. However there are some file systems that
> +	 * do not end up with dio_complete() being called, so let's not break
> +	 * them by removing it completely
>  	 */
> -	invalidate_inode_pages2_range(mapping,
> -				pos >> PAGE_SHIFT, end);
> +	if (mapping->nrpages)
> +		invalidate_inode_pages2_range(mapping,
> +					pos >> PAGE_SHIFT, end);
>  
>  	if (written > 0) {
>  		pos += written;
> -- 
> 2.7.5
> 



[Index of Archives]     [Linux Ext4 Filesystem]     [Union Filesystem]     [Filesystem Testing]     [Ceph Users]     [Ecryptfs]     [AutoFS]     [Kernel Newbies]     [Share Photos]     [Security]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux Cachefs]     [Reiser Filesystem]     [Linux RAID]     [Samba]     [Device Mapper]     [CEPH Development]
  Powered by Linux