Re: [PATCH] dio: track and serialise unaligned direct IO

Badari Pulavarty <pbadari@xxxxxxxxx> · Fri, 30 Jul 2010 10:43:09 -0700

On Fri, 2010-07-30 at 08:45 +1000, Dave Chinner wrote:
> From: Dave Chinner <dchinner@xxxxxxxxxx>
> 
> If we get two unaligned direct IO's to the same filesystem block
> that is marked as a new allocation (i.e. buffer_new), then both IOs will
> zero the portion of the block they are not writing data to. As a
> result, when the IOs complete there will be a portion of the block
> that contains zeros from the last IO to complete rather than the
> data that should be there.
> 
> This is easily manifested by qemu using aio+dio with an unaligned
> guest filesystem - every IO is unaligned and fileystem corruption is
> encountered in the guest filesystem. xfstest 240 (from Eric Sandeen)
> is also a simple reproducer.
> 
> To avoid this problem, track unaligned IO that triggers sub-block zeroing and
> check new incoming unaligned IO that require sub-block zeroing against that
> list. If we get an overlap where the start and end of unaligned IOs hit the
> same filesystem block, then we need to block the incoming IOs until the IO that
> is zeroing the block completes. The blocked IO can then continue without
> needing to do any zeroing and hence won't overwrite valid data with zeros.
> 
> Signed-off-by: Dave Chinner <dchinner@xxxxxxxxxx>

I can confirm that, it  fixes corruption  of my VM images when using AIO
+DIO. (cache=none,aio=native). I haven't reviewed the patch closely, but

1) can we do this only for AIO+DIO combination ? For regular DIO, we
should have all the IOs serialized by i_mutex anyway..

2) Having a single global list (for all devices) might cause scaling
issues.

3) Are you dropping i_mutex when you are waiting for the zero-out to
finish ?

Thanks,
Badari
> ---
>  fs/direct-io.c |  152 +++++++++++++++++++++++++++++++++++++++++++++++++++++--
>  1 files changed, 146 insertions(+), 6 deletions(-)
> 
> diff --git a/fs/direct-io.c b/fs/direct-io.c
> index a10cb91..611524e 100644
> --- a/fs/direct-io.c
> +++ b/fs/direct-io.c
> @@ -71,6 +71,9 @@ struct dio {
>  	unsigned start_zero_done;	/* flag: sub-blocksize zeroing has
>  					   been performed at the start of a
>  					   write */
> +#define LAST_SECTOR ((sector_t)-1LL)
> +	sector_t zero_block_front;	/* fs block we are zeroing at front */
> +	sector_t zero_block_rear;	/* fs block we are zeroing at rear */
>  	int pages_in_io;		/* approximate total IO pages */
>  	size_t	size;			/* total request size (doesn't change)*/
>  	sector_t block_in_file;		/* Current offset into the underlying
> @@ -135,6 +138,101 @@ struct dio {
>  	struct page *pages[DIO_PAGES];	/* page buffer */
>  };
>  
> +
> +/*
> + * record fs blocks we are doing zeroing on in a zero block list.
> + * unaligned IO is not very performant and so is relatively uncommon,
> + * so a global list should be sufficent to track them.
> + */
> +struct dio_zero_block {
> +	struct list_head dio_list;	/* list of io in progress */
> +	sector_t	zero_block;	/* block being zeroed */
> +	struct dio	*dio;		/* owner dio */
> +	wait_queue_head_t wq;		/* New IO block here */
> +	atomic_t	ref;		/* reference count */
> +};
> +
> +DEFINE_SPINLOCK(dio_zero_block_lock);
> +LIST_HEAD(dio_zero_block_list);
> +
> +/*
> + * Add a filesystem block to the list of blocks we are tracking.
> + */
> +static void
> +dio_start_zero_block(struct dio *dio, sector_t zero_block)
> +{
> +	struct dio_zero_block *zb;
> +
> +	zb = kmalloc(sizeof(*zb), GFP_NOIO);
> +	if (!zb)
> +		return;
> +	INIT_LIST_HEAD(&zb->dio_list);
> +	init_waitqueue_head(&zb->wq);
> +	zb->zero_block = zero_block;
> +	zb->dio = dio;
> +	atomic_set(&zb->ref, 1);
> +
> +	spin_lock(&dio_zero_block_lock);
> +	list_add(&zb->dio_list, &dio_zero_block_list);
> +	spin_unlock(&dio_zero_block_lock);
> +}
> +
> +static void
> +dio_drop_zero_block(struct dio_zero_block *zb)
> +{
> +	if (atomic_dec_and_test(&zb->ref))
> +		kfree(zb);
> +}
> +
> +/*
> + * Check whether a filesystem block is currently being zeroed, and if it is
> + * wait for it to complete before returning. If we waited for a block being
> + * zeroed, return 1 to indicate that the block is already initialised,
> + * otherwise return 0 to indicate that it needs zeroing.
> + */
> +static int
> +dio_wait_zero_block(struct dio *dio, sector_t zero_block)
> +{
> +	struct dio_zero_block *zb;
> +
> +	spin_lock(&dio_zero_block_lock);
> +	list_for_each_entry(zb, &dio_zero_block_list, dio_list) {
> +		if (zb->dio->inode != dio->inode)
> +			continue;
> +		if (zb->zero_block != zero_block)
> +			continue;
> +		atomic_inc(&zb->ref);
> +		spin_unlock(&dio_zero_block_lock);
> +		wait_event(zb->wq, (list_empty(&zb->dio_list)));
> +		dio_drop_zero_block(zb);
> +		return 1;
> +	}
> +	spin_unlock(&dio_zero_block_lock);
> +	return 0;
> +}
> +
> +/*
> + * Complete a block zeroing and wake up anyone waiting for it.
> + */
> +static void dio_end_zero_block(struct dio *dio, sector_t zero_block)
> +{
> +	struct dio_zero_block *zb;
> +
> +	spin_lock(&dio_zero_block_lock);
> +	list_for_each_entry(zb, &dio_zero_block_list, dio_list) {
> +		if (zb->dio->inode != dio->inode)
> +			continue;
> +		if (zb->zero_block != zero_block)
> +			continue;
> +		list_del_init(&zb->dio_list);
> +		spin_unlock(&dio_zero_block_lock);
> +		wake_up(&zb->wq);
> +		dio_drop_zero_block(zb);
> +		return;
> +	}
> +	spin_unlock(&dio_zero_block_lock);
> +}
> +
>  /*
>   * How many pages are in the queue?
>   */
> @@ -253,6 +351,11 @@ static int dio_complete(struct dio *dio, loff_t offset, int ret, bool is_async)
>  		aio_complete(dio->iocb, ret, 0);
>  	}
>  
> +	if (dio->zero_block_front != LAST_SECTOR)
> +		dio_end_zero_block(dio, dio->zero_block_front);
> +	if (dio->zero_block_rear != LAST_SECTOR)
> +		dio_end_zero_block(dio, dio->zero_block_rear);
> +
>  	if (dio->flags & DIO_LOCKING)
>  		/* lockdep: non-owner release */
>  		up_read_non_owner(&dio->inode->i_alloc_sem);
> @@ -777,6 +880,12 @@ static void clean_blockdev_aliases(struct dio *dio)
>   * block with zeros. This happens only if user-buffer, fileoffset or
>   * io length is not filesystem block-size multiple.
>   *
> + * We need to track the blocks we are zeroing. If we have concurrent IOs that hit
> + * the same start or end block, we do not want all the IOs to zero the portion
> + * they are not writing data to as that will overwrite data from the other IOs.
> + * Hence we need to block until the first unaligned IO completes before we can
> + * continue (without executing any zeroing).
> + *
>   * `end' is zero if we're doing the start of the IO, 1 at the end of the
>   * IO.
>   */
> @@ -784,8 +893,8 @@ static void dio_zero_block(struct dio *dio, int end)
>  {
>  	unsigned dio_blocks_per_fs_block;
>  	unsigned this_chunk_blocks;	/* In dio_blocks */
> -	unsigned this_chunk_bytes;
>  	struct page *page;
> +	sector_t fsblock;
>  
>  	dio->start_zero_done = 1;
>  	if (!dio->blkfactor || !buffer_new(&dio->map_bh))
> @@ -797,17 +906,41 @@ static void dio_zero_block(struct dio *dio, int end)
>  	if (!this_chunk_blocks)
>  		return;
>  
> +	if (end)
> +		this_chunk_blocks = dio_blocks_per_fs_block - this_chunk_blocks;
> +
>  	/*
>  	 * We need to zero out part of an fs block.  It is either at the
> -	 * beginning or the end of the fs block.
> +	 * beginning or the end of the fs block, but first we need to check if
> +	 * there is already a zeroing being run on this block.
> +	 *
> +	 * If we are doing a sub-block IO (i.e. zeroing both front and rear of
> +	 * the same block) we don't need to wait or set a gaurd for the rear of
> +	 * the block as we already have one set.
>  	 */
> -	if (end) 
> -		this_chunk_blocks = dio_blocks_per_fs_block - this_chunk_blocks;
> +	fsblock = dio->block_in_file >> dio->blkfactor;
> +	if (!end || dio->zero_block_front != fsblock) {
>  
> -	this_chunk_bytes = this_chunk_blocks << dio->blkbits;
> +		/* wait for any zeroing already in progress */
> +		if (dio_wait_zero_block(dio, fsblock)) {
> +			/* skip the range we would have zeroed. */
> +			dio->next_block_for_io += this_chunk_blocks;
> +			return;
> +		}
> +
> +		/*
> +		 * we are going to zero stuff now, so set a guard to catch
> +		 * others that might want to zero the same block.
> +		 */
> +		dio_start_zero_block(dio, fsblock);
> +		if (end)
> +			dio->zero_block_rear = fsblock;
> +		else
> +			dio->zero_block_front = fsblock;
> +	}
>  
>  	page = ZERO_PAGE(0);
> -	if (submit_page_section(dio, page, 0, this_chunk_bytes, 
> +	if (submit_page_section(dio, page, 0, this_chunk_blocks << dio->blkbits,
>  				dio->next_block_for_io))
>  		return;
>  
> @@ -1191,6 +1324,13 @@ __blockdev_direct_IO_newtrunc(int rw, struct kiocb *iocb, struct inode *inode,
>  	 */
>  	memset(dio, 0, offsetof(struct dio, pages));
>  
> +	/*
> +	 * zero_blocks need to initialised to largeѕt value to avoid
> +	 * matching the zero block accidentally.
> +	 */
> +	dio->zero_block_front = LAST_SECTOR;
> +	dio->zero_block_rear = LAST_SECTOR;
> +
>  	dio->flags = flags;
>  	if (dio->flags & DIO_LOCKING) {
>  		/* watch out for a 0 len io from a tricksy fs */
> -- 
> 1.7.1
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
> the body of a message to majordomo@xxxxxxxxxxxxxxx
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

_______________________________________________
xfs mailing list
xfs@xxxxxxxxxxx
http://oss.sgi.com/mailman/listinfo/xfs