On Fri, 2010-07-30 at 08:45 +1000, Dave Chinner wrote: > From: Dave Chinner <dchinner@xxxxxxxxxx> > > If we get two unaligned direct IO's to the same filesystem block > that is marked as a new allocation (i.e. buffer_new), then both IOs will > zero the portion of the block they are not writing data to. As a > result, when the IOs complete there will be a portion of the block > that contains zeros from the last IO to complete rather than the > data that should be there. > > This is easily manifested by qemu using aio+dio with an unaligned > guest filesystem - every IO is unaligned and fileystem corruption is > encountered in the guest filesystem. xfstest 240 (from Eric Sandeen) > is also a simple reproducer. > > To avoid this problem, track unaligned IO that triggers sub-block zeroing and > check new incoming unaligned IO that require sub-block zeroing against that > list. If we get an overlap where the start and end of unaligned IOs hit the > same filesystem block, then we need to block the incoming IOs until the IO that > is zeroing the block completes. The blocked IO can then continue without > needing to do any zeroing and hence won't overwrite valid data with zeros. > > Signed-off-by: Dave Chinner <dchinner@xxxxxxxxxx> I can confirm that, it fixes corruption of my VM images when using AIO +DIO. (cache=none,aio=native). I haven't reviewed the patch closely, but 1) can we do this only for AIO+DIO combination ? For regular DIO, we should have all the IOs serialized by i_mutex anyway.. 2) Having a single global list (for all devices) might cause scaling issues. 3) Are you dropping i_mutex when you are waiting for the zero-out to finish ? Thanks, Badari > --- > fs/direct-io.c | 152 +++++++++++++++++++++++++++++++++++++++++++++++++++++-- > 1 files changed, 146 insertions(+), 6 deletions(-) > > diff --git a/fs/direct-io.c b/fs/direct-io.c > index a10cb91..611524e 100644 > --- a/fs/direct-io.c > +++ b/fs/direct-io.c > @@ -71,6 +71,9 @@ struct dio { > unsigned start_zero_done; /* flag: sub-blocksize zeroing has > been performed at the start of a > write */ > +#define LAST_SECTOR ((sector_t)-1LL) > + sector_t zero_block_front; /* fs block we are zeroing at front */ > + sector_t zero_block_rear; /* fs block we are zeroing at rear */ > int pages_in_io; /* approximate total IO pages */ > size_t size; /* total request size (doesn't change)*/ > sector_t block_in_file; /* Current offset into the underlying > @@ -135,6 +138,101 @@ struct dio { > struct page *pages[DIO_PAGES]; /* page buffer */ > }; > > + > +/* > + * record fs blocks we are doing zeroing on in a zero block list. > + * unaligned IO is not very performant and so is relatively uncommon, > + * so a global list should be sufficent to track them. > + */ > +struct dio_zero_block { > + struct list_head dio_list; /* list of io in progress */ > + sector_t zero_block; /* block being zeroed */ > + struct dio *dio; /* owner dio */ > + wait_queue_head_t wq; /* New IO block here */ > + atomic_t ref; /* reference count */ > +}; > + > +DEFINE_SPINLOCK(dio_zero_block_lock); > +LIST_HEAD(dio_zero_block_list); > + > +/* > + * Add a filesystem block to the list of blocks we are tracking. > + */ > +static void > +dio_start_zero_block(struct dio *dio, sector_t zero_block) > +{ > + struct dio_zero_block *zb; > + > + zb = kmalloc(sizeof(*zb), GFP_NOIO); > + if (!zb) > + return; > + INIT_LIST_HEAD(&zb->dio_list); > + init_waitqueue_head(&zb->wq); > + zb->zero_block = zero_block; > + zb->dio = dio; > + atomic_set(&zb->ref, 1); > + > + spin_lock(&dio_zero_block_lock); > + list_add(&zb->dio_list, &dio_zero_block_list); > + spin_unlock(&dio_zero_block_lock); > +} > + > +static void > +dio_drop_zero_block(struct dio_zero_block *zb) > +{ > + if (atomic_dec_and_test(&zb->ref)) > + kfree(zb); > +} > + > +/* > + * Check whether a filesystem block is currently being zeroed, and if it is > + * wait for it to complete before returning. If we waited for a block being > + * zeroed, return 1 to indicate that the block is already initialised, > + * otherwise return 0 to indicate that it needs zeroing. > + */ > +static int > +dio_wait_zero_block(struct dio *dio, sector_t zero_block) > +{ > + struct dio_zero_block *zb; > + > + spin_lock(&dio_zero_block_lock); > + list_for_each_entry(zb, &dio_zero_block_list, dio_list) { > + if (zb->dio->inode != dio->inode) > + continue; > + if (zb->zero_block != zero_block) > + continue; > + atomic_inc(&zb->ref); > + spin_unlock(&dio_zero_block_lock); > + wait_event(zb->wq, (list_empty(&zb->dio_list))); > + dio_drop_zero_block(zb); > + return 1; > + } > + spin_unlock(&dio_zero_block_lock); > + return 0; > +} > + > +/* > + * Complete a block zeroing and wake up anyone waiting for it. > + */ > +static void dio_end_zero_block(struct dio *dio, sector_t zero_block) > +{ > + struct dio_zero_block *zb; > + > + spin_lock(&dio_zero_block_lock); > + list_for_each_entry(zb, &dio_zero_block_list, dio_list) { > + if (zb->dio->inode != dio->inode) > + continue; > + if (zb->zero_block != zero_block) > + continue; > + list_del_init(&zb->dio_list); > + spin_unlock(&dio_zero_block_lock); > + wake_up(&zb->wq); > + dio_drop_zero_block(zb); > + return; > + } > + spin_unlock(&dio_zero_block_lock); > +} > + > /* > * How many pages are in the queue? > */ > @@ -253,6 +351,11 @@ static int dio_complete(struct dio *dio, loff_t offset, int ret, bool is_async) > aio_complete(dio->iocb, ret, 0); > } > > + if (dio->zero_block_front != LAST_SECTOR) > + dio_end_zero_block(dio, dio->zero_block_front); > + if (dio->zero_block_rear != LAST_SECTOR) > + dio_end_zero_block(dio, dio->zero_block_rear); > + > if (dio->flags & DIO_LOCKING) > /* lockdep: non-owner release */ > up_read_non_owner(&dio->inode->i_alloc_sem); > @@ -777,6 +880,12 @@ static void clean_blockdev_aliases(struct dio *dio) > * block with zeros. This happens only if user-buffer, fileoffset or > * io length is not filesystem block-size multiple. > * > + * We need to track the blocks we are zeroing. If we have concurrent IOs that hit > + * the same start or end block, we do not want all the IOs to zero the portion > + * they are not writing data to as that will overwrite data from the other IOs. > + * Hence we need to block until the first unaligned IO completes before we can > + * continue (without executing any zeroing). > + * > * `end' is zero if we're doing the start of the IO, 1 at the end of the > * IO. > */ > @@ -784,8 +893,8 @@ static void dio_zero_block(struct dio *dio, int end) > { > unsigned dio_blocks_per_fs_block; > unsigned this_chunk_blocks; /* In dio_blocks */ > - unsigned this_chunk_bytes; > struct page *page; > + sector_t fsblock; > > dio->start_zero_done = 1; > if (!dio->blkfactor || !buffer_new(&dio->map_bh)) > @@ -797,17 +906,41 @@ static void dio_zero_block(struct dio *dio, int end) > if (!this_chunk_blocks) > return; > > + if (end) > + this_chunk_blocks = dio_blocks_per_fs_block - this_chunk_blocks; > + > /* > * We need to zero out part of an fs block. It is either at the > - * beginning or the end of the fs block. > + * beginning or the end of the fs block, but first we need to check if > + * there is already a zeroing being run on this block. > + * > + * If we are doing a sub-block IO (i.e. zeroing both front and rear of > + * the same block) we don't need to wait or set a gaurd for the rear of > + * the block as we already have one set. > */ > - if (end) > - this_chunk_blocks = dio_blocks_per_fs_block - this_chunk_blocks; > + fsblock = dio->block_in_file >> dio->blkfactor; > + if (!end || dio->zero_block_front != fsblock) { > > - this_chunk_bytes = this_chunk_blocks << dio->blkbits; > + /* wait for any zeroing already in progress */ > + if (dio_wait_zero_block(dio, fsblock)) { > + /* skip the range we would have zeroed. */ > + dio->next_block_for_io += this_chunk_blocks; > + return; > + } > + > + /* > + * we are going to zero stuff now, so set a guard to catch > + * others that might want to zero the same block. > + */ > + dio_start_zero_block(dio, fsblock); > + if (end) > + dio->zero_block_rear = fsblock; > + else > + dio->zero_block_front = fsblock; > + } > > page = ZERO_PAGE(0); > - if (submit_page_section(dio, page, 0, this_chunk_bytes, > + if (submit_page_section(dio, page, 0, this_chunk_blocks << dio->blkbits, > dio->next_block_for_io)) > return; > > @@ -1191,6 +1324,13 @@ __blockdev_direct_IO_newtrunc(int rw, struct kiocb *iocb, struct inode *inode, > */ > memset(dio, 0, offsetof(struct dio, pages)); > > + /* > + * zero_blocks need to initialised to largeѕt value to avoid > + * matching the zero block accidentally. > + */ > + dio->zero_block_front = LAST_SECTOR; > + dio->zero_block_rear = LAST_SECTOR; > + > dio->flags = flags; > if (dio->flags & DIO_LOCKING) { > /* watch out for a 0 len io from a tricksy fs */ > -- > 1.7.1 > > -- > To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in > the body of a message to majordomo@xxxxxxxxxxxxxxx > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html