On Thu, Feb 19, 2015 at 09:48:45AM +1100, Dave Chinner wrote: > From: Dave Chinner <dchinner@xxxxxxxxxx> > > A new fsync vs power fail test in xfstests indicated that XFS can > have unreliable data consistency when doing extending truncates that > require block zeroing. The blocks beyond EOF get zeroed in memory, > but we never force those changes to disk before we run the > transaction that extends the file size and exposes those blocks to > userspace. This can result in the blocks not being correctly zeroed > after a crash. > > Because in-memory behaviour is correct, tools like fsx don't pick up > any coherency problems - it's not until the filesystem is shutdown > or the system crashes after writing the truncate transaction to the > journal but before the zeroed data in the page cache is flushed that > the issue is exposed. > > Fix this by also flushing the dirty data in memory region between > the old size and new size when we've found blocks that need zeroing > in the truncate process. > > Reported-by: Liu Bo <bo.li.liu@xxxxxxxxxx> > cc: <stable@xxxxxxxxxxxxxxx> > Signed-off-by: Dave Chinner <dchinner@xxxxxxxxxx> > --- Reviewed-by: Brian Foster <bfoster@xxxxxxxxxx> > fs/xfs/xfs_file.c | 14 ++++++++++---- > fs/xfs/xfs_inode.h | 9 +++++---- > fs/xfs/xfs_iops.c | 36 ++++++++++++++---------------------- > 3 files changed, 29 insertions(+), 30 deletions(-) > > diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c > index ce615d1..a2e1cb8 100644 > --- a/fs/xfs/xfs_file.c > +++ b/fs/xfs/xfs_file.c > @@ -397,7 +397,8 @@ STATIC int /* error (positive) */ > xfs_zero_last_block( > struct xfs_inode *ip, > xfs_fsize_t offset, > - xfs_fsize_t isize) > + xfs_fsize_t isize, > + bool *did_zeroing) > { > struct xfs_mount *mp = ip->i_mount; > xfs_fileoff_t last_fsb = XFS_B_TO_FSBT(mp, isize); > @@ -425,6 +426,7 @@ xfs_zero_last_block( > zero_len = mp->m_sb.sb_blocksize - zero_offset; > if (isize + zero_len > offset) > zero_len = offset - isize; > + *did_zeroing = true; > return xfs_iozero(ip, isize, zero_len); > } > > @@ -443,7 +445,8 @@ int /* error (positive) */ > xfs_zero_eof( > struct xfs_inode *ip, > xfs_off_t offset, /* starting I/O offset */ > - xfs_fsize_t isize) /* current inode size */ > + xfs_fsize_t isize, /* current inode size */ > + bool *did_zeroing) > { > struct xfs_mount *mp = ip->i_mount; > xfs_fileoff_t start_zero_fsb; > @@ -465,7 +468,7 @@ xfs_zero_eof( > * We only zero a part of that block so it is handled specially. > */ > if (XFS_B_FSB_OFFSET(mp, isize) != 0) { > - error = xfs_zero_last_block(ip, offset, isize); > + error = xfs_zero_last_block(ip, offset, isize, did_zeroing); > if (error) > return error; > } > @@ -525,6 +528,7 @@ xfs_zero_eof( > if (error) > return error; > > + *did_zeroing = true; > start_zero_fsb = imap.br_startoff + imap.br_blockcount; > ASSERT(start_zero_fsb <= (end_zero_fsb + 1)); > } > @@ -567,13 +571,15 @@ restart: > * having to redo all checks before. > */ > if (*pos > i_size_read(inode)) { > + bool zero = false; > + > if (*iolock == XFS_IOLOCK_SHARED) { > xfs_rw_iunlock(ip, *iolock); > *iolock = XFS_IOLOCK_EXCL; > xfs_rw_ilock(ip, *iolock); > goto restart; > } > - error = xfs_zero_eof(ip, *pos, i_size_read(inode)); > + error = xfs_zero_eof(ip, *pos, i_size_read(inode), &zero); > if (error) > return error; > } > diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h > index 8e82b41..c73b63d 100644 > --- a/fs/xfs/xfs_inode.h > +++ b/fs/xfs/xfs_inode.h > @@ -384,10 +384,11 @@ enum xfs_prealloc_flags { > XFS_PREALLOC_INVISIBLE = (1 << 4), > }; > > -int xfs_update_prealloc_flags(struct xfs_inode *, > - enum xfs_prealloc_flags); > -int xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t); > -int xfs_iozero(struct xfs_inode *, loff_t, size_t); > +int xfs_update_prealloc_flags(struct xfs_inode *ip, > + enum xfs_prealloc_flags flags); > +int xfs_zero_eof(struct xfs_inode *ip, xfs_off_t offset, > + xfs_fsize_t isize, bool *did_zeroing); > +int xfs_iozero(struct xfs_inode *ip, loff_t pos, size_t count); > > > /* from xfs_iops.c */ > diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c > index d7782ae..3ccc28e 100644 > --- a/fs/xfs/xfs_iops.c > +++ b/fs/xfs/xfs_iops.c > @@ -756,6 +756,7 @@ xfs_setattr_size( > int error; > uint lock_flags = 0; > uint commit_flags = 0; > + bool did_zeroing = false; > > trace_xfs_setattr(ip); > > @@ -799,20 +800,16 @@ xfs_setattr_size( > return error; > > /* > - * Now we can make the changes. Before we join the inode to the > - * transaction, take care of the part of the truncation that must be > - * done without the inode lock. This needs to be done before joining > - * the inode to the transaction, because the inode cannot be unlocked > - * once it is a part of the transaction. > + * File data changes must be complete before we start the transaction to > + * modify the inode. This needs to be done before joining the inode to > + * the transaction because the inode cannot be unlocked once it is a > + * part of the transaction. > + * > + * Start with zeroing any data block beyond EOF that we may expose on > + * file extension. > */ > if (newsize > oldsize) { > - /* > - * Do the first part of growing a file: zero any data in the > - * last block that is beyond the old EOF. We need to do this > - * before the inode is joined to the transaction to modify > - * i_size. > - */ > - error = xfs_zero_eof(ip, newsize, oldsize); > + error = xfs_zero_eof(ip, newsize, oldsize, &did_zeroing); > if (error) > return error; > } > @@ -822,23 +819,18 @@ xfs_setattr_size( > * any previous writes that are beyond the on disk EOF and the new > * EOF that have not been written out need to be written here. If we > * do not write the data out, we expose ourselves to the null files > - * problem. > - * > - * Only flush from the on disk size to the smaller of the in memory > - * file size or the new size as that's the range we really care about > - * here and prevents waiting for other data not within the range we > - * care about here. > + * problem. Note that this includes any block zeroing we did above; > + * otherwise those blocks may not be zeroed after a crash. > */ > - if (oldsize != ip->i_d.di_size && newsize > ip->i_d.di_size) { > + if (newsize > ip->i_d.di_size && > + (oldsize != ip->i_d.di_size || did_zeroing)) { > error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, > ip->i_d.di_size, newsize); > if (error) > return error; > } > > - /* > - * Wait for all direct I/O to complete. > - */ > + /* Now wait for all direct I/O to complete. */ > inode_dio_wait(inode); > > /* > -- > 2.0.0 > > _______________________________________________ > xfs mailing list > xfs@xxxxxxxxxxx > http://oss.sgi.com/mailman/listinfo/xfs _______________________________________________ xfs mailing list xfs@xxxxxxxxxxx http://oss.sgi.com/mailman/listinfo/xfs