On Tue, Feb 28, 2017 at 07:11:35AM -0800, Christoph Hellwig wrote: > On Tue, Feb 28, 2017 at 09:59:40AM -0500, Brian Foster wrote: > > Heh. I've appended what I'm currently playing around with. It's > > certainly uglier, but not terrible IMO (outside of the fact that we have > > to look at the buffer_heads). This seems to address the problem, but > > still only lightly tested... > > > > An entirely different approach may be to somehow or another > > differentiate allocated delalloc blocks from "found" delalloc blocks in > > the iomap_begin() handler, and then perhaps encode that into the iomap > > such that the iomap_end() handler has an explicit reference of what to > > punch. Personally, I wouldn't mind doing something like the below short > > term to fix the regression and then incorporate an iomap enhancement to > > break the buffer_head dependency. > > We actually have a IOMAP_F_NEW for this already, but so far it's > only used by the DIO and DAX code. Ok. I think that has the potential to provide a more clean and simple solution. I don't think it's as straightforward of a change to enable that for the buffered write code, however. I don't think we can just set the NEW flag when we do xfs_bmapi_reserve_delalloc() because something like the following would still break: xfs_io -fc "pwrite 16k 4k" -c "pwrite -b 32k 0 32k" <file> If the second write happened to fail, AFAICT it would still punch out the block allocated by the first. So I suppose we'd have to tweak reserve_delalloc() to trim the returned extent, or perhaps add a flag that skips the xfs_bmbt_get_all() call to update got after we insert the extent, and thus only return what was allocated..? (The latter actually seems to work on a very quick test, see appended diff..). Brian ---8<--- diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index a9c66d4..18b927d 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -4159,7 +4159,8 @@ xfs_bmapi_reserve_delalloc( xfs_filblks_t prealloc, struct xfs_bmbt_irec *got, xfs_extnum_t *lastx, - int eof) + int eof, + int flags) { struct xfs_mount *mp = ip->i_mount; struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); @@ -4242,7 +4243,8 @@ xfs_bmapi_reserve_delalloc( * Update our extent pointer, given that xfs_bmap_add_extent_hole_delay * might have merged it into one of the neighbouring ones. */ - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *lastx), got); + if (flags & XFS_BMAPI_ENTIRE) + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *lastx), got); /* * Tag the inode if blocks were preallocated. Note that COW fork diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index cdef87d..459ba6b 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -243,7 +243,8 @@ int xfs_bmap_shift_extents(struct xfs_trans *tp, struct xfs_inode *ip, int xfs_bmap_split_extent(struct xfs_inode *ip, xfs_fileoff_t split_offset); int xfs_bmapi_reserve_delalloc(struct xfs_inode *ip, int whichfork, xfs_fileoff_t off, xfs_filblks_t len, xfs_filblks_t prealloc, - struct xfs_bmbt_irec *got, xfs_extnum_t *lastx, int eof); + struct xfs_bmbt_irec *got, xfs_extnum_t *lastx, int eof, + int flags); enum xfs_bmap_intent_type { XFS_BMAP_MAP = 1, diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 41662fb..9b1b2a4 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -613,7 +613,8 @@ xfs_file_iomap_begin_delay( retry: error = xfs_bmapi_reserve_delalloc(ip, XFS_DATA_FORK, offset_fsb, - end_fsb - offset_fsb, prealloc_blocks, &got, &idx, eof); + end_fsb - offset_fsb, prealloc_blocks, &got, &idx, + eof, 0); switch (error) { case 0: break; @@ -629,7 +630,7 @@ xfs_file_iomap_begin_delay( default: goto out_unlock; } - + iomap->flags = IOMAP_F_NEW; trace_xfs_iomap_alloc(ip, offset, count, 0, &got); done: if (isnullstartblock(got.br_startblock)) @@ -1071,16 +1072,22 @@ xfs_file_iomap_end_delalloc( struct xfs_inode *ip, loff_t offset, loff_t length, - ssize_t written) + ssize_t written, + struct iomap *iomap) { struct xfs_mount *mp = ip->i_mount; xfs_fileoff_t start_fsb; xfs_fileoff_t end_fsb; int error = 0; + trace_printk("%d: ino 0x%llx offset %llu len %llu writ %ld new %d\n", __LINE__, + ip->i_ino, offset, length, written, !!(iomap->flags & IOMAP_F_NEW)); + /* behave as if the write failed if drop writes is enabled */ - if (xfs_mp_drop_writes(mp)) + if (xfs_mp_drop_writes(mp)) { + iomap->flags |= IOMAP_F_NEW; written = 0; + } /* * start_fsb refers to the first unused block after a short write. If @@ -1101,7 +1108,7 @@ xfs_file_iomap_end_delalloc( * across the reserve/allocate/unreserve calls. If there are delalloc * blocks in the range, they are ours. */ - if (start_fsb < end_fsb) { + if (iomap->flags & IOMAP_F_NEW && start_fsb < end_fsb) { truncate_pagecache_range(VFS_I(ip), XFS_FSB_TO_B(mp, start_fsb), XFS_FSB_TO_B(mp, end_fsb) - 1); @@ -1131,7 +1138,7 @@ xfs_file_iomap_end( { if ((flags & IOMAP_WRITE) && iomap->type == IOMAP_DELALLOC) return xfs_file_iomap_end_delalloc(XFS_I(inode), offset, - length, written); + length, written, iomap); return 0; } diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index da6d08f..d76a2b6 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -313,7 +313,8 @@ xfs_reflink_reserve_cow( return error; error = xfs_bmapi_reserve_delalloc(ip, XFS_COW_FORK, imap->br_startoff, - imap->br_blockcount, 0, &got, &idx, eof); + imap->br_blockcount, 0, &got, &idx, eof, + XFS_BMAPI_ENTIRE); if (error == -ENOSPC || error == -EDQUOT) trace_xfs_reflink_cow_enospc(ip, imap); if (error)