On Thu, May 31, 2018 at 08:07:48PM +0200, Christoph Hellwig wrote: > We can handle the existing cow mapping case as a special case directly > in xfs_writepage_map, and share code for allocating delalloc blocks > with regular I/O in xfs_map_blocks. This means we need to always > call xfs_map_blocks for reflink inodes, but we can still skip most of > the work if it turns out that there is no COW mapping overlapping the > current block. > > As a subtle detail we need to start caching holes in the wpc to deal > with the case of COW reservations between EOF. But we'll need that > infrastructure later anyway, so this is no big deal. > > Based on a patch from Dave Chinner. > > Signed-off-by: Christoph Hellwig <hch@xxxxxx> > --- > fs/xfs/xfs_aops.c | 184 ++++++++++++++++++++++------------------------ > fs/xfs/xfs_aops.h | 4 +- > 2 files changed, 90 insertions(+), 98 deletions(-) > > diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c > index 9b5a4b6e268c..d536509ca05b 100644 > --- a/fs/xfs/xfs_aops.c > +++ b/fs/xfs/xfs_aops.c > @@ -375,70 +375,107 @@ xfs_end_bio( > > STATIC int > xfs_map_blocks( > + struct xfs_writepage_ctx *wpc, > struct inode *inode, > - loff_t offset, > - struct xfs_bmbt_irec *imap, > - int type) > + loff_t offset) > { > struct xfs_inode *ip = XFS_I(inode); > struct xfs_mount *mp = ip->i_mount; > ssize_t count = i_blocksize(inode); > - xfs_fileoff_t offset_fsb, end_fsb; > + xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset), end_fsb; > + struct xfs_bmbt_irec imap; > + int whichfork = XFS_DATA_FORK; > int error = 0; > int nimaps = 1; > > if (XFS_FORCED_SHUTDOWN(mp)) > return -EIO; > > - /* > - * Truncate can race with writeback since writeback doesn't take the > - * iolock and truncate decreases the file size before it starts > - * truncating the pages between new_size and old_size. Therefore, we > - * can end up in the situation where writeback gets a CoW fork mapping > - * but the truncate makes the mapping invalid and we end up in here > - * trying to get a new mapping. Bail out here so that we simply never > - * get a valid mapping and so we drop the write altogether. The page > - * truncation will kill the contents anyway. > - */ > - if (type == XFS_IO_COW && offset > i_size_read(inode)) > - return 0; > - > - ASSERT(type != XFS_IO_COW); > - > xfs_ilock(ip, XFS_ILOCK_SHARED); > ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || > (ip->i_df.if_flags & XFS_IFEXTENTS)); > ASSERT(offset <= mp->m_super->s_maxbytes); > > + if (xfs_is_reflink_inode(ip) && > + xfs_reflink_find_cow_mapping(ip, offset, &imap)) { > + xfs_iunlock(ip, XFS_ILOCK_SHARED); > + /* > + * Truncate can race with writeback since writeback doesn't > + * take the iolock and truncate decreases the file size before > + * it starts truncating the pages between new_size and old_size. > + * Therefore, we can end up in the situation where writeback > + * gets a CoW fork mapping but the truncate makes the mapping > + * invalid and we end up in here trying to get a new mapping. > + * bail out here so that we simply never get a valid mapping > + * and so we drop the write altogether. The page truncation > + * will kill the contents anyway. > + */ > + if (offset > i_size_read(inode)) { > + whichfork = XFS_IO_HOLE; Not sure what the point of this assignment is if we're going to return, but that's not a valid fork either way. Did you mean to assign wpc->io_type? > + return 0; > + } > + whichfork = XFS_COW_FORK; > + wpc->io_type = XFS_IO_COW; > + goto allocate_blocks; > + } > + > + /* > + * Map valid and no COW extent in the way? We're done. > + */ > + if (wpc->imap_valid) { > + xfs_iunlock(ip, XFS_ILOCK_SHARED); > + return 0; > + } > + > + /* > + * If we don't have a valid map, now it's time to get a new one for this > + * offset. This will convert delayed allocations (including COW ones) > + * into real extents. > + */ > if (offset > mp->m_super->s_maxbytes - count) > count = mp->m_super->s_maxbytes - offset; > end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count); > offset_fsb = XFS_B_TO_FSBT(mp, offset); > error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, > - imap, &nimaps, XFS_BMAPI_ENTIRE); > + &imap, &nimaps, XFS_BMAPI_ENTIRE); > xfs_iunlock(ip, XFS_ILOCK_SHARED); > - > if (error) > return error; > > - if (type == XFS_IO_DELALLOC && > - (!nimaps || isnullstartblock(imap->br_startblock))) { > - error = xfs_iomap_write_allocate(ip, XFS_DATA_FORK, offset, > - imap); > - if (!error) > - trace_xfs_map_blocks_alloc(ip, offset, count, type, imap); > - return error; > + if (!nimaps) { > + /* > + * Lookup returns no match? Beyond eof? regardless, > + * return it as a hole so we don't write it > + */ > + imap.br_startoff = offset_fsb; > + imap.br_blockcount = end_fsb - offset_fsb; > + imap.br_startblock = HOLESTARTBLOCK; > + wpc->io_type = XFS_IO_HOLE; > + } else if (imap.br_startblock == HOLESTARTBLOCK) { > + /* landed in a hole */ > + wpc->io_type = XFS_IO_HOLE; > } > > + if (wpc->io_type == XFS_IO_DELALLOC && > + (!nimaps || isnullstartblock(imap.br_startblock))) > + goto allocate_blocks; > + > #ifdef DEBUG > - if (type == XFS_IO_UNWRITTEN) { > + if (wpc->io_type == XFS_IO_UNWRITTEN) { > ASSERT(nimaps); > - ASSERT(imap->br_startblock != HOLESTARTBLOCK); > - ASSERT(imap->br_startblock != DELAYSTARTBLOCK); > + ASSERT(imap.br_startblock != HOLESTARTBLOCK); > + ASSERT(imap.br_startblock != DELAYSTARTBLOCK); > } > #endif > - if (nimaps) > - trace_xfs_map_blocks_found(ip, offset, count, type, imap); > + wpc->imap = imap; > + trace_xfs_map_blocks_found(ip, offset, count, wpc->io_type, &imap); > + return 0; > +allocate_blocks: > + error = xfs_iomap_write_allocate(ip, whichfork, offset, &imap); > + if (error) > + return error; > + wpc->imap = imap; > + trace_xfs_map_blocks_alloc(ip, offset, count, wpc->io_type, &imap); > return 0; > } > > @@ -759,56 +796,6 @@ xfs_aops_discard_page( > xfs_vm_invalidatepage(page, 0, PAGE_SIZE); > } > > -static int > -xfs_map_cow( > - struct xfs_writepage_ctx *wpc, > - struct inode *inode, > - loff_t offset, > - unsigned int *new_type) > -{ > - struct xfs_inode *ip = XFS_I(inode); > - struct xfs_bmbt_irec imap; > - bool is_cow = false; > - int error; > - > - /* > - * If we already have a valid COW mapping keep using it. > - */ > - if (wpc->io_type == XFS_IO_COW) { > - wpc->imap_valid = xfs_imap_valid(inode, &wpc->imap, offset); > - if (wpc->imap_valid) { > - *new_type = XFS_IO_COW; > - return 0; > - } > - } > - > - /* > - * Else we need to check if there is a COW mapping at this offset. > - */ > - xfs_ilock(ip, XFS_ILOCK_SHARED); > - is_cow = xfs_reflink_find_cow_mapping(ip, offset, &imap); > - xfs_iunlock(ip, XFS_ILOCK_SHARED); > - > - if (!is_cow) > - return 0; > - > - /* > - * And if the COW mapping has a delayed extent here we need to > - * allocate real space for it now. > - */ > - if (isnullstartblock(imap.br_startblock)) { > - error = xfs_iomap_write_allocate(ip, XFS_COW_FORK, offset, > - &imap); > - if (error) > - return error; > - } > - > - wpc->io_type = *new_type = XFS_IO_COW; > - wpc->imap_valid = true; > - wpc->imap = imap; > - return 0; > -} > - > /* > * We implement an immediate ioend submission policy here to avoid needing to > * chain multiple ioends and hence nest mempool allocations which can violate > @@ -873,10 +860,13 @@ xfs_writepage_map( > continue; > } > > - if (xfs_is_reflink_inode(XFS_I(inode))) { > - error = xfs_map_cow(wpc, inode, offset, &new_type); > - if (error) > - goto out; > + /* > + * If we already have a valid COW mapping keep using it. > + */ > + if (wpc->io_type == XFS_IO_COW && > + xfs_imap_valid(inode, &wpc->imap, offset)) { > + wpc->imap_valid = true; > + new_type = XFS_IO_COW; > } So this lifts the cow I/O type check from xfs_map_cow(). That path would set imap_valid and new_type and return, which essentially looks like a "blocks found" case where we go ahead and add the buffer to the ioend. Now we set the same values, but we have to call xfs_map_blocks() to make sure we cover the cow blocks overlap case that xfs_map_cow() used to handle. Shouldn't passing this check effectively jump straight to to the buffer add? Otherwise it looks like we unconditionally attempt to allocate blocks in the cow fork. Alternatively... > > if (wpc->io_type != new_type) { > @@ -887,22 +877,22 @@ xfs_writepage_map( > if (wpc->imap_valid) > wpc->imap_valid = xfs_imap_valid(inode, &wpc->imap, > offset); > - if (!wpc->imap_valid) { > - error = xfs_map_blocks(inode, offset, &wpc->imap, > - wpc->io_type); > + if (!wpc->imap_valid || xfs_is_reflink_inode(XFS_I(inode))) { > + error = xfs_map_blocks(wpc, inode, offset); ... perhaps this could change to something like: if (!wpc->imap_valid || (xfs_is_reflink_inode(XFS_I(inode)) && wpc->io_type != XFS_IO_COW)) error = ... ... so the logic is a bit more clear. E.g., we always need to look up in the COW fork for reflink inodes unless we're already doing COW I/O. Hm? Brian > if (error) > goto out; > wpc->imap_valid = xfs_imap_valid(inode, &wpc->imap, > offset); > } > - if (wpc->imap_valid) { > - lock_buffer(bh); > - if (wpc->io_type != XFS_IO_OVERWRITE) > - xfs_map_at_offset(inode, bh, &wpc->imap, offset); > - xfs_add_to_ioend(inode, bh, offset, wpc, wbc, &submit_list); > - count++; > - } > > + if (!wpc->imap_valid || wpc->io_type == XFS_IO_HOLE) > + continue; > + > + lock_buffer(bh); > + if (wpc->io_type != XFS_IO_OVERWRITE) > + xfs_map_at_offset(inode, bh, &wpc->imap, offset); > + xfs_add_to_ioend(inode, bh, offset, wpc, wbc, &submit_list); > + count++; > } while (offset += len, ((bh = bh->b_this_page) != head)); > > ASSERT(wpc->ioend || list_empty(&submit_list)); > diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h > index 69346d460dfa..b2ef5b661761 100644 > --- a/fs/xfs/xfs_aops.h > +++ b/fs/xfs/xfs_aops.h > @@ -29,6 +29,7 @@ enum { > XFS_IO_UNWRITTEN, /* covers allocated but uninitialized data */ > XFS_IO_OVERWRITE, /* covers already allocated extent */ > XFS_IO_COW, /* covers copy-on-write extent */ > + XFS_IO_HOLE, /* covers region without any block allocation */ > }; > > #define XFS_IO_TYPES \ > @@ -36,7 +37,8 @@ enum { > { XFS_IO_DELALLOC, "delalloc" }, \ > { XFS_IO_UNWRITTEN, "unwritten" }, \ > { XFS_IO_OVERWRITE, "overwrite" }, \ > - { XFS_IO_COW, "CoW" } > + { XFS_IO_COW, "CoW" }, \ > + { XFS_IO_HOLE, "hole" } > > /* > * Structure for buffered I/O completions. > -- > 2.17.0 > > -- > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > the body of a message to majordomo@xxxxxxxxxxxxxxx > More majordomo info at http://vger.kernel.org/majordomo-info.html