On Tue, Oct 04, 2016 at 10:39:09AM -0700, Darrick J. Wong wrote: > On Tue, Oct 04, 2016 at 12:38:23PM -0400, Brian Foster wrote: > > On Thu, Sep 29, 2016 at 08:09:01PM -0700, Darrick J. Wong wrote: > > > Wire up iomap_begin to detect shared extents and create delayed allocation > > > extents in the CoW fork: > > > > > > 1) Check if we already have an extent in the COW fork for the area. > > > If so nothing to do, we can move along. > > > 2) Look up block number for the current extent, and if there is none > > > it's not shared move along. > > > 3) Unshare the current extent as far as we are going to write into it. > > > For this we avoid an additional COW fork lookup and use the > > > information we set aside in step 1) above. > > > 4) Goto 1) unless we've covered the whole range. > > > > > > Last but not least, this updates the xfs_reflink_reserve_cow_range calling > > > convention to pass a byte offset and length, as that is what both callers > > > expect anyway. This patch has been refactored considerably as part of the > > > iomap transition. > > > > > > Signed-off-by: Darrick J. Wong <darrick.wong@xxxxxxxxxx> > > > Signed-off-by: Christoph Hellwig <hch@xxxxxx> > > > --- > > > fs/xfs/xfs_iomap.c | 12 ++- > > > fs/xfs/xfs_reflink.c | 202 ++++++++++++++++++++++++++++++++++++++++++++++++++ > > > fs/xfs/xfs_reflink.h | 9 ++ > > > 3 files changed, 221 insertions(+), 2 deletions(-) > > > > > > > > > diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c > > > index 59c7beb..e8312b0 100644 > > > --- a/fs/xfs/xfs_iomap.c > > > +++ b/fs/xfs/xfs_iomap.c > > > @@ -39,6 +39,7 @@ > > > #include "xfs_quota.h" > > > #include "xfs_dquot_item.h" > > > #include "xfs_dquot.h" > > > +#include "xfs_reflink.h" > > > > > > > > > #define XFS_WRITEIO_ALIGN(mp,off) (((off) >> mp->m_writeio_log) \ > > > @@ -961,8 +962,15 @@ xfs_file_iomap_begin( > > > if (XFS_FORCED_SHUTDOWN(mp)) > > > return -EIO; > > > > > > - if ((flags & IOMAP_WRITE) && > > > - !IS_DAX(inode) && !xfs_get_extsz_hint(ip)) { > > > + if ((flags & (IOMAP_WRITE | IOMAP_ZERO)) && xfs_is_reflink_inode(ip)) { > > > + error = xfs_reflink_reserve_cow_range(ip, offset, length); > > > + if (error < 0) > > > + return error; > > > + } > > > + > > > + if ((flags & IOMAP_WRITE) && !IS_DAX(inode) && > > > + !xfs_get_extsz_hint(ip)) { > > > + /* Reserve delalloc blocks for regular writeback. */ > > > return xfs_file_iomap_begin_delay(inode, offset, length, flags, > > > iomap); > > > } > > > > What about the short write case? E.g., do we have to clear out delalloc > > blocks from the cow fork in iomap_end() if we don't end up using them? > > Nope, unused blocks sit around in the CoW fork (with the cowextsize hint > set, this happens all the time) so that a subsequent write to an > adjacent file offset lands in the same place as the successful write. > The unused extents get cleaned out when the inode is evicted, we run out > of disk space, or the garbage collector triggers. > Interesting.. ok, I suppose I'll get to that bit eventually. :P Thanks. Brian > > > diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c > > > index 7adbb83..05a7fe6 100644 > > > --- a/fs/xfs/xfs_reflink.c > > > +++ b/fs/xfs/xfs_reflink.c > > > @@ -51,6 +51,7 @@ > > > #include "xfs_btree.h" > > > #include "xfs_bmap_btree.h" > > > #include "xfs_reflink.h" > > > +#include "xfs_iomap.h" > > > > > > /* > > > * Copy on Write of Shared Blocks > > > @@ -112,3 +113,204 @@ > > > * ioend structure. Better yet, the more ground we can cover with one > > > * ioend, the better. > > > */ > > > + > > > +/* > > > + * Given an AG extent, find the lowest-numbered run of shared blocks within > > > + * that range and return the range in fbno/flen. > > > + */ > > > +int > > > +xfs_reflink_find_shared( > > > + struct xfs_mount *mp, > > > + xfs_agnumber_t agno, > > > + xfs_agblock_t agbno, > > > + xfs_extlen_t aglen, > > > + xfs_agblock_t *fbno, > > > + xfs_extlen_t *flen, > > > + bool find_maximal) > > > +{ > > > + struct xfs_buf *agbp; > > > + struct xfs_btree_cur *cur; > > > + int error; > > > + > > > + error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp); > > > + if (error) > > > + return error; > > > + > > > + cur = xfs_refcountbt_init_cursor(mp, NULL, agbp, agno, NULL); > > > + > > > + error = xfs_refcount_find_shared(cur, agbno, aglen, fbno, flen, > > > + find_maximal); > > > + > > > + xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); > > > + > > > + xfs_buf_relse(agbp); > > > + return error; > > > +} > > > + > > > +/* > > > + * Trim the mapping to the next block where there's a change in the > > > + * shared/unshared status. More specifically, this means that we > > > + * find the lowest-numbered extent of shared blocks that coincides with > > > + * the given block mapping. If the shared extent overlaps the start of > > > + * the mapping, trim the mapping to the end of the shared extent. If > > > + * the shared region intersects the mapping, trim the mapping to the > > > + * start of the shared extent. If there are no shared regions that > > > + * overlap, just return the original extent. > > > + */ > > > +int > > > +xfs_reflink_trim_around_shared( > > > + struct xfs_inode *ip, > > > + struct xfs_bmbt_irec *irec, > > > + bool *shared, > > > + bool *trimmed) > > > +{ > > > + xfs_agnumber_t agno; > > > + xfs_agblock_t agbno; > > > + xfs_extlen_t aglen; > > > + xfs_agblock_t fbno; > > > + xfs_extlen_t flen; > > > + int error = 0; > > > + > > > + /* Holes, unwritten, and delalloc extents cannot be shared */ > > > + if (!xfs_is_reflink_inode(ip) || > > > + ISUNWRITTEN(irec) || > > > + irec->br_startblock == HOLESTARTBLOCK || > > > + irec->br_startblock == DELAYSTARTBLOCK) { > > > + *shared = false; > > > + return 0; > > > + } > > > + > > > + trace_xfs_reflink_trim_around_shared(ip, irec); > > > + > > > + agno = XFS_FSB_TO_AGNO(ip->i_mount, irec->br_startblock); > > > + agbno = XFS_FSB_TO_AGBNO(ip->i_mount, irec->br_startblock); > > > + aglen = irec->br_blockcount; > > > + > > > + error = xfs_reflink_find_shared(ip->i_mount, agno, agbno, > > > + aglen, &fbno, &flen, true); > > > + if (error) > > > + return error; > > > + > > > + *shared = *trimmed = false; > > > + if (flen == 0) { > > > > Preferable to use NULLAGBLOCK for this, imo. > > Yeah, I will look into changing this. > > > > + /* No shared blocks at all. */ > > > + return 0; > > > + } else if (fbno == agbno) { > > > + /* The start of this extent is shared. */ > > > + irec->br_blockcount = flen; > > > + *shared = true; > > > + *trimmed = true; > > > > Why do we set trimmed based solely on fbno == agbno? Is that valid if > > the bmapbt extent exactly matches the refcntbt extent and we thus don't > > actually modify the extent (e.g., br_blockcount == flen)? It's hard to > > tell because trimmed looks unused (to this point?), so I could just > > misunderstand the meaning. > > You're right, we don't have to set trimmed if flen == aglen. > > > > + return 0; > > > + } else { > > > + /* There's a shared extent midway through this extent. */ > > > + irec->br_blockcount = fbno - agbno; > > > > Don't we have to push the startblock forward in this case? > > > > Oh, I see. We trim the unshared length to push the fileoffset fsb to the > > start of the shared region for the next iteration. > > Yep. I'll clarify the comment. > > --D > > > > > Brian > > > > > + *trimmed = true; > > > + return 0; > > > + } > > > +} > > > + > > > +/* Create a CoW reservation for a range of blocks within a file. */ > > > +static int > > > +__xfs_reflink_reserve_cow( > > > + struct xfs_inode *ip, > > > + xfs_fileoff_t *offset_fsb, > > > + xfs_fileoff_t end_fsb) > > > +{ > > > + struct xfs_bmbt_irec got, prev, imap; > > > + xfs_fileoff_t orig_end_fsb; > > > + int nimaps, eof = 0, error = 0; > > > + bool shared = false, trimmed = false; > > > + xfs_extnum_t idx; > > > + > > > + /* Already reserved? Skip the refcount btree access. */ > > > + xfs_bmap_search_extents(ip, *offset_fsb, XFS_COW_FORK, &eof, &idx, > > > + &got, &prev); > > > + if (!eof && got.br_startoff <= *offset_fsb) { > > > + end_fsb = orig_end_fsb = got.br_startoff + got.br_blockcount; > > > + trace_xfs_reflink_cow_found(ip, &got); > > > + goto done; > > > + } > > > + > > > + /* Read extent from the source file. */ > > > + nimaps = 1; > > > + error = xfs_bmapi_read(ip, *offset_fsb, end_fsb - *offset_fsb, > > > + &imap, &nimaps, 0); > > > + if (error) > > > + goto out_unlock; > > > + ASSERT(nimaps == 1); > > > + > > > + /* Trim the mapping to the nearest shared extent boundary. */ > > > + error = xfs_reflink_trim_around_shared(ip, &imap, &shared, &trimmed); > > > + if (error) > > > + goto out_unlock; > > > + > > > + end_fsb = orig_end_fsb = imap.br_startoff + imap.br_blockcount; > > > + > > > + /* Not shared? Just report the (potentially capped) extent. */ > > > + if (!shared) > > > + goto done; > > > + > > > + /* > > > + * Fork all the shared blocks from our write offset until the end of > > > + * the extent. > > > + */ > > > + error = xfs_qm_dqattach_locked(ip, 0); > > > + if (error) > > > + goto out_unlock; > > > + > > > +retry: > > > + error = xfs_bmapi_reserve_delalloc(ip, XFS_COW_FORK, *offset_fsb, > > > + end_fsb - *offset_fsb, &got, > > > + &prev, &idx, eof); > > > + switch (error) { > > > + case 0: > > > + break; > > > + case -ENOSPC: > > > + case -EDQUOT: > > > + /* retry without any preallocation */ > > > + trace_xfs_reflink_cow_enospc(ip, &imap); > > > + if (end_fsb != orig_end_fsb) { > > > + end_fsb = orig_end_fsb; > > > + goto retry; > > > + } > > > + /*FALLTHRU*/ > > > + default: > > > + goto out_unlock; > > > + } > > > + > > > + trace_xfs_reflink_cow_alloc(ip, &got); > > > +done: > > > + *offset_fsb = end_fsb; > > > +out_unlock: > > > + return error; > > > +} > > > + > > > +/* Create a CoW reservation for part of a file. */ > > > +int > > > +xfs_reflink_reserve_cow_range( > > > + struct xfs_inode *ip, > > > + xfs_off_t offset, > > > + xfs_off_t count) > > > +{ > > > + struct xfs_mount *mp = ip->i_mount; > > > + xfs_fileoff_t offset_fsb, end_fsb; > > > + int error; > > > + > > > + trace_xfs_reflink_reserve_cow_range(ip, offset, count); > > > + > > > + offset_fsb = XFS_B_TO_FSBT(mp, offset); > > > + end_fsb = XFS_B_TO_FSB(mp, offset + count); > > > + > > > + xfs_ilock(ip, XFS_ILOCK_EXCL); > > > + while (offset_fsb < end_fsb) { > > > + error = __xfs_reflink_reserve_cow(ip, &offset_fsb, end_fsb); > > > + if (error) { > > > + trace_xfs_reflink_reserve_cow_range_error(ip, error, > > > + _RET_IP_); > > > + break; > > > + } > > > + } > > > + xfs_iunlock(ip, XFS_ILOCK_EXCL); > > > + > > > + return error; > > > +} > > > diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h > > > index 820b151..f824f87 100644 > > > --- a/fs/xfs/xfs_reflink.h > > > +++ b/fs/xfs/xfs_reflink.h > > > @@ -20,4 +20,13 @@ > > > #ifndef __XFS_REFLINK_H > > > #define __XFS_REFLINK_H 1 > > > > > > +extern int xfs_reflink_find_shared(struct xfs_mount *mp, xfs_agnumber_t agno, > > > + xfs_agblock_t agbno, xfs_extlen_t aglen, xfs_agblock_t *fbno, > > > + xfs_extlen_t *flen, bool find_maximal); > > > +extern int xfs_reflink_trim_around_shared(struct xfs_inode *ip, > > > + struct xfs_bmbt_irec *irec, bool *shared, bool *trimmed); > > > + > > > +extern int xfs_reflink_reserve_cow_range(struct xfs_inode *ip, > > > + xfs_off_t offset, xfs_off_t count); > > > + > > > #endif /* __XFS_REFLINK_H */ > > > > > > -- > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > the body of a message to majordomo@xxxxxxxxxxxxxxx > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > -- > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > the body of a message to majordomo@xxxxxxxxxxxxxxx > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe linux-xfs" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html