Re: [PATCH 21/63] xfs: map an inode's offset to an exact physical block

"Darrick J. Wong" <darrick.wong@xxxxxxxxxx> · Tue, 4 Oct 2016 10:28:01 -0700

On Tue, Oct 04, 2016 at 08:43:34AM -0400, Brian Foster wrote:
> On Mon, Oct 03, 2016 at 05:11:19PM -0700, Darrick J. Wong wrote:
> > On Mon, Oct 03, 2016 at 03:03:49PM -0400, Brian Foster wrote:
> > > On Thu, Sep 29, 2016 at 08:07:56PM -0700, Darrick J. Wong wrote:
> > > > Teach the bmap routine to know how to map a range of file blocks to a
> > > > specific range of physical blocks, instead of simply allocating fresh
> > > > blocks.  This enables reflink to map a file to blocks that are already
> > > > in use.
> > > > 
> > > > Signed-off-by: Darrick J. Wong <darrick.wong@xxxxxxxxxx>
> > > > ---
> > > >  fs/xfs/libxfs/xfs_bmap.c |   63 ++++++++++++++++++++++++++++++++++++++++++++++
> > > >  fs/xfs/libxfs/xfs_bmap.h |   10 +++++++
> > > >  fs/xfs/xfs_trace.h       |   54 +++++++++++++++++++++++++++++++++++++++
> > > >  3 files changed, 126 insertions(+), 1 deletion(-)
> > > > 
> > > > 
> > > > diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
> > > > index 907d7b8d..9f145ed 100644
> > > > --- a/fs/xfs/libxfs/xfs_bmap.c
> > > > +++ b/fs/xfs/libxfs/xfs_bmap.c
> > > > @@ -3877,6 +3877,55 @@ xfs_bmap_btalloc(
> > > >  }
> > > >  
> > > >  /*
> > > > + * For a remap operation, just "allocate" an extent at the address that the
> > > > + * caller passed in, and ensure that the AGFL is the right size.  The caller
> > > > + * will then map the "allocated" extent into the file somewhere.
> > > > + */
> > > > +STATIC int
> > > > +xfs_bmap_remap_alloc(
> > > > +	struct xfs_bmalloca	*ap)
> > > > +{
> > > > +	struct xfs_trans	*tp = ap->tp;
> > > > +	struct xfs_mount	*mp = tp->t_mountp;
> > > > +	xfs_agblock_t		bno;
> > > > +	struct xfs_alloc_arg	args;
> > > > +	int			error;
> > > > +
> > > > +	/*
> > > > +	 * validate that the block number is legal - the enables us to detect
> > > > +	 * and handle a silent filesystem corruption rather than crashing.
> > > > +	 */
> > > > +	memset(&args, 0, sizeof(struct xfs_alloc_arg));
> > > > +	args.tp = ap->tp;
> > > > +	args.mp = ap->tp->t_mountp;
> > > > +	bno = *ap->firstblock;
> > > > +	args.agno = XFS_FSB_TO_AGNO(mp, bno);
> > > > +	ASSERT(args.agno < mp->m_sb.sb_agcount);
> > > > +	args.agbno = XFS_FSB_TO_AGBNO(mp, bno);
> > > > +	ASSERT(args.agbno < mp->m_sb.sb_agblocks);
> > > > +
> > > > +	/* "Allocate" the extent from the range we passed in. */
> > > > +	trace_xfs_bmap_remap_alloc(ap->ip, *ap->firstblock, ap->length);
> > > > +	ap->blkno = bno;
> > > > +	ap->ip->i_d.di_nblocks += ap->length;
> > > > +	xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE);
> > > > +
> > > > +	/* Fix the freelist, like a real allocator does. */
> > > > +	args.datatype = ap->datatype;
> > > > +	args.pag = xfs_perag_get(args.mp, args.agno);
> > > > +	ASSERT(args.pag);
> > > > +
> > > > +	error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING);
> > > 
> > > Why the FREEING flag? 
> > 
> > /*
> >  * The freelist fixing code will decline the allocation if
> >  * the size and shape of the free space doesn't allow for
> >  * allocating the extent and updating all the metadata that
> >  * happens during an allocation.  We're remapping, not
> >  * allocating, so skip that check by pretending to be freeing.
> >  */
> > 
> 
> Thanks. This also can bypass fixing up the AG freelist. I suppose that
> won't matter since we aren't going to update either allocbt, but are we
> safe from the rmapbt perspective as well?

In general we'll be fine since the AG reservation code reserves enough
space to feed the rmapbt.  If reflink (which invokes this remapping)
detects that we're low on AG reservation, it'll return ENOSPC and let
userspace do a regular copy (presumably into a less full AG) to avoid
adding pressure on that AG.  COW (the second remap user) should be fine
since the extent being (re)mapped in is a regular non-shared extent with
an rmapping (owned by XFS_REFC_OWN_COW) already in the relevant AG.
The same idea applies to the extent swapper (the third remap user),
though the owner is the donor file.

--D

> 
> Brian
> 
> > > > +	if (error)
> > > > +		goto error0;
> > > > +error0:
> > > > +	xfs_perag_put(args.pag);
> > > > +	if (error)
> > > > +		trace_xfs_bmap_remap_alloc_error(ap->ip, error, _RET_IP_);
> > > > +	return error;
> > > > +}
> > > > +
> > > > +/*
> > > >   * xfs_bmap_alloc is called by xfs_bmapi to allocate an extent for a file.
> > > >   * It figures out where to ask the underlying allocator to put the new extent.
> > > >   */
> > > > @@ -3884,6 +3933,8 @@ STATIC int
> > > >  xfs_bmap_alloc(
> > > >  	struct xfs_bmalloca	*ap)	/* bmap alloc argument struct */
> > > >  {
> > > > +	if (ap->flags & XFS_BMAPI_REMAP)
> > > > +		return xfs_bmap_remap_alloc(ap);
> > > >  	if (XFS_IS_REALTIME_INODE(ap->ip) &&
> > > >  	    xfs_alloc_is_userdata(ap->datatype))
> > > >  		return xfs_bmap_rtalloc(ap);
> > > > @@ -4442,6 +4493,12 @@ xfs_bmapi_write(
> > > >  	ASSERT(len > 0);
> > > >  	ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL);
> > > >  	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
> > > > +	if (whichfork == XFS_ATTR_FORK)
> > > > +		ASSERT(!(flags & XFS_BMAPI_REMAP));
> > > 
> > > I think it's better to avoid conditionals if the only affected code
> > > consists of ASSERT() statements (which can be compiled out). E.g., 
> > > 
> > > 	ASSERT(!((flags & XFS_BMAPI_REMAP) && whichfork == XFS_ATTR_FORK));
> > > 
> > > ... and so on, but not a big deal.
> > 
> > <nod> I might as well fix that up too...
> > 
> > --D
> > 
> > > Brian
> > > 
> > > > +	if (flags & XFS_BMAPI_REMAP) {
> > > > +		ASSERT(!(flags & XFS_BMAPI_PREALLOC));
> > > > +		ASSERT(!(flags & XFS_BMAPI_CONVERT));
> > > > +	}
> > > >  
> > > >  	/* zeroing is for currently only for data extents, not metadata */
> > > >  	ASSERT((flags & (XFS_BMAPI_METADATA | XFS_BMAPI_ZERO)) !=
> > > > @@ -4503,6 +4560,12 @@ xfs_bmapi_write(
> > > >  		wasdelay = !inhole && isnullstartblock(bma.got.br_startblock);
> > > >  
> > > >  		/*
> > > > +		 * Make sure we only reflink into a hole.
> > > > +		 */
> > > > +		if (flags & XFS_BMAPI_REMAP)
> > > > +			ASSERT(inhole);
> > > > +
> > > > +		/*
> > > >  		 * First, deal with the hole before the allocated space
> > > >  		 * that we found, if any.
> > > >  		 */
> > > > diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
> > > > index fcdb094..877b6f9 100644
> > > > --- a/fs/xfs/libxfs/xfs_bmap.h
> > > > +++ b/fs/xfs/libxfs/xfs_bmap.h
> > > > @@ -97,6 +97,13 @@ struct xfs_extent_free_item
> > > >   */
> > > >  #define XFS_BMAPI_ZERO		0x080
> > > >  
> > > > +/*
> > > > + * Map the inode offset to the block given in ap->firstblock.  Primarily
> > > > + * used for reflink.  The range must be in a hole, and this flag cannot be
> > > > + * turned on with PREALLOC or CONVERT, and cannot be used on the attr fork.
> > > > + */
> > > > +#define XFS_BMAPI_REMAP		0x100
> > > > +
> > > >  #define XFS_BMAPI_FLAGS \
> > > >  	{ XFS_BMAPI_ENTIRE,	"ENTIRE" }, \
> > > >  	{ XFS_BMAPI_METADATA,	"METADATA" }, \
> > > > @@ -105,7 +112,8 @@ struct xfs_extent_free_item
> > > >  	{ XFS_BMAPI_IGSTATE,	"IGSTATE" }, \
> > > >  	{ XFS_BMAPI_CONTIG,	"CONTIG" }, \
> > > >  	{ XFS_BMAPI_CONVERT,	"CONVERT" }, \
> > > > -	{ XFS_BMAPI_ZERO,	"ZERO" }
> > > > +	{ XFS_BMAPI_ZERO,	"ZERO" }, \
> > > > +	{ XFS_BMAPI_REMAP,	"REMAP" }
> > > >  
> > > >  
> > > >  static inline int xfs_bmapi_aflag(int w)
> > > > diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
> > > > index 195a168..8485984 100644
> > > > --- a/fs/xfs/xfs_trace.h
> > > > +++ b/fs/xfs/xfs_trace.h
> > > > @@ -2965,6 +2965,60 @@ TRACE_EVENT(xfs_refcount_finish_one_leftover,
> > > >  		  __entry->adjusted)
> > > >  );
> > > >  
> > > > +/* simple inode-based error/%ip tracepoint class */
> > > > +DECLARE_EVENT_CLASS(xfs_inode_error_class,
> > > > +	TP_PROTO(struct xfs_inode *ip, int error, unsigned long caller_ip),
> > > > +	TP_ARGS(ip, error, caller_ip),
> > > > +	TP_STRUCT__entry(
> > > > +		__field(dev_t, dev)
> > > > +		__field(xfs_ino_t, ino)
> > > > +		__field(int, error)
> > > > +		__field(unsigned long, caller_ip)
> > > > +	),
> > > > +	TP_fast_assign(
> > > > +		__entry->dev = VFS_I(ip)->i_sb->s_dev;
> > > > +		__entry->ino = ip->i_ino;
> > > > +		__entry->error = error;
> > > > +		__entry->caller_ip = caller_ip;
> > > > +	),
> > > > +	TP_printk("dev %d:%d ino %llx error %d caller %ps",
> > > > +		  MAJOR(__entry->dev), MINOR(__entry->dev),
> > > > +		  __entry->ino,
> > > > +		  __entry->error,
> > > > +		  (char *)__entry->caller_ip)
> > > > +);
> > > > +
> > > > +#define DEFINE_INODE_ERROR_EVENT(name) \
> > > > +DEFINE_EVENT(xfs_inode_error_class, name, \
> > > > +	TP_PROTO(struct xfs_inode *ip, int error, \
> > > > +		 unsigned long caller_ip), \
> > > > +	TP_ARGS(ip, error, caller_ip))
> > > > +
> > > > +/* reflink allocator */
> > > > +TRACE_EVENT(xfs_bmap_remap_alloc,
> > > > +	TP_PROTO(struct xfs_inode *ip, xfs_fsblock_t fsbno,
> > > > +		 xfs_extlen_t len),
> > > > +	TP_ARGS(ip, fsbno, len),
> > > > +	TP_STRUCT__entry(
> > > > +		__field(dev_t, dev)
> > > > +		__field(xfs_ino_t, ino)
> > > > +		__field(xfs_fsblock_t, fsbno)
> > > > +		__field(xfs_extlen_t, len)
> > > > +	),
> > > > +	TP_fast_assign(
> > > > +		__entry->dev = VFS_I(ip)->i_sb->s_dev;
> > > > +		__entry->ino = ip->i_ino;
> > > > +		__entry->fsbno = fsbno;
> > > > +		__entry->len = len;
> > > > +	),
> > > > +	TP_printk("dev %d:%d ino 0x%llx fsbno 0x%llx len %x",
> > > > +		  MAJOR(__entry->dev), MINOR(__entry->dev),
> > > > +		  __entry->ino,
> > > > +		  __entry->fsbno,
> > > > +		  __entry->len)
> > > > +);
> > > > +DEFINE_INODE_ERROR_EVENT(xfs_bmap_remap_alloc_error);
> > > > +
> > > >  #endif /* _TRACE_XFS_H */
> > > >  
> > > >  #undef TRACE_INCLUDE_PATH
> > > > 
> > > > --
> > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > the body of a message to majordomo@xxxxxxxxxxxxxxx
> > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > the body of a message to majordomo@xxxxxxxxxxxxxxx
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html