Re: [PATCH 05/14] xfs: repair free space btrees

Brian Foster <bfoster@xxxxxxxxxx> · Tue, 31 Jul 2018 13:47:23 -0400

On Sun, Jul 29, 2018 at 10:48:21PM -0700, Darrick J. Wong wrote:
> From: Darrick J. Wong <darrick.wong@xxxxxxxxxx>
> 
> Rebuild the free space btrees from the gaps in the rmap btree.
> 
> Signed-off-by: Darrick J. Wong <darrick.wong@xxxxxxxxxx>
> ---
>  fs/xfs/Makefile             |    1 
>  fs/xfs/scrub/alloc.c        |    1 
>  fs/xfs/scrub/alloc_repair.c |  581 +++++++++++++++++++++++++++++++++++++++++++
>  fs/xfs/scrub/common.c       |    8 +
>  fs/xfs/scrub/repair.h       |    2 
>  fs/xfs/scrub/scrub.c        |    4 
>  fs/xfs/scrub/trace.h        |    2 
>  fs/xfs/xfs_extent_busy.c    |   14 +
>  fs/xfs/xfs_extent_busy.h    |    2 
>  9 files changed, 610 insertions(+), 5 deletions(-)
>  create mode 100644 fs/xfs/scrub/alloc_repair.c
> 
> 
...
> diff --git a/fs/xfs/scrub/alloc_repair.c b/fs/xfs/scrub/alloc_repair.c
> new file mode 100644
> index 000000000000..b228c2906de2
> --- /dev/null
> +++ b/fs/xfs/scrub/alloc_repair.c
> @@ -0,0 +1,581 @@
...
> +/* Record extents that aren't in use from gaps in the rmap records. */
> +STATIC int
> +xrep_abt_walk_rmap(
> +	struct xfs_btree_cur	*cur,
> +	struct xfs_rmap_irec	*rec,
> +	void			*priv)
> +{
> +	struct xrep_abt		*ra = priv;
> +	struct xrep_abt_extent	*rae;
> +	xfs_fsblock_t		fsb;
> +	int			error;
> +
> +	/* Record all the OWN_AG blocks... */
> +	if (rec->rm_owner == XFS_RMAP_OWN_AG) {
> +		fsb = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_private.a.agno,
> +				rec->rm_startblock);
> +		error = xfs_bitmap_set(ra->btlist, fsb, rec->rm_blockcount);
> +		if (error)
> +			return error;
> +	}
> +
> +	/* ...and all the rmapbt blocks... */
> +	error = xfs_bitmap_set_btcur_path(&ra->nobtlist, cur);
> +	if (error)
> +		return error;
> +
> +	/* ...and all the free space. */
> +	if (rec->rm_startblock > ra->next_bno) {
> +		trace_xrep_abt_walk_rmap(cur->bc_mp, cur->bc_private.a.agno,
> +				ra->next_bno, rec->rm_startblock - ra->next_bno,
> +				XFS_RMAP_OWN_NULL, 0, 0);
> +
> +		rae = kmem_alloc(sizeof(struct xrep_abt_extent), KM_MAYFAIL);
> +		if (!rae)
> +			return -ENOMEM;
> +		INIT_LIST_HEAD(&rae->list);
> +		rae->bno = ra->next_bno;
> +		rae->len = rec->rm_startblock - ra->next_bno;
> +		list_add_tail(&rae->list, ra->extlist);

Any reason we don't use a bitmap for this one?

> +		ra->nr_records++;
> +		ra->nr_blocks += rae->len;
> +	}
> +	ra->next_bno = max_t(xfs_agblock_t, ra->next_bno,
> +			rec->rm_startblock + rec->rm_blockcount);

The max_t() is to cover the record overlap case, right? If so, another
one liner comment would be good.

> +	return 0;
> +}
> +
...
> +/* Free an extent, which creates a record in the bnobt/cntbt. */
> +STATIC int
> +xrep_abt_free_extent(
> +	struct xfs_scrub	*sc,
> +	xfs_fsblock_t		fsbno,
> +	xfs_extlen_t		len,
> +	struct xfs_owner_info	*oinfo)
> +{
> +	int			error;
> +
> +	error = xfs_free_extent(sc->tp, fsbno, len, oinfo, 0);
> +	if (error)
> +		return error;
> +	error = xrep_roll_ag_trans(sc);
> +	if (error)
> +		return error;
> +	return xfs_mod_fdblocks(sc->mp, -(int64_t)len, false);

What's this call for? Is it because the blocks we're freeing were
already free? (Similar question on the other xfs_mod_fdblocks() call
further down).

BTW, what prevents some other task from coming along and screwing with
this? For example, could a large falloc or buffered write come in and
allocate these global blocks before we take them away here (causing the
whole sequence to fail)?

> +}
> +
...
> +/*
> + * Allocate a block from the (cached) first extent in the AG.  In theory
> + * this should never fail, since we already checked that there was enough
> + * space to handle the new btrees.
> + */
> +STATIC xfs_fsblock_t
> +xrep_abt_alloc_block(
> +	struct xfs_scrub	*sc,
> +	struct list_head	*free_extents)
> +{
> +	struct xrep_abt_extent	*ext;
> +
> +	/* Pull the first free space extent off the list, and... */
> +	ext = list_first_entry(free_extents, struct xrep_abt_extent, list);
> +
> +	/* ...take its first block. */
> +	ext->bno++;
> +	ext->len--;
> +	if (ext->len == 0) {
> +		list_del(&ext->list);
> +		kmem_free(ext);
> +	}
> +
> +	return XFS_AGB_TO_FSB(sc->mp, sc->sa.agno, ext->bno - 1);

Looks like a potential use after free of ext.

> +}
> +
...
> +/*
> + * Reset the global free block counter and the per-AG counters to make it look
> + * like this AG has no free space.
> + */
> +STATIC int
> +xrep_abt_reset_counters(
> +	struct xfs_scrub	*sc,
> +	int			*log_flags)
> +{
> +	struct xfs_perag	*pag = sc->sa.pag;
> +	struct xfs_agf		*agf;
> +	xfs_agblock_t		new_btblks;
> +	xfs_agblock_t		to_free;
> +	int			error;
> +
> +	/*
> +	 * Since we're abandoning the old bnobt/cntbt, we have to decrease
> +	 * fdblocks by the # of blocks in those trees.  btreeblks counts the
> +	 * non-root blocks of the free space and rmap btrees.  Do this before
> +	 * resetting the AGF counters.
> +	 */

Hmm, I'm not quite following the comment wrt to the xfs_mod_fdblocks()
below. to_free looks like it's the count of all current btree blocks
minus rmap blocks (i.e., old bno/cnt btree blocks). Are we "allocating"
those blocks here because we're going to free them later?

> +	agf = XFS_BUF_TO_AGF(sc->sa.agf_bp);
> +
> +	/* rmap_blocks accounts root block, btreeblks doesn't */
> +	new_btblks = be32_to_cpu(agf->agf_rmap_blocks) - 1;
> +
> +	/* btreeblks doesn't account bno/cnt root blocks */
> +	to_free = pag->pagf_btreeblks + 2;
> +
> +	/* and don't account for the blocks we aren't freeing */
> +	to_free -= new_btblks;
> +
> +	error = xfs_mod_fdblocks(sc->mp, -(int64_t)to_free, false);
> +	if (error)
> +		return error;
> +
> +	/*
> +	 * Reset the per-AG info, both incore and ondisk.  Mark the incore
> +	 * state stale in case we fail out of here.
> +	 */
> +	ASSERT(pag->pagf_init);
> +	pag->pagf_init = 0;
> +	pag->pagf_btreeblks = new_btblks;
> +	pag->pagf_freeblks = 0;
> +	pag->pagf_longest = 0;
> +
> +	agf->agf_btreeblks = cpu_to_be32(new_btblks);
> +	agf->agf_freeblks = 0;
> +	agf->agf_longest = 0;
> +	*log_flags |= XFS_AGF_BTREEBLKS | XFS_AGF_LONGEST | XFS_AGF_FREEBLKS;
> +
> +	return 0;
> +}
> +
> +/* Initialize a new free space btree root and implant into AGF. */
> +STATIC int
> +xrep_abt_reset_btree(
> +	struct xfs_scrub	*sc,
> +	xfs_btnum_t		btnum,
> +	struct list_head	*free_extents)
> +{
> +	struct xfs_owner_info	oinfo;
> +	struct xfs_buf		*bp;
> +	struct xfs_perag	*pag = sc->sa.pag;
> +	struct xfs_mount	*mp = sc->mp;
> +	struct xfs_agf		*agf = XFS_BUF_TO_AGF(sc->sa.agf_bp);
> +	xfs_fsblock_t		fsbno;
> +	int			error;
> +
> +	/* Allocate new root block. */
> +	fsbno = xrep_abt_alloc_block(sc, free_extents);

xrep_abt_alloc_block() converts an agbno to return an fsb. This function
passes the fsb to the init call just below and then converts it back to
an agbno in two places. It seems like there might be less conversions to
follow if the above just returned an agbno and we converted it to an fsb
once for xrep_init_btblock().

> +	if (fsbno == NULLFSBLOCK)
> +		return -ENOSPC;
> +
> +	/* Initialize new tree root. */
> +	error = xrep_init_btblock(sc, fsbno, &bp, btnum, &xfs_allocbt_buf_ops);
> +	if (error)
> +		return error;
> +
> +	/* Implant into AGF. */
> +	agf->agf_roots[btnum] = cpu_to_be32(XFS_FSB_TO_AGBNO(mp, fsbno));
> +	agf->agf_levels[btnum] = cpu_to_be32(1);
> +
> +	/* Add rmap records for the btree roots */
> +	xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG);
> +	error = xfs_rmap_alloc(sc->tp, sc->sa.agf_bp, sc->sa.agno,
> +			XFS_FSB_TO_AGBNO(mp, fsbno), 1, &oinfo);
> +	if (error)
> +		return error;
> +
> +	/* Reset the incore state. */
> +	pag->pagf_levels[btnum] = 1;
> +
> +	return 0;
> +}
> +
...
> +
> +/*
> + * Make our new freespace btree roots permanent so that we can start freeing
> + * unused space back into the AG.
> + */
> +STATIC int
> +xrep_abt_commit_new(
> +	struct xfs_scrub	*sc,
> +	struct xfs_bitmap	*old_allocbt_blocks,
> +	int			log_flags)
> +{
> +	int			error;
> +
> +	xfs_alloc_log_agf(sc->tp, sc->sa.agf_bp, log_flags);
> +
> +	/* Invalidate the old freespace btree blocks and commit. */
> +	error = xrep_invalidate_blocks(sc, old_allocbt_blocks);
> +	if (error)
> +		return error;

It looks like the above invalidation all happens in the same
transaction. Those aren't logging buffer data or anything, but any idea
how many log formats we can get away with in this single transaction?

> +	error = xrep_roll_ag_trans(sc);
> +	if (error)
> +		return error;
> +
> +	/* Now that we've succeeded, mark the incore state valid again. */
> +	sc->sa.pag->pagf_init = 1;
> +	return 0;
> +}
> +
> +/* Build new free space btrees and dispose of the old one. */
> +STATIC int
> +xrep_abt_rebuild_trees(
> +	struct xfs_scrub	*sc,
> +	struct list_head	*free_extents,
> +	struct xfs_bitmap	*old_allocbt_blocks)
> +{
> +	struct xfs_owner_info	oinfo;
> +	struct xrep_abt_extent	*rae;
> +	struct xrep_abt_extent	*n;
> +	struct xrep_abt_extent	*longest;
> +	int			error;
> +
> +	xfs_rmap_skip_owner_update(&oinfo);
> +
> +	/*
> +	 * Insert the longest free extent in case it's necessary to
> +	 * refresh the AGFL with multiple blocks.  If there is no longest
> +	 * extent, we had exactly the free space we needed; we're done.
> +	 */

I'm confused by the last sentence. longest should only be NULL if the
free space list is empty and haven't we already bailed out with -ENOSPC
if that's the case?

> +	longest = xrep_abt_get_longest(free_extents);
> +	if (!longest)
> +		goto done;
> +	error = xrep_abt_free_extent(sc,
> +			XFS_AGB_TO_FSB(sc->mp, sc->sa.agno, longest->bno),
> +			longest->len, &oinfo);
> +	list_del(&longest->list);
> +	kmem_free(longest);
> +	if (error)
> +		return error;
> +
> +	/* Insert records into the new btrees. */
> +	list_for_each_entry_safe(rae, n, free_extents, list) {
> +		error = xrep_abt_free_extent(sc,
> +				XFS_AGB_TO_FSB(sc->mp, sc->sa.agno, rae->bno),
> +				rae->len, &oinfo);
> +		if (error)
> +			return error;
> +		list_del(&rae->list);
> +		kmem_free(rae);
> +	}

Ok, at this point we've reset the btree roots and we start freeing the
free ranges that were discovered via the rmapbt analysis. AFAICT, if we
fail or crash at this point, we leave the allocbts in a partially
constructed state. I take it that is Ok with respect to the broader
repair algorithm because we'd essentially start over by inspecting the
rmapbt again on a retry.

The blocks allocated for the btrees that we've begun to construct here
end up mapped in the rmapbt as we go, right? IIUC, that means we don't
necessarily have infinite retries to make sure this completes. IOW,
suppose that a first repair attempt finds just enough free space to
construct new trees, gets far enough along to consume most of that free
space and then crashes. Is it possible that a subsequent repair attempt
includes the btree blocks allocated during the previous failed repair
attempt in the sum of "old btree blocks" and determines we don't have
enough free space to repair?

> +
> +done:
> +	/* Free all the OWN_AG blocks that are not in the rmapbt/agfl. */
> +	xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG);
> +	return xrep_reap_extents(sc, old_allocbt_blocks, &oinfo,
> +			XFS_AG_RESV_NONE);
> +}
> +
...
> diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c
> index 0ed68379e551..82f99633a597 100644
> --- a/fs/xfs/xfs_extent_busy.c
> +++ b/fs/xfs/xfs_extent_busy.c
> @@ -657,3 +657,17 @@ xfs_extent_busy_ag_cmp(
>  		diff = b1->bno - b2->bno;
>  	return diff;
>  }
> +
> +/* Are there any busy extents in this AG? */
> +bool
> +xfs_extent_busy_list_empty(
> +	struct xfs_perag	*pag)
> +{
> +	spin_lock(&pag->pagb_lock);
> +	if (pag->pagb_tree.rb_node) {

RB_EMPTY_ROOT()?

Brian

> +		spin_unlock(&pag->pagb_lock);
> +		return false;
> +	}
> +	spin_unlock(&pag->pagb_lock);
> +	return true;
> +}
> diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h
> index 990ab3891971..2f8c73c712c6 100644
> --- a/fs/xfs/xfs_extent_busy.h
> +++ b/fs/xfs/xfs_extent_busy.h
> @@ -65,4 +65,6 @@ static inline void xfs_extent_busy_sort(struct list_head *list)
>  	list_sort(NULL, list, xfs_extent_busy_ag_cmp);
>  }
>  
> +bool xfs_extent_busy_list_empty(struct xfs_perag *pag);
> +
>  #endif /* __XFS_EXTENT_BUSY_H__ */
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@xxxxxxxxxxxxxxx
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html