Re: [PATCH 04/16] xfs: repair the AGF

Brian Foster <bfoster@xxxxxxxxxx> · Fri, 27 Jul 2018 12:25:56 -0400

On Fri, Jul 27, 2018 at 09:02:38AM -0700, Darrick J. Wong wrote:
> On Fri, Jul 27, 2018 at 10:23:48AM -0400, Brian Foster wrote:
> > On Wed, Jul 25, 2018 at 05:19:55PM -0700, Darrick J. Wong wrote:
> > > From: Darrick J. Wong <darrick.wong@xxxxxxxxxx>
> > > 
> > > Regenerate the AGF from the rmap data.
> > > 
> > > Signed-off-by: Darrick J. Wong <darrick.wong@xxxxxxxxxx>
> > > ---
> > 
> > Mostly seems sane to me. I still need to come up to speed on the broader
> > xfs_scrub context. A few comments in the meantime..
> 
> <nod> Thanks for taking a look at this series. :)
> 
> > >  fs/xfs/scrub/agheader_repair.c |  366 ++++++++++++++++++++++++++++++++++++++++
> > >  fs/xfs/scrub/repair.c          |   27 ++-
> > >  fs/xfs/scrub/repair.h          |    4 
> > >  fs/xfs/scrub/scrub.c           |    2 
> > >  4 files changed, 389 insertions(+), 10 deletions(-)
> > > 
> > > 
> > > diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c
> > > index 1e96621ece3a..938af216cb1c 100644
> > > --- a/fs/xfs/scrub/agheader_repair.c
> > > +++ b/fs/xfs/scrub/agheader_repair.c
> > ...
> > > @@ -54,3 +61,362 @@ xrep_superblock(
> > >  	xfs_trans_log_buf(sc->tp, bp, 0, BBTOB(bp->b_length) - 1);
> > >  	return error;
> > >  }
> > ...
> > > +/* Update all AGF fields which derive from btree contents. */
> > > +STATIC int
> > > +xrep_agf_calc_from_btrees(
> > > +	struct xfs_scrub	*sc,
> > > +	struct xfs_buf		*agf_bp)
> > > +{
> > > +	struct xrep_agf_allocbt	raa = { .sc = sc };
> > > +	struct xfs_btree_cur	*cur = NULL;
> > > +	struct xfs_agf		*agf = XFS_BUF_TO_AGF(agf_bp);
> > > +	struct xfs_mount	*mp = sc->mp;
> > > +	xfs_agblock_t		btreeblks;
> > > +	xfs_agblock_t		blocks;
> > > +	int			error;
> > > +
> > > +	/* Update the AGF counters from the bnobt. */
> > > +	cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.agno,
> > > +			XFS_BTNUM_BNO);
> > > +	error = xfs_alloc_query_all(cur, xrep_agf_walk_allocbt, &raa);
> > > +	if (error)
> > > +		goto err;
> > > +	error = xfs_btree_count_blocks(cur, &blocks);
> > > +	if (error)
> > > +		goto err;
> > > +	xfs_btree_del_cursor(cur, error);
> > > +	btreeblks = blocks - 1;
> > 
> > Why the -1? We don't count the root or something?
> 
> The AGF btreeblks field only counts the number of blocks added to the
> bno/cnt/rmapbt since they were initialized (each with a single root
> block).  I find it a little strange not to count the root, but oh well.
> 

Got it.

> > > +	agf->agf_freeblks = cpu_to_be32(raa.freeblks);
> > > +	agf->agf_longest = cpu_to_be32(raa.longest);
> > > +
> > > +	/* Update the AGF counters from the cntbt. */
> > > +	cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.agno,
> > > +			XFS_BTNUM_CNT);
> > > +	error = xfs_btree_count_blocks(cur, &blocks);
> > > +	if (error)
> > > +		goto err;
> > > +	xfs_btree_del_cursor(cur, error);
> > > +	btreeblks += blocks - 1;
> > > +
> > > +	/* Update the AGF counters from the rmapbt. */
> > > +	cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.agno);
> > > +	error = xfs_btree_count_blocks(cur, &blocks);
> > > +	if (error)
> > > +		goto err;
> > > +	xfs_btree_del_cursor(cur, error);
> > > +	agf->agf_rmap_blocks = cpu_to_be32(blocks);
> > > +	btreeblks += blocks - 1;
> > > +
> > > +	agf->agf_btreeblks = cpu_to_be32(btreeblks);
> > > +
> > > +	/* Update the AGF counters from the refcountbt. */
> > > +	if (xfs_sb_version_hasreflink(&mp->m_sb)) {
> > > +		cur = xfs_refcountbt_init_cursor(mp, sc->tp, agf_bp,
> > > +				sc->sa.agno, NULL);
> > 
> > FYI this fails to compile on for-next (dfops param has been removed).
> 
> Yeah, I'm working on a rebase to for-next (once I settle on the locking
> question in hch's "reduce cow lookups" series).
> 
> > > +		error = xfs_btree_count_blocks(cur, &blocks);
> > > +		if (error)
> > > +			goto err;
> > > +		xfs_btree_del_cursor(cur, error);
> > > +		agf->agf_refcount_blocks = cpu_to_be32(blocks);
> > > +	}
> > > +
> > > +	return 0;
> > > +err:
> > > +	xfs_btree_del_cursor(cur, error);
> > > +	return error;
> > > +}
> > > +
...
> > > +/* Repair the AGF. v5 filesystems only. */
> > > +int
> > > +xrep_agf(
> > > +	struct xfs_scrub		*sc)
> > > +{
> > > +	struct xrep_find_ag_btree	fab[XREP_AGF_MAX] = {
> > > +		[XREP_AGF_BNOBT] = {
> > > +			.rmap_owner = XFS_RMAP_OWN_AG,
> > > +			.buf_ops = &xfs_allocbt_buf_ops,
> > > +			.magic = XFS_ABTB_CRC_MAGIC,
> > > +		},
> > > +		[XREP_AGF_CNTBT] = {
> > > +			.rmap_owner = XFS_RMAP_OWN_AG,
> > > +			.buf_ops = &xfs_allocbt_buf_ops,
> > > +			.magic = XFS_ABTC_CRC_MAGIC,
> > > +		},
> > > +		[XREP_AGF_RMAPBT] = {
> > > +			.rmap_owner = XFS_RMAP_OWN_AG,
> > > +			.buf_ops = &xfs_rmapbt_buf_ops,
> > > +			.magic = XFS_RMAP_CRC_MAGIC,
> > > +		},
> > > +		[XREP_AGF_REFCOUNTBT] = {
> > > +			.rmap_owner = XFS_RMAP_OWN_REFC,
> > > +			.buf_ops = &xfs_refcountbt_buf_ops,
> > > +			.magic = XFS_REFC_CRC_MAGIC,
> > > +		},
> > > +		[XREP_AGF_END] = {
> > > +			.buf_ops = NULL,
> > > +		},
> > > +	};
> > > +	struct xfs_agf			old_agf;
> > > +	struct xfs_mount		*mp = sc->mp;
> > > +	struct xfs_buf			*agf_bp;
> > > +	struct xfs_buf			*agfl_bp;
> > > +	struct xfs_agf			*agf;
> > > +	int				error;
> > > +
> > > +	/* We require the rmapbt to rebuild anything. */
> > > +	if (!xfs_sb_version_hasrmapbt(&mp->m_sb))
> > > +		return -EOPNOTSUPP;
> > > +
> > > +	xchk_perag_get(sc->mp, &sc->sa);
> > > +	error = xfs_trans_read_buf(mp, sc->tp, mp->m_ddev_targp,
> > > +			XFS_AG_DADDR(mp, sc->sa.agno, XFS_AGF_DADDR(mp)),
> > > +			XFS_FSS_TO_BB(mp, 1), 0, &agf_bp, NULL);
> > > +	if (error)
> > > +		return error;
> > > +	agf_bp->b_ops = &xfs_agf_buf_ops;
> > 
> > Any reason we don't call xfs_read_agf() here? It looks like we use the
> > similar helper for the agfl below.
> 
> We're grabbing the agf buffer without read verifiers so that we can
> reinitialize it.  Note that scrub tries xfs_read_agf, and if it fails
> with -EFSCORRUPTED/-EFSBADCRC it marks the agf as corrupt, so it's
> possible that sc->sa.sa_agf is still null.
> 

Ah, makes sense. I missed the NULL b_ops..

> This probably could have been trans_get_buf though...
> 
> > > +	agf = XFS_BUF_TO_AGF(agf_bp);
> > > +
> > > +	/*
> > > +	 * Load the AGFL so that we can screen out OWN_AG blocks that are on
> > > +	 * the AGFL now; these blocks might have once been part of the
> > > +	 * bno/cnt/rmap btrees but are not now.  This is a chicken and egg
> > > +	 * problem: the AGF is corrupt, so we have to trust the AGFL contents
> > > +	 * because we can't do any serious cross-referencing with any of the
> > > +	 * btrees rooted in the AGF.  If the AGFL contents are obviously bad
> > > +	 * then we'll bail out.
> > > +	 */
> > > +	error = xfs_alloc_read_agfl(mp, sc->tp, sc->sa.agno, &agfl_bp);
> > > +	if (error)
> > > +		return error;
> > > +
> > > +	/*
> > > +	 * Spot-check the AGFL blocks; if they're obviously corrupt then
> > > +	 * there's nothing we can do but bail out.
> > > +	 */
> > 
> > Why? Can't we reset the agfl, or is that handled elsewhere?
> 
> It's handled in xrep_agfl, but userspace will have to call us again to
> fix the agfl and then call us a third time about the agf repair.
> 

Ok, this was one of the things I feel like I don't have enough context
on wrt to online repair: in general, what dependent structures we expect
to be consistent in order to repair some other interrelated structure.
Userspace repair is straightforward in this regard since we slurp the
whole fs into memory, adjust the global state as we go, then essentially
regenerate new metadata based on the finalized state.

For online repair, it sounds like we're potentially limited because
repair of one structure may always depend on some other subset of
metadata being consistent, right? If so, is the goal of online repair to
essentially "do the best we can but otherwise the most severe
corruptions may have to always fall back to xfs_repair?"

So in this particular case, we expect a sane agfl and otherwise buzz off
because 1.) this is a targeted agf repair request and 2.) we have a
separate request to deal with the agfl. It sounds like the smarts to
understand how we might have to jump back and forth between them is in
userspace, so the end-user doesn't necessarily have to understand the
dependency.

Brian

> (xfs_scrub does this, naturally...)
> 
> --D
> 
> > Brian
> > 
> > > +	error = xfs_agfl_walk(sc->mp, XFS_BUF_TO_AGF(agf_bp), agfl_bp,
> > > +			xrep_agf_check_agfl_block, sc);
> > > +	if (error)
> > > +		return error;
> > > +
> > > +	/*
> > > +	 * Find the AGF btree roots.  This is also a chicken-and-egg situation;
> > > +	 * see the function for more details.
> > > +	 */
> > > +	error = xrep_agf_find_btrees(sc, agf_bp, fab, agfl_bp);
> > > +	if (error)
> > > +		return error;
> > > +
> > > +	/* Start rewriting the header and implant the btrees we found. */
> > > +	xrep_agf_init_header(sc, agf_bp, &old_agf);
> > > +	xrep_agf_set_roots(sc, agf, fab);
> > > +	error = xrep_agf_calc_from_btrees(sc, agf_bp);
> > > +	if (error)
> > > +		goto out_revert;
> > > +
> > > +	/* Commit the changes and reinitialize incore state. */
> > > +	return xrep_agf_commit_new(sc, agf_bp);
> > > +
> > > +out_revert:
> > > +	/* Mark the incore AGF state stale and revert the AGF. */
> > > +	sc->sa.pag->pagf_init = 0;
> > > +	memcpy(agf, &old_agf, sizeof(old_agf));
> > > +	return error;
> > > +}
> > > diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
> > > index 85b048b341a0..17cf48564390 100644
> > > --- a/fs/xfs/scrub/repair.c
> > > +++ b/fs/xfs/scrub/repair.c
> > > @@ -128,9 +128,12 @@ xrep_roll_ag_trans(
> > >  	int			error;
> > >  
> > >  	/* Keep the AG header buffers locked so we can keep going. */
> > > -	xfs_trans_bhold(sc->tp, sc->sa.agi_bp);
> > > -	xfs_trans_bhold(sc->tp, sc->sa.agf_bp);
> > > -	xfs_trans_bhold(sc->tp, sc->sa.agfl_bp);
> > > +	if (sc->sa.agi_bp)
> > > +		xfs_trans_bhold(sc->tp, sc->sa.agi_bp);
> > > +	if (sc->sa.agf_bp)
> > > +		xfs_trans_bhold(sc->tp, sc->sa.agf_bp);
> > > +	if (sc->sa.agfl_bp)
> > > +		xfs_trans_bhold(sc->tp, sc->sa.agfl_bp);
> > >  
> > >  	/* Roll the transaction. */
> > >  	error = xfs_trans_roll(&sc->tp);
> > > @@ -138,9 +141,12 @@ xrep_roll_ag_trans(
> > >  		goto out_release;
> > >  
> > >  	/* Join AG headers to the new transaction. */
> > > -	xfs_trans_bjoin(sc->tp, sc->sa.agi_bp);
> > > -	xfs_trans_bjoin(sc->tp, sc->sa.agf_bp);
> > > -	xfs_trans_bjoin(sc->tp, sc->sa.agfl_bp);
> > > +	if (sc->sa.agi_bp)
> > > +		xfs_trans_bjoin(sc->tp, sc->sa.agi_bp);
> > > +	if (sc->sa.agf_bp)
> > > +		xfs_trans_bjoin(sc->tp, sc->sa.agf_bp);
> > > +	if (sc->sa.agfl_bp)
> > > +		xfs_trans_bjoin(sc->tp, sc->sa.agfl_bp);
> > >  
> > >  	return 0;
> > >  
> > > @@ -150,9 +156,12 @@ xrep_roll_ag_trans(
> > >  	 * buffers will be released during teardown on our way out
> > >  	 * of the kernel.
> > >  	 */
> > > -	xfs_trans_bhold_release(sc->tp, sc->sa.agi_bp);
> > > -	xfs_trans_bhold_release(sc->tp, sc->sa.agf_bp);
> > > -	xfs_trans_bhold_release(sc->tp, sc->sa.agfl_bp);
> > > +	if (sc->sa.agi_bp)
> > > +		xfs_trans_bhold_release(sc->tp, sc->sa.agi_bp);
> > > +	if (sc->sa.agf_bp)
> > > +		xfs_trans_bhold_release(sc->tp, sc->sa.agf_bp);
> > > +	if (sc->sa.agfl_bp)
> > > +		xfs_trans_bhold_release(sc->tp, sc->sa.agfl_bp);
> > >  
> > >  	return error;
> > >  }
> > > diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
> > > index 5a4e92221916..1d283360b5ab 100644
> > > --- a/fs/xfs/scrub/repair.h
> > > +++ b/fs/xfs/scrub/repair.h
> > > @@ -58,6 +58,8 @@ int xrep_ino_dqattach(struct xfs_scrub *sc);
> > >  
> > >  int xrep_probe(struct xfs_scrub *sc);
> > >  int xrep_superblock(struct xfs_scrub *sc);
> > > +int xrep_agf(struct xfs_scrub *sc);
> > > +int xrep_agfl(struct xfs_scrub *sc);
> > >  
> > >  #else
> > >  
> > > @@ -81,6 +83,8 @@ xrep_calc_ag_resblks(
> > >  
> > >  #define xrep_probe			xrep_notsupported
> > >  #define xrep_superblock			xrep_notsupported
> > > +#define xrep_agf			xrep_notsupported
> > > +#define xrep_agfl			xrep_notsupported
> > >  
> > >  #endif /* CONFIG_XFS_ONLINE_REPAIR */
> > >  
> > > diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
> > > index 6efb926f3cf8..1e8a17c8e2b9 100644
> > > --- a/fs/xfs/scrub/scrub.c
> > > +++ b/fs/xfs/scrub/scrub.c
> > > @@ -214,7 +214,7 @@ static const struct xchk_meta_ops meta_scrub_ops[] = {
> > >  		.type	= ST_PERAG,
> > >  		.setup	= xchk_setup_fs,
> > >  		.scrub	= xchk_agf,
> > > -		.repair	= xrep_notsupported,
> > > +		.repair	= xrep_agf,
> > >  	},
> > >  	[XFS_SCRUB_TYPE_AGFL]= {	/* agfl */
> > >  		.type	= ST_PERAG,
> > > 
> > > --
> > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > the body of a message to majordomo@xxxxxxxxxxxxxxx
> > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > the body of a message to majordomo@xxxxxxxxxxxxxxx
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html