On Fri, Jul 27, 2018 at 12:25:56PM -0400, Brian Foster wrote: > On Fri, Jul 27, 2018 at 09:02:38AM -0700, Darrick J. Wong wrote: > > On Fri, Jul 27, 2018 at 10:23:48AM -0400, Brian Foster wrote: > > > On Wed, Jul 25, 2018 at 05:19:55PM -0700, Darrick J. Wong wrote: > > > > From: Darrick J. Wong <darrick.wong@xxxxxxxxxx> > > > > > > > > Regenerate the AGF from the rmap data. > > > > > > > > Signed-off-by: Darrick J. Wong <darrick.wong@xxxxxxxxxx> > > > > --- > > > > > > Mostly seems sane to me. I still need to come up to speed on the broader > > > xfs_scrub context. A few comments in the meantime.. > > > > <nod> Thanks for taking a look at this series. :) > > > > > > fs/xfs/scrub/agheader_repair.c | 366 ++++++++++++++++++++++++++++++++++++++++ > > > > fs/xfs/scrub/repair.c | 27 ++- > > > > fs/xfs/scrub/repair.h | 4 > > > > fs/xfs/scrub/scrub.c | 2 > > > > 4 files changed, 389 insertions(+), 10 deletions(-) > > > > > > > > > > > > diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c > > > > index 1e96621ece3a..938af216cb1c 100644 > > > > --- a/fs/xfs/scrub/agheader_repair.c > > > > +++ b/fs/xfs/scrub/agheader_repair.c > > > ... > > > > @@ -54,3 +61,362 @@ xrep_superblock( > > > > xfs_trans_log_buf(sc->tp, bp, 0, BBTOB(bp->b_length) - 1); > > > > return error; > > > > } > > > ... > > > > +/* Update all AGF fields which derive from btree contents. */ > > > > +STATIC int > > > > +xrep_agf_calc_from_btrees( > > > > + struct xfs_scrub *sc, > > > > + struct xfs_buf *agf_bp) > > > > +{ > > > > + struct xrep_agf_allocbt raa = { .sc = sc }; > > > > + struct xfs_btree_cur *cur = NULL; > > > > + struct xfs_agf *agf = XFS_BUF_TO_AGF(agf_bp); > > > > + struct xfs_mount *mp = sc->mp; > > > > + xfs_agblock_t btreeblks; > > > > + xfs_agblock_t blocks; > > > > + int error; > > > > + > > > > + /* Update the AGF counters from the bnobt. */ > > > > + cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.agno, > > > > + XFS_BTNUM_BNO); > > > > + error = xfs_alloc_query_all(cur, xrep_agf_walk_allocbt, &raa); > > > > + if (error) > > > > + goto err; > > > > + error = xfs_btree_count_blocks(cur, &blocks); > > > > + if (error) > > > > + goto err; > > > > + xfs_btree_del_cursor(cur, error); > > > > + btreeblks = blocks - 1; > > > > > > Why the -1? We don't count the root or something? > > > > The AGF btreeblks field only counts the number of blocks added to the > > bno/cnt/rmapbt since they were initialized (each with a single root > > block). I find it a little strange not to count the root, but oh well. > > > > Got it. > > > > > + agf->agf_freeblks = cpu_to_be32(raa.freeblks); > > > > + agf->agf_longest = cpu_to_be32(raa.longest); > > > > + > > > > + /* Update the AGF counters from the cntbt. */ > > > > + cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.agno, > > > > + XFS_BTNUM_CNT); > > > > + error = xfs_btree_count_blocks(cur, &blocks); > > > > + if (error) > > > > + goto err; > > > > + xfs_btree_del_cursor(cur, error); > > > > + btreeblks += blocks - 1; > > > > + > > > > + /* Update the AGF counters from the rmapbt. */ > > > > + cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.agno); > > > > + error = xfs_btree_count_blocks(cur, &blocks); > > > > + if (error) > > > > + goto err; > > > > + xfs_btree_del_cursor(cur, error); > > > > + agf->agf_rmap_blocks = cpu_to_be32(blocks); > > > > + btreeblks += blocks - 1; > > > > + > > > > + agf->agf_btreeblks = cpu_to_be32(btreeblks); > > > > + > > > > + /* Update the AGF counters from the refcountbt. */ > > > > + if (xfs_sb_version_hasreflink(&mp->m_sb)) { > > > > + cur = xfs_refcountbt_init_cursor(mp, sc->tp, agf_bp, > > > > + sc->sa.agno, NULL); > > > > > > FYI this fails to compile on for-next (dfops param has been removed). > > > > Yeah, I'm working on a rebase to for-next (once I settle on the locking > > question in hch's "reduce cow lookups" series). > > > > > > + error = xfs_btree_count_blocks(cur, &blocks); > > > > + if (error) > > > > + goto err; > > > > + xfs_btree_del_cursor(cur, error); > > > > + agf->agf_refcount_blocks = cpu_to_be32(blocks); > > > > + } > > > > + > > > > + return 0; > > > > +err: > > > > + xfs_btree_del_cursor(cur, error); > > > > + return error; > > > > +} > > > > + > ... > > > > +/* Repair the AGF. v5 filesystems only. */ > > > > +int > > > > +xrep_agf( > > > > + struct xfs_scrub *sc) > > > > +{ > > > > + struct xrep_find_ag_btree fab[XREP_AGF_MAX] = { > > > > + [XREP_AGF_BNOBT] = { > > > > + .rmap_owner = XFS_RMAP_OWN_AG, > > > > + .buf_ops = &xfs_allocbt_buf_ops, > > > > + .magic = XFS_ABTB_CRC_MAGIC, > > > > + }, > > > > + [XREP_AGF_CNTBT] = { > > > > + .rmap_owner = XFS_RMAP_OWN_AG, > > > > + .buf_ops = &xfs_allocbt_buf_ops, > > > > + .magic = XFS_ABTC_CRC_MAGIC, > > > > + }, > > > > + [XREP_AGF_RMAPBT] = { > > > > + .rmap_owner = XFS_RMAP_OWN_AG, > > > > + .buf_ops = &xfs_rmapbt_buf_ops, > > > > + .magic = XFS_RMAP_CRC_MAGIC, > > > > + }, > > > > + [XREP_AGF_REFCOUNTBT] = { > > > > + .rmap_owner = XFS_RMAP_OWN_REFC, > > > > + .buf_ops = &xfs_refcountbt_buf_ops, > > > > + .magic = XFS_REFC_CRC_MAGIC, > > > > + }, > > > > + [XREP_AGF_END] = { > > > > + .buf_ops = NULL, > > > > + }, > > > > + }; > > > > + struct xfs_agf old_agf; > > > > + struct xfs_mount *mp = sc->mp; > > > > + struct xfs_buf *agf_bp; > > > > + struct xfs_buf *agfl_bp; > > > > + struct xfs_agf *agf; > > > > + int error; > > > > + > > > > + /* We require the rmapbt to rebuild anything. */ > > > > + if (!xfs_sb_version_hasrmapbt(&mp->m_sb)) > > > > + return -EOPNOTSUPP; > > > > + > > > > + xchk_perag_get(sc->mp, &sc->sa); > > > > + error = xfs_trans_read_buf(mp, sc->tp, mp->m_ddev_targp, > > > > + XFS_AG_DADDR(mp, sc->sa.agno, XFS_AGF_DADDR(mp)), > > > > + XFS_FSS_TO_BB(mp, 1), 0, &agf_bp, NULL); > > > > + if (error) > > > > + return error; > > > > + agf_bp->b_ops = &xfs_agf_buf_ops; > > > > > > Any reason we don't call xfs_read_agf() here? It looks like we use the > > > similar helper for the agfl below. > > > > We're grabbing the agf buffer without read verifiers so that we can > > reinitialize it. Note that scrub tries xfs_read_agf, and if it fails > > with -EFSCORRUPTED/-EFSBADCRC it marks the agf as corrupt, so it's > > possible that sc->sa.sa_agf is still null. > > > > Ah, makes sense. I missed the NULL b_ops.. > > > This probably could have been trans_get_buf though... > > > > > > + agf = XFS_BUF_TO_AGF(agf_bp); > > > > + > > > > + /* > > > > + * Load the AGFL so that we can screen out OWN_AG blocks that are on > > > > + * the AGFL now; these blocks might have once been part of the > > > > + * bno/cnt/rmap btrees but are not now. This is a chicken and egg > > > > + * problem: the AGF is corrupt, so we have to trust the AGFL contents > > > > + * because we can't do any serious cross-referencing with any of the > > > > + * btrees rooted in the AGF. If the AGFL contents are obviously bad > > > > + * then we'll bail out. > > > > + */ > > > > + error = xfs_alloc_read_agfl(mp, sc->tp, sc->sa.agno, &agfl_bp); > > > > + if (error) > > > > + return error; > > > > + > > > > + /* > > > > + * Spot-check the AGFL blocks; if they're obviously corrupt then > > > > + * there's nothing we can do but bail out. > > > > + */ > > > > > > Why? Can't we reset the agfl, or is that handled elsewhere? > > > > It's handled in xrep_agfl, but userspace will have to call us again to > > fix the agfl and then call us a third time about the agf repair. > > > > Ok, this was one of the things I feel like I don't have enough context > on wrt to online repair: in general, what dependent structures we expect > to be consistent in order to repair some other interrelated structure. > Userspace repair is straightforward in this regard since we slurp the > whole fs into memory, adjust the global state as we go, then essentially > regenerate new metadata based on the finalized state. <nod> > For online repair, it sounds like we're potentially limited because > repair of one structure may always depend on some other subset of > metadata being consistent, right? Correct. There's an implicit assumption here (which is coded into a warning message in xfs_scrub) that if the primary and secondary metadata (rmapbt usually) are corrupt then the fs may be unrecoverable online... > If so, is the goal of online repair to essentially "do the best we can > but otherwise the most severe corruptions may have to always fall back > to xfs_repair?" ...and so the only recourse is xfs_repair, with the usual caveat that a severely damaged filesystem might be a total loss. If the online repair fails then the fs will be shut down (either due to cancelling a dirty repair transaction or because xfs_scrub can call FS_IOC_SHUTDOWN after a failure). > So in this particular case, we expect a sane agfl and otherwise buzz off > because 1.) this is a targeted agf repair request and 2.) we have a > separate request to deal with the agfl. It sounds like the smarts to > understand how we might have to jump back and forth between them is in > userspace, so the end-user doesn't necessarily have to understand the > dependency. Correct. Userspace should be running xfs_scrub, which knows in which order metadata has to be checked, and what dependencies must be satisfied for repairs to succeed. While it's possible to invoke it manually via xfs_io, in practice nobody but xfstests should be using that method of invocation. --D > Brian > > > (xfs_scrub does this, naturally...) > > > > --D > > > > > Brian > > > > > > > + error = xfs_agfl_walk(sc->mp, XFS_BUF_TO_AGF(agf_bp), agfl_bp, > > > > + xrep_agf_check_agfl_block, sc); > > > > + if (error) > > > > + return error; > > > > + > > > > + /* > > > > + * Find the AGF btree roots. This is also a chicken-and-egg situation; > > > > + * see the function for more details. > > > > + */ > > > > + error = xrep_agf_find_btrees(sc, agf_bp, fab, agfl_bp); > > > > + if (error) > > > > + return error; > > > > + > > > > + /* Start rewriting the header and implant the btrees we found. */ > > > > + xrep_agf_init_header(sc, agf_bp, &old_agf); > > > > + xrep_agf_set_roots(sc, agf, fab); > > > > + error = xrep_agf_calc_from_btrees(sc, agf_bp); > > > > + if (error) > > > > + goto out_revert; > > > > + > > > > + /* Commit the changes and reinitialize incore state. */ > > > > + return xrep_agf_commit_new(sc, agf_bp); > > > > + > > > > +out_revert: > > > > + /* Mark the incore AGF state stale and revert the AGF. */ > > > > + sc->sa.pag->pagf_init = 0; > > > > + memcpy(agf, &old_agf, sizeof(old_agf)); > > > > + return error; > > > > +} > > > > diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c > > > > index 85b048b341a0..17cf48564390 100644 > > > > --- a/fs/xfs/scrub/repair.c > > > > +++ b/fs/xfs/scrub/repair.c > > > > @@ -128,9 +128,12 @@ xrep_roll_ag_trans( > > > > int error; > > > > > > > > /* Keep the AG header buffers locked so we can keep going. */ > > > > - xfs_trans_bhold(sc->tp, sc->sa.agi_bp); > > > > - xfs_trans_bhold(sc->tp, sc->sa.agf_bp); > > > > - xfs_trans_bhold(sc->tp, sc->sa.agfl_bp); > > > > + if (sc->sa.agi_bp) > > > > + xfs_trans_bhold(sc->tp, sc->sa.agi_bp); > > > > + if (sc->sa.agf_bp) > > > > + xfs_trans_bhold(sc->tp, sc->sa.agf_bp); > > > > + if (sc->sa.agfl_bp) > > > > + xfs_trans_bhold(sc->tp, sc->sa.agfl_bp); > > > > > > > > /* Roll the transaction. */ > > > > error = xfs_trans_roll(&sc->tp); > > > > @@ -138,9 +141,12 @@ xrep_roll_ag_trans( > > > > goto out_release; > > > > > > > > /* Join AG headers to the new transaction. */ > > > > - xfs_trans_bjoin(sc->tp, sc->sa.agi_bp); > > > > - xfs_trans_bjoin(sc->tp, sc->sa.agf_bp); > > > > - xfs_trans_bjoin(sc->tp, sc->sa.agfl_bp); > > > > + if (sc->sa.agi_bp) > > > > + xfs_trans_bjoin(sc->tp, sc->sa.agi_bp); > > > > + if (sc->sa.agf_bp) > > > > + xfs_trans_bjoin(sc->tp, sc->sa.agf_bp); > > > > + if (sc->sa.agfl_bp) > > > > + xfs_trans_bjoin(sc->tp, sc->sa.agfl_bp); > > > > > > > > return 0; > > > > > > > > @@ -150,9 +156,12 @@ xrep_roll_ag_trans( > > > > * buffers will be released during teardown on our way out > > > > * of the kernel. > > > > */ > > > > - xfs_trans_bhold_release(sc->tp, sc->sa.agi_bp); > > > > - xfs_trans_bhold_release(sc->tp, sc->sa.agf_bp); > > > > - xfs_trans_bhold_release(sc->tp, sc->sa.agfl_bp); > > > > + if (sc->sa.agi_bp) > > > > + xfs_trans_bhold_release(sc->tp, sc->sa.agi_bp); > > > > + if (sc->sa.agf_bp) > > > > + xfs_trans_bhold_release(sc->tp, sc->sa.agf_bp); > > > > + if (sc->sa.agfl_bp) > > > > + xfs_trans_bhold_release(sc->tp, sc->sa.agfl_bp); > > > > > > > > return error; > > > > } > > > > diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h > > > > index 5a4e92221916..1d283360b5ab 100644 > > > > --- a/fs/xfs/scrub/repair.h > > > > +++ b/fs/xfs/scrub/repair.h > > > > @@ -58,6 +58,8 @@ int xrep_ino_dqattach(struct xfs_scrub *sc); > > > > > > > > int xrep_probe(struct xfs_scrub *sc); > > > > int xrep_superblock(struct xfs_scrub *sc); > > > > +int xrep_agf(struct xfs_scrub *sc); > > > > +int xrep_agfl(struct xfs_scrub *sc); > > > > > > > > #else > > > > > > > > @@ -81,6 +83,8 @@ xrep_calc_ag_resblks( > > > > > > > > #define xrep_probe xrep_notsupported > > > > #define xrep_superblock xrep_notsupported > > > > +#define xrep_agf xrep_notsupported > > > > +#define xrep_agfl xrep_notsupported > > > > > > > > #endif /* CONFIG_XFS_ONLINE_REPAIR */ > > > > > > > > diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c > > > > index 6efb926f3cf8..1e8a17c8e2b9 100644 > > > > --- a/fs/xfs/scrub/scrub.c > > > > +++ b/fs/xfs/scrub/scrub.c > > > > @@ -214,7 +214,7 @@ static const struct xchk_meta_ops meta_scrub_ops[] = { > > > > .type = ST_PERAG, > > > > .setup = xchk_setup_fs, > > > > .scrub = xchk_agf, > > > > - .repair = xrep_notsupported, > > > > + .repair = xrep_agf, > > > > }, > > > > [XFS_SCRUB_TYPE_AGFL]= { /* agfl */ > > > > .type = ST_PERAG, > > > > > > > > -- > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > > the body of a message to majordomo@xxxxxxxxxxxxxxx > > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > > > -- > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > > > the body of a message to majordomo@xxxxxxxxxxxxxxx > > > More majordomo info at http://vger.kernel.org/majordomo-info.html > -- > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > the body of a message to majordomo@xxxxxxxxxxxxxxx > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe linux-xfs" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html