From: Darrick J. Wong <darrick.wong@xxxxxxxxxx> Rebuild the free space btrees from the gaps in the rmap btree. Signed-off-by: Darrick J. Wong <darrick.wong@xxxxxxxxxx> --- fs/xfs/Makefile | 1 fs/xfs/libxfs/xfs_ag_resv.c | 2 fs/xfs/libxfs/xfs_types.h | 7 fs/xfs/scrub/alloc_repair.c | 721 +++++++++++++++++++++++++++++++++++++++++++ fs/xfs/scrub/common.c | 8 fs/xfs/scrub/repair.h | 11 + fs/xfs/scrub/scrub.c | 11 - fs/xfs/scrub/scrub.h | 8 fs/xfs/scrub/trace.h | 24 + fs/xfs/xfs_extent_busy.c | 13 + fs/xfs/xfs_extent_busy.h | 2 11 files changed, 803 insertions(+), 5 deletions(-) create mode 100644 fs/xfs/scrub/alloc_repair.c diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 2586f07e1fdb..9957fcfad75f 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -159,6 +159,7 @@ xfs-$(CONFIG_XFS_QUOTA) += scrub/quota.o ifeq ($(CONFIG_XFS_ONLINE_REPAIR),y) xfs-y += $(addprefix scrub/, \ agheader_repair.o \ + alloc_repair.o \ array.o \ bitmap.o \ repair.o \ diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c index fdfe6dc0d307..3dcc87629478 100644 --- a/fs/xfs/libxfs/xfs_ag_resv.c +++ b/fs/xfs/libxfs/xfs_ag_resv.c @@ -383,6 +383,8 @@ xfs_ag_resv_free_extent( /* fall through */ case XFS_AG_RESV_NONE: xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (int64_t)len); + /* fall through */ + case XFS_AG_RESV_IGNORE: return; } diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h index 397d94775440..50254487e40d 100644 --- a/fs/xfs/libxfs/xfs_types.h +++ b/fs/xfs/libxfs/xfs_types.h @@ -173,6 +173,13 @@ enum xfs_ag_resv_type { XFS_AG_RESV_AGFL, XFS_AG_RESV_METADATA, XFS_AG_RESV_RMAPBT, + + /* + * Don't increase fdblocks when freeing extent. This is a pony for + * the bnobt repair functions to re-free the free space without + * altering fdblocks. If you think you need this you're wrong. + */ + XFS_AG_RESV_IGNORE, }; /* diff --git a/fs/xfs/scrub/alloc_repair.c b/fs/xfs/scrub/alloc_repair.c new file mode 100644 index 000000000000..c098715f704e --- /dev/null +++ b/fs/xfs/scrub/alloc_repair.c @@ -0,0 +1,721 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2019 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <darrick.wong@xxxxxxxxxx> + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_alloc.h" +#include "xfs_alloc_btree.h" +#include "xfs_rmap.h" +#include "xfs_rmap_btree.h" +#include "xfs_inode.h" +#include "xfs_refcount.h" +#include "xfs_extent_busy.h" +#include "xfs_health.h" +#include "xfs_bmap.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/btree.h" +#include "scrub/trace.h" +#include "scrub/repair.h" +#include "scrub/bitmap.h" +#include "scrub/array.h" + +/* + * Free Space Btree Repair + * ======================= + * + * The reverse mappings are supposed to record all space usage for the entire + * AG. Therefore, we can recalculate the free extents in an AG by looking for + * gaps in the physical extents recorded in the rmapbt. On a reflink + * filesystem this is a little more tricky in that we have to be aware that + * the rmap records are allowed to overlap. + * + * We derive which blocks belonged to the old bnobt/cntbt by recording all the + * OWN_AG extents and subtracting out the blocks owned by all other OWN_AG + * metadata: the rmapbt blocks visited while iterating the reverse mappings + * and the AGFL blocks. + * + * Once we have both of those pieces, we can reconstruct the bnobt and cntbt + * by blowing out the free block state and freeing all the extents that we + * found. This adds the requirement that we can't have any busy extents in + * the AG because the busy code cannot handle duplicate records. + * + * Note that we can only rebuild both free space btrees at the same time + * because the regular extent freeing infrastructure loads both btrees at the + * same time. + * + * We use the prefix 'xrep_abt' here because we regenerate both free space + * allocation btrees at the same time. + */ + +struct xrep_abt { + /* Blocks owned by the rmapbt or the agfl. */ + struct xbitmap not_allocbt_blocks; + + /* All OWN_AG blocks. */ + struct xbitmap old_allocbt_blocks; + + /* + * New bnobt information. All btree block reservations are added to + * the reservation list in new_bnobt_info. + */ + struct xrep_newbt new_bnobt_info; + struct xfs_btree_bload bno_bload; + + /* new cntbt information */ + struct xrep_newbt new_cntbt_info; + struct xfs_btree_bload cnt_bload; + + /* Free space extents. */ + struct xfbma *free_records; + + struct xfs_scrub *sc; + + /* Number of non-null records in @free_records. */ + uint64_t nr_real_records; + + /* get_data()'s position in the free space record array. */ + uint64_t iter; + + /* + * Next block we anticipate seeing in the rmap records. If the next + * rmap record is greater than next_bno, we have found unused space. + */ + xfs_agblock_t next_bno; + + /* Number of free blocks in this AG. */ + xfs_agblock_t nr_blocks; + + /* Longest free extent we found in the AG. */ + xfs_agblock_t longest; +}; + +/* + * Stash a free space record for all the space since the last bno we found + * all the way up to @end. + */ +static int +xrep_abt_stash( + struct xrep_abt *ra, + xfs_agblock_t end) +{ + struct xfs_alloc_rec_incore arec = { + .ar_startblock = ra->next_bno, + .ar_blockcount = end - ra->next_bno, + }; + int error = 0; + + trace_xrep_abt_found(ra->sc->mp, ra->sc->sa.agno, arec.ar_startblock, + arec.ar_blockcount); + + if (xchk_should_terminate(ra->sc, &error)) + return error; + + error = xfbma_append(ra->free_records, &arec); + if (error) + return error; + ra->nr_blocks += arec.ar_blockcount; + return 0; +} + +/* Record extents that aren't in use from gaps in the rmap records. */ +STATIC int +xrep_abt_walk_rmap( + struct xfs_btree_cur *cur, + struct xfs_rmap_irec *rec, + void *priv) +{ + struct xrep_abt *ra = priv; + xfs_fsblock_t fsb; + int error; + + /* Record all the OWN_AG blocks... */ + if (rec->rm_owner == XFS_RMAP_OWN_AG) { + fsb = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_private.a.agno, + rec->rm_startblock); + error = xbitmap_set(&ra->old_allocbt_blocks, fsb, + rec->rm_blockcount); + if (error) + return error; + } + + /* ...and all the rmapbt blocks... */ + error = xbitmap_set_btcur_path(&ra->not_allocbt_blocks, cur); + if (error) + return error; + + /* ...and all the free space. */ + if (rec->rm_startblock > ra->next_bno) { + error = xrep_abt_stash(ra, rec->rm_startblock); + if (error) + return error; + } + + /* + * rmap records can overlap on reflink filesystems, so project next_bno + * as far out into the AG space as we currently know about. + */ + ra->next_bno = max_t(xfs_agblock_t, ra->next_bno, + rec->rm_startblock + rec->rm_blockcount); + return 0; +} + +/* Collect an AGFL block for the not-to-release list. */ +static int +xrep_abt_walk_agfl( + struct xfs_mount *mp, + xfs_agblock_t bno, + void *priv) +{ + struct xrep_abt *ra = priv; + xfs_fsblock_t fsb; + + fsb = XFS_AGB_TO_FSB(mp, ra->sc->sa.agno, bno); + return xbitmap_set(&ra->not_allocbt_blocks, fsb, 1); +} + +/* + * Compare two free space extents by block number. We want to sort by block + * number. + */ +static int +xrep_bnobt_extent_cmp( + const void *a, + const void *b) +{ + const struct xfs_alloc_rec_incore *ap = a; + const struct xfs_alloc_rec_incore *bp = b; + + if (ap->ar_startblock > bp->ar_startblock) + return 1; + else if (ap->ar_startblock < bp->ar_startblock) + return -1; + return 0; +} + +/* + * Compare two free space extents by length and then block number. We want + * to sort first in order of decreasing length and then in increasing block + * number. + */ +static int +xrep_cntbt_extent_cmp( + const void *a, + const void *b) +{ + const struct xfs_alloc_rec_incore *ap = a; + const struct xfs_alloc_rec_incore *bp = b; + + if (ap->ar_blockcount > bp->ar_blockcount) + return 1; + else if (ap->ar_blockcount < bp->ar_blockcount) + return -1; + return xrep_bnobt_extent_cmp(a, b); +} + +/* + * Iterate all reverse mappings to find (1) the gaps between rmap records (all + * unowned space), (2) the OWN_AG extents (which encompass the free space + * btrees, the rmapbt, and the agfl), (3) the rmapbt blocks, and (4) the AGFL + * blocks. The free space is (1) + (2) - (3) - (4). + */ +STATIC int +xrep_abt_find_freespace( + struct xrep_abt *ra) +{ + struct xfs_scrub *sc = ra->sc; + struct xfs_btree_cur *cur; + struct xfs_mount *mp = sc->mp; + xfs_agblock_t agend; + int error; + + xbitmap_init(&ra->not_allocbt_blocks); + + /* + * Iterate all the reverse mappings to find gaps in the physical + * mappings, all the OWN_AG blocks, and all the rmapbt extents. + */ + cur = xfs_rmapbt_init_cursor(mp, sc->tp, sc->sa.agf_bp, sc->sa.agno); + error = xfs_rmap_query_all(cur, xrep_abt_walk_rmap, ra); + xfs_btree_del_cursor(cur, error); + if (error) + goto err; + + /* Insert a record for space between the last rmap and EOAG. */ + agend = be32_to_cpu(XFS_BUF_TO_AGF(sc->sa.agf_bp)->agf_length); + if (ra->next_bno < agend) { + error = xrep_abt_stash(ra, agend); + if (error) + goto err; + } + + /* Collect all the AGFL blocks. */ + error = xfs_agfl_walk(mp, XFS_BUF_TO_AGF(sc->sa.agf_bp), + sc->sa.agfl_bp, xrep_abt_walk_agfl, ra); + if (error) + goto err; + + /* Compute the old bnobt/cntbt blocks. */ + xbitmap_disunion(&ra->old_allocbt_blocks, &ra->not_allocbt_blocks); + + ra->nr_real_records = xfbma_length(ra->free_records); +err: + xbitmap_destroy(&ra->not_allocbt_blocks); + return error; +} + +/* + * We're going to use the observed free space records to reserve blocks for the + * new free space btrees, so we play an iterative game where we try to converge + * on the number of blocks we need: + * + * 1. Estimate how many blocks we'll need to store the records. + * 2. If the first free record has more blocks than we need, we're done. + * We will have to re-sort the records prior to building the cntbt. + * 3. If that record has exactly the number of blocks we need, null out the + * record. We're done. + * 4. Otherwise, we still need more blocks. Null out the record, subtract its + * length from the number of blocks we need, and go back to step 1. + * + * Fortunately, we don't have to do any transaction work to play this game, so + * we don't have to tear down the staging cursors. + */ +STATIC int +xrep_abt_reserve_space( + struct xrep_abt *ra, + struct xfs_btree_cur *bno_cur, + struct xfs_btree_cur *cnt_cur, + bool *need_resort) +{ + struct xfs_scrub *sc = ra->sc; + uint64_t record_nr = xfbma_length(ra->free_records) - 1; + unsigned int allocated = 0; + int error = 0; + + *need_resort = false; + do { + struct xfs_alloc_rec_incore arec; + uint64_t required; + unsigned int desired; + unsigned int found; + + /* Compute how many blocks we'll need. */ + error = xfs_btree_bload_compute_geometry(cnt_cur, + &ra->cnt_bload, ra->nr_real_records); + if (error) + break; + + error = xfs_btree_bload_compute_geometry(bno_cur, + &ra->bno_bload, ra->nr_real_records); + if (error) + break; + + /* How many btree blocks do we need to store all records? */ + required = ra->cnt_bload.nr_blocks + ra->bno_bload.nr_blocks; + ASSERT(required < INT_MAX); + + /* If we've reserved enough blocks, we're done. */ + if (allocated >= required) + break; + + desired = required - allocated; + + /* We need space but there's none left; bye! */ + if (ra->nr_real_records == 0) { + error = -ENOSPC; + break; + } + + /* Grab the first record from the list. */ + error = xfbma_get(ra->free_records, record_nr, &arec); + if (error) + break; + + ASSERT(arec.ar_blockcount <= UINT_MAX); + found = min_t(unsigned int, arec.ar_blockcount, desired); + + error = xrep_newbt_add_blocks(&ra->new_bnobt_info, + XFS_AGB_TO_FSB(sc->mp, sc->sa.agno, + arec.ar_startblock), + found, NULL); + if (error) + break; + allocated += found; + ra->nr_blocks -= found; + + if (arec.ar_blockcount > desired) { + /* + * Record has more space than we need. The number of + * free records doesn't change, so shrink the free + * record and exit the loop. + */ + arec.ar_startblock += desired; + arec.ar_blockcount -= desired; + error = xfbma_set(ra->free_records, record_nr, &arec); + if (error) + break; + *need_resort = true; + break; + } + + /* + * We're going to use up the entire record, so nullify it and + * move on to the next one. This changes the number of free + * records, so we must go around the loop once more to re-run + * _bload_init. + */ + error = xfbma_nullify(ra->free_records, record_nr); + if (error) + break; + ra->nr_real_records--; + record_nr--; + } while (1); + + return error; +} + +/* + * Deal with all the space we reserved. Blocks that were allocated for the + * free space btrees need to have a (deferred) rmap added for the OWN_AG + * allocation, and blocks that didn't get used can be freed via the usual + * (deferred) means. + */ +STATIC void +xrep_abt_dispose_reservations( + struct xrep_abt *ra, + int error) +{ + struct xrep_newbt_resv *resv, *n; + struct xfs_scrub *sc = ra->sc; + + if (error) + goto junkit; + + for_each_xrep_newbt_reservation(&ra->new_bnobt_info, resv, n) { + /* Add a deferred rmap for each extent we used. */ + if (resv->used > 0) + xfs_rmap_alloc_extent(sc->tp, + XFS_FSB_TO_AGNO(sc->mp, resv->fsbno), + XFS_FSB_TO_AGBNO(sc->mp, resv->fsbno), + resv->used, XFS_RMAP_OWN_AG); + + /* + * Add a deferred free for each block we didn't use and now + * have to add to the free space since the new btrees are + * online. + */ + if (resv->used < resv->len) + __xfs_bmap_add_free(sc->tp, resv->fsbno + resv->used, + resv->len - resv->used, NULL, true); + } + +junkit: + for_each_xrep_newbt_reservation(&ra->new_bnobt_info, resv, n) { + list_del(&resv->list); + kmem_free(resv); + } + + xrep_newbt_destroy(&ra->new_bnobt_info, error); + xrep_newbt_destroy(&ra->new_cntbt_info, error); +} + +/* Retrieve free space data for bulk load. */ +STATIC int +xrep_abt_get_data( + struct xfs_btree_cur *cur, + void *priv) +{ + struct xfs_alloc_rec_incore *arec = &cur->bc_rec.a; + struct xrep_abt *ra = priv; + int error; + + error = xfbma_get_data(ra->free_records, &ra->iter, arec); + if (error) + return error; + + ra->longest = max(ra->longest, arec->ar_blockcount); + return 0; +} + +/* Feed one of the new btree blocks to the bulk loader. */ +STATIC int +xrep_abt_alloc_block( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *ptr, + void *priv) +{ + struct xrep_abt *ra = priv; + + return xrep_newbt_claim_block(cur, &ra->new_bnobt_info, ptr); +} + +/* + * Reset the AGF counters to reflect the free space btrees that we just + * rebuilt, then reinitialize the per-AG data. + */ +STATIC int +xrep_abt_reset_counters( + struct xrep_abt *ra, + unsigned int freesp_btreeblks) +{ + struct xfs_scrub *sc = ra->sc; + struct xfs_perag *pag = sc->sa.pag; + struct xfs_agf *agf; + struct xfs_buf *bp; + + agf = XFS_BUF_TO_AGF(sc->sa.agf_bp); + + /* + * Mark the pagf information stale and use the accessor function to + * forcibly reload it from the values we just logged. We still own the + * AGF buffer so we can safely ignore bp. + */ + ASSERT(pag->pagf_init); + pag->pagf_init = 0; + + agf->agf_btreeblks = cpu_to_be32(freesp_btreeblks + + (be32_to_cpu(agf->agf_rmap_blocks) - 1)); + agf->agf_freeblks = cpu_to_be32(ra->nr_blocks); + agf->agf_longest = cpu_to_be32(ra->longest); + xfs_alloc_log_agf(sc->tp, sc->sa.agf_bp, XFS_AGF_BTREEBLKS | + XFS_AGF_LONGEST | + XFS_AGF_FREEBLKS); + + return xfs_alloc_read_agf(sc->mp, sc->tp, sc->sa.agno, 0, &bp); +} + +static void +xrep_abt_init_bload( + struct xrep_abt *ra, + struct xfs_btree_bload *bload) +{ + bload->get_data = xrep_abt_get_data; + bload->alloc_block = xrep_abt_alloc_block; + + xrep_bload_estimate_slack(ra->sc, bload); +} + +/* + * Use the collected free space information to stage new free space btrees. + * If this is successful we'll return with the new btree root + * information logged to the repair transaction but not yet committed. + */ +STATIC int +xrep_abt_build_new_trees( + struct xrep_abt *ra) +{ + struct xfs_scrub *sc = ra->sc; + struct xfs_btree_cur *bno_cur; + struct xfs_btree_cur *cnt_cur; + bool need_resort; + int error; + + xrep_abt_init_bload(ra, &ra->bno_bload); + xrep_abt_init_bload(ra, &ra->cnt_bload); + + /* + * Sort the free extents by length so that we can set up the free space + * btrees in as few extents as possible. This reduces the amount of + * deferred rmap / free work we have to do at the end. + */ + error = xfbma_sort(ra->free_records, xrep_cntbt_extent_cmp); + if (error) + return error; + + /* + * Prepare to construct the new btree by reserving disk space for the + * new btree and setting up all the accounting information we'll need + * to root the new btree while it's under construction and before we + * attach it to the AG header. + */ + xrep_newbt_init_bare(&ra->new_bnobt_info, sc); + xrep_newbt_init_bare(&ra->new_cntbt_info, sc); + + /* Allocate cursors for the staged btrees. */ + bno_cur = xfs_allocbt_stage_cursor(sc->mp, sc->tp, + &ra->new_bnobt_info.afake, sc->sa.agno, XFS_BTNUM_BNO); + cnt_cur = xfs_allocbt_stage_cursor(sc->mp, sc->tp, + &ra->new_cntbt_info.afake, sc->sa.agno, XFS_BTNUM_CNT); + + /* Reserve the space we'll need for the new btrees. */ + error = xrep_abt_reserve_space(ra, bno_cur, cnt_cur, &need_resort); + if (error) + goto out_cur; + + /* + * If we need to re-sort the free extents by length, do so so that we + * can put the records into the cntbt in the correct order. + */ + if (need_resort) { + error = xfbma_sort(ra->free_records, xrep_cntbt_extent_cmp); + if (error) + goto out_cur; + } + + /* Load the free space by length tree. */ + ra->iter = 0; + ra->longest = 0; + error = xfs_btree_bload(cnt_cur, &ra->cnt_bload, ra); + if (error) + goto out_cur; + + /* Re-sort the free extents by block number so so that we can put the + * records into the bnobt in the correct order. + */ + error = xfbma_sort(ra->free_records, xrep_bnobt_extent_cmp); + if (error) + goto out_cur; + + /* Load the free space by block number tree. */ + ra->iter = 0; + error = xfs_btree_bload(bno_cur, &ra->bno_bload, ra); + if (error) + goto out_cur; + + /* + * Install the new btrees in the AG header. After this point the old + * btree is no longer accessible and the new tree is live. + * + * Note: We re-read the AGF here to ensure the buffer type is set + * properly. Since we built a new tree without attaching to the AGF + * buffer, the buffer item may have fallen off the buffer. This ought + * to succeed since the AGF is held across transaction rolls. + */ + error = xfs_read_agf(sc->mp, sc->tp, sc->sa.agno, 0, &sc->sa.agf_bp); + if (error) + goto out_cur; + + /* Commit our new btrees. */ + xfs_allocbt_commit_staged_btree(bno_cur, sc->sa.agf_bp); + xfs_btree_del_cursor(bno_cur, 0); + xfs_allocbt_commit_staged_btree(cnt_cur, sc->sa.agf_bp); + xfs_btree_del_cursor(cnt_cur, 0); + + /* Reset the AGF counters now that we've changed the btree shape. */ + error = xrep_abt_reset_counters(ra, (ra->bno_bload.nr_blocks - 1) + + (ra->cnt_bload.nr_blocks - 1)); + if (error) + goto out_newbt; + + /* Dispose of any unused blocks and the accounting information. */ + xrep_abt_dispose_reservations(ra, error); + + return xrep_roll_ag_trans(sc); + +out_cur: + xfs_btree_del_cursor(cnt_cur, error); + xfs_btree_del_cursor(bno_cur, error); +out_newbt: + xrep_abt_dispose_reservations(ra, error); + return error; +} + +/* + * Now that we've logged the roots of the new btrees, invalidate all of the + * old blocks and free them. + */ +STATIC int +xrep_abt_remove_old_trees( + struct xrep_abt *ra) +{ + /* Free the old inode btree blocks if they're not in use. */ + return xrep_reap_extents(ra->sc, &ra->old_allocbt_blocks, + &XFS_RMAP_OINFO_AG, XFS_AG_RESV_IGNORE); +} + +/* Repair the freespace btrees for some AG. */ +int +xrep_allocbt( + struct xfs_scrub *sc) +{ + struct xrep_abt *ra; + struct xfs_mount *mp = sc->mp; + int error; + + /* We require the rmapbt to rebuild anything. */ + if (!xfs_sb_version_hasrmapbt(&mp->m_sb)) + return -EOPNOTSUPP; + + ra = kmem_zalloc(sizeof(struct xrep_abt), KM_NOFS | KM_MAYFAIL); + if (!ra) + return -ENOMEM; + ra->sc = sc; + + /* We rebuild both data structures. */ + sc->sick_mask = XFS_SICK_AG_BNOBT | XFS_SICK_AG_CNTBT; + + xchk_perag_get(sc->mp, &sc->sa); + + /* + * Make sure the busy extent list is clear because we can't put + * extents on there twice. + */ + if (!xfs_extent_busy_list_empty(sc->sa.pag)) + return -EDEADLOCK; + + /* Set up some storage */ + ra->free_records = xfbma_init(sizeof(struct xfs_alloc_rec_incore)); + if (IS_ERR(ra->free_records)) { + error = PTR_ERR(ra->free_records); + goto out_ra; + } + + /* Collect the free space data and find the old btree blocks. */ + xbitmap_init(&ra->old_allocbt_blocks); + error = xrep_abt_find_freespace(ra); + if (error) + goto out_bitmap; + + /* Rebuild the free space information. */ + error = xrep_abt_build_new_trees(ra); + if (error) + goto out_bitmap; + + /* Kill the old trees. */ + error = xrep_abt_remove_old_trees(ra); + +out_bitmap: + xbitmap_destroy(&ra->old_allocbt_blocks); + xfbma_destroy(ra->free_records); +out_ra: + kmem_free(ra); + return error; +} + +/* Make sure both btrees are ok after we've rebuilt them. */ +int +xrep_revalidate_allocbt( + struct xfs_scrub *sc) +{ + __u32 old_type = sc->sm->sm_type; + int error; + + /* + * We must update sm_type temporarily so that the tree-to-tree cross + * reference checks will work in the correct direction, and also so + * that tracing will report correctly if there are more errors. + */ + sc->sm->sm_type = XFS_SCRUB_TYPE_BNOBT; + error = xchk_bnobt(sc); + if (error) + goto out; + + sc->sm->sm_type = XFS_SCRUB_TYPE_CNTBT; + error = xchk_cntbt(sc); +out: + sc->sm->sm_type = old_type; + return error; +} diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c index 18876056e5e0..4a49a9099477 100644 --- a/fs/xfs/scrub/common.c +++ b/fs/xfs/scrub/common.c @@ -634,8 +634,14 @@ xchk_setup_ag_btree( * expensive operation should be performed infrequently and only * as a last resort. Any caller that sets force_log should * document why they need to do so. + * + * Force everything in memory out to disk if we're repairing. + * This ensures we won't get tripped up by btree blocks sitting + * in memory waiting to have LSNs stamped in. The AGF/AGI repair + * routines use any available rmap data to try to find a btree + * root that also passes the read verifiers. */ - if (force_log) { + if (force_log || (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)) { error = xchk_checkpoint_log(mp); if (error) return error; diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h index 6ca5dc8dfb2d..725e6d69f003 100644 --- a/fs/xfs/scrub/repair.h +++ b/fs/xfs/scrub/repair.h @@ -55,6 +55,10 @@ int xrep_find_ag_btree_roots(struct xfs_scrub *sc, struct xfs_buf *agf_bp, void xrep_force_quotacheck(struct xfs_scrub *sc, uint dqtype); int xrep_ino_dqattach(struct xfs_scrub *sc); +/* Metadata revalidators */ + +int xrep_revalidate_allocbt(struct xfs_scrub *sc); + /* Metadata repairers */ int xrep_probe(struct xfs_scrub *sc); @@ -62,6 +66,7 @@ int xrep_superblock(struct xfs_scrub *sc); int xrep_agf(struct xfs_scrub *sc); int xrep_agfl(struct xfs_scrub *sc); int xrep_agi(struct xfs_scrub *sc); +int xrep_allocbt(struct xfs_scrub *sc); struct xrep_newbt_resv { /* Link to list of extents that we've reserved. */ @@ -101,6 +106,9 @@ struct xrep_newbt { enum xfs_ag_resv_type resv; }; +#define for_each_xrep_newbt_reservation(xnr, resv, n) \ + list_for_each_entry_safe((resv), (n), &(xnr)->resv_list, list) + void xrep_newbt_init_bare(struct xrep_newbt *xba, struct xfs_scrub *sc); void xrep_newbt_init_ag(struct xrep_newbt *xba, struct xfs_scrub *sc, const struct xfs_owner_info *oinfo, xfs_fsblock_t alloc_hint, @@ -135,11 +143,14 @@ xrep_calc_ag_resblks( return 0; } +#define xrep_revalidate_allocbt (NULL) + #define xrep_probe xrep_notsupported #define xrep_superblock xrep_notsupported #define xrep_agf xrep_notsupported #define xrep_agfl xrep_notsupported #define xrep_agi xrep_notsupported +#define xrep_allocbt xrep_notsupported #endif /* CONFIG_XFS_ONLINE_REPAIR */ diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index f1775bb19313..2acbc4ba9e97 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -218,13 +218,15 @@ static const struct xchk_meta_ops meta_scrub_ops[] = { .type = ST_PERAG, .setup = xchk_setup_ag_allocbt, .scrub = xchk_bnobt, - .repair = xrep_notsupported, + .repair = xrep_allocbt, + .repair_eval = xrep_revalidate_allocbt, }, [XFS_SCRUB_TYPE_CNTBT] = { /* cntbt */ .type = ST_PERAG, .setup = xchk_setup_ag_allocbt, .scrub = xchk_cntbt, - .repair = xrep_notsupported, + .repair = xrep_allocbt, + .repair_eval = xrep_revalidate_allocbt, }, [XFS_SCRUB_TYPE_INOBT] = { /* inobt */ .type = ST_PERAG, @@ -496,7 +498,10 @@ xfs_scrub_metadata( goto out_teardown; /* Scrub for errors. */ - error = sc.ops->scrub(&sc); + if ((sc.flags & XREP_ALREADY_FIXED) && sc.ops->repair_eval != NULL) + error = sc.ops->repair_eval(&sc); + else + error = sc.ops->scrub(&sc); if (!(sc.flags & XCHK_TRY_HARDER) && error == -EDEADLOCK) { /* * Scrubbers return -EDEADLOCK to mean 'try harder'. diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h index ad1ceb44a628..94a30637a127 100644 --- a/fs/xfs/scrub/scrub.h +++ b/fs/xfs/scrub/scrub.h @@ -27,6 +27,14 @@ struct xchk_meta_ops { /* Repair or optimize the metadata. */ int (*repair)(struct xfs_scrub *); + /* + * Re-scrub the metadata we repaired, in case there's extra work that + * we need to do to check our repair work. If this is NULL, we'll use + * the ->scrub function pointer, assuming that the regular scrub is + * sufficient. + */ + int (*repair_eval)(struct xfs_scrub *sc); + /* Decide if we even have this piece of metadata. */ bool (*has)(struct xfs_sb *); diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index 7ba12e68ea01..9734aca1a0fd 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -722,11 +722,33 @@ DEFINE_EVENT(xrep_rmap_class, name, \ xfs_agblock_t agbno, xfs_extlen_t len, \ uint64_t owner, uint64_t offset, unsigned int flags), \ TP_ARGS(mp, agno, agbno, len, owner, offset, flags)) -DEFINE_REPAIR_RMAP_EVENT(xrep_alloc_extent_fn); DEFINE_REPAIR_RMAP_EVENT(xrep_ialloc_extent_fn); DEFINE_REPAIR_RMAP_EVENT(xrep_rmap_extent_fn); DEFINE_REPAIR_RMAP_EVENT(xrep_bmap_extent_fn); +TRACE_EVENT(xrep_abt_found, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agblock_t startblock, xfs_extlen_t blockcount), + TP_ARGS(mp, agno, startblock, blockcount), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, startblock) + __field(xfs_extlen_t, blockcount) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->startblock = startblock; + __entry->blockcount = blockcount; + ), + TP_printk("dev %d:%d agno %u agbno %u len %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->startblock, + __entry->blockcount) +) + TRACE_EVENT(xrep_refcount_extent_fn, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, struct xfs_refcount_irec *irec), diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c index 3991e59cfd18..89cac4f2032d 100644 --- a/fs/xfs/xfs_extent_busy.c +++ b/fs/xfs/xfs_extent_busy.c @@ -657,3 +657,16 @@ xfs_extent_busy_ag_cmp( diff = b1->bno - b2->bno; return diff; } + +/* Are there any busy extents in this AG? */ +bool +xfs_extent_busy_list_empty( + struct xfs_perag *pag) +{ + bool res; + + spin_lock(&pag->pagb_lock); + res = RB_EMPTY_ROOT(&pag->pagb_tree); + spin_unlock(&pag->pagb_lock); + return res; +} diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h index 990ab3891971..2f8c73c712c6 100644 --- a/fs/xfs/xfs_extent_busy.h +++ b/fs/xfs/xfs_extent_busy.h @@ -65,4 +65,6 @@ static inline void xfs_extent_busy_sort(struct list_head *list) list_sort(NULL, list, xfs_extent_busy_ag_cmp); } +bool xfs_extent_busy_list_empty(struct xfs_perag *pag); + #endif /* __XFS_EXTENT_BUSY_H__ */