On Sun, Jul 29, 2018 at 10:48:21PM -0700, Darrick J. Wong wrote: > From: Darrick J. Wong <darrick.wong@xxxxxxxxxx> > > Rebuild the free space btrees from the gaps in the rmap btree. > > Signed-off-by: Darrick J. Wong <darrick.wong@xxxxxxxxxx> > --- > fs/xfs/Makefile | 1 > fs/xfs/scrub/alloc.c | 1 > fs/xfs/scrub/alloc_repair.c | 581 +++++++++++++++++++++++++++++++++++++++++++ > fs/xfs/scrub/common.c | 8 + > fs/xfs/scrub/repair.h | 2 > fs/xfs/scrub/scrub.c | 4 > fs/xfs/scrub/trace.h | 2 > fs/xfs/xfs_extent_busy.c | 14 + > fs/xfs/xfs_extent_busy.h | 2 > 9 files changed, 610 insertions(+), 5 deletions(-) > create mode 100644 fs/xfs/scrub/alloc_repair.c > > ... > diff --git a/fs/xfs/scrub/alloc_repair.c b/fs/xfs/scrub/alloc_repair.c > new file mode 100644 > index 000000000000..b228c2906de2 > --- /dev/null > +++ b/fs/xfs/scrub/alloc_repair.c > @@ -0,0 +1,581 @@ ... > +/* Record extents that aren't in use from gaps in the rmap records. */ > +STATIC int > +xrep_abt_walk_rmap( > + struct xfs_btree_cur *cur, > + struct xfs_rmap_irec *rec, > + void *priv) > +{ > + struct xrep_abt *ra = priv; > + struct xrep_abt_extent *rae; > + xfs_fsblock_t fsb; > + int error; > + > + /* Record all the OWN_AG blocks... */ > + if (rec->rm_owner == XFS_RMAP_OWN_AG) { > + fsb = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_private.a.agno, > + rec->rm_startblock); > + error = xfs_bitmap_set(ra->btlist, fsb, rec->rm_blockcount); > + if (error) > + return error; > + } > + > + /* ...and all the rmapbt blocks... */ > + error = xfs_bitmap_set_btcur_path(&ra->nobtlist, cur); > + if (error) > + return error; > + > + /* ...and all the free space. */ > + if (rec->rm_startblock > ra->next_bno) { > + trace_xrep_abt_walk_rmap(cur->bc_mp, cur->bc_private.a.agno, > + ra->next_bno, rec->rm_startblock - ra->next_bno, > + XFS_RMAP_OWN_NULL, 0, 0); > + > + rae = kmem_alloc(sizeof(struct xrep_abt_extent), KM_MAYFAIL); > + if (!rae) > + return -ENOMEM; > + INIT_LIST_HEAD(&rae->list); > + rae->bno = ra->next_bno; > + rae->len = rec->rm_startblock - ra->next_bno; > + list_add_tail(&rae->list, ra->extlist); Any reason we don't use a bitmap for this one? > + ra->nr_records++; > + ra->nr_blocks += rae->len; > + } > + ra->next_bno = max_t(xfs_agblock_t, ra->next_bno, > + rec->rm_startblock + rec->rm_blockcount); The max_t() is to cover the record overlap case, right? If so, another one liner comment would be good. > + return 0; > +} > + ... > +/* Free an extent, which creates a record in the bnobt/cntbt. */ > +STATIC int > +xrep_abt_free_extent( > + struct xfs_scrub *sc, > + xfs_fsblock_t fsbno, > + xfs_extlen_t len, > + struct xfs_owner_info *oinfo) > +{ > + int error; > + > + error = xfs_free_extent(sc->tp, fsbno, len, oinfo, 0); > + if (error) > + return error; > + error = xrep_roll_ag_trans(sc); > + if (error) > + return error; > + return xfs_mod_fdblocks(sc->mp, -(int64_t)len, false); What's this call for? Is it because the blocks we're freeing were already free? (Similar question on the other xfs_mod_fdblocks() call further down). BTW, what prevents some other task from coming along and screwing with this? For example, could a large falloc or buffered write come in and allocate these global blocks before we take them away here (causing the whole sequence to fail)? > +} > + ... > +/* > + * Allocate a block from the (cached) first extent in the AG. In theory > + * this should never fail, since we already checked that there was enough > + * space to handle the new btrees. > + */ > +STATIC xfs_fsblock_t > +xrep_abt_alloc_block( > + struct xfs_scrub *sc, > + struct list_head *free_extents) > +{ > + struct xrep_abt_extent *ext; > + > + /* Pull the first free space extent off the list, and... */ > + ext = list_first_entry(free_extents, struct xrep_abt_extent, list); > + > + /* ...take its first block. */ > + ext->bno++; > + ext->len--; > + if (ext->len == 0) { > + list_del(&ext->list); > + kmem_free(ext); > + } > + > + return XFS_AGB_TO_FSB(sc->mp, sc->sa.agno, ext->bno - 1); Looks like a potential use after free of ext. > +} > + ... > +/* > + * Reset the global free block counter and the per-AG counters to make it look > + * like this AG has no free space. > + */ > +STATIC int > +xrep_abt_reset_counters( > + struct xfs_scrub *sc, > + int *log_flags) > +{ > + struct xfs_perag *pag = sc->sa.pag; > + struct xfs_agf *agf; > + xfs_agblock_t new_btblks; > + xfs_agblock_t to_free; > + int error; > + > + /* > + * Since we're abandoning the old bnobt/cntbt, we have to decrease > + * fdblocks by the # of blocks in those trees. btreeblks counts the > + * non-root blocks of the free space and rmap btrees. Do this before > + * resetting the AGF counters. > + */ Hmm, I'm not quite following the comment wrt to the xfs_mod_fdblocks() below. to_free looks like it's the count of all current btree blocks minus rmap blocks (i.e., old bno/cnt btree blocks). Are we "allocating" those blocks here because we're going to free them later? > + agf = XFS_BUF_TO_AGF(sc->sa.agf_bp); > + > + /* rmap_blocks accounts root block, btreeblks doesn't */ > + new_btblks = be32_to_cpu(agf->agf_rmap_blocks) - 1; > + > + /* btreeblks doesn't account bno/cnt root blocks */ > + to_free = pag->pagf_btreeblks + 2; > + > + /* and don't account for the blocks we aren't freeing */ > + to_free -= new_btblks; > + > + error = xfs_mod_fdblocks(sc->mp, -(int64_t)to_free, false); > + if (error) > + return error; > + > + /* > + * Reset the per-AG info, both incore and ondisk. Mark the incore > + * state stale in case we fail out of here. > + */ > + ASSERT(pag->pagf_init); > + pag->pagf_init = 0; > + pag->pagf_btreeblks = new_btblks; > + pag->pagf_freeblks = 0; > + pag->pagf_longest = 0; > + > + agf->agf_btreeblks = cpu_to_be32(new_btblks); > + agf->agf_freeblks = 0; > + agf->agf_longest = 0; > + *log_flags |= XFS_AGF_BTREEBLKS | XFS_AGF_LONGEST | XFS_AGF_FREEBLKS; > + > + return 0; > +} > + > +/* Initialize a new free space btree root and implant into AGF. */ > +STATIC int > +xrep_abt_reset_btree( > + struct xfs_scrub *sc, > + xfs_btnum_t btnum, > + struct list_head *free_extents) > +{ > + struct xfs_owner_info oinfo; > + struct xfs_buf *bp; > + struct xfs_perag *pag = sc->sa.pag; > + struct xfs_mount *mp = sc->mp; > + struct xfs_agf *agf = XFS_BUF_TO_AGF(sc->sa.agf_bp); > + xfs_fsblock_t fsbno; > + int error; > + > + /* Allocate new root block. */ > + fsbno = xrep_abt_alloc_block(sc, free_extents); xrep_abt_alloc_block() converts an agbno to return an fsb. This function passes the fsb to the init call just below and then converts it back to an agbno in two places. It seems like there might be less conversions to follow if the above just returned an agbno and we converted it to an fsb once for xrep_init_btblock(). > + if (fsbno == NULLFSBLOCK) > + return -ENOSPC; > + > + /* Initialize new tree root. */ > + error = xrep_init_btblock(sc, fsbno, &bp, btnum, &xfs_allocbt_buf_ops); > + if (error) > + return error; > + > + /* Implant into AGF. */ > + agf->agf_roots[btnum] = cpu_to_be32(XFS_FSB_TO_AGBNO(mp, fsbno)); > + agf->agf_levels[btnum] = cpu_to_be32(1); > + > + /* Add rmap records for the btree roots */ > + xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG); > + error = xfs_rmap_alloc(sc->tp, sc->sa.agf_bp, sc->sa.agno, > + XFS_FSB_TO_AGBNO(mp, fsbno), 1, &oinfo); > + if (error) > + return error; > + > + /* Reset the incore state. */ > + pag->pagf_levels[btnum] = 1; > + > + return 0; > +} > + ... > + > +/* > + * Make our new freespace btree roots permanent so that we can start freeing > + * unused space back into the AG. > + */ > +STATIC int > +xrep_abt_commit_new( > + struct xfs_scrub *sc, > + struct xfs_bitmap *old_allocbt_blocks, > + int log_flags) > +{ > + int error; > + > + xfs_alloc_log_agf(sc->tp, sc->sa.agf_bp, log_flags); > + > + /* Invalidate the old freespace btree blocks and commit. */ > + error = xrep_invalidate_blocks(sc, old_allocbt_blocks); > + if (error) > + return error; It looks like the above invalidation all happens in the same transaction. Those aren't logging buffer data or anything, but any idea how many log formats we can get away with in this single transaction? > + error = xrep_roll_ag_trans(sc); > + if (error) > + return error; > + > + /* Now that we've succeeded, mark the incore state valid again. */ > + sc->sa.pag->pagf_init = 1; > + return 0; > +} > + > +/* Build new free space btrees and dispose of the old one. */ > +STATIC int > +xrep_abt_rebuild_trees( > + struct xfs_scrub *sc, > + struct list_head *free_extents, > + struct xfs_bitmap *old_allocbt_blocks) > +{ > + struct xfs_owner_info oinfo; > + struct xrep_abt_extent *rae; > + struct xrep_abt_extent *n; > + struct xrep_abt_extent *longest; > + int error; > + > + xfs_rmap_skip_owner_update(&oinfo); > + > + /* > + * Insert the longest free extent in case it's necessary to > + * refresh the AGFL with multiple blocks. If there is no longest > + * extent, we had exactly the free space we needed; we're done. > + */ I'm confused by the last sentence. longest should only be NULL if the free space list is empty and haven't we already bailed out with -ENOSPC if that's the case? > + longest = xrep_abt_get_longest(free_extents); > + if (!longest) > + goto done; > + error = xrep_abt_free_extent(sc, > + XFS_AGB_TO_FSB(sc->mp, sc->sa.agno, longest->bno), > + longest->len, &oinfo); > + list_del(&longest->list); > + kmem_free(longest); > + if (error) > + return error; > + > + /* Insert records into the new btrees. */ > + list_for_each_entry_safe(rae, n, free_extents, list) { > + error = xrep_abt_free_extent(sc, > + XFS_AGB_TO_FSB(sc->mp, sc->sa.agno, rae->bno), > + rae->len, &oinfo); > + if (error) > + return error; > + list_del(&rae->list); > + kmem_free(rae); > + } Ok, at this point we've reset the btree roots and we start freeing the free ranges that were discovered via the rmapbt analysis. AFAICT, if we fail or crash at this point, we leave the allocbts in a partially constructed state. I take it that is Ok with respect to the broader repair algorithm because we'd essentially start over by inspecting the rmapbt again on a retry. The blocks allocated for the btrees that we've begun to construct here end up mapped in the rmapbt as we go, right? IIUC, that means we don't necessarily have infinite retries to make sure this completes. IOW, suppose that a first repair attempt finds just enough free space to construct new trees, gets far enough along to consume most of that free space and then crashes. Is it possible that a subsequent repair attempt includes the btree blocks allocated during the previous failed repair attempt in the sum of "old btree blocks" and determines we don't have enough free space to repair? > + > +done: > + /* Free all the OWN_AG blocks that are not in the rmapbt/agfl. */ > + xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG); > + return xrep_reap_extents(sc, old_allocbt_blocks, &oinfo, > + XFS_AG_RESV_NONE); > +} > + ... > diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c > index 0ed68379e551..82f99633a597 100644 > --- a/fs/xfs/xfs_extent_busy.c > +++ b/fs/xfs/xfs_extent_busy.c > @@ -657,3 +657,17 @@ xfs_extent_busy_ag_cmp( > diff = b1->bno - b2->bno; > return diff; > } > + > +/* Are there any busy extents in this AG? */ > +bool > +xfs_extent_busy_list_empty( > + struct xfs_perag *pag) > +{ > + spin_lock(&pag->pagb_lock); > + if (pag->pagb_tree.rb_node) { RB_EMPTY_ROOT()? Brian > + spin_unlock(&pag->pagb_lock); > + return false; > + } > + spin_unlock(&pag->pagb_lock); > + return true; > +} > diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h > index 990ab3891971..2f8c73c712c6 100644 > --- a/fs/xfs/xfs_extent_busy.h > +++ b/fs/xfs/xfs_extent_busy.h > @@ -65,4 +65,6 @@ static inline void xfs_extent_busy_sort(struct list_head *list) > list_sort(NULL, list, xfs_extent_busy_ag_cmp); > } > > +bool xfs_extent_busy_list_empty(struct xfs_perag *pag); > + > #endif /* __XFS_EXTENT_BUSY_H__ */ > > -- > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > the body of a message to majordomo@xxxxxxxxxxxxxxx > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe linux-xfs" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html