Take all the reverse-mapping data we've acquired and use it to generate reference count data. This data is used in phase 5 to rebuild the refcount btree. v2: Update to reflect separation of rmap_irec flags. Signed-off-by: Darrick J. Wong <darrick.wong@xxxxxxxxxx> --- repair/phase4.c | 27 ++++++ repair/rmap.c | 232 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ repair/rmap.h | 2 3 files changed, 259 insertions(+), 2 deletions(-) diff --git a/repair/phase4.c b/repair/phase4.c index 9da1bb1..86992c9 100644 --- a/repair/phase4.c +++ b/repair/phase4.c @@ -193,6 +193,21 @@ _("%s while checking reverse-mappings"), } static void +compute_ag_refcounts( + work_queue_t *wq, + xfs_agnumber_t agno, + void *arg) +{ + int error; + + error = compute_refcounts(wq->mp, agno); + if (error) + do_error( +_("%s while computing reference count records.\n"), + strerror(-error)); +} + +static void process_rmap_data( struct xfs_mount *mp) { @@ -206,6 +221,14 @@ process_rmap_data( for (i = 0; i < mp->m_sb.sb_agcount; i++) queue_work(&wq, check_rmap_btrees, i, NULL); destroy_work_queue(&wq); + + if (!xfs_sb_version_hasreflink(&mp->m_sb)) + return; + + create_work_queue(&wq, mp, libxfs_nproc()); + for (i = 0; i < mp->m_sb.sb_agcount; i++) + queue_work(&wq, compute_ag_refcounts, i, NULL); + destroy_work_queue(&wq); } void @@ -359,7 +382,9 @@ phase4(xfs_mount_t *mp) /* * Process all the reverse-mapping data that we collected. This - * involves checking the rmap data against the btree. + * involves checking the rmap data against the btree, computing + * reference counts based on the rmap data, and checking the counts + * against the refcount btree. */ process_rmap_data(mp); diff --git a/repair/rmap.c b/repair/rmap.c index 0baf4eb..0753448 100644 --- a/repair/rmap.c +++ b/repair/rmap.c @@ -42,6 +42,7 @@ struct xfs_ag_rmap { int ar_flcount; /* agfl entries from leftover */ /* agbt allocations */ struct xfs_rmap_irec ar_last_rmap; /* last rmap seen */ + struct xfs_slab *ar_refcount_items; /* refcount items, p4-5 */ }; static struct xfs_ag_rmap *ag_rmaps; @@ -88,7 +89,8 @@ bool rmap_needs_work( struct xfs_mount *mp) { - return xfs_sb_version_hasrmapbt(&mp->m_sb); + return xfs_sb_version_hasreflink(&mp->m_sb) || + xfs_sb_version_hasrmapbt(&mp->m_sb); } /* @@ -120,6 +122,11 @@ _("Insufficient memory while allocating reverse mapping slabs.")); do_error( _("Insufficient memory while allocating raw metadata reverse mapping slabs.")); ag_rmaps[i].ar_last_rmap.rm_owner = XFS_RMAP_OWN_UNKNOWN; + error = init_slab(&ag_rmaps[i].ar_refcount_items, + sizeof(struct xfs_refcount_irec)); + if (error) + do_error( +_("Insufficient memory while allocating refcount item slabs.")); } } @@ -138,6 +145,7 @@ rmaps_free( for (i = 0; i < mp->m_sb.sb_agcount; i++) { free_slab(&ag_rmaps[i].ar_rmaps); free_slab(&ag_rmaps[i].ar_raw_rmaps); + free_slab(&ag_rmaps[i].ar_refcount_items); } free(ag_rmaps); ag_rmaps = NULL; @@ -591,6 +599,228 @@ rmap_dump( #endif /* + * Rebuilding the Reference Count & Reverse Mapping Btrees + * + * The reference count (refcnt) and reverse mapping (rmap) btrees are rebuilt + * during phase 5, like all other AG btrees. Therefore, reverse mappings must + * be processed into reference counts at the end of phase 4, and the rmaps must + * be recorded during phase 4. There is a need to access the rmaps in physical + * block order, but no particular need for random access, so the slab.c code + * provides a big logical array (consisting of smaller slabs) and some inorder + * iterator functions. + * + * Once we've recorded all the reverse mappings, we're ready to translate the + * rmaps into refcount entries. Imagine the rmap entries as rectangles + * representing extents of physical blocks, and that the rectangles can be laid + * down to allow them to overlap each other; then we know that we must emit + * a refcnt btree entry wherever the amount of overlap changes, i.e. the + * emission stimulus is level-triggered: + * + * - --- + * -- ----- ---- --- ------ + * -- ---- ----------- ---- --------- + * -------------------------------- ----------- + * ^ ^ ^^ ^^ ^ ^^ ^^^ ^^^^ ^ ^^ ^ ^ ^ + * 2 1 23 21 3 43 234 2123 1 01 2 3 0 + * + * For our purposes, a rmap is a tuple (startblock, len, fileoff, owner). + * + * Note that in the actual refcnt btree we don't store the refcount < 2 cases + * because the bnobt tells us which blocks are free; single-use blocks aren't + * recorded in the bnobt or the refcntbt. If the rmapbt supports storing + * multiple entries covering a given block we could theoretically dispense with + * the refcntbt and simply count rmaps, but that's inefficient in the (hot) + * write path, so we'll take the cost of the extra tree to save time. Also + * there's no guarantee that rmap will be enabled. + * + * Given an array of rmaps sorted by physical block number, a starting physical + * block (sp), a bag to hold rmaps that cover sp, and the next physical + * block where the level changes (np), we can reconstruct the refcount + * btree as follows: + * + * While there are still unprocessed rmaps in the array, + * - Set sp to the physical block (pblk) of the next unprocessed rmap. + * - Add to the bag all rmaps in the array where startblock == sp. + * - Set np to the physical block where the bag size will change. + * This is the minimum of (the pblk of the next unprocessed rmap) and + * (startblock + len of each rmap in the bag). + * - Record the bag size as old_bag_size. + * + * - While the bag isn't empty, + * - Remove from the bag all rmaps where startblock + len == np. + * - Add to the bag all rmaps in the array where startblock == np. + * - If the bag size isn't old_bag_size, store the refcount entry + * (sp, np - sp, bag_size) in the refcnt btree. + * - If the bag is empty, break out of the inner loop. + * - Set old_bag_size to the bag size + * - Set sp = np. + * - Set np to the physical block where the bag size will change. + * This is the minimum of (the pblk of the next unprocessed rmap) and + * (startblock + len of each rmap in the bag). + * + * An implementation detail is that because this processing happens during + * phase 4, the refcount entries are stored in an array so that phase 5 can + * load them into the refcount btree. The rmaps can be loaded directly into + * the rmap btree during phase 5 as well. + */ + +/* + * Emit a refcount object for refcntbt reconstruction during phase 5. + */ +#define REFCOUNT_CLAMP(nr) ((nr) > MAXREFCOUNT ? MAXREFCOUNT : (nr)) +static void +refcount_emit( + struct xfs_mount *mp, + xfs_agnumber_t agno, + xfs_agblock_t agbno, + xfs_extlen_t len, + size_t nr_rmaps) +{ + struct xfs_refcount_irec rlrec; + int error; + struct xfs_slab *rlslab; + + rlslab = ag_rmaps[agno].ar_refcount_items; + ASSERT(nr_rmaps > 0); + + dbg_printf("REFL: agno=%u pblk=%u, len=%u -> refcount=%zu\n", + agno, agbno, len, nr_rmaps); + rlrec.rc_startblock = agbno; + rlrec.rc_blockcount = len; + rlrec.rc_refcount = REFCOUNT_CLAMP(nr_rmaps); + error = slab_add(rlslab, &rlrec); + if (error) + do_error( +_("Insufficient memory while recreating refcount tree.")); +} +#undef REFCOUNT_CLAMP + +/* + * Transform a pile of physical block mapping observations into refcount data + * for eventual rebuilding of the btrees. + */ +#define RMAP_END(r) ((r)->rm_startblock + (r)->rm_blockcount) +int +compute_refcounts( + struct xfs_mount *mp, + xfs_agnumber_t agno) +{ + struct xfs_bag *stack_top = NULL; + struct xfs_slab *rmaps; + struct xfs_slab_cursor *rmaps_cur; + struct xfs_rmap_irec *array_cur; + struct xfs_rmap_irec *rmap; + xfs_agblock_t sbno; /* first bno of this rmap set */ + xfs_agblock_t cbno; /* first bno of this refcount set */ + xfs_agblock_t nbno; /* next bno where rmap set changes */ + size_t n, idx; + size_t old_stack_nr; + int error; + + if (!xfs_sb_version_hasreflink(&mp->m_sb)) + return 0; + + rmaps = ag_rmaps[agno].ar_rmaps; + + error = init_slab_cursor(rmaps, rmap_compare, &rmaps_cur); + if (error) + return error; + + error = init_bag(&stack_top); + if (error) + goto err; + + /* While there are rmaps to be processed... */ + n = 0; + while (n < slab_count(rmaps)) { + array_cur = peek_slab_cursor(rmaps_cur); + sbno = cbno = array_cur->rm_startblock; + /* Push all rmaps with pblk == sbno onto the stack */ + for (; + array_cur && array_cur->rm_startblock == sbno; + array_cur = peek_slab_cursor(rmaps_cur)) { + advance_slab_cursor(rmaps_cur); n++; + rmap_dump("push0", agno, array_cur); + error = bag_add(stack_top, array_cur); + if (error) + goto err; + } + + /* Set nbno to the bno of the next refcount change */ + if (n < slab_count(rmaps)) + nbno = array_cur->rm_startblock; + else + nbno = NULLAGBLOCK; + foreach_bag_ptr(stack_top, idx, rmap) { + nbno = min(nbno, RMAP_END(rmap)); + } + + /* Emit reverse mappings, if needed */ + ASSERT(nbno > sbno); + old_stack_nr = bag_count(stack_top); + + /* While stack isn't empty... */ + while (bag_count(stack_top)) { + /* Pop all rmaps that end at nbno */ + foreach_bag_ptr_reverse(stack_top, idx, rmap) { + if (RMAP_END(rmap) != nbno) + continue; + rmap_dump("pop", agno, rmap); + error = bag_remove(stack_top, idx); + if (error) + goto err; + } + + /* Push array items that start at nbno */ + for (; + array_cur && array_cur->rm_startblock == nbno; + array_cur = peek_slab_cursor(rmaps_cur)) { + advance_slab_cursor(rmaps_cur); n++; + rmap_dump("push1", agno, array_cur); + error = bag_add(stack_top, array_cur); + if (error) + goto err; + } + + /* Emit refcount if necessary */ + ASSERT(nbno > cbno); + if (bag_count(stack_top) != old_stack_nr) { + if (old_stack_nr > 1) { + refcount_emit(mp, agno, cbno, + nbno - cbno, + old_stack_nr); + } + cbno = nbno; + } + + /* Stack empty, go find the next rmap */ + if (bag_count(stack_top) == 0) + break; + old_stack_nr = bag_count(stack_top); + sbno = nbno; + + /* Set nbno to the bno of the next refcount change */ + if (n < slab_count(rmaps)) + nbno = array_cur->rm_startblock; + else + nbno = NULLAGBLOCK; + foreach_bag_ptr(stack_top, idx, rmap) { + nbno = min(nbno, RMAP_END(rmap)); + } + + /* Emit reverse mappings, if needed */ + ASSERT(nbno > sbno); + } + } +err: + free_bag(&stack_top); + free_slab_cursor(&rmaps_cur); + + return error; +} +#undef RMAP_END + +/* * Return the number of rmap objects for an AG. */ size_t diff --git a/repair/rmap.h b/repair/rmap.h index 7106dfc..01dec9f 100644 --- a/repair/rmap.h +++ b/repair/rmap.h @@ -49,6 +49,8 @@ extern __int64_t rmap_diffkeys(struct xfs_rmap_irec *kp1, extern void rmap_high_key_from_rec(struct xfs_rmap_irec *rec, struct xfs_rmap_irec *key); +extern int compute_refcounts(struct xfs_mount *, xfs_agnumber_t); + extern void fix_freelist(struct xfs_mount *, xfs_agnumber_t, bool); extern void rmap_store_agflcount(struct xfs_mount *, xfs_agnumber_t, int); -- To unsubscribe from this list: send the line "unsubscribe linux-xfs" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html