[PATCH 3/3] xfs_repair: rebuild block mappings from rmapbt data

"Darrick J. Wong" <djwong@xxxxxxxxxx> · Mon, 25 Mar 2024 20:25:32 -0700

From: Darrick J. Wong <djwong@xxxxxxxxxx>

Use rmap records to rebuild corrupt inode forks instead of zapping
the whole inode if we think the rmap data is reasonably sane.

Signed-off-by: Darrick J. Wong <djwong@xxxxxxxxxx>
Reviewed-by: Christoph Hellwig <hch@xxxxxx>
Reviewed-by: Bill O'Donnell <bodonnel@xxxxxxxxxx>
---
 include/xfs_trans.h      |    2 
 libfrog/util.h           |    5 
 libxfs/libxfs_api_defs.h |   15 +
 libxfs/trans.c           |   48 +++
 repair/Makefile          |    2 
 repair/agbtree.c         |    2 
 repair/bmap_repair.c     |  748 ++++++++++++++++++++++++++++++++++++++++++++++
 repair/bmap_repair.h     |   13 +
 repair/bulkload.c        |  205 ++++++++++++-
 repair/bulkload.h        |   24 +
 repair/dinode.c          |   54 +++
 repair/rmap.c            |    2 
 repair/rmap.h            |    1 
 13 files changed, 1110 insertions(+), 11 deletions(-)
 create mode 100644 repair/bmap_repair.c
 create mode 100644 repair/bmap_repair.h

diff --git a/include/xfs_trans.h b/include/xfs_trans.h
index ab298ccfe556..ac82c3bc480a 100644
--- a/include/xfs_trans.h
+++ b/include/xfs_trans.h
@@ -98,6 +98,8 @@ int	libxfs_trans_alloc_rollable(struct xfs_mount *mp, uint blocks,
 int	libxfs_trans_alloc_empty(struct xfs_mount *mp, struct xfs_trans **tpp);
 int	libxfs_trans_commit(struct xfs_trans *);
 void	libxfs_trans_cancel(struct xfs_trans *);
+int	libxfs_trans_reserve_more(struct xfs_trans *tp, uint blocks,
+			uint rtextents);
 
 /* cancel dfops associated with a transaction */
 void xfs_defer_cancel(struct xfs_trans *);
diff --git a/libfrog/util.h b/libfrog/util.h
index 1b97881bf168..5df95e69cd11 100644
--- a/libfrog/util.h
+++ b/libfrog/util.h
@@ -8,4 +8,9 @@
 
 unsigned int	log2_roundup(unsigned int i);
 
+#define min_t(type,x,y) \
+	({ type __x = (x); type __y = (y); __x < __y ? __x: __y; })
+#define max_t(type,x,y) \
+	({ type __x = (x); type __y = (y); __x > __y ? __x: __y; })
+
 #endif /* __LIBFROG_UTIL_H__ */
diff --git a/libxfs/libxfs_api_defs.h b/libxfs/libxfs_api_defs.h
index 28960317ab6b..769733ec2ee3 100644
--- a/libxfs/libxfs_api_defs.h
+++ b/libxfs/libxfs_api_defs.h
@@ -32,7 +32,7 @@
 #define xfs_alloc_fix_freelist		libxfs_alloc_fix_freelist
 #define xfs_alloc_min_freelist		libxfs_alloc_min_freelist
 #define xfs_alloc_read_agf		libxfs_alloc_read_agf
-#define xfs_alloc_vextent		libxfs_alloc_vextent
+#define xfs_alloc_vextent_start_ag	libxfs_alloc_vextent_start_ag
 
 #define xfs_ascii_ci_hashname		libxfs_ascii_ci_hashname
 
@@ -44,11 +44,18 @@
 #define xfs_attr_shortform_verify	libxfs_attr_shortform_verify
 
 #define __xfs_bmap_add_free		__libxfs_bmap_add_free
+#define xfs_bmap_validate_extent	libxfs_bmap_validate_extent
 #define xfs_bmapi_read			libxfs_bmapi_read
+#define xfs_bmapi_remap			libxfs_bmapi_remap
 #define xfs_bmapi_write			libxfs_bmapi_write
 #define xfs_bmap_last_offset		libxfs_bmap_last_offset
+#define xfs_bmbt_calc_size		libxfs_bmbt_calc_size
+#define xfs_bmbt_commit_staged_btree	libxfs_bmbt_commit_staged_btree
+#define xfs_bmbt_disk_get_startoff	libxfs_bmbt_disk_get_startoff
+#define xfs_bmbt_disk_set_all		libxfs_bmbt_disk_set_all
 #define xfs_bmbt_maxlevels_ondisk	libxfs_bmbt_maxlevels_ondisk
 #define xfs_bmbt_maxrecs		libxfs_bmbt_maxrecs
+#define xfs_bmbt_stage_cursor		libxfs_bmbt_stage_cursor
 #define xfs_bmdr_maxrecs		libxfs_bmdr_maxrecs
 
 #define xfs_btree_bload			libxfs_btree_bload
@@ -117,6 +124,7 @@
 
 #define xfs_finobt_calc_reserves	libxfs_finobt_calc_reserves
 #define xfs_free_extent			libxfs_free_extent
+#define xfs_free_extent_later		libxfs_free_extent_later
 #define xfs_free_perag			libxfs_free_perag
 #define xfs_fs_geometry			libxfs_fs_geometry
 #define xfs_highbit32			libxfs_highbit32
@@ -127,7 +135,10 @@
 #define xfs_ialloc_read_agi		libxfs_ialloc_read_agi
 #define xfs_idata_realloc		libxfs_idata_realloc
 #define xfs_idestroy_fork		libxfs_idestroy_fork
+#define xfs_iext_first			libxfs_iext_first
+#define xfs_iext_insert_raw		libxfs_iext_insert_raw
 #define xfs_iext_lookup_extent		libxfs_iext_lookup_extent
+#define xfs_iext_next			libxfs_iext_next
 #define xfs_ifork_zap_attr		libxfs_ifork_zap_attr
 #define xfs_imap_to_bp			libxfs_imap_to_bp
 #define xfs_initialize_perag		libxfs_initialize_perag
@@ -174,10 +185,12 @@
 #define xfs_rmapbt_stage_cursor		libxfs_rmapbt_stage_cursor
 #define xfs_rmap_compare		libxfs_rmap_compare
 #define xfs_rmap_get_rec		libxfs_rmap_get_rec
+#define xfs_rmap_ino_bmbt_owner		libxfs_rmap_ino_bmbt_owner
 #define xfs_rmap_irec_offset_pack	libxfs_rmap_irec_offset_pack
 #define xfs_rmap_irec_offset_unpack	libxfs_rmap_irec_offset_unpack
 #define xfs_rmap_lookup_le		libxfs_rmap_lookup_le
 #define xfs_rmap_lookup_le_range	libxfs_rmap_lookup_le_range
+#define xfs_rmap_query_all		libxfs_rmap_query_all
 #define xfs_rmap_query_range		libxfs_rmap_query_range
 
 #define xfs_rtbitmap_getword		libxfs_rtbitmap_getword
diff --git a/libxfs/trans.c b/libxfs/trans.c
index bd1186b24e62..8143a6a99f62 100644
--- a/libxfs/trans.c
+++ b/libxfs/trans.c
@@ -1143,3 +1143,51 @@ libxfs_trans_alloc_inode(
 	*tpp = tp;
 	return 0;
 }
+
+/*
+ * Try to reserve more blocks for a transaction.  The single use case we
+ * support is for offline repair -- use a transaction to gather data without
+ * fear of btree cycle deadlocks; calculate how many blocks we really need
+ * from that data; and only then start modifying data.  This can fail due to
+ * ENOSPC, so we have to be able to cancel the transaction.
+ */
+int
+libxfs_trans_reserve_more(
+	struct xfs_trans	*tp,
+	uint			blocks,
+	uint			rtextents)
+{
+	int			error = 0;
+
+	ASSERT(!(tp->t_flags & XFS_TRANS_DIRTY));
+
+	/*
+	 * Attempt to reserve the needed disk blocks by decrementing
+	 * the number needed from the number available.  This will
+	 * fail if the count would go below zero.
+	 */
+	if (blocks > 0) {
+		if (tp->t_mountp->m_sb.sb_fdblocks < blocks)
+			return -ENOSPC;
+		tp->t_blk_res += blocks;
+	}
+
+	/*
+	 * Attempt to reserve the needed realtime extents by decrementing
+	 * the number needed from the number available.  This will
+	 * fail if the count would go below zero.
+	 */
+	if (rtextents > 0) {
+		if (tp->t_mountp->m_sb.sb_rextents < rtextents) {
+			error = -ENOSPC;
+			goto out_blocks;
+		}
+	}
+
+	return 0;
+out_blocks:
+	if (blocks > 0)
+		tp->t_blk_res -= blocks;
+
+	return error;
+}
diff --git a/repair/Makefile b/repair/Makefile
index 2c40e59a30fc..e5014deb0ce8 100644
--- a/repair/Makefile
+++ b/repair/Makefile
@@ -16,6 +16,7 @@ HFILES = \
 	avl.h \
 	bulkload.h \
 	bmap.h \
+	bmap_repair.h \
 	btree.h \
 	da_util.h \
 	dinode.h \
@@ -41,6 +42,7 @@ CFILES = \
 	avl.c \
 	bulkload.c \
 	bmap.c \
+	bmap_repair.c \
 	btree.c \
 	da_util.c \
 	dino_chunks.c \
diff --git a/repair/agbtree.c b/repair/agbtree.c
index c6f0512fe7de..38f3f7b8feac 100644
--- a/repair/agbtree.c
+++ b/repair/agbtree.c
@@ -22,7 +22,7 @@ init_rebuild(
 {
 	memset(btr, 0, sizeof(struct bt_rebuild));
 
-	bulkload_init_ag(&btr->newbt, sc, oinfo);
+	bulkload_init_ag(&btr->newbt, sc, oinfo, NULLFSBLOCK);
 	btr->bload.max_dirty = XFS_B_TO_FSBT(sc->mp, 256U << 10); /* 256K */
 	bulkload_estimate_ag_slack(sc, &btr->bload, est_agfreeblocks);
 }
diff --git a/repair/bmap_repair.c b/repair/bmap_repair.c
new file mode 100644
index 000000000000..1dbcafb22736
--- /dev/null
+++ b/repair/bmap_repair.c
@@ -0,0 +1,748 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2019-2024 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@xxxxxxxxxx>
+ */
+#include <libxfs.h>
+#include "btree.h"
+#include "err_protos.h"
+#include "libxlog.h"
+#include "incore.h"
+#include "globals.h"
+#include "dinode.h"
+#include "slab.h"
+#include "rmap.h"
+#include "bulkload.h"
+#include "bmap_repair.h"
+#include "libfrog/util.h"
+
+/*
+ * Inode Fork Block Mapping (BMBT) Repair
+ * ======================================
+ *
+ * Gather all the rmap records for the inode and fork we're fixing, reset the
+ * incore fork, then recreate the btree.
+ */
+struct xrep_bmap {
+	/* List of new bmap records. */
+	struct xfs_slab		*bmap_records;
+	struct xfs_slab_cursor	*bmap_cursor;
+
+	/* New fork. */
+	struct bulkload		new_fork_info;
+	struct xfs_btree_bload	bmap_bload;
+
+	struct repair_ctx	*sc;
+
+	/* How many blocks did we find allocated to this file? */
+	xfs_rfsblock_t		nblocks;
+
+	/* How many bmbt blocks did we find for this fork? */
+	xfs_rfsblock_t		old_bmbt_block_count;
+
+	/* Which fork are we fixing? */
+	int			whichfork;
+};
+
+/* Remember this reverse-mapping as a series of bmap records. */
+STATIC int
+xrep_bmap_from_rmap(
+	struct xrep_bmap	*rb,
+	xfs_fileoff_t		startoff,
+	xfs_fsblock_t		startblock,
+	xfs_filblks_t		blockcount,
+	bool			unwritten)
+{
+	struct xfs_bmbt_rec	rbe;
+	struct xfs_bmbt_irec	irec;
+	int			error = 0;
+
+	irec.br_startoff = startoff;
+	irec.br_startblock = startblock;
+	irec.br_state = unwritten ? XFS_EXT_UNWRITTEN : XFS_EXT_NORM;
+
+	do {
+		xfs_failaddr_t	fa;
+
+		irec.br_blockcount = min_t(xfs_filblks_t, blockcount,
+				XFS_MAX_BMBT_EXTLEN);
+
+		fa = libxfs_bmap_validate_extent(rb->sc->ip, rb->whichfork,
+				&irec);
+		if (fa)
+			return -EFSCORRUPTED;
+
+		libxfs_bmbt_disk_set_all(&rbe, &irec);
+
+		error = slab_add(rb->bmap_records, &rbe);
+		if (error)
+			return error;
+
+		irec.br_startblock += irec.br_blockcount;
+		irec.br_startoff += irec.br_blockcount;
+		blockcount -= irec.br_blockcount;
+	} while (blockcount > 0);
+
+	return 0;
+}
+
+/* Check for any obvious errors or conflicts in the file mapping. */
+STATIC int
+xrep_bmap_check_fork_rmap(
+	struct xrep_bmap		*rb,
+	struct xfs_btree_cur		*cur,
+	const struct xfs_rmap_irec	*rec)
+{
+	struct repair_ctx		*sc = rb->sc;
+
+	/*
+	 * Data extents for rt files are never stored on the data device, but
+	 * everything else (xattrs, bmbt blocks) can be.
+	 */
+	if (XFS_IS_REALTIME_INODE(sc->ip) &&
+	    !(rec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK)))
+		return EFSCORRUPTED;
+
+	/* Check that this is within the AG. */
+	if (!xfs_verify_agbext(cur->bc_ag.pag, rec->rm_startblock,
+				rec->rm_blockcount))
+		return EFSCORRUPTED;
+
+	/* No contradictory flags. */
+	if ((rec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK)) &&
+	    (rec->rm_flags & XFS_RMAP_UNWRITTEN))
+		return EFSCORRUPTED;
+
+	/* Check the file offset range. */
+	if (!(rec->rm_flags & XFS_RMAP_BMBT_BLOCK) &&
+	    !xfs_verify_fileext(sc->mp, rec->rm_offset, rec->rm_blockcount))
+		return EFSCORRUPTED;
+
+	return 0;
+}
+
+/* Record extents that belong to this inode's fork. */
+STATIC int
+xrep_bmap_walk_rmap(
+	struct xfs_btree_cur		*cur,
+	const struct xfs_rmap_irec	*rec,
+	void				*priv)
+{
+	struct xrep_bmap		*rb = priv;
+	struct xfs_mount		*mp = cur->bc_mp;
+	xfs_fsblock_t			fsbno;
+	int				error;
+
+	/* Skip extents which are not owned by this inode and fork. */
+	if (rec->rm_owner != rb->sc->ip->i_ino)
+		return 0;
+
+	error = xrep_bmap_check_fork_rmap(rb, cur, rec);
+	if (error)
+		return error;
+
+	/*
+	 * Record all blocks allocated to this file even if the extent isn't
+	 * for the fork we're rebuilding so that we can reset di_nblocks later.
+	 */
+	rb->nblocks += rec->rm_blockcount;
+
+	/* If this rmap isn't for the fork we want, we're done. */
+	if (rb->whichfork == XFS_DATA_FORK &&
+	    (rec->rm_flags & XFS_RMAP_ATTR_FORK))
+		return 0;
+	if (rb->whichfork == XFS_ATTR_FORK &&
+	    !(rec->rm_flags & XFS_RMAP_ATTR_FORK))
+		return 0;
+
+	fsbno = XFS_AGB_TO_FSB(mp, cur->bc_ag.pag->pag_agno,
+			rec->rm_startblock);
+
+	if (rec->rm_flags & XFS_RMAP_BMBT_BLOCK) {
+		rb->old_bmbt_block_count += rec->rm_blockcount;
+		return 0;
+	}
+
+	return xrep_bmap_from_rmap(rb, rec->rm_offset, fsbno,
+			rec->rm_blockcount,
+			rec->rm_flags & XFS_RMAP_UNWRITTEN);
+}
+
+/* Compare two bmap extents. */
+static int
+xrep_bmap_extent_cmp(
+	const void			*a,
+	const void			*b)
+{
+	xfs_fileoff_t			ao;
+	xfs_fileoff_t			bo;
+
+	ao = libxfs_bmbt_disk_get_startoff((struct xfs_bmbt_rec *)a);
+	bo = libxfs_bmbt_disk_get_startoff((struct xfs_bmbt_rec *)b);
+
+	if (ao > bo)
+		return 1;
+	else if (ao < bo)
+		return -1;
+	return 0;
+}
+
+/* Scan one AG for reverse mappings that we can turn into extent maps. */
+STATIC int
+xrep_bmap_scan_ag(
+	struct xrep_bmap	*rb,
+	struct xfs_perag	*pag)
+{
+	struct repair_ctx	*sc = rb->sc;
+	struct xfs_mount	*mp = sc->mp;
+	struct xfs_buf		*agf_bp = NULL;
+	struct xfs_btree_cur	*cur;
+	int			error;
+
+	error = -libxfs_alloc_read_agf(pag, sc->tp, 0, &agf_bp);
+	if (error)
+		return error;
+	if (!agf_bp)
+		return ENOMEM;
+	cur = libxfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, pag);
+	error = -libxfs_rmap_query_all(cur, xrep_bmap_walk_rmap, rb);
+	libxfs_btree_del_cursor(cur, error);
+	libxfs_trans_brelse(sc->tp, agf_bp);
+	return error;
+}
+
+/*
+ * Collect block mappings for this fork of this inode and decide if we have
+ * enough space to rebuild.  Caller is responsible for cleaning up the list if
+ * anything goes wrong.
+ */
+STATIC int
+xrep_bmap_find_mappings(
+	struct xrep_bmap	*rb)
+{
+	struct xfs_perag	*pag;
+	xfs_agnumber_t		agno;
+	int			error;
+
+	/* Iterate the rmaps for extents. */
+	for_each_perag(rb->sc->mp, agno, pag) {
+		error = xrep_bmap_scan_ag(rb, pag);
+		if (error) {
+			libxfs_perag_put(pag);
+			return error;
+		}
+	}
+
+	return 0;
+}
+
+/* Retrieve bmap data for bulk load. */
+STATIC int
+xrep_bmap_get_records(
+	struct xfs_btree_cur	*cur,
+	unsigned int		idx,
+	struct xfs_btree_block	*block,
+	unsigned int		nr_wanted,
+	void			*priv)
+{
+	struct xfs_bmbt_rec	*rec;
+	struct xfs_bmbt_irec	*irec = &cur->bc_rec.b;
+	struct xrep_bmap	*rb = priv;
+	union xfs_btree_rec	*block_rec;
+	unsigned int		loaded;
+
+	for (loaded = 0; loaded < nr_wanted; loaded++, idx++) {
+		rec = pop_slab_cursor(rb->bmap_cursor);
+		libxfs_bmbt_disk_get_all(rec, irec);
+
+		block_rec = libxfs_btree_rec_addr(cur, idx, block);
+		cur->bc_ops->init_rec_from_cur(cur, block_rec);
+	}
+
+	return loaded;
+}
+
+/* Feed one of the new btree blocks to the bulk loader. */
+STATIC int
+xrep_bmap_claim_block(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*ptr,
+	void			*priv)
+{
+	struct xrep_bmap        *rb = priv;
+
+	return bulkload_claim_block(cur, &rb->new_fork_info, ptr);
+}
+
+/* Figure out how much space we need to create the incore btree root block. */
+STATIC size_t
+xrep_bmap_iroot_size(
+	struct xfs_btree_cur	*cur,
+	unsigned int		level,
+	unsigned int		nr_this_level,
+	void			*priv)
+{
+	ASSERT(level > 0);
+
+	return XFS_BMAP_BROOT_SPACE_CALC(cur->bc_mp, nr_this_level);
+}
+
+/* Update the inode counters. */
+STATIC int
+xrep_bmap_reset_counters(
+	struct xrep_bmap	*rb)
+{
+	struct repair_ctx	*sc = rb->sc;
+	struct xbtree_ifakeroot	*ifake = &rb->new_fork_info.ifake;
+	int64_t			delta;
+
+	/*
+	 * Update the inode block counts to reflect the extents we found in the
+	 * rmapbt.
+	 */
+	delta = ifake->if_blocks - rb->old_bmbt_block_count;
+	sc->ip->i_nblocks = rb->nblocks + delta;
+	libxfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE);
+
+	/* Quotas don't exist so we're done. */
+	return 0;
+}
+
+/*
+ * Ensure that the inode being repaired is ready to handle a certain number of
+ * extents, or return EFSCORRUPTED.  Caller must hold the ILOCK of the inode
+ * being repaired and have joined it to the scrub transaction.
+ */
+static int
+xrep_ino_ensure_extent_count(
+	struct repair_ctx	*sc,
+	int			whichfork,
+	xfs_extnum_t		nextents)
+{
+	xfs_extnum_t		max_extents;
+	bool			large_extcount;
+
+	large_extcount = xfs_inode_has_large_extent_counts(sc->ip);
+	max_extents = xfs_iext_max_nextents(large_extcount, whichfork);
+	if (nextents <= max_extents)
+		return 0;
+	if (large_extcount)
+		return EFSCORRUPTED;
+	if (!xfs_has_large_extent_counts(sc->mp))
+		return EFSCORRUPTED;
+
+	max_extents = xfs_iext_max_nextents(true, whichfork);
+	if (nextents > max_extents)
+		return EFSCORRUPTED;
+
+	sc->ip->i_diflags2 |= XFS_DIFLAG2_NREXT64;
+	libxfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE);
+	return 0;
+}
+
+/*
+ * Create a new iext tree and load it with block mappings.  If the inode is
+ * in extents format, that's all we need to do to commit the new mappings.
+ * If it is in btree format, this takes care of preloading the incore tree.
+ */
+STATIC int
+xrep_bmap_extents_load(
+	struct xrep_bmap	*rb,
+	struct xfs_btree_cur	*bmap_cur,
+	uint64_t		nextents)
+{
+	struct xfs_iext_cursor	icur;
+	struct xbtree_ifakeroot	*ifake = &rb->new_fork_info.ifake;
+	struct xfs_ifork	*ifp = ifake->if_fork;
+	unsigned int		i;
+	int			error;
+
+	ASSERT(ifp->if_bytes == 0);
+
+	error = init_slab_cursor(rb->bmap_records, xrep_bmap_extent_cmp,
+			&rb->bmap_cursor);
+	if (error)
+		return error;
+
+	/* Add all the mappings to the incore extent tree. */
+	libxfs_iext_first(ifp, &icur);
+	for (i = 0; i < nextents; i++) {
+		struct xfs_bmbt_rec	*rec;
+
+		rec = pop_slab_cursor(rb->bmap_cursor);
+		libxfs_bmbt_disk_get_all(rec, &bmap_cur->bc_rec.b);
+		libxfs_iext_insert_raw(ifp, &icur, &bmap_cur->bc_rec.b);
+		ifp->if_nextents++;
+		libxfs_iext_next(ifp, &icur);
+	}
+	free_slab_cursor(&rb->bmap_cursor);
+
+	return xrep_ino_ensure_extent_count(rb->sc, rb->whichfork,
+			ifp->if_nextents);
+}
+
+/*
+ * Reserve new btree blocks, bulk load the bmap records into the ondisk btree,
+ * and load the incore extent tree.
+ */
+STATIC int
+xrep_bmap_btree_load(
+	struct xrep_bmap	*rb,
+	struct xfs_btree_cur	*bmap_cur,
+	uint64_t		nextents)
+{
+	struct repair_ctx	*sc = rb->sc;
+	int			error;
+
+	rb->bmap_bload.get_records = xrep_bmap_get_records;
+	rb->bmap_bload.claim_block = xrep_bmap_claim_block;
+	rb->bmap_bload.iroot_size = xrep_bmap_iroot_size;
+	rb->bmap_bload.max_dirty = XFS_B_TO_FSBT(sc->mp, 256U << 10); /* 256K */
+
+	/*
+	 * Always make the btree as small as possible, since we might need the
+	 * space to rebuild the space metadata btrees in later phases.
+	 */
+	rb->bmap_bload.leaf_slack = 0;
+	rb->bmap_bload.node_slack = 0;
+
+	/* Compute how many blocks we'll need. */
+	error = -libxfs_btree_bload_compute_geometry(bmap_cur, &rb->bmap_bload,
+			nextents);
+	if (error)
+		return error;
+
+	/*
+	 * Guess how many blocks we're going to need to rebuild an entire bmap
+	 * from the number of extents we found, and pump up our transaction to
+	 * have sufficient block reservation.
+	 */
+	error = -libxfs_trans_reserve_more(sc->tp, rb->bmap_bload.nr_blocks, 0);
+	if (error)
+		return error;
+
+	/* Reserve the space we'll need for the new btree. */
+	error = bulkload_alloc_file_blocks(&rb->new_fork_info,
+			rb->bmap_bload.nr_blocks);
+	if (error)
+		return error;
+
+	/* Add all observed bmap records. */
+	error = init_slab_cursor(rb->bmap_records, xrep_bmap_extent_cmp,
+			&rb->bmap_cursor);
+	if (error)
+		return error;
+	error = -libxfs_btree_bload(bmap_cur, &rb->bmap_bload, rb);
+	free_slab_cursor(&rb->bmap_cursor);
+	if (error)
+	       return error;
+
+	/*
+	 * Load the new bmap records into the new incore extent tree to
+	 * preserve delalloc reservations for regular files.  The directory
+	 * code loads the extent tree during xfs_dir_open and assumes
+	 * thereafter that it remains loaded, so we must not violate that
+	 * assumption.
+	 */
+	return xrep_bmap_extents_load(rb, bmap_cur, nextents);
+}
+
+/*
+ * Use the collected bmap information to stage a new bmap fork.  If this is
+ * successful we'll return with the new fork information logged to the repair
+ * transaction but not yet committed.
+ */
+STATIC int
+xrep_bmap_build_new_fork(
+	struct xrep_bmap	*rb)
+{
+	struct xfs_owner_info	oinfo;
+	struct repair_ctx	*sc = rb->sc;
+	struct xfs_btree_cur	*bmap_cur;
+	struct xbtree_ifakeroot	*ifake = &rb->new_fork_info.ifake;
+	uint64_t		nextents;
+	int			error;
+
+	/*
+	 * Sort the bmap extents by startblock to avoid btree splits when we
+	 * rebuild the bmbt btree.
+	 */
+	qsort_slab(rb->bmap_records, xrep_bmap_extent_cmp);
+
+	/*
+	 * Prepare to construct the new fork by initializing the new btree
+	 * structure and creating a fake ifork in the ifakeroot structure.
+	 */
+	libxfs_rmap_ino_bmbt_owner(&oinfo, sc->ip->i_ino, rb->whichfork);
+	bulkload_init_inode(&rb->new_fork_info, sc, rb->whichfork, &oinfo);
+	bmap_cur = libxfs_bmbt_stage_cursor(sc->mp, sc->ip, ifake);
+
+	/*
+	 * Figure out the size and format of the new fork, then fill it with
+	 * all the bmap records we've found.  Join the inode to the transaction
+	 * so that we can roll the transaction while holding the inode locked.
+	 */
+	libxfs_trans_ijoin(sc->tp, sc->ip, 0);
+	nextents = slab_count(rb->bmap_records);
+	if (nextents <= XFS_IFORK_MAXEXT(sc->ip, rb->whichfork)) {
+		ifake->if_fork->if_format = XFS_DINODE_FMT_EXTENTS;
+		error = xrep_bmap_extents_load(rb, bmap_cur, nextents);
+	} else {
+		ifake->if_fork->if_format = XFS_DINODE_FMT_BTREE;
+		error = xrep_bmap_btree_load(rb, bmap_cur, nextents);
+	}
+	if (error)
+		goto err_cur;
+
+	/*
+	 * Install the new fork in the inode.  After this point the old mapping
+	 * data are no longer accessible and the new tree is live.  We delete
+	 * the cursor immediately after committing the staged root because the
+	 * staged fork might be in extents format.
+	 */
+	libxfs_bmbt_commit_staged_btree(bmap_cur, sc->tp, rb->whichfork);
+	libxfs_btree_del_cursor(bmap_cur, 0);
+
+	/* Reset the inode counters now that we've changed the fork. */
+	error = xrep_bmap_reset_counters(rb);
+	if (error)
+		goto err_newbt;
+
+	/* Dispose of any unused blocks and the accounting infomation. */
+	error = bulkload_commit(&rb->new_fork_info);
+	if (error)
+		return error;
+
+	return -libxfs_trans_roll_inode(&sc->tp, sc->ip);
+err_cur:
+	if (bmap_cur)
+		libxfs_btree_del_cursor(bmap_cur, error);
+err_newbt:
+	bulkload_cancel(&rb->new_fork_info);
+	return error;
+}
+
+/* Check for garbage inputs.  Returns ECANCELED if there's nothing to do. */
+STATIC int
+xrep_bmap_check_inputs(
+	struct repair_ctx	*sc,
+	int			whichfork)
+{
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(sc->ip, whichfork);
+
+	ASSERT(whichfork == XFS_DATA_FORK || whichfork == XFS_ATTR_FORK);
+
+	if (!xfs_has_rmapbt(sc->mp))
+		return EOPNOTSUPP;
+
+	/* No fork means nothing to rebuild. */
+	if (!ifp)
+		return ECANCELED;
+
+	/*
+	 * We only know how to repair extent mappings, which is to say that we
+	 * only support extents and btree fork format.  Repairs to a local
+	 * format fork require a higher level repair function, so we do not
+	 * have any work to do here.
+	 */
+	switch (ifp->if_format) {
+	case XFS_DINODE_FMT_DEV:
+	case XFS_DINODE_FMT_LOCAL:
+	case XFS_DINODE_FMT_UUID:
+		return ECANCELED;
+	case XFS_DINODE_FMT_EXTENTS:
+	case XFS_DINODE_FMT_BTREE:
+		break;
+	default:
+		return EFSCORRUPTED;
+	}
+
+	if (whichfork == XFS_ATTR_FORK)
+		return 0;
+
+	/* Only files, symlinks, and directories get to have data forks. */
+	switch (VFS_I(sc->ip)->i_mode & S_IFMT) {
+	case S_IFREG:
+	case S_IFDIR:
+	case S_IFLNK:
+		/* ok */
+		break;
+	default:
+		return EINVAL;
+	}
+
+	/* Don't know how to rebuild realtime data forks. */
+	if (XFS_IS_REALTIME_INODE(sc->ip))
+		return EOPNOTSUPP;
+
+	return 0;
+}
+
+/* Repair an inode fork. */
+STATIC int
+xrep_bmap(
+	struct repair_ctx	*sc,
+	int			whichfork)
+{
+	struct xrep_bmap	*rb;
+	int			error = 0;
+
+	error = xrep_bmap_check_inputs(sc, whichfork);
+	if (error == ECANCELED)
+		return 0;
+	if (error)
+		return error;
+
+	rb = kmem_zalloc(sizeof(struct xrep_bmap), KM_NOFS | KM_MAYFAIL);
+	if (!rb)
+		return ENOMEM;
+	rb->sc = sc;
+	rb->whichfork = whichfork;
+
+	/* Set up some storage */
+	error = init_slab(&rb->bmap_records, sizeof(struct xfs_bmbt_rec));
+	if (error)
+		goto out_rb;
+
+	/* Collect all reverse mappings for this fork's extents. */
+	error = xrep_bmap_find_mappings(rb);
+	if (error)
+		goto out_bitmap;
+
+	/* Rebuild the bmap information. */
+	error = xrep_bmap_build_new_fork(rb);
+
+	/*
+	 * We don't need to free the old bmbt blocks because we're rebuilding
+	 * all the space metadata later.
+	 */
+
+out_bitmap:
+	free_slab(&rb->bmap_records);
+out_rb:
+	kmem_free(rb);
+	return error;
+}
+
+/* Rebuild some inode's bmap. */
+int
+rebuild_bmap(
+	struct xfs_mount	*mp,
+	xfs_ino_t		ino,
+	int			whichfork,
+	unsigned long		nr_extents,
+	struct xfs_buf		**ino_bpp,
+	struct xfs_dinode	**dinop,
+	int			*dirty)
+{
+	struct repair_ctx	sc = {
+		.mp		= mp,
+	};
+	const struct xfs_buf_ops *bp_ops;
+	unsigned long		boffset;
+	unsigned long long	resblks;
+	xfs_daddr_t		bp_bn;
+	int			bp_length;
+	int			error, err2;
+
+	bp_bn = xfs_buf_daddr(*ino_bpp);
+	bp_length = (*ino_bpp)->b_length;
+	bp_ops = (*ino_bpp)->b_ops;
+	boffset = (char *)(*dinop) - (char *)(*ino_bpp)->b_addr;
+
+	/*
+	 * Bail out if the inode didn't think it had extents.  Otherwise, zap
+	 * it back to a zero-extents fork so that we can rebuild it.
+	 */
+	switch (whichfork) {
+	case XFS_DATA_FORK:
+		if ((*dinop)->di_nextents == 0)
+			return 0;
+		(*dinop)->di_format = XFS_DINODE_FMT_EXTENTS;
+		(*dinop)->di_nextents = 0;
+		libxfs_dinode_calc_crc(mp, *dinop);
+		*dirty = 1;
+		break;
+	case XFS_ATTR_FORK:
+		if ((*dinop)->di_anextents == 0)
+			return 0;
+		(*dinop)->di_aformat = XFS_DINODE_FMT_EXTENTS;
+		(*dinop)->di_anextents = 0;
+		libxfs_dinode_calc_crc(mp, *dinop);
+		*dirty = 1;
+		break;
+	default:
+		return EINVAL;
+	}
+
+	resblks = libxfs_bmbt_calc_size(mp, nr_extents);
+	error = -libxfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, resblks, 0,
+			0, &sc.tp);
+	if (error)
+		return error;
+
+	/*
+	 * Repair magic: the caller passed us the inode cluster buffer for the
+	 * inode.  The _iget call grabs the buffer to load the incore inode, so
+	 * the buffer must be attached to the transaction to avoid recursing
+	 * the buffer lock.
+	 *
+	 * Unfortunately, the _iget call drops the buffer once the inode is
+	 * loaded, so if we've made any changes we have to log the buffer, hold
+	 * it, and roll the transaction.  This persists the caller's changes
+	 * and maintains our ownership of the cluster buffer.
+	 */
+	libxfs_trans_bjoin(sc.tp, *ino_bpp);
+	if (*dirty) {
+		unsigned int	end = BBTOB((*ino_bpp)->b_length) - 1;
+
+		libxfs_trans_log_buf(sc.tp, *ino_bpp, 0, end);
+		*dirty = 0;
+
+		libxfs_trans_bhold(sc.tp, *ino_bpp);
+		error = -libxfs_trans_roll(&sc.tp);
+		libxfs_trans_bjoin(sc.tp, *ino_bpp);
+		if (error)
+			goto out_cancel;
+	}
+
+	/* Grab the inode and fix the bmbt. */
+	error = -libxfs_iget(mp, sc.tp, ino, 0, &sc.ip);
+	if (error)
+		goto out_cancel;
+	error = xrep_bmap(&sc, whichfork);
+	if (error)
+		libxfs_trans_cancel(sc.tp);
+	else
+		error = -libxfs_trans_commit(sc.tp);
+
+	/*
+	 * Rebuilding the inode fork rolled the transaction, so we need to
+	 * re-grab the inode cluster buffer and dinode pointer for the caller.
+	 */
+	err2 = -libxfs_imap_to_bp(mp, NULL, &sc.ip->i_imap, ino_bpp);
+	if (err2)
+		do_error(
+ _("Unable to re-grab inode cluster buffer after failed repair of inode %llu, error %d.\n"),
+				(unsigned long long)ino, err2);
+	*dinop = xfs_buf_offset(*ino_bpp, sc.ip->i_imap.im_boffset);
+	libxfs_irele(sc.ip);
+
+	return error;
+
+out_cancel:
+	libxfs_trans_cancel(sc.tp);
+
+	/*
+	 * Try to regrab the old buffer so we have something to return to the
+	 * caller.
+	 */
+	err2 = -libxfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, bp_bn,
+			bp_length, 0, ino_bpp, bp_ops);
+	if (err2)
+		do_error(
+ _("Unable to re-grab inode cluster buffer after failed repair of inode %llu, error %d.\n"),
+				(unsigned long long)ino, err2);
+	*dinop = xfs_buf_offset(*ino_bpp, boffset);
+	return error;
+}
diff --git a/repair/bmap_repair.h b/repair/bmap_repair.h
new file mode 100644
index 000000000000..6d55359490a0
--- /dev/null
+++ b/repair/bmap_repair.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (c) 2019-2024 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@xxxxxxxxxx>
+ */
+#ifndef REBUILD_H_
+#define REBUILD_H_
+
+int rebuild_bmap(struct xfs_mount *mp, xfs_ino_t ino, int whichfork,
+		 unsigned long nr_extents, struct xfs_buf **ino_bpp,
+		 struct xfs_dinode **dinop, int *dirty);
+
+#endif /* REBUILD_H_ */
diff --git a/repair/bulkload.c b/repair/bulkload.c
index 18158c397f56..a97839f549dd 100644
--- a/repair/bulkload.c
+++ b/repair/bulkload.c
@@ -14,14 +14,29 @@ void
 bulkload_init_ag(
 	struct bulkload			*bkl,
 	struct repair_ctx		*sc,
-	const struct xfs_owner_info	*oinfo)
+	const struct xfs_owner_info	*oinfo,
+	xfs_fsblock_t			alloc_hint)
 {
 	memset(bkl, 0, sizeof(struct bulkload));
 	bkl->sc = sc;
 	bkl->oinfo = *oinfo; /* structure copy */
+	bkl->alloc_hint = alloc_hint;
 	INIT_LIST_HEAD(&bkl->resv_list);
 }
 
+/* Initialize accounting resources for staging a new inode fork btree. */
+void
+bulkload_init_inode(
+	struct bulkload			*bkl,
+	struct repair_ctx		*sc,
+	int				whichfork,
+	const struct xfs_owner_info	*oinfo)
+{
+	bulkload_init_ag(bkl, sc, oinfo, XFS_INO_TO_FSB(sc->mp, sc->ip->i_ino));
+	bkl->ifake.if_fork = kmem_cache_zalloc(xfs_ifork_cache, 0);
+	bkl->ifake.if_fork_size = xfs_inode_fork_size(sc->ip, whichfork);
+}
+
 /* Designate specific blocks to be used to build our new btree. */
 static int
 bulkload_add_blocks(
@@ -71,17 +86,199 @@ bulkload_add_extent(
 	return bulkload_add_blocks(bkl, pag, &args);
 }
 
+/* Don't let our allocation hint take us beyond EOFS */
+static inline void
+bulkload_validate_file_alloc_hint(
+	struct bulkload		*bkl)
+{
+	struct repair_ctx	*sc = bkl->sc;
+
+	if (libxfs_verify_fsbno(sc->mp, bkl->alloc_hint))
+		return;
+
+	bkl->alloc_hint = XFS_AGB_TO_FSB(sc->mp, 0, XFS_AGFL_BLOCK(sc->mp) + 1);
+}
+
+/* Allocate disk space for our new file-based btree. */
+int
+bulkload_alloc_file_blocks(
+	struct bulkload		*bkl,
+	uint64_t		nr_blocks)
+{
+	struct repair_ctx	*sc = bkl->sc;
+	struct xfs_mount	*mp = sc->mp;
+	int			error = 0;
+
+	while (nr_blocks > 0) {
+		struct xfs_alloc_arg	args = {
+			.tp		= sc->tp,
+			.mp		= mp,
+			.oinfo		= bkl->oinfo,
+			.minlen		= 1,
+			.maxlen		= nr_blocks,
+			.prod		= 1,
+			.resv		= XFS_AG_RESV_NONE,
+		};
+		struct xfs_perag	*pag;
+		xfs_agnumber_t		agno;
+
+		bulkload_validate_file_alloc_hint(bkl);
+
+		error = -libxfs_alloc_vextent_start_ag(&args, bkl->alloc_hint);
+		if (error)
+			return error;
+		if (args.fsbno == NULLFSBLOCK)
+			return ENOSPC;
+
+		agno = XFS_FSB_TO_AGNO(mp, args.fsbno);
+
+		pag = libxfs_perag_get(mp, agno);
+		if (!pag) {
+			ASSERT(0);
+			return -EFSCORRUPTED;
+		}
+
+		error = bulkload_add_blocks(bkl, pag, &args);
+		libxfs_perag_put(pag);
+		if (error)
+			return error;
+
+		nr_blocks -= args.len;
+		bkl->alloc_hint = args.fsbno + args.len;
+
+		error = -libxfs_defer_finish(&sc->tp);
+		if (error)
+			return error;
+	}
+
+	return 0;
+}
+
+/*
+ * Free the unused part of a space extent that was reserved for a new ondisk
+ * structure.  Returns the number of EFIs logged or a negative errno.
+ */
+static inline int
+bulkload_free_extent(
+	struct bulkload		*bkl,
+	struct bulkload_resv	*resv,
+	bool			btree_committed)
+{
+	struct repair_ctx	*sc = bkl->sc;
+	xfs_agblock_t		free_agbno = resv->agbno;
+	xfs_extlen_t		free_aglen = resv->len;
+	xfs_fsblock_t		fsbno;
+	int			error;
+
+	if (!btree_committed || resv->used == 0) {
+		/*
+		 * If we're not committing a new btree or we didn't use the
+		 * space reservation, free the entire space extent.
+		 */
+		goto free;
+	}
+
+	/*
+	 * We used space and committed the btree.  Remove the written blocks
+	 * from the reservation and possibly log a new EFI to free any unused
+	 * reservation space.
+	 */
+	free_agbno += resv->used;
+	free_aglen -= resv->used;
+
+	if (free_aglen == 0)
+		return 0;
+
+free:
+	/*
+	 * Use EFIs to free the reservations.  We don't need to use EFIs here
+	 * like the kernel, but we'll do it to keep the code matched.
+	 */
+	fsbno = XFS_AGB_TO_FSB(sc->mp, resv->pag->pag_agno, free_agbno);
+	error = -libxfs_free_extent_later(sc->tp, fsbno, free_aglen,
+			&bkl->oinfo, XFS_AG_RESV_NONE, true);
+	if (error)
+		return error;
+
+	return 1;
+}
+
 /* Free all the accounting info and disk space we reserved for a new btree. */
-void
-bulkload_commit(
-	struct bulkload		*bkl)
+static int
+bulkload_free(
+	struct bulkload		*bkl,
+	bool			btree_committed)
 {
+	struct repair_ctx	*sc = bkl->sc;
 	struct bulkload_resv	*resv, *n;
+	unsigned int		freed = 0;
+	int			error = 0;
 
 	list_for_each_entry_safe(resv, n, &bkl->resv_list, list) {
+		int		ret;
+
+		ret = bulkload_free_extent(bkl, resv, btree_committed);
 		list_del(&resv->list);
+		libxfs_perag_put(resv->pag);
 		kfree(resv);
+
+		if (ret < 0) {
+			error = ret;
+			goto junkit;
+		}
+
+		freed += ret;
+		if (freed >= XREP_MAX_ITRUNCATE_EFIS) {
+			error = -libxfs_defer_finish(&sc->tp);
+			if (error)
+				goto junkit;
+			freed = 0;
+		}
 	}
+
+	if (freed)
+		error = -libxfs_defer_finish(&sc->tp);
+junkit:
+	/*
+	 * If we still have reservations attached to @newbt, cleanup must have
+	 * failed and the filesystem is about to go down.  Clean up the incore
+	 * reservations.
+	 */
+	list_for_each_entry_safe(resv, n, &bkl->resv_list, list) {
+		list_del(&resv->list);
+		libxfs_perag_put(resv->pag);
+		kfree(resv);
+	}
+
+	if (sc->ip) {
+		kmem_cache_free(xfs_ifork_cache, bkl->ifake.if_fork);
+		bkl->ifake.if_fork = NULL;
+	}
+
+	return error;
+}
+
+/*
+ * Free all the accounting info and unused disk space allocations after
+ * committing a new btree.
+ */
+int
+bulkload_commit(
+	struct bulkload		*bkl)
+{
+	return bulkload_free(bkl, true);
+}
+
+/*
+ * Free all the accounting info and all of the disk space we reserved for a new
+ * btree that we're not going to commit.  We want to try to roll things back
+ * cleanly for things like ENOSPC midway through allocation.
+ */
+void
+bulkload_cancel(
+	struct bulkload		*bkl)
+{
+	bulkload_free(bkl, false);
 }
 
 /* Feed one of the reserved btree blocks to the bulk loader. */
diff --git a/repair/bulkload.h b/repair/bulkload.h
index f4790e3b3de6..a88aafaa678a 100644
--- a/repair/bulkload.h
+++ b/repair/bulkload.h
@@ -8,9 +8,17 @@
 
 extern int bload_leaf_slack;
 extern int bload_node_slack;
+/*
+ * This is the maximum number of deferred extent freeing item extents (EFIs)
+ * that we'll attach to a transaction without rolling the transaction to avoid
+ * overrunning a tr_itruncate reservation.
+ */
+#define XREP_MAX_ITRUNCATE_EFIS	(128)
 
 struct repair_ctx {
 	struct xfs_mount	*mp;
+	struct xfs_inode	*ip;
+	struct xfs_trans	*tp;
 };
 
 struct bulkload_resv {
@@ -36,7 +44,10 @@ struct bulkload {
 	struct list_head	resv_list;
 
 	/* Fake root for new btree. */
-	struct xbtree_afakeroot	afake;
+	union {
+		struct xbtree_afakeroot	afake;
+		struct xbtree_ifakeroot	ifake;
+	};
 
 	/* rmap owner of these blocks */
 	struct xfs_owner_info	oinfo;
@@ -44,6 +55,9 @@ struct bulkload {
 	/* The last reservation we allocated from. */
 	struct bulkload_resv	*last_resv;
 
+	/* Hint as to where we should allocate blocks. */
+	xfs_fsblock_t		alloc_hint;
+
 	/* Number of blocks reserved via resv_list. */
 	unsigned int		nr_reserved;
 };
@@ -52,12 +66,16 @@ struct bulkload {
 	list_for_each_entry_safe((resv), (n), &(bkl)->resv_list, list)
 
 void bulkload_init_ag(struct bulkload *bkl, struct repair_ctx *sc,
-		const struct xfs_owner_info *oinfo);
+		const struct xfs_owner_info *oinfo, xfs_fsblock_t alloc_hint);
+void bulkload_init_inode(struct bulkload *bkl, struct repair_ctx *sc,
+		int whichfork, const struct xfs_owner_info *oinfo);
 int bulkload_claim_block(struct xfs_btree_cur *cur, struct bulkload *bkl,
 		union xfs_btree_ptr *ptr);
 int bulkload_add_extent(struct bulkload *bkl, struct xfs_perag *pag,
 		xfs_agblock_t agbno, xfs_extlen_t len);
-void bulkload_commit(struct bulkload *bkl);
+int bulkload_alloc_file_blocks(struct bulkload *bkl, uint64_t nr_blocks);
+void bulkload_cancel(struct bulkload *bkl);
+int bulkload_commit(struct bulkload *bkl);
 void bulkload_estimate_ag_slack(struct repair_ctx *sc,
 		struct xfs_btree_bload *bload, unsigned int free);
 
diff --git a/repair/dinode.c b/repair/dinode.c
index a18af3ff7772..b8f5bf4e550e 100644
--- a/repair/dinode.c
+++ b/repair/dinode.c
@@ -20,6 +20,7 @@
 #include "threads.h"
 #include "slab.h"
 #include "rmap.h"
+#include "bmap_repair.h"
 
 /*
  * gettext lookups for translations of strings use mutexes internally to
@@ -1909,7 +1910,9 @@ process_inode_data_fork(
 	xfs_ino_t		lino = XFS_AGINO_TO_INO(mp, agno, ino);
 	int			err = 0;
 	xfs_extnum_t		nex, max_nex;
+	int			try_rebuild = -1; /* don't know yet */
 
+retry:
 	/*
 	 * extent count on disk is only valid for positive values. The kernel
 	 * uses negative values in memory. hence if we see negative numbers
@@ -1938,11 +1941,15 @@ process_inode_data_fork(
 		*totblocks = 0;
 		break;
 	case XFS_DINODE_FMT_EXTENTS:
+		if (!rmapbt_suspect && try_rebuild == -1)
+			try_rebuild = 1;
 		err = process_exinode(mp, agno, ino, dino, type, dirty,
 			totblocks, nextents, dblkmap, XFS_DATA_FORK,
 			check_dups);
 		break;
 	case XFS_DINODE_FMT_BTREE:
+		if (!rmapbt_suspect && try_rebuild == -1)
+			try_rebuild = 1;
 		err = process_btinode(mp, agno, ino, dino, type, dirty,
 			totblocks, nextents, dblkmap, XFS_DATA_FORK,
 			check_dups);
@@ -1958,8 +1965,28 @@ process_inode_data_fork(
 	if (err)  {
 		do_warn(_("bad data fork in inode %" PRIu64 "\n"), lino);
 		if (!no_modify)  {
+			if (try_rebuild == 1) {
+				do_warn(
+_("rebuilding inode %"PRIu64" data fork\n"),
+					lino);
+				try_rebuild = 0;
+				err = rebuild_bmap(mp, lino, XFS_DATA_FORK,
+						be32_to_cpu(dino->di_nextents),
+						ino_bpp, dinop, dirty);
+				dino = *dinop;
+				if (!err)
+					goto retry;
+				do_warn(
+_("inode %"PRIu64" data fork rebuild failed, error %d, clearing\n"),
+					lino, err);
+			}
 			clear_dinode(mp, dino, lino);
 			*dirty += 1;
+			ASSERT(*dirty > 0);
+		} else if (try_rebuild == 1) {
+			do_warn(
+_("would have tried to rebuild inode %"PRIu64" data fork\n"),
+					lino);
 		}
 		return 1;
 	}
@@ -2025,7 +2052,9 @@ process_inode_attr_fork(
 	struct blkmap		*ablkmap = NULL;
 	int			repair = 0;
 	int			err;
+	int			try_rebuild = -1; /* don't know yet */
 
+retry:
 	if (!dino->di_forkoff) {
 		*anextents = 0;
 		if (dino->di_aformat != XFS_DINODE_FMT_EXTENTS) {
@@ -2052,6 +2081,8 @@ process_inode_attr_fork(
 		err = process_lclinode(mp, agno, ino, dino, XFS_ATTR_FORK);
 		break;
 	case XFS_DINODE_FMT_EXTENTS:
+		if (!rmapbt_suspect && try_rebuild == -1)
+			try_rebuild = 1;
 		ablkmap = blkmap_alloc(*anextents, XFS_ATTR_FORK);
 		*anextents = 0;
 		err = process_exinode(mp, agno, ino, dino, type, dirty,
@@ -2059,6 +2090,8 @@ process_inode_attr_fork(
 				XFS_ATTR_FORK, check_dups);
 		break;
 	case XFS_DINODE_FMT_BTREE:
+		if (!rmapbt_suspect && try_rebuild == -1)
+			try_rebuild = 1;
 		ablkmap = blkmap_alloc(*anextents, XFS_ATTR_FORK);
 		*anextents = 0;
 		err = process_btinode(mp, agno, ino, dino, type, dirty,
@@ -2084,10 +2117,29 @@ process_inode_attr_fork(
 		do_warn(_("bad attribute fork in inode %" PRIu64 "\n"), lino);
 
 		if (!no_modify)  {
+			if (try_rebuild == 1) {
+				do_warn(
+_("rebuilding inode %"PRIu64" attr fork\n"),
+					lino);
+				try_rebuild = 0;
+				err = rebuild_bmap(mp, lino, XFS_ATTR_FORK,
+						be16_to_cpu(dino->di_anextents),
+						ino_bpp, dinop, dirty);
+				dino = *dinop;
+				if (!err)
+					goto retry;
+				do_warn(
+_("inode %"PRIu64" attr fork rebuild failed, error %d"),
+					lino, err);
+			}
 			do_warn(_(", clearing attr fork\n"));
 			*dirty += clear_dinode_attr(mp, dino, lino);
 			ASSERT(*dirty > 0);
-		} else  {
+		} else if (try_rebuild) {
+			do_warn(
+_("would have tried to rebuild inode %"PRIu64" attr fork or cleared it\n"),
+					lino);
+		} else {
 			do_warn(_(", would clear attr fork\n"));
 		}
 
diff --git a/repair/rmap.c b/repair/rmap.c
index 6bb77e082492..a2291c7b3b01 100644
--- a/repair/rmap.c
+++ b/repair/rmap.c
@@ -33,7 +33,7 @@ struct xfs_ag_rmap {
 };
 
 static struct xfs_ag_rmap *ag_rmaps;
-static bool rmapbt_suspect;
+bool rmapbt_suspect;
 static bool refcbt_suspect;
 
 static inline int rmap_compare(const void *a, const void *b)
diff --git a/repair/rmap.h b/repair/rmap.h
index 6004e9f68b63..1dad2f5890a4 100644
--- a/repair/rmap.h
+++ b/repair/rmap.h
@@ -7,6 +7,7 @@
 #define RMAP_H_
 
 extern bool collect_rmaps;
+extern bool rmapbt_suspect;
 
 extern bool rmap_needs_work(struct xfs_mount *);