[PATCH] xfs_repair: estimate per-AG btree slack better

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



From: Darrick J. Wong <djwong@xxxxxxxxxx>

The slack calculation for per-AG btrees is a bit inaccurate because it
only disables slack space in the new btrees when the amount of free
space in the AG (not counting the btrees) is less than 3/32ths of the
AG.  In other words, it assumes that the btrees will fit in less than 9
percent of the space.

However, there's one scenario where this goes wrong -- if the rmapbt
consumes a significant portion of the AG space.  Say a filesystem is
hosting a VM image farm that starts with perfectly shared images.  As
time goes by, random writes to those images will slowly cause the rmapbt
to increase in size as blocks within those images get COWed.

Suppose that the rmapbt now consumes 20% of the space in the AG, that
the AG is nearly full, and that the blocks in the old rmapbt are mostly
full.  At the start of phase5_func, mk_incore_fstree will return that
num_freeblocks is ~20% of the AG size.  Hence the slack calculation will
conclude that there's plenty of space in the AG and new btrees will be
built with 25% slack in the blocks.  If the size of these new expanded
btrees is larger than the free space in the AG, repair will fail to
allocate btree blocks and fail, causing severe filesystem damage.

To combat this, estimate the worst case size of the AG btrees given the
number of records we intend to put in them, subtract that worst case
figure from num_freeblocks, and feed that to bulkload_estimate_ag_slack.
This results in tighter packing of new btree blocks when space is dear,
and hopefully fewer problems.  This /can/ be reproduced with generic/333
if you hack it to keep COWing blocks until the filesystem is totally
out of space, even if reflink has long since refused to share more
blocks.

Signed-off-by: Darrick J. Wong <djwong@xxxxxxxxxx>
---
 libxfs/libxfs_api_defs.h |    4 ++
 repair/agbtree.c         |   90 ++++++++++++++++++++++++++++++++++++++++------
 repair/agbtree.h         |    3 ++
 repair/phase5.c          |   18 +++++++--
 repair/rmap.c            |   44 ++++++++++++++++++++++
 repair/rmap.h            |    3 ++
 6 files changed, 146 insertions(+), 16 deletions(-)

diff --git a/libxfs/libxfs_api_defs.h b/libxfs/libxfs_api_defs.h
index f8efcce7..d973e300 100644
--- a/libxfs/libxfs_api_defs.h
+++ b/libxfs/libxfs_api_defs.h
@@ -25,6 +25,7 @@
 #define xfs_ag_resv_free		libxfs_ag_resv_free
 
 #define xfs_alloc_ag_max_usable		libxfs_alloc_ag_max_usable
+#define xfs_allocbt_calc_size		libxfs_allocbt_calc_size
 #define xfs_allocbt_maxlevels_ondisk	libxfs_allocbt_maxlevels_ondisk
 #define xfs_allocbt_maxrecs		libxfs_allocbt_maxrecs
 #define xfs_allocbt_stage_cursor	libxfs_allocbt_stage_cursor
@@ -115,6 +116,7 @@
 #define xfs_highbit32			libxfs_highbit32
 #define xfs_highbit64			libxfs_highbit64
 #define xfs_ialloc_calc_rootino		libxfs_ialloc_calc_rootino
+#define xfs_iallocbt_calc_size		libxfs_iallocbt_calc_size
 #define xfs_iallocbt_maxlevels_ondisk	libxfs_iallocbt_maxlevels_ondisk
 #define xfs_ialloc_read_agi		libxfs_ialloc_read_agi
 #define xfs_idata_realloc		libxfs_idata_realloc
@@ -146,6 +148,7 @@
 #define xfs_read_agf			libxfs_read_agf
 #define xfs_refc_block			libxfs_refc_block
 #define xfs_refcountbt_calc_reserves	libxfs_refcountbt_calc_reserves
+#define xfs_refcountbt_calc_size	libxfs_refcountbt_calc_size
 #define xfs_refcountbt_init_cursor	libxfs_refcountbt_init_cursor
 #define xfs_refcountbt_maxlevels_ondisk	libxfs_refcountbt_maxlevels_ondisk
 #define xfs_refcountbt_maxrecs		libxfs_refcountbt_maxrecs
@@ -155,6 +158,7 @@
 
 #define xfs_rmap_alloc			libxfs_rmap_alloc
 #define xfs_rmapbt_calc_reserves	libxfs_rmapbt_calc_reserves
+#define xfs_rmapbt_calc_size		libxfs_rmapbt_calc_size
 #define xfs_rmapbt_init_cursor		libxfs_rmapbt_init_cursor
 #define xfs_rmapbt_maxlevels_ondisk	libxfs_rmapbt_maxlevels_ondisk
 #define xfs_rmapbt_maxrecs		libxfs_rmapbt_maxrecs
diff --git a/repair/agbtree.c b/repair/agbtree.c
index ef001803..d5e441a3 100644
--- a/repair/agbtree.c
+++ b/repair/agbtree.c
@@ -17,13 +17,13 @@ static void
 init_rebuild(
 	struct repair_ctx		*sc,
 	const struct xfs_owner_info	*oinfo,
-	xfs_agblock_t			free_space,
+	xfs_agblock_t			est_agfreeblocks,
 	struct bt_rebuild		*btr)
 {
 	memset(btr, 0, sizeof(struct bt_rebuild));
 
 	bulkload_init_ag(&btr->newbt, sc, oinfo);
-	bulkload_estimate_ag_slack(sc, &btr->bload, free_space);
+	bulkload_estimate_ag_slack(sc, &btr->bload, est_agfreeblocks);
 }
 
 /*
@@ -227,7 +227,7 @@ void
 init_freespace_cursors(
 	struct repair_ctx	*sc,
 	struct xfs_perag	*pag,
-	unsigned int		free_space,
+	unsigned int		est_agfreeblocks,
 	unsigned int		*nr_extents,
 	int			*extra_blocks,
 	struct bt_rebuild	*btr_bno,
@@ -239,8 +239,8 @@ init_freespace_cursors(
 
 	agfl_goal = libxfs_alloc_min_freelist(sc->mp, NULL);
 
-	init_rebuild(sc, &XFS_RMAP_OINFO_AG, free_space, btr_bno);
-	init_rebuild(sc, &XFS_RMAP_OINFO_AG, free_space, btr_cnt);
+	init_rebuild(sc, &XFS_RMAP_OINFO_AG, est_agfreeblocks, btr_bno);
+	init_rebuild(sc, &XFS_RMAP_OINFO_AG, est_agfreeblocks, btr_cnt);
 
 	btr_bno->cur = libxfs_allocbt_stage_cursor(sc->mp,
 			&btr_bno->newbt.afake, pag, XFS_BTNUM_BNO);
@@ -439,7 +439,7 @@ void
 init_ino_cursors(
 	struct repair_ctx	*sc,
 	struct xfs_perag	*pag,
-	unsigned int		free_space,
+	unsigned int		est_agfreeblocks,
 	uint64_t		*num_inos,
 	uint64_t		*num_free_inos,
 	struct bt_rebuild	*btr_ino,
@@ -453,7 +453,7 @@ init_ino_cursors(
 	int			error;
 
 	finobt = xfs_has_finobt(sc->mp);
-	init_rebuild(sc, &XFS_RMAP_OINFO_INOBT, free_space, btr_ino);
+	init_rebuild(sc, &XFS_RMAP_OINFO_INOBT, est_agfreeblocks, btr_ino);
 
 	/* Compute inode statistics. */
 	*num_free_inos = 0;
@@ -506,7 +506,7 @@ _("Unable to compute inode btree geometry, error %d.\n"), error);
 	if (!finobt)
 		return;
 
-	init_rebuild(sc, &XFS_RMAP_OINFO_INOBT, free_space, btr_fino);
+	init_rebuild(sc, &XFS_RMAP_OINFO_INOBT, est_agfreeblocks, btr_fino);
 	btr_fino->cur = libxfs_inobt_stage_cursor(pag, &btr_fino->newbt.afake,
 			XFS_BTNUM_FINO);
 
@@ -577,7 +577,7 @@ void
 init_rmapbt_cursor(
 	struct repair_ctx	*sc,
 	struct xfs_perag	*pag,
-	unsigned int		free_space,
+	unsigned int		est_agfreeblocks,
 	struct bt_rebuild	*btr)
 {
 	xfs_agnumber_t		agno = pag->pag_agno;
@@ -586,7 +586,7 @@ init_rmapbt_cursor(
 	if (!xfs_has_rmapbt(sc->mp))
 		return;
 
-	init_rebuild(sc, &XFS_RMAP_OINFO_AG, free_space, btr);
+	init_rebuild(sc, &XFS_RMAP_OINFO_AG, est_agfreeblocks, btr);
 	btr->cur = libxfs_rmapbt_stage_cursor(sc->mp, &btr->newbt.afake, pag);
 
 	btr->bload.get_record = get_rmapbt_record;
@@ -648,7 +648,7 @@ void
 init_refc_cursor(
 	struct repair_ctx	*sc,
 	struct xfs_perag	*pag,
-	unsigned int		free_space,
+	unsigned int		est_agfreeblocks,
 	struct bt_rebuild	*btr)
 {
 	xfs_agnumber_t		agno = pag->pag_agno;
@@ -657,7 +657,7 @@ init_refc_cursor(
 	if (!xfs_has_reflink(sc->mp))
 		return;
 
-	init_rebuild(sc, &XFS_RMAP_OINFO_REFC, free_space, btr);
+	init_rebuild(sc, &XFS_RMAP_OINFO_REFC, est_agfreeblocks, btr);
 	btr->cur = libxfs_refcountbt_stage_cursor(sc->mp, &btr->newbt.afake,
 			pag);
 
@@ -698,3 +698,69 @@ _("Error %d while creating refcount btree for AG %u.\n"), error, agno);
 	libxfs_btree_del_cursor(btr->cur, 0);
 	free_slab_cursor(&btr->slab_cursor);
 }
+
+static xfs_extlen_t
+estimate_allocbt_blocks(
+	struct xfs_perag	*pag,
+	unsigned int		nr_extents)
+{
+	return libxfs_allocbt_calc_size(pag->pag_mount, nr_extents) * 2;
+}
+
+static xfs_extlen_t
+estimate_inobt_blocks(
+	struct xfs_perag	*pag)
+{
+	struct ino_tree_node	*ino_rec;
+	xfs_agnumber_t		agno = pag->pag_agno;
+	unsigned int		ino_recs = 0;
+	unsigned int		fino_recs = 0;
+	xfs_extlen_t		ret;
+
+	for (ino_rec = findfirst_inode_rec(agno);
+	     ino_rec != NULL;
+	     ino_rec = next_ino_rec(ino_rec))  {
+		unsigned int	rec_nfinos = 0;
+		int		i;
+
+		for (i = 0; i < XFS_INODES_PER_CHUNK; i++)  {
+			ASSERT(is_inode_confirmed(ino_rec, i));
+			/*
+			 * sparse inodes are not factored into superblock (free)
+			 * inode counts
+			 */
+			if (is_inode_sparse(ino_rec, i))
+				continue;
+			if (is_inode_free(ino_rec, i))
+				rec_nfinos++;
+		}
+
+		ino_recs++;
+
+		/* finobt only considers records with free inodes */
+		if (rec_nfinos)
+			fino_recs++;
+	}
+
+	ret = libxfs_iallocbt_calc_size(pag->pag_mount, ino_recs);
+	if (xfs_has_finobt(pag->pag_mount))
+		ret += libxfs_iallocbt_calc_size(pag->pag_mount, fino_recs);
+	return ret;
+
+}
+
+/* Estimate the size of the per-AG btrees. */
+xfs_extlen_t
+estimate_agbtree_blocks(
+	struct xfs_perag	*pag,
+	unsigned int		free_extents)
+{
+	unsigned int		ret = 0;
+
+	ret += estimate_allocbt_blocks(pag, free_extents);
+	ret += estimate_inobt_blocks(pag);
+	ret += estimate_rmapbt_blocks(pag);
+	ret += estimate_refcountbt_blocks(pag);
+
+	return ret;
+}
diff --git a/repair/agbtree.h b/repair/agbtree.h
index 84f7083d..714d8e68 100644
--- a/repair/agbtree.h
+++ b/repair/agbtree.h
@@ -59,4 +59,7 @@ void init_refc_cursor(struct repair_ctx *sc, struct xfs_perag *pag,
 void build_refcount_tree(struct repair_ctx *sc, xfs_agnumber_t agno,
 		struct bt_rebuild *btr);
 
+xfs_extlen_t estimate_agbtree_blocks(struct xfs_perag *pag,
+		unsigned int free_extents);
+
 #endif /* __XFS_REPAIR_AG_BTREE_H__ */
diff --git a/repair/phase5.c b/repair/phase5.c
index b04912d8..0d14c354 100644
--- a/repair/phase5.c
+++ b/repair/phase5.c
@@ -447,6 +447,8 @@ phase5_func(
 	int			extra_blocks = 0;
 	uint			num_freeblocks;
 	xfs_agblock_t		num_extents;
+	unsigned int		est_agfreeblocks = 0;
+	unsigned int		total_btblocks;
 
 	if (verbose)
 		do_log(_("        - agno = %d\n"), agno);
@@ -474,12 +476,20 @@ _("unable to rebuild AG %u.  Not enough free space in on-disk AG.\n"),
 			agno);
 	}
 
-	init_ino_cursors(&sc, pag, num_freeblocks, &sb_icount_ag[agno],
+	/*
+	 * Estimate the number of free blocks in this AG after rebuilding
+	 * all btrees.
+	 */
+	total_btblocks = estimate_agbtree_blocks(pag, num_extents);
+	if (num_freeblocks > total_btblocks)
+		est_agfreeblocks = num_freeblocks - total_btblocks;
+
+	init_ino_cursors(&sc, pag, est_agfreeblocks, &sb_icount_ag[agno],
 			&sb_ifree_ag[agno], &btr_ino, &btr_fino);
 
-	init_rmapbt_cursor(&sc, pag, num_freeblocks, &btr_rmap);
+	init_rmapbt_cursor(&sc, pag, est_agfreeblocks, &btr_rmap);
 
-	init_refc_cursor(&sc, pag, num_freeblocks, &btr_refc);
+	init_refc_cursor(&sc, pag, est_agfreeblocks, &btr_refc);
 
 	num_extents = count_bno_extents_blocks(agno, &num_freeblocks);
 	/*
@@ -507,7 +517,7 @@ _("unable to rebuild AG %u.  Not enough free space in on-disk AG.\n"),
 	/*
 	 * track blocks that we might really lose
 	 */
-	init_freespace_cursors(&sc, pag, num_freeblocks, &num_extents,
+	init_freespace_cursors(&sc, pag, est_agfreeblocks, &num_extents,
 			&extra_blocks, &btr_bno, &btr_cnt);
 
 	/*
diff --git a/repair/rmap.c b/repair/rmap.c
index 9013daa2..6bb77e08 100644
--- a/repair/rmap.c
+++ b/repair/rmap.c
@@ -1531,3 +1531,47 @@ rmap_store_agflcount(
 
 	ag_rmaps[agno].ar_flcount = count;
 }
+
+/* Estimate the size of the ondisk rmapbt from the incore data. */
+xfs_extlen_t
+estimate_rmapbt_blocks(
+	struct xfs_perag	*pag)
+{
+	struct xfs_mount	*mp = pag->pag_mount;
+	struct xfs_ag_rmap	*x;
+	unsigned long long	nr_recs = 0;
+
+	if (!rmap_needs_work(mp) || !xfs_has_rmapbt(mp))
+		return 0;
+
+	/*
+	 * Overestimate the amount of space needed by pretending that every
+	 * record in the incore slab will become rmapbt records.
+	 */
+	x = &ag_rmaps[pag->pag_agno];
+	if (x->ar_rmaps)
+		nr_recs += slab_count(x->ar_rmaps);
+	if (x->ar_raw_rmaps)
+		nr_recs += slab_count(x->ar_raw_rmaps);
+
+	return libxfs_rmapbt_calc_size(mp, nr_recs);
+}
+
+/* Estimate the size of the ondisk refcountbt from the incore data. */
+xfs_extlen_t
+estimate_refcountbt_blocks(
+	struct xfs_perag	*pag)
+{
+	struct xfs_mount	*mp = pag->pag_mount;
+	struct xfs_ag_rmap	*x;
+
+	if (!rmap_needs_work(mp) || !xfs_has_reflink(mp))
+		return 0;
+
+	x = &ag_rmaps[pag->pag_agno];
+	if (!x->ar_refcount_items)
+		return 0;
+
+	return libxfs_refcountbt_calc_size(mp,
+			slab_count(x->ar_refcount_items));
+}
diff --git a/repair/rmap.h b/repair/rmap.h
index 8d176cb3..6004e9f6 100644
--- a/repair/rmap.h
+++ b/repair/rmap.h
@@ -48,4 +48,7 @@ extern int fix_inode_reflink_flags(struct xfs_mount *, xfs_agnumber_t);
 extern void fix_freelist(struct xfs_mount *, xfs_agnumber_t, bool);
 extern void rmap_store_agflcount(struct xfs_mount *, xfs_agnumber_t, int);
 
+xfs_extlen_t estimate_rmapbt_blocks(struct xfs_perag *pag);
+xfs_extlen_t estimate_refcountbt_blocks(struct xfs_perag *pag);
+
 #endif /* RMAP_H_ */



[Index of Archives]     [XFS Filesystem Development (older mail)]     [Linux Filesystem Development]     [Linux Audio Users]     [Yosemite Trails]     [Linux Kernel]     [Linux RAID]     [Linux SCSI]


  Powered by Linux