[RFC PATCH] xfs_repair: fix rebuilding btree node block less than minrecs

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



In production, we found that sometimes xfs_repair phase 5
rebuilds freespace node block with pointers less than minrecs
and if we trigger xfs_repair again it would report such
the following message:

bad btree nrecs (39, min=40, max=80) in btbno block 0/7882

The background is that xfs_repair starts to rebuild AGFL
after the freespace btree is settled in phase 5 so we may
need to leave necessary room in advance for each btree
leaves in order to avoid freespace btree split and then
result in AGFL rebuild fails. The old mathematics uses
ceil(num_extents / maxrecs) to decide the number of node
blocks. That would be fine without leaving extra space
since minrecs = maxrecs / 2 but if some slack was decreased
from maxrecs, the result would be larger than what is
expected and cause num_recs_pb less than minrecs, i.e:

num_extents = 79, adj_maxrecs = 80 - 2 (slack) = 78

so we'd get

num_blocks = ceil(79 / 78) = 2,
num_recs_pb = 79 / 2 = 39, which is less than
minrecs = 80 / 2 = 40

OTOH, btree bulk loading code behaves in a different way.
As in xfs_btree_bload_level_geometry it wrote

num_blocks = floor(num_extents / maxrecs)

which will never go below minrecs. And when it goes
above maxrecs, just increment num_blocks and recalculate
so we can get the reasonable results.

In the long term, btree bulk loader will replace the current
repair code as well as to resolve AGFL dependency issue.
But we may still want to look for a backportable solution
for stable versions. Hence, use the same logic to avoid the
freespace btree minrecs underflow for now.

Cc: "Darrick J. Wong" <darrick.wong@xxxxxxxxxx>
Cc: Dave Chinner <dchinner@xxxxxxxxxx>
Cc: Eric Sandeen <sandeen@xxxxxxxxxxx>
Fixes: 9851fd79bfb1 ("repair: AGFL rebuild fails if btree split required")
Signed-off-by: Gao Xiang <hsiangkao@xxxxxxxxxx>
---
not heavy tested yet..

 repair/phase5.c | 101 +++++++++++++++++++++---------------------------
 1 file changed, 45 insertions(+), 56 deletions(-)

diff --git a/repair/phase5.c b/repair/phase5.c
index abae8a08..997804a5 100644
--- a/repair/phase5.c
+++ b/repair/phase5.c
@@ -348,11 +348,29 @@ finish_cursor(bt_status_t *curs)
  * failure at runtime. Hence leave a couple of records slack space in
  * each block to allow immediate modification of the tree without
  * requiring splits to be done.
- *
- * XXX(hch): any reason we don't just look at mp->m_alloc_mxr?
  */
-#define XR_ALLOC_BLOCK_MAXRECS(mp, level) \
-	(libxfs_allocbt_maxrecs((mp), (mp)->m_sb.sb_blocksize, (level) == 0) - 2)
+static void
+compute_level_geometry(xfs_mount_t *mp, bt_stat_level_t *lptr,
+		       uint64_t nr_this_level, bool leaf)
+{
+	unsigned int		maxrecs = mp->m_alloc_mxr[!leaf];
+	int			slack = leaf ? 2 : 0;
+	unsigned int		desired_npb;
+
+	desired_npb = max(mp->m_alloc_mnr[!leaf], maxrecs - slack);
+	lptr->num_recs_tot = nr_this_level;
+	lptr->num_blocks = max(1ULL, nr_this_level / desired_npb);
+
+	lptr->num_recs_pb = nr_this_level / lptr->num_blocks;
+	lptr->modulo = nr_this_level % lptr->num_blocks;
+	if (lptr->num_recs_pb > maxrecs || (lptr->num_recs_pb == maxrecs &&
+			lptr->modulo)) {
+		lptr->num_blocks++;
+
+		lptr->num_recs_pb = nr_this_level / lptr->num_blocks;
+		lptr->modulo = nr_this_level % lptr->num_blocks;
+	}
+}
 
 /*
  * this calculates a freespace cursor for an ag.
@@ -370,6 +388,7 @@ calculate_freespace_cursor(xfs_mount_t *mp, xfs_agnumber_t agno,
 	int			i;
 	int			extents_used;
 	int			extra_blocks;
+	uint64_t		old_blocks;
 	bt_stat_level_t		*lptr;
 	bt_stat_level_t		*p_lptr;
 	extent_tree_node_t	*ext_ptr;
@@ -388,10 +407,7 @@ calculate_freespace_cursor(xfs_mount_t *mp, xfs_agnumber_t agno,
 	 * of the tree and set up the cursor for the leaf level
 	 * (note that the same code is duplicated further down)
 	 */
-	lptr->num_blocks = howmany(num_extents, XR_ALLOC_BLOCK_MAXRECS(mp, 0));
-	lptr->num_recs_pb = num_extents / lptr->num_blocks;
-	lptr->modulo = num_extents % lptr->num_blocks;
-	lptr->num_recs_tot = num_extents;
+	compute_level_geometry(mp, lptr, num_extents, true);
 	level = 1;
 
 #ifdef XR_BLD_FREE_TRACE
@@ -406,18 +422,12 @@ calculate_freespace_cursor(xfs_mount_t *mp, xfs_agnumber_t agno,
 	 * per level is the # of blocks in the level below it
 	 */
 	if (lptr->num_blocks > 1)  {
-		for (; btree_curs->level[level - 1].num_blocks > 1
-				&& level < XFS_BTREE_MAXLEVELS;
-				level++)  {
+		do {
+			p_lptr = lptr;
 			lptr = &btree_curs->level[level];
-			p_lptr = &btree_curs->level[level - 1];
-			lptr->num_blocks = howmany(p_lptr->num_blocks,
-					XR_ALLOC_BLOCK_MAXRECS(mp, level));
-			lptr->modulo = p_lptr->num_blocks
-					% lptr->num_blocks;
-			lptr->num_recs_pb = p_lptr->num_blocks
-					/ lptr->num_blocks;
-			lptr->num_recs_tot = p_lptr->num_blocks;
+
+			compute_level_geometry(mp, lptr,
+					p_lptr->num_blocks, false);
 #ifdef XR_BLD_FREE_TRACE
 			fprintf(stderr, "%s %d %d %d %d %d\n", __func__,
 					level,
@@ -426,7 +436,9 @@ calculate_freespace_cursor(xfs_mount_t *mp, xfs_agnumber_t agno,
 					lptr->modulo,
 					lptr->num_recs_tot);
 #endif
-		}
+			level++;
+		} while (lptr->num_blocks > 1);
+		ASSERT (level < XFS_BTREE_MAXLEVELS);
 	}
 
 	ASSERT(lptr->num_blocks == 1);
@@ -496,8 +508,11 @@ calculate_freespace_cursor(xfs_mount_t *mp, xfs_agnumber_t agno,
 	 * see if the number of leaf blocks will change as a result
 	 * of the number of extents changing
 	 */
-	if (howmany(num_extents, XR_ALLOC_BLOCK_MAXRECS(mp, 0))
-			!= btree_curs->level[0].num_blocks)  {
+	old_blocks = btree_curs->level[0].num_blocks;
+	compute_level_geometry(mp, &btree_curs->level[0], num_extents, true);
+	extra_blocks = 0;
+
+	if (old_blocks != btree_curs->level[0].num_blocks)  {
 		/*
 		 * yes -- recalculate the cursor.  If the number of
 		 * excess (overallocated) blocks is < xfs_agfl_size/2, we're ok.
@@ -553,30 +568,20 @@ calculate_freespace_cursor(xfs_mount_t *mp, xfs_agnumber_t agno,
 		}
 
 		lptr = &btree_curs->level[0];
-		lptr->num_blocks = howmany(num_extents,
-					XR_ALLOC_BLOCK_MAXRECS(mp, 0));
-		lptr->num_recs_pb = num_extents / lptr->num_blocks;
-		lptr->modulo = num_extents % lptr->num_blocks;
-		lptr->num_recs_tot = num_extents;
 		level = 1;
 
 		/*
 		 * if we need more levels, set them up
 		 */
 		if (lptr->num_blocks > 1)  {
-			for (level = 1; btree_curs->level[level-1].num_blocks
-					> 1 && level < XFS_BTREE_MAXLEVELS;
-					level++)  {
-				lptr = &btree_curs->level[level];
-				p_lptr = &btree_curs->level[level-1];
-				lptr->num_blocks = howmany(p_lptr->num_blocks,
-					XR_ALLOC_BLOCK_MAXRECS(mp, level));
-				lptr->modulo = p_lptr->num_blocks
-						% lptr->num_blocks;
-				lptr->num_recs_pb = p_lptr->num_blocks
-						/ lptr->num_blocks;
-				lptr->num_recs_tot = p_lptr->num_blocks;
-			}
+			do {
+				p_lptr = lptr;
+				lptr = &btree_curs->level[level++];
+
+				compute_level_geometry(mp, lptr,
+						p_lptr->num_blocks, false);
+			} while (lptr->num_blocks > 1);
+			ASSERT (level < XFS_BTREE_MAXLEVELS);
 		}
 		ASSERT(lptr->num_blocks == 1);
 		btree_curs->num_levels = level;
@@ -591,22 +596,6 @@ calculate_freespace_cursor(xfs_mount_t *mp, xfs_agnumber_t agno,
 
 		ASSERT(blocks_allocated_total >= blocks_needed);
 		extra_blocks = blocks_allocated_total - blocks_needed;
-	} else  {
-		if (extents_used > 0) {
-			/*
-			 * reset the leaf level geometry to account
-			 * for consumed extents.  we can leave the
-			 * rest of the cursor alone since the number
-			 * of leaf blocks hasn't changed.
-			 */
-			lptr = &btree_curs->level[0];
-
-			lptr->num_recs_pb = num_extents / lptr->num_blocks;
-			lptr->modulo = num_extents % lptr->num_blocks;
-			lptr->num_recs_tot = num_extents;
-		}
-
-		extra_blocks = 0;
 	}
 
 	btree_curs->num_tot_blocks = blocks_allocated_pt;
-- 
2.18.1




[Index of Archives]     [XFS Filesystem Development (older mail)]     [Linux Filesystem Development]     [Linux Audio Users]     [Yosemite Trails]     [Linux Kernel]     [Linux RAID]     [Linux SCSI]


  Powered by Linux