On 18 Sep 2021 at 07:00, Darrick J. Wong wrote: > From: Darrick J. Wong <djwong@xxxxxxxxxx> > > Instead of assuming that the hardcoded XFS_BTREE_MAXLEVELS value is big > enough to handle the maximally tall rmap btree when all blocks are in > use and maximally shared, let's compute the maximum height assuming the > rmapbt consumes as many blocks as possible. Maximum rmap btree height calculations look good to me. Reviewed-by: Chandan Babu R <chandan.babu@xxxxxxxxxx> > > Signed-off-by: Darrick J. Wong <djwong@xxxxxxxxxx> > --- > fs/xfs/libxfs/xfs_btree.c | 34 +++++++++++++++++++++++++++++++++ > fs/xfs/libxfs/xfs_btree.h | 2 ++ > fs/xfs/libxfs/xfs_rmap_btree.c | 40 ++++++++++++++++++++------------------- > fs/xfs/libxfs/xfs_rmap_btree.h | 2 +- > fs/xfs/libxfs/xfs_trans_resv.c | 12 ++++++++++++ > fs/xfs/libxfs/xfs_trans_space.h | 7 +++++++ > fs/xfs/xfs_mount.c | 2 +- > 7 files changed, 78 insertions(+), 21 deletions(-) > > > diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c > index 6cf49f7e1299..005bc42cf0bd 100644 > --- a/fs/xfs/libxfs/xfs_btree.c > +++ b/fs/xfs/libxfs/xfs_btree.c > @@ -4526,6 +4526,40 @@ xfs_btree_compute_maxlevels( > return level; > } > > +/* > + * Compute the maximum height of a btree that is allowed to consume up to the > + * given number of blocks. > + */ > +unsigned int > +xfs_btree_compute_maxlevels_size( > + unsigned long long max_btblocks, > + unsigned int leaf_mnr) > +{ > + unsigned long long leaf_blocks = leaf_mnr; > + unsigned long long blocks_left; > + unsigned int maxlevels; > + > + if (max_btblocks < 1) > + return 0; > + > + /* > + * The loop increments maxlevels as long as there would be enough > + * blocks left in the reservation to handle each node block at the > + * current level pointing to the minimum possible number of leaf blocks > + * at the next level down. We start the loop assuming a single-level > + * btree consuming one block. > + */ > + maxlevels = 1; > + blocks_left = max_btblocks - 1; > + while (leaf_blocks < blocks_left) { > + maxlevels++; > + blocks_left -= leaf_blocks; > + leaf_blocks *= leaf_mnr; > + } > + > + return maxlevels; > +} > + > /* > * Query a regular btree for all records overlapping a given interval. > * Start with a LE lookup of the key of low_rec and return all records > diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h > index 106760c540c7..d256d869f0af 100644 > --- a/fs/xfs/libxfs/xfs_btree.h > +++ b/fs/xfs/libxfs/xfs_btree.h > @@ -476,6 +476,8 @@ xfs_failaddr_t xfs_btree_lblock_verify(struct xfs_buf *bp, > unsigned int max_recs); > > uint xfs_btree_compute_maxlevels(uint *limits, unsigned long len); > +unsigned int xfs_btree_compute_maxlevels_size(unsigned long long max_btblocks, > + unsigned int leaf_mnr); > unsigned long long xfs_btree_calc_size(uint *limits, unsigned long long len); > > /* > diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c > index f3c4d0965cc9..85caeb14e4db 100644 > --- a/fs/xfs/libxfs/xfs_rmap_btree.c > +++ b/fs/xfs/libxfs/xfs_rmap_btree.c > @@ -535,30 +535,32 @@ xfs_rmapbt_maxrecs( > } > > /* Compute the maximum height of an rmap btree. */ > -void > +unsigned int > xfs_rmapbt_compute_maxlevels( > - struct xfs_mount *mp) > + struct xfs_mount *mp) > { > + if (!xfs_has_reflink(mp)) { > + /* > + * If there's no block sharing, compute the maximum rmapbt > + * height assuming one rmap record per AG block. > + */ > + return xfs_btree_compute_maxlevels(mp->m_rmap_mnr, > + mp->m_sb.sb_agblocks); > + } > + > /* > - * On a non-reflink filesystem, the maximum number of rmap > - * records is the number of blocks in the AG, hence the max > - * rmapbt height is log_$maxrecs($agblocks). However, with > - * reflink each AG block can have up to 2^32 (per the refcount > - * record format) owners, which means that theoretically we > - * could face up to 2^64 rmap records. > + * Compute the asymptotic maxlevels for an rmapbt on a reflink fs. > * > - * That effectively means that the max rmapbt height must be > - * XFS_BTREE_MAXLEVELS. "Fortunately" we'll run out of AG > - * blocks to feed the rmapbt long before the rmapbt reaches > - * maximum height. The reflink code uses ag_resv_critical to > - * disallow reflinking when less than 10% of the per-AG metadata > - * block reservation since the fallback is a regular file copy. > + * On a reflink filesystem, each AG block can have up to 2^32 (per the > + * refcount record format) owners, which means that theoretically we > + * could face up to 2^64 rmap records. However, we're likely to run > + * out of blocks in the AG long before that happens, which means that > + * we must compute the max height based on what the btree will look > + * like if it consumes almost all the blocks in the AG due to maximal > + * sharing factor. > */ > - if (xfs_has_reflink(mp)) > - mp->m_rmap_maxlevels = XFS_BTREE_MAXLEVELS; > - else > - mp->m_rmap_maxlevels = xfs_btree_compute_maxlevels( > - mp->m_rmap_mnr, mp->m_sb.sb_agblocks); > + return xfs_btree_compute_maxlevels_size(mp->m_sb.sb_agblocks, > + mp->m_rmap_mnr[1]); > } > > /* Calculate the refcount btree size for some records. */ > diff --git a/fs/xfs/libxfs/xfs_rmap_btree.h b/fs/xfs/libxfs/xfs_rmap_btree.h > index f2eee6572af4..5aaecf755abd 100644 > --- a/fs/xfs/libxfs/xfs_rmap_btree.h > +++ b/fs/xfs/libxfs/xfs_rmap_btree.h > @@ -49,7 +49,7 @@ struct xfs_btree_cur *xfs_rmapbt_stage_cursor(struct xfs_mount *mp, > void xfs_rmapbt_commit_staged_btree(struct xfs_btree_cur *cur, > struct xfs_trans *tp, struct xfs_buf *agbp); > int xfs_rmapbt_maxrecs(int blocklen, int leaf); > -extern void xfs_rmapbt_compute_maxlevels(struct xfs_mount *mp); > +unsigned int xfs_rmapbt_compute_maxlevels(struct xfs_mount *mp); > > extern xfs_extlen_t xfs_rmapbt_calc_size(struct xfs_mount *mp, > unsigned long long len); > diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c > index 5e300daa2559..679f10e08f31 100644 > --- a/fs/xfs/libxfs/xfs_trans_resv.c > +++ b/fs/xfs/libxfs/xfs_trans_resv.c > @@ -814,6 +814,15 @@ xfs_trans_resv_calc( > struct xfs_mount *mp, > struct xfs_trans_resv *resp) > { > + unsigned int rmap_maxlevels = mp->m_rmap_maxlevels; > + > + /* > + * In the early days of rmap+reflink, we hardcoded the rmap maxlevels > + * to 9 even if the AG size was smaller. > + */ > + if (xfs_has_rmapbt(mp) && xfs_has_reflink(mp)) > + mp->m_rmap_maxlevels = XFS_OLD_REFLINK_RMAP_MAXLEVELS; > + > /* > * The following transactions are logged in physical format and > * require a permanent reservation on space. > @@ -916,4 +925,7 @@ xfs_trans_resv_calc( > resp->tr_clearagi.tr_logres = xfs_calc_clear_agi_bucket_reservation(mp); > resp->tr_growrtzero.tr_logres = xfs_calc_growrtzero_reservation(mp); > resp->tr_growrtfree.tr_logres = xfs_calc_growrtfree_reservation(mp); > + > + /* Put everything back the way it was. This goes at the end. */ > + mp->m_rmap_maxlevels = rmap_maxlevels; > } > diff --git a/fs/xfs/libxfs/xfs_trans_space.h b/fs/xfs/libxfs/xfs_trans_space.h > index 50332be34388..440c9c390b86 100644 > --- a/fs/xfs/libxfs/xfs_trans_space.h > +++ b/fs/xfs/libxfs/xfs_trans_space.h > @@ -17,6 +17,13 @@ > /* Adding one rmap could split every level up to the top of the tree. */ > #define XFS_RMAPADD_SPACE_RES(mp) ((mp)->m_rmap_maxlevels) > > +/* > + * Note that we historically set m_rmap_maxlevels to 9 when reflink was > + * enabled, so we must preserve this behavior to avoid changing the transaction > + * space reservations. > + */ > +#define XFS_OLD_REFLINK_RMAP_MAXLEVELS (9) > + > /* Blocks we might need to add "b" rmaps to a tree. */ > #define XFS_NRMAPADD_SPACE_RES(mp, b)\ > (((b + XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp) - 1) / \ > diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c > index 06dac09eddbd..e600a0b781c8 100644 > --- a/fs/xfs/xfs_mount.c > +++ b/fs/xfs/xfs_mount.c > @@ -635,7 +635,7 @@ xfs_mountfs( > xfs_bmap_compute_maxlevels(mp, XFS_DATA_FORK); > xfs_bmap_compute_maxlevels(mp, XFS_ATTR_FORK); > xfs_mount_setup_inode_geom(mp); > - xfs_rmapbt_compute_maxlevels(mp); > + mp->m_rmap_maxlevels = xfs_rmapbt_compute_maxlevels(mp); > xfs_refcountbt_compute_maxlevels(mp); > > /* -- chandan