On Tue, May 24, 2022 at 12:21:56PM +1000, Dave Chinner wrote: > From: Dave Chinner <dchinner@xxxxxxxxxx> > > Commit dc04db2aa7c9 has caused a small aim7 regression, showing a > small increase in CPU usage in __xfs_btree_check_sblock() as a > result of the extra checking. > > This is likely due to the endian conversion of the sibling poitners > being unconditional instead of relying on the compiler to endian > convert the NULL pointer at compile time and avoiding the runtime > conversion for this common case. > > Rework the checks so that endian conversion of the sibling pointers > is only done if they are not null as the original code did. > > .... and these need to be "inline" because the compiler completely > fails to inline them automatically like it should be doing. > > $ size fs/xfs/libxfs/xfs_btree.o* > text data bss dec hex filename > 51874 240 0 52114 cb92 fs/xfs/libxfs/xfs_btree.o.orig > 51562 240 0 51802 ca5a fs/xfs/libxfs/xfs_btree.o.inline > > Just when you think the tools have advanced sufficiently we don't > have to care about stuff like this anymore, along comes a reminder > that *our tools still suck*. > > Fixes: dc04db2aa7c9 ("xfs: detect self referencing btree sibling pointers") > Reported-by: kernel test robot <oliver.sang@xxxxxxxxx> > Signed-off-by: Dave Chinner <dchinner@xxxxxxxxxx> > --- > fs/xfs/libxfs/xfs_btree.c | 47 +++++++++++++++++++++++++++------------ > 1 file changed, 33 insertions(+), 14 deletions(-) > > diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c > index 2aa300f7461f..786ec1cb1bba 100644 > --- a/fs/xfs/libxfs/xfs_btree.c > +++ b/fs/xfs/libxfs/xfs_btree.c > @@ -51,16 +51,31 @@ xfs_btree_magic( > return magic; > } > > -static xfs_failaddr_t > +/* > + * These sibling pointer checks are optimised for null sibling pointers. This > + * happens a lot, and we don't need to byte swap at runtime if the sibling > + * pointer is NULL. > + * > + * These are explicitly marked at inline because the cost of calling them as > + * functions instead of inlining them is about 36 bytes extra code per call site > + * on x86-64. Yes, gcc-11 fails to inline them, and explicit inlining of these > + * two sibling check functions reduces the compiled code size by over 300 > + * bytes. > + */ > +static inline xfs_failaddr_t > xfs_btree_check_lblock_siblings( > struct xfs_mount *mp, > struct xfs_btree_cur *cur, > int level, > xfs_fsblock_t fsb, > - xfs_fsblock_t sibling) > + __be64 dsibling) > { > - if (sibling == NULLFSBLOCK) > + xfs_fsblock_t sibling; > + > + if (dsibling == cpu_to_be64(NULLFSBLOCK)) > return NULL; > + > + sibling = be64_to_cpu(dsibling); > if (sibling == fsb) > return __this_address; > if (level >= 0) { > @@ -74,17 +89,21 @@ xfs_btree_check_lblock_siblings( > return NULL; > } > > -static xfs_failaddr_t > +static inline xfs_failaddr_t > xfs_btree_check_sblock_siblings( > struct xfs_mount *mp, > struct xfs_btree_cur *cur, > int level, > xfs_agnumber_t agno, > xfs_agblock_t agbno, > - xfs_agblock_t sibling) > + __be32 dsibling) > { > - if (sibling == NULLAGBLOCK) > + xfs_agblock_t sibling; > + > + if (dsibling == cpu_to_be32(NULLAGBLOCK)) > return NULL; > + > + sibling = be32_to_cpu(dsibling); > if (sibling == agbno) > return __this_address; > if (level >= 0) { > @@ -136,10 +155,10 @@ __xfs_btree_check_lblock( > fsb = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp)); > > fa = xfs_btree_check_lblock_siblings(mp, cur, level, fsb, > - be64_to_cpu(block->bb_u.l.bb_leftsib)); > + block->bb_u.l.bb_leftsib); > if (!fa) > fa = xfs_btree_check_lblock_siblings(mp, cur, level, fsb, > - be64_to_cpu(block->bb_u.l.bb_rightsib)); > + block->bb_u.l.bb_rightsib); > return fa; > } > > @@ -204,10 +223,10 @@ __xfs_btree_check_sblock( > } > > fa = xfs_btree_check_sblock_siblings(mp, cur, level, agno, agbno, > - be32_to_cpu(block->bb_u.s.bb_leftsib)); > + block->bb_u.s.bb_leftsib); > if (!fa) > fa = xfs_btree_check_sblock_siblings(mp, cur, level, agno, > - agbno, be32_to_cpu(block->bb_u.s.bb_rightsib)); > + agbno, block->bb_u.s.bb_rightsib); > return fa; > } > > @@ -4523,10 +4542,10 @@ xfs_btree_lblock_verify( > /* sibling pointer verification */ > fsb = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp)); > fa = xfs_btree_check_lblock_siblings(mp, NULL, -1, fsb, > - be64_to_cpu(block->bb_u.l.bb_leftsib)); > + block->bb_u.l.bb_leftsib); > if (!fa) > fa = xfs_btree_check_lblock_siblings(mp, NULL, -1, fsb, > - be64_to_cpu(block->bb_u.l.bb_rightsib)); > + block->bb_u.l.bb_rightsib); > return fa; The next thing I wanna do is make __xfs_btree_check_[sl]block actually print out the failaddr_t returned to it. I half wonder if it would be *even faster* to pass in a *pointer* to the sibling fields and use be64_to_cpup, but the time savings will probably be eaten up on regrokking asm code, so Reviewed-by: Darrick J. Wong <djwong@xxxxxxxxxx> --D > } > > @@ -4580,10 +4599,10 @@ xfs_btree_sblock_verify( > agno = xfs_daddr_to_agno(mp, xfs_buf_daddr(bp)); > agbno = xfs_daddr_to_agbno(mp, xfs_buf_daddr(bp)); > fa = xfs_btree_check_sblock_siblings(mp, NULL, -1, agno, agbno, > - be32_to_cpu(block->bb_u.s.bb_leftsib)); > + block->bb_u.s.bb_leftsib); > if (!fa) > fa = xfs_btree_check_sblock_siblings(mp, NULL, -1, agno, agbno, > - be32_to_cpu(block->bb_u.s.bb_rightsib)); > + block->bb_u.s.bb_rightsib); > return fa; > } > > -- > 2.35.1 >