Re: [PATCH 1/3] xfs: avoid unnecessary runtime sibling pointer endian conversions

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Tue, May 24, 2022 at 12:21:56PM +1000, Dave Chinner wrote:
> From: Dave Chinner <dchinner@xxxxxxxxxx>
> 
> Commit dc04db2aa7c9 has caused a small aim7 regression, showing a
> small increase in CPU usage in __xfs_btree_check_sblock() as a
> result of the extra checking.
> 
> This is likely due to the endian conversion of the sibling poitners
> being unconditional instead of relying on the compiler to endian
> convert the NULL pointer at compile time and avoiding the runtime
> conversion for this common case.
> 
> Rework the checks so that endian conversion of the sibling pointers
> is only done if they are not null as the original code did.
> 
> .... and these need to be "inline" because the compiler completely
> fails to inline them automatically like it should be doing.
> 
> $ size fs/xfs/libxfs/xfs_btree.o*
>    text	   data	    bss	    dec	    hex	filename
>   51874	    240	      0	  52114	   cb92 fs/xfs/libxfs/xfs_btree.o.orig
>   51562	    240	      0	  51802	   ca5a fs/xfs/libxfs/xfs_btree.o.inline
> 
> Just when you think the tools have advanced sufficiently we don't
> have to care about stuff like this anymore, along comes a reminder
> that *our tools still suck*.
> 
> Fixes: dc04db2aa7c9 ("xfs: detect self referencing btree sibling pointers")
> Reported-by: kernel test robot <oliver.sang@xxxxxxxxx>
> Signed-off-by: Dave Chinner <dchinner@xxxxxxxxxx>
> ---
>  fs/xfs/libxfs/xfs_btree.c | 47 +++++++++++++++++++++++++++------------
>  1 file changed, 33 insertions(+), 14 deletions(-)
> 
> diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
> index 2aa300f7461f..786ec1cb1bba 100644
> --- a/fs/xfs/libxfs/xfs_btree.c
> +++ b/fs/xfs/libxfs/xfs_btree.c
> @@ -51,16 +51,31 @@ xfs_btree_magic(
>  	return magic;
>  }
>  
> -static xfs_failaddr_t
> +/*
> + * These sibling pointer checks are optimised for null sibling pointers. This
> + * happens a lot, and we don't need to byte swap at runtime if the sibling
> + * pointer is NULL.
> + *
> + * These are explicitly marked at inline because the cost of calling them as
> + * functions instead of inlining them is about 36 bytes extra code per call site
> + * on x86-64. Yes, gcc-11 fails to inline them, and explicit inlining of these
> + * two sibling check functions reduces the compiled code size by over 300
> + * bytes.
> + */
> +static inline xfs_failaddr_t
>  xfs_btree_check_lblock_siblings(
>  	struct xfs_mount	*mp,
>  	struct xfs_btree_cur	*cur,
>  	int			level,
>  	xfs_fsblock_t		fsb,
> -	xfs_fsblock_t		sibling)
> +	__be64			dsibling)
>  {
> -	if (sibling == NULLFSBLOCK)
> +	xfs_fsblock_t		sibling;
> +
> +	if (dsibling == cpu_to_be64(NULLFSBLOCK))
>  		return NULL;
> +
> +	sibling = be64_to_cpu(dsibling);
>  	if (sibling == fsb)
>  		return __this_address;
>  	if (level >= 0) {
> @@ -74,17 +89,21 @@ xfs_btree_check_lblock_siblings(
>  	return NULL;
>  }
>  
> -static xfs_failaddr_t
> +static inline xfs_failaddr_t
>  xfs_btree_check_sblock_siblings(
>  	struct xfs_mount	*mp,
>  	struct xfs_btree_cur	*cur,
>  	int			level,
>  	xfs_agnumber_t		agno,
>  	xfs_agblock_t		agbno,
> -	xfs_agblock_t		sibling)
> +	__be32			dsibling)
>  {
> -	if (sibling == NULLAGBLOCK)
> +	xfs_agblock_t		sibling;
> +
> +	if (dsibling == cpu_to_be32(NULLAGBLOCK))
>  		return NULL;
> +
> +	sibling = be32_to_cpu(dsibling);
>  	if (sibling == agbno)
>  		return __this_address;
>  	if (level >= 0) {
> @@ -136,10 +155,10 @@ __xfs_btree_check_lblock(
>  		fsb = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp));
>  
>  	fa = xfs_btree_check_lblock_siblings(mp, cur, level, fsb,
> -			be64_to_cpu(block->bb_u.l.bb_leftsib));
> +			block->bb_u.l.bb_leftsib);
>  	if (!fa)
>  		fa = xfs_btree_check_lblock_siblings(mp, cur, level, fsb,
> -				be64_to_cpu(block->bb_u.l.bb_rightsib));
> +				block->bb_u.l.bb_rightsib);
>  	return fa;
>  }
>  
> @@ -204,10 +223,10 @@ __xfs_btree_check_sblock(
>  	}
>  
>  	fa = xfs_btree_check_sblock_siblings(mp, cur, level, agno, agbno,
> -			be32_to_cpu(block->bb_u.s.bb_leftsib));
> +			block->bb_u.s.bb_leftsib);
>  	if (!fa)
>  		fa = xfs_btree_check_sblock_siblings(mp, cur, level, agno,
> -				 agbno, be32_to_cpu(block->bb_u.s.bb_rightsib));
> +				 agbno, block->bb_u.s.bb_rightsib);
>  	return fa;
>  }
>  
> @@ -4523,10 +4542,10 @@ xfs_btree_lblock_verify(
>  	/* sibling pointer verification */
>  	fsb = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp));
>  	fa = xfs_btree_check_lblock_siblings(mp, NULL, -1, fsb,
> -			be64_to_cpu(block->bb_u.l.bb_leftsib));
> +			block->bb_u.l.bb_leftsib);
>  	if (!fa)
>  		fa = xfs_btree_check_lblock_siblings(mp, NULL, -1, fsb,
> -				be64_to_cpu(block->bb_u.l.bb_rightsib));
> +				block->bb_u.l.bb_rightsib);
>  	return fa;

The next thing I wanna do is make __xfs_btree_check_[sl]block actually
print out the failaddr_t returned to it.

I half wonder if it would be *even faster* to pass in a *pointer* to the
sibling fields and use be64_to_cpup, but the time savings will probably
be eaten up on regrokking asm code, so

Reviewed-by: Darrick J. Wong <djwong@xxxxxxxxxx>

--D

>  }
>  
> @@ -4580,10 +4599,10 @@ xfs_btree_sblock_verify(
>  	agno = xfs_daddr_to_agno(mp, xfs_buf_daddr(bp));
>  	agbno = xfs_daddr_to_agbno(mp, xfs_buf_daddr(bp));
>  	fa = xfs_btree_check_sblock_siblings(mp, NULL, -1, agno, agbno,
> -			be32_to_cpu(block->bb_u.s.bb_leftsib));
> +			block->bb_u.s.bb_leftsib);
>  	if (!fa)
>  		fa = xfs_btree_check_sblock_siblings(mp, NULL, -1, agno, agbno,
> -				be32_to_cpu(block->bb_u.s.bb_rightsib));
> +				block->bb_u.s.bb_rightsib);
>  	return fa;
>  }
>  
> -- 
> 2.35.1
> 



[Index of Archives]     [XFS Filesystem Development (older mail)]     [Linux Filesystem Development]     [Linux Audio Users]     [Yosemite Trails]     [Linux Kernel]     [Linux RAID]     [Linux SCSI]


  Powered by Linux