Re: [PATCH] arm64: enable GENERIC_FIND_FIRST_BIT

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



From: Yury Norov <yury.norov@xxxxxxxxx>
Date: Sat, 5 Dec 2020 08:54:06 -0800

Hi,

> ARM64 doesn't implement find_first_{zero}_bit in arch code and doesn't
> enable it in config. It leads to using find_next_bit() which is less
> efficient:
>
> 0000000000000000 <find_first_bit>:
>    0:	aa0003e4 	mov	x4, x0
>    4:	aa0103e0 	mov	x0, x1
>    8:	b4000181 	cbz	x1, 38 <find_first_bit+0x38>
>    c:	f9400083 	ldr	x3, [x4]
>   10:	d2800802 	mov	x2, #0x40                  	// #64
>   14:	91002084 	add	x4, x4, #0x8
>   18:	b40000c3 	cbz	x3, 30 <find_first_bit+0x30>
>   1c:	14000008 	b	3c <find_first_bit+0x3c>
>   20:	f8408483 	ldr	x3, [x4], #8
>   24:	91010045 	add	x5, x2, #0x40
>   28:	b50000c3 	cbnz	x3, 40 <find_first_bit+0x40>
>   2c:	aa0503e2 	mov	x2, x5
>   30:	eb02001f 	cmp	x0, x2
>   34:	54ffff68 	b.hi	20 <find_first_bit+0x20>  // b.pmore
>   38:	d65f03c0 	ret
>   3c:	d2800002 	mov	x2, #0x0                   	// #0
>   40:	dac00063 	rbit	x3, x3
>   44:	dac01063 	clz	x3, x3
>   48:	8b020062 	add	x2, x3, x2
>   4c:	eb02001f 	cmp	x0, x2
>   50:	9a829000 	csel	x0, x0, x2, ls  // ls = plast
>   54:	d65f03c0 	ret
>
>   ...
>
> 0000000000000118 <_find_next_bit.constprop.1>:
>  118:	eb02007f 	cmp	x3, x2
>  11c:	540002e2 	b.cs	178 <_find_next_bit.constprop.1+0x60>  // b.hs, b.nlast
>  120:	d346fc66 	lsr	x6, x3, #6
>  124:	f8667805 	ldr	x5, [x0, x6, lsl #3]
>  128:	b4000061 	cbz	x1, 134 <_find_next_bit.constprop.1+0x1c>
>  12c:	f8667826 	ldr	x6, [x1, x6, lsl #3]
>  130:	8a0600a5 	and	x5, x5, x6
>  134:	ca0400a6 	eor	x6, x5, x4
>  138:	92800005 	mov	x5, #0xffffffffffffffff    	// #-1
>  13c:	9ac320a5 	lsl	x5, x5, x3
>  140:	927ae463 	and	x3, x3, #0xffffffffffffffc0
>  144:	ea0600a5 	ands	x5, x5, x6
>  148:	54000120 	b.eq	16c <_find_next_bit.constprop.1+0x54>  // b.none
>  14c:	1400000e 	b	184 <_find_next_bit.constprop.1+0x6c>
>  150:	d346fc66 	lsr	x6, x3, #6
>  154:	f8667805 	ldr	x5, [x0, x6, lsl #3]
>  158:	b4000061 	cbz	x1, 164 <_find_next_bit.constprop.1+0x4c>
>  15c:	f8667826 	ldr	x6, [x1, x6, lsl #3]
>  160:	8a0600a5 	and	x5, x5, x6
>  164:	eb05009f 	cmp	x4, x5
>  168:	540000c1 	b.ne	180 <_find_next_bit.constprop.1+0x68>  // b.any
>  16c:	91010063 	add	x3, x3, #0x40
>  170:	eb03005f 	cmp	x2, x3
>  174:	54fffee8 	b.hi	150 <_find_next_bit.constprop.1+0x38>  // b.pmore
>  178:	aa0203e0 	mov	x0, x2
>  17c:	d65f03c0 	ret
>  180:	ca050085 	eor	x5, x4, x5
>  184:	dac000a5 	rbit	x5, x5
>  188:	dac010a5 	clz	x5, x5
>  18c:	8b0300a3 	add	x3, x5, x3
>  190:	eb03005f 	cmp	x2, x3
>  194:	9a839042 	csel	x2, x2, x3, ls  // ls = plast
>  198:	aa0203e0 	mov	x0, x2
>  19c:	d65f03c0 	ret
>
>  ...
>
> 0000000000000238 <find_next_bit>:
>  238:	a9bf7bfd 	stp	x29, x30, [sp, #-16]!
>  23c:	aa0203e3 	mov	x3, x2
>  240:	d2800004 	mov	x4, #0x0                   	// #0
>  244:	aa0103e2 	mov	x2, x1
>  248:	910003fd 	mov	x29, sp
>  24c:	d2800001 	mov	x1, #0x0                   	// #0
>  250:	97ffffb2 	bl	118 <_find_next_bit.constprop.1>
>  254:	a8c17bfd 	ldp	x29, x30, [sp], #16
>  258:	d65f03c0 	ret
>
> Enabling this functions would also benefit for_each_{set,clear}_bit().
> Would it make sense to enable this config for all such architectures by
> default?

I confirm that GENERIC_FIND_FIRST_BIT also produces more optimized and
fast code on MIPS (32 R2) where there is also no architecture-specific
bitsearching routines.
So, if it's okay for other folks, I'd suggest to go for it and enable
for all similar arches.

(otherwise, I'll publish a separate entry for mips-next after 5.12-rc1
 release and mention you in "Suggested-by:")

> Signed-off-by: Yury Norov <yury.norov@xxxxxxxxx>
>
> ---
>  arch/arm64/Kconfig | 1 +
>  1 file changed, 1 insertion(+)
>
> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
> index 1515f6f153a0..2b90ef1f548e 100644
> --- a/arch/arm64/Kconfig
> +++ b/arch/arm64/Kconfig
> @@ -106,6 +106,7 @@ config ARM64
>  	select GENERIC_CPU_AUTOPROBE
>  	select GENERIC_CPU_VULNERABILITIES
>  	select GENERIC_EARLY_IOREMAP
> +	select GENERIC_FIND_FIRST_BIT
>  	select GENERIC_IDLE_POLL_SETUP
>  	select GENERIC_IRQ_IPI
>  	select GENERIC_IRQ_MULTI_HANDLER
> --
> 2.25.1

Thanks,
Al





[Index of Archives]     [LKML Archive]     [Linux ARM Kernel]     [Linux ARM]     [Git]     [Yosemite News]     [Linux SCSI]     [Linux Hams]

  Powered by Linux