> -----Original Message----- > From: Charlie Jenkins <charlie@xxxxxxxxxxxx> > Sent: Friday, October 27, 2023 2:25 PM > To: Wang, Xiao W <xiao.w.wang@xxxxxxxxx> > Cc: paul.walmsley@xxxxxxxxxx; palmer@xxxxxxxxxxx; > aou@xxxxxxxxxxxxxxxxx; ardb@xxxxxxxxxx; anup@xxxxxxxxxxxxxx; Li, Haicheng > <haicheng.li@xxxxxxxxx>; ajones@xxxxxxxxxxxxxxxx; Liu, Yujie > <yujie.liu@xxxxxxxxx>; linux-riscv@xxxxxxxxxxxxxxxxxxx; linux- > efi@xxxxxxxxxxxxxxx; linux-kernel@xxxxxxxxxxxxxxx > Subject: Re: [PATCH v3 2/2] riscv: Optimize bitops with Zbb extension > > On Tue, Sep 26, 2023 at 05:46:55PM +0800, Xiao Wang wrote: > > This patch leverages the alternative mechanism to dynamically optimize > > bitops (including __ffs, __fls, ffs, fls) with Zbb instructions. When > > Zbb ext is not supported by the runtime CPU, legacy implementation is > > used. If Zbb is supported, then the optimized variants will be selected > > via alternative patching. > > > > The legacy bitops support is taken from the generic C implementation as > > fallback. > > > > If the parameter is a build-time constant, we leverage compiler builtin to > > calculate the result directly, this approach is inspired by x86 bitops > > implementation. > > > > EFI stub runs before the kernel, so alternative mechanism should not be > > used there, this patch introduces a macro NO_ALTERNATIVE for this > purpose. > > > > Signed-off-by: Xiao Wang <xiao.w.wang@xxxxxxxxx> > > --- > > arch/riscv/include/asm/bitops.h | 266 +++++++++++++++++++++++++- > > drivers/firmware/efi/libstub/Makefile | 2 +- > > 2 files changed, 264 insertions(+), 4 deletions(-) > > > > diff --git a/arch/riscv/include/asm/bitops.h > b/arch/riscv/include/asm/bitops.h > > index 3540b690944b..c97e774cb647 100644 > > --- a/arch/riscv/include/asm/bitops.h > > +++ b/arch/riscv/include/asm/bitops.h > > @@ -15,13 +15,273 @@ > > #include <asm/barrier.h> > > #include <asm/bitsperlong.h> > > > > +#if !defined(CONFIG_RISCV_ISA_ZBB) || defined(NO_ALTERNATIVE) > > #include <asm-generic/bitops/__ffs.h> > > -#include <asm-generic/bitops/ffz.h> > > -#include <asm-generic/bitops/fls.h> > > #include <asm-generic/bitops/__fls.h> > > +#include <asm-generic/bitops/ffs.h> > > +#include <asm-generic/bitops/fls.h> > > + > > +#else > > +#include <asm/alternative-macros.h> > > +#include <asm/hwcap.h> > > + > > +#if (BITS_PER_LONG == 64) > > +#define CTZW "ctzw " > > +#define CLZW "clzw " > > +#elif (BITS_PER_LONG == 32) > > +#define CTZW "ctz " > > +#define CLZW "clz " > > +#else > > +#error "Unexpected BITS_PER_LONG" > > +#endif > > + > > +static __always_inline unsigned long variable__ffs(unsigned long word) > > +{ > > + int num; > > + > > + asm_volatile_goto( > > + ALTERNATIVE("j %l[legacy]", "nop", 0, RISCV_ISA_EXT_ZBB, 1) > > + : : : : legacy); > > + > > + asm volatile ( > > + ".option push\n" > > + ".option arch,+zbb\n" > > + "ctz %0, %1\n" > > + ".option pop\n" > > + : "=r" (word) : "r" (word) :); > > + > > + return word; > > That's so clean! > > > + > > +legacy: > > + num = 0; > > +#if BITS_PER_LONG == 64 > > + if ((word & 0xffffffff) == 0) { > > + num += 32; > > + word >>= 32; > > + } > > +#endif > > + if ((word & 0xffff) == 0) { > > + num += 16; > > + word >>= 16; > > + } > > + if ((word & 0xff) == 0) { > > + num += 8; > > + word >>= 8; > > + } > > + if ((word & 0xf) == 0) { > > + num += 4; > > + word >>= 4; > > + } > > + if ((word & 0x3) == 0) { > > + num += 2; > > + word >>= 2; > > + } > > + if ((word & 0x1) == 0) > > + num += 1; > > + return num; > > +} > > + > > +/** > > + * __ffs - find first set bit in a long word > > + * @word: The word to search > > + * > > + * Undefined if no set bit exists, so code should check against 0 first. > > + */ > > +#define __ffs(word) \ > > + (__builtin_constant_p(word) ? \ > > + (unsigned long)__builtin_ctzl(word) : \ > > + variable__ffs(word)) > > + > > +static __always_inline unsigned long variable__fls(unsigned long word) > > +{ > > + int num; > > + > > + asm_volatile_goto( > > + ALTERNATIVE("j %l[legacy]", "nop", 0, RISCV_ISA_EXT_ZBB, 1) > > + : : : : legacy); > > + > > + asm volatile ( > > + ".option push\n" > > + ".option arch,+zbb\n" > > + "clz %0, %1\n" > > + ".option pop\n" > > + : "=r" (word) : "r" (word) :); > > + > > + return BITS_PER_LONG - 1 - word; > > + > > +legacy: > > + num = BITS_PER_LONG - 1; > > +#if BITS_PER_LONG == 64 > > + if (!(word & (~0ul << 32))) { > > + num -= 32; > > + word <<= 32; > > + } > > +#endif > > + if (!(word & (~0ul << (BITS_PER_LONG-16)))) { > > + num -= 16; > > + word <<= 16; > > + } > > + if (!(word & (~0ul << (BITS_PER_LONG-8)))) { > > + num -= 8; > > + word <<= 8; > > + } > > + if (!(word & (~0ul << (BITS_PER_LONG-4)))) { > > + num -= 4; > > + word <<= 4; > > + } > > + if (!(word & (~0ul << (BITS_PER_LONG-2)))) { > > + num -= 2; > > + word <<= 2; > > + } > > + if (!(word & (~0ul << (BITS_PER_LONG-1)))) > > + num -= 1; > > + return num; > > +} > > + > > +/** > > + * __fls - find last set bit in a long word > > + * @word: the word to search > > + * > > + * Undefined if no set bit exists, so code should check against 0 first. > > + */ > > +#define __fls(word) \ > > + (__builtin_constant_p(word) ? \ > > + (unsigned long)(BITS_PER_LONG - 1 - __builtin_clzl(word)) : \ > > + variable__fls(word)) > > + > > +static __always_inline int variable_ffs(int x) > > +{ > > + int r; > > + > > + asm_volatile_goto( > > + ALTERNATIVE("j %l[legacy]", "nop", 0, RISCV_ISA_EXT_ZBB, 1) > > + : : : : legacy); > > + > > + asm volatile ( > > + ".option push\n" > > + ".option arch,+zbb\n" > > + "bnez %1, 1f\n" > > + "li %0, 0\n" > > + "j 2f\n" > > + "1:\n" > > + CTZW "%0, %1\n" > > + "addi %0, %0, 1\n" > > + "2:\n" > > + ".option pop\n" > > + : "=r" (r) : "r" (x) :); > > + > > + return r; > > + > > I generally like to move as much code out of asm as possible. You are > also able to remove one of the branches if you rearrange as follows: > > if (!x) > return 0; > > asm volatile ( > ".option push\n" > ".option arch,+zbb\n" > CTZW "%0, %1\n" > ".option pop\n" > : "=r" (r) : "r" (x) :); > > return r + 1; > > You could then also extract out the "if (!x)" at the beginning of the > legacy section at put it at the top of the function. > Agree, less asm code will be more reader friendly. And your sample code looks nice, thanks for the suggestion. Will do the adjustment in next version. > > +legacy: > > + r = 1; > > + if (!x) > > + return 0; > > + if (!(x & 0xffff)) { > > + x >>= 16; > > + r += 16; > > + } > > + if (!(x & 0xff)) { > > + x >>= 8; > > + r += 8; > > + } > > + if (!(x & 0xf)) { > > + x >>= 4; > > + r += 4; > > + } > > + if (!(x & 3)) { > > + x >>= 2; > > + r += 2; > > + } > > + if (!(x & 1)) { > > + x >>= 1; > > + r += 1; > > + } > > + return r; > > +} > > + > > +/** > > + * ffs - find first set bit in a word > > + * @x: the word to search > > + * > > + * This is defined the same way as the libc and compiler builtin ffs routines. > > + * > > + * ffs(value) returns 0 if value is 0 or the position of the first set bit if > > + * value is nonzero. The first (least significant) bit is at position 1. > > + */ > > +#define ffs(x) (__builtin_constant_p(x) ? __builtin_ffs(x) : variable_ffs(x)) > > + > > +static __always_inline int variable_fls(unsigned int x) > > +{ > > + int r; > > + > > + asm_volatile_goto( > > + ALTERNATIVE("j %l[legacy]", "nop", 0, RISCV_ISA_EXT_ZBB, 1) > > + : : : : legacy); > > + > > + asm volatile ( > > + ".option push\n" > > + ".option arch,+zbb\n" > > + "bnez %1, 1f\n" > > + "li %0, 0\n" > > + "j 2f\n" > > + "1:\n" > > + CLZW "%0, %1\n" > > + "neg %0, %0\n" > > + "addi %0, %0, 32\n" > > + "2:\n" > > + ".option pop\n" > > + : "=r" (r) : "r" (x) :); > > + > > + return r; > > + > > Same comment as above but with appropriate changes. Will refine it in next version. > > > +legacy: > > + r = 32; > > + if (!x) > > + return 0; > > + if (!(x & 0xffff0000u)) { > > + x <<= 16; > > + r -= 16; > > + } > > + if (!(x & 0xff000000u)) { > > + x <<= 8; > > + r -= 8; > > + } > > + if (!(x & 0xf0000000u)) { > > + x <<= 4; > > + r -= 4; > > + } > > + if (!(x & 0xc0000000u)) { > > + x <<= 2; > > + r -= 2; > > + } > > + if (!(x & 0x80000000u)) { > > + x <<= 1; > > + r -= 1; > > + } > > + return r; > > +} > > + > > +/** > > + * fls - find last set bit in a word > > + * @x: the word to search > > + * > > + * This is defined in a similar way as ffs, but returns the position of the most > > + * significant set bit. > > + * > > + * fls(value) returns 0 if value is 0 or the position of the last set bit if > > + * value is nonzero. The last (most significant) bit is at position 32. > > + */ > > +#define fls(x) > \ > > + (__builtin_constant_p(x) ? \ > > + (int)(((x) != 0) ? \ > > + (sizeof(unsigned int) * 8 - __builtin_clz(x)) : 0) : \ > > + variable_fls(x)) > > + > > +#endif > > As a nit, it's nice when large #ifdef blocks have a comment like > > /* !defined(CONFIG_RISCV_ISA_ZBB) || defined(NO_ALTERNATIVE) */ > Yes, I will add it in next version. Thanks for the comments. BRs, Xiao > Overall great changes, just that small optimization to get rid of the > jump. > > - Charlie > > > + > > +#include <asm-generic/bitops/ffz.h> > > #include <asm-generic/bitops/fls64.h> > > #include <asm-generic/bitops/sched.h> > > -#include <asm-generic/bitops/ffs.h> > > > > #include <asm-generic/bitops/hweight.h> > >