Re: [PATCH bpf-next v4 2/3] bpf,x64: use shrx/sarx/shlx when available

KP Singh <kpsingh@xxxxxxxxxx> · Wed, 5 Oct 2022 21:11:01 -0700

On Sat, Oct 1, 2022 at 10:12 PM Jie Meng <jmeng@xxxxxx> wrote:
>
> Instead of shr/sar/shl that implicitly use %cl, emit their more flexible
> alternatives provided in BMI2 when advantageous; keep using the non BMI2
> instructions when shift count is already in BPF_REG_4/rcx as non BMI2
> instructions are shorter.

This is confusing, you mention %CL in the first sentence and then RCX in the
second sentence. Can you clarify this more here?

Also, It would be good to have some explanations about the
performance benefits here as well.

i.e. a load + store + non vector instruction v/s a single vector instruction
omitting the load. How many cycles do we expect in each case, I do expect the
latter to be lesser, but mentioning it in the commit removes any ambiguity.

>
> To summarize, when BMI2 is available:
> -------------------------------------------------
>             |   arbitrary dst
> =================================================
> src == ecx  |   shl dst, cl
> -------------------------------------------------
> src != ecx  |   shlx dst, dst, src
> -------------------------------------------------
>
> A concrete example between non BMI2 and BMI2 codegen.  To shift %rsi by
> %rdi:
>
> Without BMI2:
>
>  ef3:   push   %rcx
>         51
>  ef4:   mov    %rdi,%rcx
>         48 89 f9
>  ef7:   shl    %cl,%rsi
>         48 d3 e6
>  efa:   pop    %rcx
>         59
>
> With BMI2:
>
>  f0b:   shlx   %rdi,%rsi,%rsi
>         c4 e2 c1 f7 f6
>
> Signed-off-by: Jie Meng <jmeng@xxxxxx>
> ---
>  arch/x86/net/bpf_jit_comp.c | 64 +++++++++++++++++++++++++++++++++++++
>  1 file changed, 64 insertions(+)
>
> diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
> index d9ba997c5891..d09c54f3d2e0 100644
> --- a/arch/x86/net/bpf_jit_comp.c
> +++ b/arch/x86/net/bpf_jit_comp.c
> @@ -889,6 +889,48 @@ static void emit_nops(u8 **pprog, int len)
>         *pprog = prog;
>  }
>
> +/* emit the 3-byte VEX prefix */
> +static void emit_3vex(u8 **pprog, bool r, bool x, bool b, u8 m,
> +                     bool w, u8 src_reg2, bool l, u8 p)

Can you please use somewhat more descriptive variable names here?

or add more information about what x, b, m, w, l and p mean?

> +{
> +       u8 *prog = *pprog;
> +       u8 b0 = 0xc4, b1, b2;
> +       u8 src2 = reg2hex[src_reg2];
> +
> +       if (is_ereg(src_reg2))
> +               src2 |= 1 << 3;
> +
> +       /*
> +        *    7                           0
> +        *  +---+---+---+---+---+---+---+---+
> +        *  |~R |~X |~B |         m         |
> +        *  +---+---+---+---+---+---+---+---+
> +        */
> +       b1 = (!r << 7) | (!x << 6) | (!b << 5) | (m & 0x1f);

Some explanation here would help, not everyone is aware of x86 vex encoding :)

> +       /*
> +        *    7                           0
> +        *  +---+---+---+---+---+---+---+---+
> +        *  | W |     ~vvvv     | L |   pp  |
> +        *  +---+---+---+---+---+---+---+---+
> +        */
> +       b2 = (w << 7) | ((~src2 & 0xf) << 3) | (l << 2) | (p & 3);
> +
> +       EMIT3(b0, b1, b2);
> +       *pprog = prog;
> +}
> +
> +/* emit BMI2 shift instruction */
> +static void emit_shiftx(u8 **pprog, u32 dst_reg, u8 src_reg, bool is64, u8 op)
> +{
> +       u8 *prog = *pprog;
> +       bool r = is_ereg(dst_reg);
> +       u8 m = 2; /* escape code 0f38 */
> +
> +       emit_3vex(&prog, r, false, r, m, is64, src_reg, false, op);
> +       EMIT2(0xf7, add_2reg(0xC0, dst_reg, dst_reg));
> +       *pprog = prog;
> +}
> +
>  #define INSN_SZ_DIFF (((addrs[i] - addrs[i - 1]) - (prog - temp)))
>
>  static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image,
> @@ -1135,6 +1177,28 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
>                 case BPF_ALU64 | BPF_LSH | BPF_X:
>                 case BPF_ALU64 | BPF_RSH | BPF_X:
>                 case BPF_ALU64 | BPF_ARSH | BPF_X:
> +                       /* BMI2 shifts aren't better when shift count is already in rcx */
> +                       if (boot_cpu_has(X86_FEATURE_BMI2) && src_reg != BPF_REG_4) {
> +                               /* shrx/sarx/shlx dst_reg, dst_reg, src_reg */
> +                               bool w = (BPF_CLASS(insn->code) == BPF_ALU64);
> +                               u8 op;
> +
> +                               switch (BPF_OP(insn->code)) {
> +                               case BPF_LSH:
> +                                       op = 1; /* prefix 0x66 */
> +                                       break;
> +                               case BPF_RSH:
> +                                       op = 3; /* prefix 0xf2 */
> +                                       break;
> +                               case BPF_ARSH:
> +                                       op = 2; /* prefix 0xf3 */
> +                                       break;
> +                               }
> +
> +                               emit_shiftx(&prog, dst_reg, src_reg, w, op);
> +
> +                               break;
> +                       }
>
>                         if (src_reg != BPF_REG_4) { /* common case */
>                                 /* Check for bad case when dst_reg == rcx */
> --
> 2.30.2
>