Re: [RFC v2 6/9] netfilter: add bpf base hook program generator

Alexei Starovoitov <alexei.starovoitov@xxxxxxxxx> · Wed, 5 Oct 2022 19:52:09 -0700

On Wed, Oct 05, 2022 at 04:13:06PM +0200, Florian Westphal wrote:
>  
> @@ -254,11 +269,24 @@ static inline int nf_hook(u_int8_t pf, unsigned int hook, struct net *net,
>  
>  	if (hook_head) {
>  		struct nf_hook_state state;
> +#if IS_ENABLED(CONFIG_NF_HOOK_BPF)
> +		const struct bpf_prog *p = READ_ONCE(hook_head->hook_prog);
> +
> +		nf_hook_state_init(&state, hook, pf, indev, outdev,
> +				   sk, net, okfn);
> +
> +		state.priv = (void *)hook_head;
> +		state.skb = skb;
>  
> +		migrate_disable();
> +		ret = bpf_prog_run_nf(p, &state);
> +		migrate_enable();

Since generated prog doesn't do any per-cpu work and not using any maps
there is no need for migrate_disable.
There is cant_migrate() in __bpf_prog_run(), but it's probably better
to silence that instead of adding migrate_disable/enable overhead.
I guess it's ok for now.

> +static bool emit_mov_ptr_reg(struct nf_hook_prog *p, u8 dreg, u8 sreg)
> +{
> +	if (sizeof(void *) == sizeof(u64))
> +		return emit(p, BPF_MOV64_REG(dreg, sreg));
> +	if (sizeof(void *) == sizeof(u32))
> +		return emit(p, BPF_MOV32_REG(dreg, sreg));

I bet that was never tested :) because... see below.

> +
> +	return false;
> +}
> +
> +static bool do_prologue(struct nf_hook_prog *p)
> +{
> +	int width = bytes_to_bpf_size(sizeof(void *));
> +
> +	if (WARN_ON_ONCE(width < 0))
> +		return false;
> +
> +	/* argument to program is a pointer to struct nf_hook_state, in BPF_REG_1. */
> +	if (!emit_mov_ptr_reg(p, BPF_REG_6, BPF_REG_1))
> +		return false;
> +
> +	if (!emit(p, BPF_LDX_MEM(width, BPF_REG_7, BPF_REG_1,
> +				 offsetof(struct nf_hook_state, priv))))
> +		return false;
> +
> +	/* could load state->hook_index, but we don't support index > 0 for bpf call. */
> +	if (!emit(p, BPF_MOV32_IMM(BPF_REG_8, 0)))
> +		return false;
> +
> +	return true;
> +}
> +
> +static void patch_hook_jumps(struct nf_hook_prog *p)
> +{
> +	unsigned int i;
> +
> +	if (!p->insns)
> +		return;
> +
> +	for (i = 0; i < p->pos; i++) {
> +		if (BPF_CLASS(p->insns[i].code) != BPF_JMP)
> +			continue;
> +
> +		if (p->insns[i].code == (BPF_EXIT | BPF_JMP))
> +			continue;
> +		if (p->insns[i].code == (BPF_CALL | BPF_JMP))
> +			continue;
> +
> +		if (p->insns[i].off != JMP_INVALID)
> +			continue;
> +		p->insns[i].off = p->pos - i - 1;

Pls add a check that it fits in 16-bits.

> +	}
> +}
> +
> +static bool emit_retval(struct nf_hook_prog *p, int retval)
> +{
> +	if (!emit(p, BPF_MOV32_IMM(BPF_REG_0, retval)))
> +		return false;
> +
> +	return emit(p, BPF_EXIT_INSN());
> +}
> +
> +static bool emit_nf_hook_slow(struct nf_hook_prog *p)
> +{
> +	int width = bytes_to_bpf_size(sizeof(void *));
> +
> +	/* restore the original state->priv. */
> +	if (!emit(p, BPF_STX_MEM(width, BPF_REG_6, BPF_REG_7,
> +				 offsetof(struct nf_hook_state, priv))))
> +		return false;
> +
> +	/* arg1 is state->skb */
> +	if (!emit(p, BPF_LDX_MEM(width, BPF_REG_1, BPF_REG_6,
> +				 offsetof(struct nf_hook_state, skb))))
> +		return false;
> +
> +	/* arg2 is "struct nf_hook_state *" */
> +	if (!emit(p, BPF_MOV64_REG(BPF_REG_2, BPF_REG_6)))
> +		return false;
> +
> +	/* arg3 is nf_hook_entries (original state->priv) */
> +	if (!emit(p, BPF_MOV64_REG(BPF_REG_3, BPF_REG_7)))
> +		return false;
> +
> +	if (!emit(p, BPF_EMIT_CALL(nf_hook_slow)))
> +		return false;
> +
> +	/* No further action needed, return retval provided by nf_hook_slow */
> +	return emit(p, BPF_EXIT_INSN());
> +}
> +
> +static bool emit_nf_queue(struct nf_hook_prog *p)
> +{
> +	int width = bytes_to_bpf_size(sizeof(void *));
> +
> +	if (width < 0) {
> +		WARN_ON_ONCE(1);
> +		return false;
> +	}
> +
> +	/* int nf_queue(struct sk_buff *skb, struct nf_hook_state *state, unsigned int verdict) */
> +	if (!emit(p, BPF_LDX_MEM(width, BPF_REG_1, BPF_REG_6,
> +				 offsetof(struct nf_hook_state, skb))))
> +		return false;
> +	if (!emit(p, BPF_STX_MEM(BPF_H, BPF_REG_6, BPF_REG_8,
> +				 offsetof(struct nf_hook_state, hook_index))))
> +		return false;
> +	/* arg2: struct nf_hook_state * */
> +	if (!emit(p, BPF_MOV64_REG(BPF_REG_2, BPF_REG_6)))
> +		return false;
> +	/* arg3: original hook return value: (NUM << NF_VERDICT_QBITS | NF_QUEUE) */
> +	if (!emit(p, BPF_MOV32_REG(BPF_REG_3, BPF_REG_0)))
> +		return false;
> +	if (!emit(p, BPF_EMIT_CALL(nf_queue)))
> +		return false;

here and other CALL work by accident on x84-64.
You need to wrap them with BPF_CALL_ and point BPF_EMIT_CALL to that wrapper.
On x86-64 it will be a nop.
On x86-32 it will do quite a bit of work.

> +
> +	/* Check nf_queue return value.  Abnormal case: nf_queue returned != 0.
> +	 *
> +	 * Fall back to nf_hook_slow().
> +	 */
> +	if (!emit(p, BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2)))
> +		return false;
> +
> +	/* Normal case: skb was stolen. Return 0. */
> +	return emit_retval(p, 0);
> +}
> +
> +static bool do_epilogue_base_hooks(struct nf_hook_prog *p)
> +{
> +	int width = bytes_to_bpf_size(sizeof(void *));
> +
> +	if (WARN_ON_ONCE(width < 0))
> +		return false;
> +
> +	/* last 'hook'. We arrive here if previous hook returned ACCEPT,
> +	 * i.e. all hooks passed -- we are done.
> +	 *
> +	 * Return 1, skb can continue traversing network stack.
> +	 */
> +	if (!emit_retval(p, 1))
> +		return false;
> +
> +	/* Patch all hook jumps, in case any of these are taken
> +	 * we need to jump to this location.
> +	 *
> +	 * This happens when verdict is != ACCEPT.
> +	 */
> +	patch_hook_jumps(p);
> +
> +	/* need to ignore upper 24 bits, might contain errno or queue number */
> +	if (!emit(p, BPF_MOV32_REG(BPF_REG_3, BPF_REG_0)))
> +		return false;
> +	if (!emit(p, BPF_ALU32_IMM(BPF_AND, BPF_REG_3, 0xff)))
> +		return false;
> +
> +	/* ACCEPT handled, check STOLEN. */
> +	if (!emit(p, BPF_JMP_IMM(BPF_JNE, BPF_REG_3, NF_STOLEN, 2)))
> +		return false;
> +
> +	if (!emit_retval(p, 0))
> +		return false;
> +
> +	/* ACCEPT and STOLEN handled.  Check DROP next */
> +	if (!emit(p, BPF_JMP_IMM(BPF_JNE, BPF_REG_3, NF_DROP, 1 + 2 + 2 + 2 + 2)))
> +		return false;
> +
> +	/* First step. Extract the errno number. 1 insn. */
> +	if (!emit(p, BPF_ALU32_IMM(BPF_RSH, BPF_REG_0, NF_VERDICT_QBITS)))
> +		return false;
> +
> +	/* Second step: replace errno with EPERM if it was 0. 2 insns. */
> +	if (!emit(p, BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1)))
> +		return false;
> +	if (!emit(p, BPF_MOV32_IMM(BPF_REG_0, EPERM)))
> +		return false;
> +
> +	/* Third step: negate reg0: Caller expects -EFOO and stash the result.  2 insns. */
> +	if (!emit(p, BPF_ALU32_IMM(BPF_NEG, BPF_REG_0, 0)))
> +		return false;
> +	if (!emit(p, BPF_MOV32_REG(BPF_REG_8, BPF_REG_0)))
> +		return false;
> +
> +	/* Fourth step: free the skb. 2 insns. */
> +	if (!emit(p, BPF_LDX_MEM(width, BPF_REG_1, BPF_REG_6,
> +				 offsetof(struct nf_hook_state, skb))))
> +		return false;
> +	if (!emit(p, BPF_EMIT_CALL(kfree_skb)))
> +		return false;

ditto.

> +
> +	/* Last step: return. 2 insns. */
> +	if (!emit(p, BPF_MOV32_REG(BPF_REG_0, BPF_REG_8)))
> +		return false;
> +	if (!emit(p, BPF_EXIT_INSN()))
> +		return false;
> +
> +	/* ACCEPT, STOLEN and DROP have been handled.
> +	 * REPEAT and STOP are not allowed anymore for individual hook functions.
> +	 * This leaves NFQUEUE as only remaing return value.
> +	 *
> +	 * In this case BPF_REG_0 still contains the original verdict of
> +	 * '(NUM << NF_VERDICT_QBITS | NF_QUEUE)', so pass it to nf_queue() as-is.
> +	 */
> +	if (!emit_nf_queue(p))
> +		return false;
> +
> +	/* Increment hook index and store it in nf_hook_state so nf_hook_slow will
> +	 * start at the next hook, if any.
> +	 */
> +	if (!emit(p, BPF_ALU32_IMM(BPF_ADD, BPF_REG_8, 1)))
> +		return false;
> +	if (!emit(p, BPF_STX_MEM(BPF_H, BPF_REG_6, BPF_REG_8,
> +				 offsetof(struct nf_hook_state, hook_index))))
> +		return false;
> +
> +	return emit_nf_hook_slow(p);
> +}
> +
> +static int nf_hook_prog_init(struct nf_hook_prog *p)
> +{
> +	memset(p, 0, sizeof(*p));
> +
> +	p->insns = kcalloc(BPF_MAXINSNS, sizeof(*p->insns), GFP_KERNEL);
> +	if (!p->insns)
> +		return -ENOMEM;
> +
> +	return 0;
> +}
> +
> +static void nf_hook_prog_free(struct nf_hook_prog *p)
> +{
> +	kfree(p->insns);
> +}
> +
> +static int xlate_base_hooks(struct nf_hook_prog *p, const struct nf_hook_entries *e)
> +{
> +	unsigned int i, len;
> +
> +	len = e->num_hook_entries;
> +
> +	if (!do_prologue(p))
> +		goto out;
> +
> +	for (i = 0; i < len; i++) {
> +		if (!xlate_one_hook(p, e, &e->hooks[i]))
> +			goto out;
> +
> +		if (i + 1 < len) {
> +			if (!emit(p, BPF_MOV64_REG(BPF_REG_1, BPF_REG_6)))
> +				goto out;
> +
> +			if (!emit(p, BPF_ALU32_IMM(BPF_ADD, BPF_REG_8, 1)))
> +				goto out;
> +		}
> +	}
> +
> +	if (!do_epilogue_base_hooks(p))
> +		goto out;
> +
> +	return 0;
> +out:
> +	return -EINVAL;
> +}
> +
> +static struct bpf_prog *nf_hook_jit_compile(struct bpf_insn *insns, unsigned int len)
> +{
> +	struct bpf_prog *prog;
> +	int err = 0;
> +
> +	prog = bpf_prog_alloc(bpf_prog_size(len), 0);
> +	if (!prog)
> +		return NULL;
> +
> +	prog->len = len;
> +	prog->type = BPF_PROG_TYPE_SOCKET_FILTER;

lol. Just say BPF_PROG_TYPE_UNSPEC ?

> +	memcpy(prog->insnsi, insns, prog->len * sizeof(struct bpf_insn));
> +
> +	prog = bpf_prog_select_runtime(prog, &err);
> +	if (err) {
> +		bpf_prog_free(prog);
> +		return NULL;
> +	}

Would be good to do bpf_prog_alloc_id() so it can be seen in
bpftool prog show.
and bpf_prog_kallsyms_add() to make 'perf report' and
stack traces readable.

Overall I don't hate it, but don't like it either.
Please provide performance numbers.
It's a lot of tricky code and not clear what the benefits are.
Who will maintain this body of code long term?
How are we going to deal with refactoring that will touch generic bpf bits
and this generated prog?

> Purpose of this is to eventually add a 'netfilter prog type' to bpf and
> permit attachment of (userspace generated) bpf programs to the netfilter
> machinery, e.g.  'attach bpf prog id 1234 to ipv6 PREROUTING at prio -300'.
> 
> This will require to expose the context structure (program argument,
> '__nf_hook_state', with rewriting accesses to match nf_hook_state layout.

This part is orthogonal, right? I don't see how this work is connected
to above idea.
I'm still convinced that xt_bpf was a bad choice for many reasons.
"Add a 'netfilter prog type' to bpf" would repeat the same mistakes.
Let's evaluate this set independently.