Re: [PATCH bpf-next 13/15] libbpf: Generate loader program out of BPF ELF file.

Yonghong Song <yhs@xxxxxx> · Tue, 20 Apr 2021 22:30:21 -0700

On 4/20/21 9:46 PM, Alexei Starovoitov wrote:
On Tue, Apr 20, 2021 at 06:34:11PM -0700, Yonghong Song wrote:

kconfig, typeless ksym, struct_ops and CO-RE are not supported yet.

Beyond this, currently libbpf has a lot of flexibility between prog open
and load, change program type, key/value size, pin maps, max_entries, reuse
map, etc. it is worthwhile to mention this in the cover letter.
It is possible that these changes may defeat the purpose of signing the
program though.

Right. We'd need to decide which ones are ok to change after signature
verification. I think max_entries gotta be allowed, since tools
actively change it. The other fields selftest change too, but I'm not sure
it's a good thing to allow for signed progs. TBD.

+	if (gen->error)
+		return -ENOMEM;

return gen->error?

right

+	if (off + size > UINT32_MAX) {
+		gen->error = -ERANGE;
+		return -ERANGE;
+	}
+	gen->insn_start = realloc(gen->insn_start, off + size);
+	if (!gen->insn_start) {
+		gen->error = -ENOMEM;
+		return -ENOMEM;
+	}
+	gen->insn_cur = gen->insn_start + off;
+	return 0;
+}
+
+static int bpf_gen__realloc_data_buf(struct bpf_gen *gen, __u32 size)

Maybe change the return type to size_t? Esp. in the below
we have off + size > UINT32_MAX.

return type? it's 0 or error. you mean argument type?
I think u32 is better. The prog size and all other ways
the bpf_gen__add_data is called with 32-bit values.

Sorry, I mean

+static int bpf_gen__add_data(struct bpf_gen *gen, const void *data, 
__u32 size)

Since we allow off + size could be close to UINT32_MAX,
maybe bpf_gen__add_data should return __u32 instead of int.


+{
+	size_t off = gen->data_cur - gen->data_start;
+
+	if (gen->error)
+		return -ENOMEM;

return gen->error?

right

+	if (off + size > UINT32_MAX) {
+		gen->error = -ERANGE;
+		return -ERANGE;
+	}
+	gen->data_start = realloc(gen->data_start, off + size);
+	if (!gen->data_start) {
+		gen->error = -ENOMEM;
+		return -ENOMEM;
+	}
+	gen->data_cur = gen->data_start + off;
+	return 0;
+}
+
+static void bpf_gen__emit(struct bpf_gen *gen, struct bpf_insn insn)
+{
+	if (bpf_gen__realloc_insn_buf(gen, sizeof(insn)))
+		return;
+	memcpy(gen->insn_cur, &insn, sizeof(insn));
+	gen->insn_cur += sizeof(insn);
+}
+
+static void bpf_gen__emit2(struct bpf_gen *gen, struct bpf_insn insn1, struct bpf_insn insn2)
+{
+	bpf_gen__emit(gen, insn1);
+	bpf_gen__emit(gen, insn2);
+}
+
+void bpf_gen__init(struct bpf_gen *gen, int log_level)
+{
+	gen->log_level = log_level;
+	bpf_gen__emit(gen, BPF_MOV64_REG(BPF_REG_6, BPF_REG_1));
+	bpf_gen__emit(gen, BPF_ST_MEM(BPF_W, BPF_REG_10, stack_off(last_attach_btf_obj_fd), 0));

Here we initialize last_attach_btf_obj_fd, do we need to initialize
last_btf_id?

Not sure why I inited it. Probably left over. I'll remove it.

+}
+
+static int bpf_gen__add_data(struct bpf_gen *gen, const void *data, __u32 size)
+{
+	void *prev;
+
+	if (bpf_gen__realloc_data_buf(gen, size))
+		return 0;
+	prev = gen->data_cur;
+	memcpy(gen->data_cur, data, size);
+	gen->data_cur += size;
+	return prev - gen->data_start;
+}
+
+static int insn_bytes_to_bpf_size(__u32 sz)
+{
+	switch (sz) {
+	case 8: return BPF_DW;
+	case 4: return BPF_W;
+	case 2: return BPF_H;
+	case 1: return BPF_B;
+	default: return -1;
+	}
+}
+
[...]
+
+static void __bpf_gen__debug(struct bpf_gen *gen, int reg1, int reg2, const char *fmt, va_list args)
+{
+	char buf[1024];
+	int addr, len, ret;
+
+	if (!gen->log_level)
+		return;
+	ret = vsnprintf(buf, sizeof(buf), fmt, args);
+	if (ret < 1024 - 7 && reg1 >= 0 && reg2 < 0)
+		strcat(buf, " r=%d");

Why only for reg1 >= 0 && reg2 < 0?

To avoid specifying BPF_REG_7 and adding " r=%%d" to printks explicitly.
Just to make bpf_gen__debug_ret() short and less verbose.

+	len = strlen(buf) + 1;
+	addr = bpf_gen__add_data(gen, buf, len);
+
+	bpf_gen__emit2(gen, BPF_LD_IMM64_RAW_FULL(BPF_REG_1, BPF_PSEUDO_MAP_IDX_VALUE, 0, 0, 0, addr));
+	bpf_gen__emit(gen, BPF_MOV64_IMM(BPF_REG_2, len));
+	if (reg1 >= 0)
+		bpf_gen__emit(gen, BPF_MOV64_REG(BPF_REG_3, reg1));
+	if (reg2 >= 0)
+		bpf_gen__emit(gen, BPF_MOV64_REG(BPF_REG_4, reg2));
+	bpf_gen__emit(gen, BPF_EMIT_CALL(BPF_FUNC_trace_printk));
+}
+
+static void bpf_gen__debug_regs(struct bpf_gen *gen, int reg1, int reg2, const char *fmt, ...)
+{
+	va_list args;
+
+	va_start(args, fmt);
+	__bpf_gen__debug(gen, reg1, reg2, fmt, args);
+	va_end(args);
+}
+
+static void bpf_gen__debug_ret(struct bpf_gen *gen, const char *fmt, ...)
+{
+	va_list args;
+
+	va_start(args, fmt);
+	__bpf_gen__debug(gen, BPF_REG_7, -1, fmt, args);
+	va_end(args);
+}
+
+static void bpf_gen__emit_sys_close(struct bpf_gen *gen, int stack_off)
+{
+	bpf_gen__emit(gen, BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_10, stack_off));
+	bpf_gen__emit(gen, BPF_JMP_IMM(BPF_JSLE, BPF_REG_1, 0, 2 + (gen->log_level ? 6 : 0)));

The number "6" is magic. This refers the number of insns generated below
with
    bpf_gen__debug_regs(gen, BPF_REG_9, BPF_REG_0, "close(%%d) = %%d");
At least some comment will be better.

good point. will add a comment.


+	bpf_gen__emit(gen, BPF_MOV64_REG(BPF_REG_9, BPF_REG_1));
+	bpf_gen__emit(gen, BPF_EMIT_CALL(BPF_FUNC_sys_close));
+	bpf_gen__debug_regs(gen, BPF_REG_9, BPF_REG_0, "close(%%d) = %%d");
+}
+
+int bpf_gen__finish(struct bpf_gen *gen)
+{
+	int i;
+
+	bpf_gen__emit_sys_close(gen, stack_off(btf_fd));
+	for (i = 0; i < gen->nr_progs; i++)
+		bpf_gen__move_stack2ctx(gen, offsetof(struct bpf_loader_ctx,
+						      u[gen->nr_maps + i].map_fd), 4,

Maybe u[gen->nr_maps + i].prog_fd?
u[..] is a union, but prog_fd better reflects what it is.

ohh. right.

+					stack_off(prog_fd[i]));
+	for (i = 0; i < gen->nr_maps; i++)
+		bpf_gen__move_stack2ctx(gen, offsetof(struct bpf_loader_ctx,
+						      u[i].prog_fd), 4,

u[i].map_fd?

right.

+	/* remember map_fd in the stack, if successful */
+	if (map_idx < 0) {
+		bpf_gen__emit(gen, BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_7, stack_off(inner_map_fd)));

Some comments here to indicate map_idx < 0 is for inner map creation will
help understand the code.

will do.

+	/* store btf_id into insn[insn_idx].imm */
+	insn = (int)(long)&((struct bpf_insn *)(long)insns)[relo->insn_idx].imm;

This is really fancy. Maybe something like
	insn = insns + sizeof(struct bpf_insn) * relo->insn_idx + offsetof(struct
bpf_insn, imm).
Does this sound better?

yeah. much better.

+static void mark_feat_supported(enum kern_feature_id last_feat)
+{
+	struct kern_feature_desc *feat;
+	int i;
+
+	for (i = 0; i <= last_feat; i++) {
+		feat = &feature_probes[i];
+		WRITE_ONCE(feat->res, FEAT_SUPPORTED);
+	}

This assumes all earlier features than FD_IDX are supported. I think this is
probably fine although it may not work for some weird backport.
Did you see any issues if we don't explicitly set previous features
supported?

This helper is only used as mark_feat_supported(FEAT_FD_IDX)
to tell libbpf that it shouldn't probe anything.
Otherwise probing via prog_load screw up gen_trace completely.
May be it will be mark_all_feat_supported(void), but that seems less flexible.

Maybe add some comments here to explain why marking explicit supported
instead if probing?


@@ -9383,7 +9512,13 @@ static int libbpf_find_attach_btf_id(struct bpf_program *prog, int *btf_obj_fd,
   	}
   	/* kernel/module BTF ID */
-	err = find_kernel_btf_id(prog->obj, attach_name, attach_type, btf_obj_fd, btf_type_id);
+	if (prog->obj->gen_trace) {
+		bpf_gen__record_find_name(prog->obj->gen_trace, attach_name, attach_type);
+		*btf_obj_fd = 0;
+		*btf_type_id = 1;

We have quite some codes like this and may add more to support more
features. I am wondering whether we could have some kind of callbacks
to make the code more streamlined. But I am not sure how easy it is.

you mean find_kernel_btf_id() in general?
This 'find' operation is translated differently for
prog name as seen in this hunk via bpf_gen__record_find_name()
and via bpf_gen__record_extern() in another place.
For libbpf it's all find_kernel_btf_id(), but semantically they are different,
so they cannot map as-is to gen trace bpf_gen__find_kernel_btf_id (if there was
such thing).
Because such 'generic' callback wouldn't convey the meaning of what to do
with the result of the find.

I mean like calling
    err = obj->ops->find_kernel_btf_id(...)
where gen_trace and normal libbpf all registers their own callback 
functions for find_kernel_btf_id(). Similar ideas can be applied to
other places or not. Not 100% sure this is the best approach or not,
just want to bring it up for discussion.


+	} else {
+		err = find_kernel_btf_id(prog->obj, attach_name, attach_type, btf_obj_fd, btf_type_id);
+	}
   	if (err) {
   		pr_warn("failed to find kernel BTF type ID of '%s': %d\n", attach_name, err);
   		return err;

diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map
index b9b29baf1df8..a5dffc0a3369 100644
--- a/tools/lib/bpf/libbpf.map
+++ b/tools/lib/bpf/libbpf.map
@@ -361,4 +361,5 @@ LIBBPF_0.4.0 {
   		bpf_linker__new;
   		bpf_map__inner_map;
   		bpf_object__set_kversion;
+		bpf_load;

Based on alphabet ordering, this should move a few places earlier.

I will need to go through the patch again for better understanding ...

Thanks a lot for the review.

I'll address these comments and those that I got offline and will post v2.
This gen stuff will look quite different.
I hope bpf_load will not be a part of uapi anymore.
And 'struct bpf_gen' will not be exposed to bpftool directly.

Sounds good.