Re: [PATCH bpf-next 13/15] libbpf: Generate loader program out of BPF ELF file.

Yonghong Song <yhs@xxxxxx> · Tue, 20 Apr 2021 18:34:11 -0700

On 4/16/21 8:32 PM, Alexei Starovoitov wrote:
From: Alexei Starovoitov <ast@xxxxxxxxxx>

The BPF program loading process performed by libbpf is quite complex
and consists of the following steps:
"open" phase:
- parse elf file and remember relocations, sections
- collect externs and ksyms including their btf_ids in prog's BTF
- patch BTF datasec (since llvm couldn't do it)
- init maps (old style map_def, BTF based, global data map, kconfig map)
- collect relocations against progs and maps
"load" phase:
- probe kernel features
- load vmlinux BTF
- resolve externs (kconfig and ksym)
- load program BTF
- init struct_ops
- create maps
- apply CO-RE relocations
- patch ld_imm64 insns with src_reg=PSEUDO_MAP, PSEUDO_MAP_VALUE, PSEUDO_BTF_ID
- reposition subprograms and adjust call insns
- sanitize and load progs

During this process libbpf does sys_bpf() calls to load BTF, create maps,
populate maps and finally load programs.
Instead of actually doing the syscalls generate a trace of what libbpf
would have done and represent it as the "loader program".
The "loader program" consists of single map with:
- union bpf_attr(s)
- BTF bytes
- map value bytes
- insns bytes
and single bpf program that passes bpf_attr(s) and data into bpf_sys_bpf() helper.
Executing such "loader program" via bpf_prog_test_run() command will
replay the sequence of syscalls that libbpf would have done which will result
the same maps created and programs loaded as specified in the elf file.
The "loader program" removes libelf and majority of libbpf dependency from
program loading process.

kconfig, typeless ksym, struct_ops and CO-RE are not supported yet.

Beyond this, currently libbpf has a lot of flexibility between prog open
and load, change program type, key/value size, pin maps, max_entries, 
reuse map, etc. it is worthwhile to mention this in the cover letter.
It is possible that these changes may defeat the purpose of signing the
program though.


The order of relocate_data and relocate_calls had to change in order
for trace generation to see all relocations for given program with
correct insn_idx-es.

Signed-off-by: Alexei Starovoitov <ast@xxxxxxxxxx>
---
  tools/lib/bpf/Build              |   2 +-
  tools/lib/bpf/bpf.c              |  61 ++++
  tools/lib/bpf/bpf.h              |  35 ++
  tools/lib/bpf/bpf_gen_internal.h |  38 +++
  tools/lib/bpf/gen_trace.c        | 529 +++++++++++++++++++++++++++++++
  tools/lib/bpf/libbpf.c           | 199 ++++++++++--
  tools/lib/bpf/libbpf.map         |   1 +
  tools/lib/bpf/libbpf_internal.h  |   2 +
  8 files changed, 834 insertions(+), 33 deletions(-)
  create mode 100644 tools/lib/bpf/bpf_gen_internal.h
  create mode 100644 tools/lib/bpf/gen_trace.c

diff --git a/tools/lib/bpf/Build b/tools/lib/bpf/Build
index 9b057cc7650a..d0a1903bcc3c 100644
--- a/tools/lib/bpf/Build
+++ b/tools/lib/bpf/Build
@@ -1,3 +1,3 @@
  libbpf-y := libbpf.o bpf.o nlattr.o btf.o libbpf_errno.o str_error.o \
  	    netlink.o bpf_prog_linfo.o libbpf_probes.o xsk.o hashmap.o \
-	    btf_dump.o ringbuf.o strset.o linker.o
+	    btf_dump.o ringbuf.o strset.o linker.o gen_trace.o
diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index b96a3aba6fcc..517e4f949a73 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -972,3 +972,64 @@ int bpf_prog_bind_map(int prog_fd, int map_fd,
[...]
+/* The layout of bpf_map_prog_desc and bpf_loader_ctx is feature dependent
+ * and will change from one version of libbpf to another and features
+ * requested during loader program generation.
+ */
+union bpf_map_prog_desc {
+	struct {
+		__u32 map_fd;
+		__u32 max_entries;
+	};
+	struct {
+		__u32 prog_fd;
+		__u32 attach_prog_fd;
+	};
+};
+
+struct bpf_loader_ctx {
+	size_t sz;
+	__u32 log_level;
+	__u32 log_size;
+	__u64 log_buf;
+	union bpf_map_prog_desc u[];
+};
+
+struct bpf_load_opts {
+	size_t sz; /* size of this struct for forward/backward compatibility */
+	struct bpf_loader_ctx *ctx;
+	const void *data;
+	const void *insns;
+	__u32 data_sz;
+	__u32 insns_sz;
+};
+#define bpf_load_opts__last_field insns_sz
+
+LIBBPF_API int bpf_load(const struct bpf_load_opts *opts);
+
  #ifdef __cplusplus
  } /* extern "C" */
  #endif
diff --git a/tools/lib/bpf/bpf_gen_internal.h b/tools/lib/bpf/bpf_gen_internal.h
new file mode 100644
index 000000000000..a79f2e4ad980
--- /dev/null
+++ b/tools/lib/bpf/bpf_gen_internal.h
@@ -0,0 +1,38 @@
+/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
+/* Copyright (c) 2021 Facebook */
+#ifndef __BPF_GEN_INTERNAL_H
+#define __BPF_GEN_INTERNAL_H
+
+struct relo_desc {
+	const char *name;
+	int kind;
+	int insn_idx;
+};
+
+struct bpf_gen {
+	void *data_start;
+	void *data_cur;
+	void *insn_start;
+	void *insn_cur;
+	__u32 nr_progs;
+	__u32 nr_maps;
+	int log_level;
+	int error;
+	struct relo_desc *relos;
+	int relo_cnt;
+};
+
+void bpf_object__set_gen_trace(struct bpf_object *obj, struct bpf_gen *gen);
+
+void bpf_gen__init(struct bpf_gen *gen, int log_level);
+int bpf_gen__finish(struct bpf_gen *gen);
+void bpf_gen__load_btf(struct bpf_gen *gen, const void *raw_data, __u32 raw_size);
+void bpf_gen__map_create(struct bpf_gen *gen, struct bpf_create_map_attr *map_attr, int map_idx);
+struct bpf_prog_load_params;
+void bpf_gen__prog_load(struct bpf_gen *gen, struct bpf_prog_load_params *load_attr, int prog_idx);
+void bpf_gen__map_update_elem(struct bpf_gen *gen, int map_idx, void *value, __u32 value_size);
+void bpf_gen__map_freeze(struct bpf_gen *gen, int map_idx);
+void bpf_gen__record_find_name(struct bpf_gen *gen, const char *name, enum bpf_attach_type type);
+void bpf_gen__record_extern(struct bpf_gen *gen, const char *name, int kind, int insn_idx);
+
+#endif
diff --git a/tools/lib/bpf/gen_trace.c b/tools/lib/bpf/gen_trace.c
new file mode 100644
index 000000000000..1a80a8dd1c9f
--- /dev/null
+++ b/tools/lib/bpf/gen_trace.c
@@ -0,0 +1,529 @@
+// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
+/* Copyright (c) 2021 Facebook */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <linux/filter.h>
+#include "btf.h"
+#include "bpf.h"
+#include "libbpf.h"
+#include "libbpf_internal.h"
+#include "hashmap.h"
+#include "bpf_gen_internal.h"
+
+#define MAX_USED_MAPS 64
+#define MAX_USED_PROGS 32
+
+/* The following structure describes the stack layout of the loader program.
+ * In addition R6 contains the pointer to context.
+ * R7 contains the result of the last sys_bpf command (typically error or FD).
+ */
+struct loader_stack {
+	__u32 btf_fd;
+	__u32 map_fd[MAX_USED_MAPS];
+	__u32 prog_fd[MAX_USED_PROGS];
+	__u32 inner_map_fd;
+	__u32 last_btf_id;
+	__u32 last_attach_btf_obj_fd;
+};
+#define stack_off(field) (__s16)(-sizeof(struct loader_stack) + offsetof(struct loader_stack, field))
+
+static int bpf_gen__realloc_insn_buf(struct bpf_gen *gen, __u32 size)
+{
+	size_t off = gen->insn_cur - gen->insn_start;
+
+	if (gen->error)
+		return -ENOMEM;

return gen->error?

+	if (off + size > UINT32_MAX) {
+		gen->error = -ERANGE;
+		return -ERANGE;
+	}
+	gen->insn_start = realloc(gen->insn_start, off + size);
+	if (!gen->insn_start) {
+		gen->error = -ENOMEM;
+		return -ENOMEM;
+	}
+	gen->insn_cur = gen->insn_start + off;
+	return 0;
+}
+
+static int bpf_gen__realloc_data_buf(struct bpf_gen *gen, __u32 size)

Maybe change the return type to size_t? Esp. in the below
we have off + size > UINT32_MAX.

+{
+	size_t off = gen->data_cur - gen->data_start;
+
+	if (gen->error)
+		return -ENOMEM;

return gen->error?

+	if (off + size > UINT32_MAX) {
+		gen->error = -ERANGE;
+		return -ERANGE;
+	}
+	gen->data_start = realloc(gen->data_start, off + size);
+	if (!gen->data_start) {
+		gen->error = -ENOMEM;
+		return -ENOMEM;
+	}
+	gen->data_cur = gen->data_start + off;
+	return 0;
+}
+
+static void bpf_gen__emit(struct bpf_gen *gen, struct bpf_insn insn)
+{
+	if (bpf_gen__realloc_insn_buf(gen, sizeof(insn)))
+		return;
+	memcpy(gen->insn_cur, &insn, sizeof(insn));
+	gen->insn_cur += sizeof(insn);
+}
+
+static void bpf_gen__emit2(struct bpf_gen *gen, struct bpf_insn insn1, struct bpf_insn insn2)
+{
+	bpf_gen__emit(gen, insn1);
+	bpf_gen__emit(gen, insn2);
+}
+
+void bpf_gen__init(struct bpf_gen *gen, int log_level)
+{
+	gen->log_level = log_level;
+	bpf_gen__emit(gen, BPF_MOV64_REG(BPF_REG_6, BPF_REG_1));
+	bpf_gen__emit(gen, BPF_ST_MEM(BPF_W, BPF_REG_10, stack_off(last_attach_btf_obj_fd), 0));

Here we initialize last_attach_btf_obj_fd, do we need to initialize 
last_btf_id?

+}
+
+static int bpf_gen__add_data(struct bpf_gen *gen, const void *data, __u32 size)
+{
+	void *prev;
+
+	if (bpf_gen__realloc_data_buf(gen, size))
+		return 0;
+	prev = gen->data_cur;
+	memcpy(gen->data_cur, data, size);
+	gen->data_cur += size;
+	return prev - gen->data_start;
+}
+
+static int insn_bytes_to_bpf_size(__u32 sz)
+{
+	switch (sz) {
+	case 8: return BPF_DW;
+	case 4: return BPF_W;
+	case 2: return BPF_H;
+	case 1: return BPF_B;
+	default: return -1;
+	}
+}
+
[...]
+
+static void __bpf_gen__debug(struct bpf_gen *gen, int reg1, int reg2, const char *fmt, va_list args)
+{
+	char buf[1024];
+	int addr, len, ret;
+
+	if (!gen->log_level)
+		return;
+	ret = vsnprintf(buf, sizeof(buf), fmt, args);
+	if (ret < 1024 - 7 && reg1 >= 0 && reg2 < 0)
+		strcat(buf, " r=%d");

Why only for reg1 >= 0 && reg2 < 0?

+	len = strlen(buf) + 1;
+	addr = bpf_gen__add_data(gen, buf, len);
+
+	bpf_gen__emit2(gen, BPF_LD_IMM64_RAW_FULL(BPF_REG_1, BPF_PSEUDO_MAP_IDX_VALUE, 0, 0, 0, addr));
+	bpf_gen__emit(gen, BPF_MOV64_IMM(BPF_REG_2, len));
+	if (reg1 >= 0)
+		bpf_gen__emit(gen, BPF_MOV64_REG(BPF_REG_3, reg1));
+	if (reg2 >= 0)
+		bpf_gen__emit(gen, BPF_MOV64_REG(BPF_REG_4, reg2));
+	bpf_gen__emit(gen, BPF_EMIT_CALL(BPF_FUNC_trace_printk));
+}
+
+static void bpf_gen__debug_regs(struct bpf_gen *gen, int reg1, int reg2, const char *fmt, ...)
+{
+	va_list args;
+
+	va_start(args, fmt);
+	__bpf_gen__debug(gen, reg1, reg2, fmt, args);
+	va_end(args);
+}
+
+static void bpf_gen__debug_ret(struct bpf_gen *gen, const char *fmt, ...)
+{
+	va_list args;
+
+	va_start(args, fmt);
+	__bpf_gen__debug(gen, BPF_REG_7, -1, fmt, args);
+	va_end(args);
+}
+
+static void bpf_gen__emit_sys_close(struct bpf_gen *gen, int stack_off)
+{
+	bpf_gen__emit(gen, BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_10, stack_off));
+	bpf_gen__emit(gen, BPF_JMP_IMM(BPF_JSLE, BPF_REG_1, 0, 2 + (gen->log_level ? 6 : 0)));

The number "6" is magic. This refers the number of insns generated below 
with
   bpf_gen__debug_regs(gen, BPF_REG_9, BPF_REG_0, "close(%%d) = %%d");
At least some comment will be better.

+	bpf_gen__emit(gen, BPF_MOV64_REG(BPF_REG_9, BPF_REG_1));
+	bpf_gen__emit(gen, BPF_EMIT_CALL(BPF_FUNC_sys_close));
+	bpf_gen__debug_regs(gen, BPF_REG_9, BPF_REG_0, "close(%%d) = %%d");
+}
+
+int bpf_gen__finish(struct bpf_gen *gen)
+{
+	int i;
+
+	bpf_gen__emit_sys_close(gen, stack_off(btf_fd));
+	for (i = 0; i < gen->nr_progs; i++)
+		bpf_gen__move_stack2ctx(gen, offsetof(struct bpf_loader_ctx,
+						      u[gen->nr_maps + i].map_fd), 4,

Maybe u[gen->nr_maps + i].prog_fd?
u[..] is a union, but prog_fd better reflects what it is.

+					stack_off(prog_fd[i]));
+	for (i = 0; i < gen->nr_maps; i++)
+		bpf_gen__move_stack2ctx(gen, offsetof(struct bpf_loader_ctx,
+						      u[i].prog_fd), 4,

u[i].map_fd?

+					stack_off(map_fd[i]));
+	bpf_gen__emit(gen, BPF_MOV64_IMM(BPF_REG_0, 0));
+	bpf_gen__emit(gen, BPF_EXIT_INSN());
+	pr_debug("bpf_gen__finish %d\n", gen->error);
+	return gen->error;
+}
+
+void bpf_gen__load_btf(struct bpf_gen *gen, const void *btf_raw_data, __u32 btf_raw_size)
+{
+	union bpf_attr attr = {};
+	int attr_size = offsetofend(union bpf_attr, btf_log_level);
+	int btf_data, btf_load_attr;
+
+	pr_debug("btf_load: size %d\n", btf_raw_size);
+	btf_data = bpf_gen__add_data(gen, btf_raw_data, btf_raw_size);
+
+	attr.btf_size = btf_raw_size;
+	btf_load_attr = bpf_gen__add_data(gen, &attr, attr_size);
+
+	/* populate union bpf_attr with user provided log details */
+	bpf_gen__move_ctx2blob(gen, btf_load_attr + offsetof(union bpf_attr, btf_log_level), 4,
+			       offsetof(struct bpf_loader_ctx, log_level));
+	bpf_gen__move_ctx2blob(gen, btf_load_attr + offsetof(union bpf_attr, btf_log_size), 4,
+			       offsetof(struct bpf_loader_ctx, log_size));
+	bpf_gen__move_ctx2blob(gen, btf_load_attr + offsetof(union bpf_attr, btf_log_buf), 8,
+			       offsetof(struct bpf_loader_ctx, log_buf));
+	/* populate union bpf_attr with a pointer to the BTF data */
+	bpf_gen__emit_rel_store(gen, btf_load_attr + offsetof(union bpf_attr, btf), btf_data);
+	/* emit BTF_LOAD command */
+	bpf_gen__emit_sys_bpf(gen, BPF_BTF_LOAD, btf_load_attr, attr_size);
+	bpf_gen__debug_ret(gen, "btf_load size %d", btf_raw_size);
+	bpf_gen__emit_check_err(gen);
+	/* remember btf_fd in the stack, if successful */
+	bpf_gen__emit(gen, BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_7, stack_off(btf_fd)));
+}
+
+void bpf_gen__map_create(struct bpf_gen *gen, struct bpf_create_map_attr *map_attr, int map_idx)
+{
+	union bpf_attr attr = {};
+	int attr_size = offsetofend(union bpf_attr, btf_vmlinux_value_type_id);
+	bool close_inner_map_fd = false;
+	int map_create_attr;
+
+	attr.map_type = map_attr->map_type;
+	attr.key_size = map_attr->key_size;
+	attr.value_size = map_attr->value_size;
+	attr.map_flags = map_attr->map_flags;
+	memcpy(attr.map_name, map_attr->name,
+	       min((unsigned)strlen(map_attr->name), BPF_OBJ_NAME_LEN - 1));
+	attr.numa_node = map_attr->numa_node;
+	attr.map_ifindex = map_attr->map_ifindex;
+	attr.max_entries = map_attr->max_entries;
+	switch (attr.map_type) {
+	case BPF_MAP_TYPE_PERF_EVENT_ARRAY:
+	case BPF_MAP_TYPE_CGROUP_ARRAY:
+	case BPF_MAP_TYPE_STACK_TRACE:
+	case BPF_MAP_TYPE_ARRAY_OF_MAPS:
+	case BPF_MAP_TYPE_HASH_OF_MAPS:
+	case BPF_MAP_TYPE_DEVMAP:
+	case BPF_MAP_TYPE_DEVMAP_HASH:
+	case BPF_MAP_TYPE_CPUMAP:
+	case BPF_MAP_TYPE_XSKMAP:
+	case BPF_MAP_TYPE_SOCKMAP:
+	case BPF_MAP_TYPE_SOCKHASH:
+	case BPF_MAP_TYPE_QUEUE:
+	case BPF_MAP_TYPE_STACK:
+	case BPF_MAP_TYPE_RINGBUF:
+		break;
+	default:
+		attr.btf_key_type_id = map_attr->btf_key_type_id;
+		attr.btf_value_type_id = map_attr->btf_value_type_id;
+	}
+
+	pr_debug("map_create: %s idx %d type %d value_type_id %d\n",
+		 attr.map_name, map_idx, map_attr->map_type, attr.btf_value_type_id);
+
+	map_create_attr = bpf_gen__add_data(gen, &attr, attr_size);
+	if (attr.btf_value_type_id)
+		/* populate union bpf_attr with btf_fd saved in the stack earlier */
+		bpf_gen__move_stack2blob(gen, map_create_attr + offsetof(union bpf_attr, btf_fd), 4,
+					 stack_off(btf_fd));
+	switch (attr.map_type) {
+	case BPF_MAP_TYPE_ARRAY_OF_MAPS:
+	case BPF_MAP_TYPE_HASH_OF_MAPS:
+		bpf_gen__move_stack2blob(gen, map_create_attr + offsetof(union bpf_attr, inner_map_fd),
+					 4, stack_off(inner_map_fd));
+		close_inner_map_fd = true;
+		break;
+	default:;
+	}
+	/* emit MAP_CREATE command */
+	bpf_gen__emit_sys_bpf(gen, BPF_MAP_CREATE, map_create_attr, attr_size);
+	bpf_gen__debug_ret(gen, "map_create %s idx %d type %d value_size %d",
+			   attr.map_name, map_idx, map_attr->map_type, attr.value_size);
+	bpf_gen__emit_check_err(gen);
+	/* remember map_fd in the stack, if successful */
+	if (map_idx < 0) {
+		bpf_gen__emit(gen, BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_7, stack_off(inner_map_fd)));

Some comments here to indicate map_idx < 0 is for inner map creation 
will help understand the code.

+	} else {
+		if (map_idx != gen->nr_maps) {
+			gen->error = -EDOM; /* internal bug */
+			return;
+		}
+		bpf_gen__emit(gen, BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_7, stack_off(map_fd[map_idx])));
+		gen->nr_maps++;
+	}
+	if (close_inner_map_fd)
+		bpf_gen__emit_sys_close(gen, stack_off(inner_map_fd));
+}
+
+void bpf_gen__record_find_name(struct bpf_gen *gen, const char *attach_name,
+			       enum bpf_attach_type type)
+{
+	const char *prefix;
+	int kind, len, name;
+
+	btf_get_kernel_prefix_kind(type, &prefix, &kind);
+	pr_debug("find_btf_id '%s%s'\n", prefix, attach_name);
+	len = strlen(prefix);
+	if (len)
+		name = bpf_gen__add_data(gen, prefix, len);
+	name = bpf_gen__add_data(gen, attach_name, strlen(attach_name) + 1);
+	name -= len;
+
+	bpf_gen__emit(gen, BPF_MOV64_IMM(BPF_REG_1, 0));
+	bpf_gen__emit2(gen, BPF_LD_IMM64_RAW_FULL(BPF_REG_2, BPF_PSEUDO_MAP_IDX_VALUE, 0, 0, 0, name));
+	bpf_gen__emit(gen, BPF_MOV64_IMM(BPF_REG_3, kind));
+	bpf_gen__emit(gen, BPF_MOV64_REG(BPF_REG_4, BPF_REG_10));
+	bpf_gen__emit(gen, BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, stack_off(last_attach_btf_obj_fd)));
+	bpf_gen__emit(gen, BPF_MOV64_IMM(BPF_REG_5, 0));
+	bpf_gen__emit(gen, BPF_EMIT_CALL(BPF_FUNC_btf_find_by_name_kind));
+	bpf_gen__emit(gen, BPF_MOV64_REG(BPF_REG_7, BPF_REG_0));
+	bpf_gen__debug_ret(gen, "find_by_name_kind(%s%s,%d)", prefix, attach_name, kind);
+	bpf_gen__emit_check_err(gen);
+	/* remember btf_id */
+	bpf_gen__emit(gen, BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_7, stack_off(last_btf_id)));
+}
+
+void bpf_gen__record_extern(struct bpf_gen *gen, const char *name, int kind, int insn_idx)
+{
+	struct relo_desc *relo;
+
+	relo = libbpf_reallocarray(gen->relos, gen->relo_cnt + 1, sizeof(*relo));
+	if (!relo) {
+		gen->error = -ENOMEM;
+		return;
+	}
+	gen->relos = relo;
+	relo += gen->relo_cnt;
+	relo->name = name;
+	relo->kind = kind;
+	relo->insn_idx = insn_idx;
+	gen->relo_cnt++;
+}
+
+static void bpf_gen__emit_relo(struct bpf_gen *gen, struct relo_desc *relo, int insns)
+{
+	int name, insn;
+
+	pr_debug("relo: %s at %d\n", relo->name, relo->insn_idx);
+	name = bpf_gen__add_data(gen, relo->name, strlen(relo->name) + 1);
+
+	bpf_gen__emit(gen, BPF_MOV64_IMM(BPF_REG_1, 0));
+	bpf_gen__emit2(gen, BPF_LD_IMM64_RAW_FULL(BPF_REG_2, BPF_PSEUDO_MAP_IDX_VALUE, 0, 0, 0, name));
+	bpf_gen__emit(gen, BPF_MOV64_IMM(BPF_REG_3, relo->kind));
+	bpf_gen__emit(gen, BPF_MOV64_REG(BPF_REG_4, BPF_REG_10));
+	bpf_gen__emit(gen, BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, stack_off(last_attach_btf_obj_fd)));
+	bpf_gen__emit(gen, BPF_MOV64_IMM(BPF_REG_5, 0));
+	bpf_gen__emit(gen, BPF_EMIT_CALL(BPF_FUNC_btf_find_by_name_kind));
+	bpf_gen__emit(gen, BPF_MOV64_REG(BPF_REG_7, BPF_REG_0));
+	bpf_gen__debug_ret(gen, "find_by_name_kind(%s,%d)", relo->name, relo->kind);
+	bpf_gen__emit_check_err(gen);
+	/* store btf_id into insn[insn_idx].imm */
+	insn = (int)(long)&((struct bpf_insn *)(long)insns)[relo->insn_idx].imm;

This is really fancy. Maybe something like
	insn = insns + sizeof(struct bpf_insn) * relo->insn_idx + 
offsetof(struct bpf_insn, imm).
Does this sound better?

+	bpf_gen__emit2(gen, BPF_LD_IMM64_RAW_FULL(BPF_REG_0, BPF_PSEUDO_MAP_IDX_VALUE, 0, 0, 0, insn));
+	bpf_gen__emit(gen, BPF_STX_MEM(BPF_W, BPF_REG_0, BPF_REG_7, 0));
+}
+
[...]
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 083e441d9c5e..a61b4d401527 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -54,6 +54,7 @@
  #include "str_error.h"
  #include "libbpf_internal.h"
  #include "hashmap.h"
+#include "bpf_gen_internal.h"
  
  #ifndef BPF_FS_MAGIC
  #define BPF_FS_MAGIC		0xcafe4a11
@@ -435,6 +436,8 @@ struct bpf_object {
  	bool loaded;
  	bool has_subcalls;
  
+	struct bpf_gen *gen_trace;
+
  	/*
  	 * Information when doing elf related work. Only valid if fd
  	 * is valid.
@@ -2651,7 +2654,15 @@ static int bpf_object__sanitize_and_load_btf(struct bpf_object *obj)
  		bpf_object__sanitize_btf(obj, kern_btf);
  	}
  
-	err = btf__load(kern_btf);
+	if (obj->gen_trace) {
+		__u32 raw_size = 0;
+		const void *raw_data = btf__get_raw_data(kern_btf, &raw_size);
+
+		bpf_gen__load_btf(obj->gen_trace, raw_data, raw_size);
+		btf__set_fd(kern_btf, 0);
+	} else {
+		err = btf__load(kern_btf);
+	}
  	if (sanitize) {
  		if (!err) {
  			/* move fd to libbpf's BTF */
@@ -4277,6 +4288,17 @@ static bool kernel_supports(enum kern_feature_id feat_id)
  	return READ_ONCE(feat->res) == FEAT_SUPPORTED;
  }
  
+static void mark_feat_supported(enum kern_feature_id last_feat)
+{
+	struct kern_feature_desc *feat;
+	int i;
+
+	for (i = 0; i <= last_feat; i++) {
+		feat = &feature_probes[i];
+		WRITE_ONCE(feat->res, FEAT_SUPPORTED);
+	}

This assumes all earlier features than FD_IDX are supported. I think 
this is probably fine although it may not work for some weird backport.
Did you see any issues if we don't explicitly set previous features
supported?

+}
+
  static bool map_is_reuse_compat(const struct bpf_map *map, int map_fd)
  {
  	struct bpf_map_info map_info = {};
@@ -4344,6 +4366,13 @@ bpf_object__populate_internal_map(struct bpf_object *obj, struct bpf_map *map)
  	char *cp, errmsg[STRERR_BUFSIZE];
  	int err, zero = 0;
  
+	if (obj->gen_trace) {
+		bpf_gen__map_update_elem(obj->gen_trace, map - obj->maps,
+					 map->mmaped, map->def.value_size);
+		if (map_type == LIBBPF_MAP_RODATA || map_type == LIBBPF_MAP_KCONFIG)
+			bpf_gen__map_freeze(obj->gen_trace, map - obj->maps);
+		return 0;
+	}
  	err = bpf_map_update_elem(map->fd, &zero, map->mmaped, 0);
  	if (err) {
  		err = -errno;
@@ -4369,7 +4398,7 @@ bpf_object__populate_internal_map(struct bpf_object *obj, struct bpf_map *map)
  
  static void bpf_map__destroy(struct bpf_map *map);
[...]
@@ -9383,7 +9512,13 @@ static int libbpf_find_attach_btf_id(struct bpf_program *prog, int *btf_obj_fd,
  	}
  
  	/* kernel/module BTF ID */
-	err = find_kernel_btf_id(prog->obj, attach_name, attach_type, btf_obj_fd, btf_type_id);
+	if (prog->obj->gen_trace) {
+		bpf_gen__record_find_name(prog->obj->gen_trace, attach_name, attach_type);
+		*btf_obj_fd = 0;
+		*btf_type_id = 1;

We have quite some codes like this and may add more to support more 
features. I am wondering whether we could have some kind of callbacks
to make the code more streamlined. But I am not sure how easy it is.

+	} else {
+		err = find_kernel_btf_id(prog->obj, attach_name, attach_type, btf_obj_fd, btf_type_id);
+	}
  	if (err) {
  		pr_warn("failed to find kernel BTF type ID of '%s': %d\n", attach_name, err);
  		return err;
diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map
index b9b29baf1df8..a5dffc0a3369 100644
--- a/tools/lib/bpf/libbpf.map
+++ b/tools/lib/bpf/libbpf.map
@@ -361,4 +361,5 @@ LIBBPF_0.4.0 {
  		bpf_linker__new;
  		bpf_map__inner_map;
  		bpf_object__set_kversion;
+		bpf_load;

Based on alphabet ordering, this should move a few places earlier.

I will need to go through the patch again for better understanding ...