On 9/10/21 8:36 AM, Juraj Vijtiuk wrote:
Hello, while developing some cgroup socket programs, we have noticed that some BPF_PROG_TYPE_CGROUP_SOCK programs return different values when the bpf_sock context is accessed, depending on how they are accessed. One example of the issue would be access to ctx->family in programs that attach to BPF_CGROUP_INET6_POST_BIND and BPF_CGROUP_INET4_POST_BIND. A direct ctx->family access returns the correct value, while BPF_CORE_READ(ctx, family) returns random values. The BPF C program and an example userspace C loader are attached below, with an example trace_pipe output. So far we have looked at the generated BPF byte code with llvm-objdump and everything looked fine there, the main difference being in the way the access is done, as expected. The BPF_CORE_READ macro expands into a bpf_probe_read_kernel() call with arguments wrapped in __builtin_preserve_access_index. bpf_probe_read_* helper calls are supported for BPF_PROG_TYPE_CGROUP_SOCKS so that shouldn't be an issue. Next, we looked at libbpf debug output, where everything looked ok too. The part of the output with relocations is attached below.
This is an incorrect usage of CORE. See below.
We have tested this with various kernel versions, including 5.10, 5.11 and 5.13 on x86_64 and 5.11 on 32 bit ARM. The issue appeared on all of those kernels and architectures. At this point we're not sure what to look at next so any ideas on what might cause the issues or suggestions on what to test next would be greatly appreciated. Regards, Juraj Vijtiuk example.bpf.c ---------------------------------------- #include "vmlinux.h" #include <bpf/bpf_helpers.h> #include <bpf/bpf_core_read.h> SEC("cgroup/post_bind4") int cgroup_post_bind4_prog(struct bpf_sock *ctx) { u32 family1 = 0; u32 family2 = 0; family1 = ctx->family; family2 = BPF_CORE_READ(ctx, family); bpf_printk("family1 = %u, family2 = %u\n", family1, family2);
We have assembly code below: 0: b7 02 00 00 04 00 00 00 r2 = 4 1: bf 13 00 00 00 00 00 00 r3 = r1 2: 0f 23 00 00 00 00 00 00 r3 += r2 3: 61 16 04 00 00 00 00 00 r6 = *(u32 *)(r1 + 4) 4: bf a1 00 00 00 00 00 00 r1 = r10 5: 07 01 00 00 e0 ff ff ff r1 += -32 6: b4 02 00 00 04 00 00 00 w2 = 4 7: 85 00 00 00 71 00 00 00 call 113 8: 61 a4 e0 ff 00 00 00 00 r4 = *(u32 *)(r10 - 32) Looks like they are the same one insn #3 read context + 4 and another insn #7 also read context + 4. But for insn #3, the verifier will rewrite it proper kernel field, case offsetof(struct bpf_sock, family): *insn++ = BPF_LDX_MEM( BPF_FIELD_SIZEOF(struct sock_common, skc_family), si->dst_reg, si->src_reg, bpf_target_off(struct sock_common, skc_family, sizeof_field(struct sock_common, skc_family), target_size)); break; and this is not the same for insn #7. That is why they have different results. The CORE is used for accessing kernel data structures, the "ctx" is not really a kernel data structure (rather a UAPI interface) in most cases.
return 0; } char LICENSE[] SEC("license") = "GPL"; example.c ---------------------------------------- #include <sys/types.h> #include <sys/stat.h> #include <fcntl.h> #include <unistd.h> #include <sys/resource.h> #include <argp.h> #include <bpf/libbpf.h> #include <bpf/bpf.h> #include "example.skel.h" void read_trace_pipe(void) { int trace_fd; trace_fd = open("/sys/kernel/debug/tracing/trace_pipe", O_RDONLY, 0); if (trace_fd < 0) return; while (1) { static char buf[4096]; ssize_t sz; sz = read(trace_fd, buf, sizeof(buf) - 1); if (sz > 0) { buf[sz] = 0; puts(buf); } } } int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args) { return vfprintf(stderr, format, args); } int main(int argc, char **argv) { struct example_bpf *obj; int err = 0; struct rlimit rlim = { .rlim_cur = 512UL << 20, .rlim_max = 512UL << 20, }; err = setrlimit(RLIMIT_MEMLOCK, &rlim); if (err) { fprintf(stderr, "failed to change rlimit\n"); return 1; } libbpf_set_print(libbpf_print_fn); obj = example_bpf__open(); if (!obj) { fprintf(stderr, "failed to open and/or load BPF object\n"); return 1; } err = example_bpf__load(obj); if (err) { fprintf(stderr, "failed to load BPF object %d\n", err); goto cleanup; } const char *cgroup_path = "/sys/fs/cgroup"; int cgroup_fd = open(cgroup_path, O_DIRECTORY | O_RDONLY); struct bpf_program *prog = obj->progs.cgroup_post_bind4_prog; obj->links.cgroup_post_bind4_prog = bpf_program__attach_cgroup(prog, cgroup_fd); err = libbpf_get_error(obj->links.cgroup_post_bind4_prog); if (err) { fprintf(stderr, "failed to attach BPF program %d\n", err); goto cleanup; } read_trace_pipe(); cleanup: example_bpf__destroy(obj); return err != 0; } trace_pipe output ---------------------------------------- Chrome_IOThread-26477 [006] d..2 385580.114654: bpf_trace_printk: family1 = 2, family2 = 1747691712 <...>-144100 [004] d..2 385594.936690: bpf_trace_printk: family1 = 2, family2 = 0 libbpf relocation log ---------------------------------------- libbpf: loading kernel BTF '/sys/kernel/btf/vmlinux': 0 libbpf: sec 'cgroup/post_bind4': found 2 CO-RE relocations libbpf: prog 'cgroup_post_bind4_prog': relo #0: kind <byte_off> (0), spec is [2] struct bpf_sock.family (0:1 @ offset 4) libbpf: CO-RE relocating [2] struct bpf_sock: found target candidate [24518] struct bpf_sock libbpf: prog 'cgroup_post_bind4_prog': relo #0: matching candidate #0 [24518] struct bpf_sock.family (0:1 @ offset 4) libbpf: prog 'cgroup_post_bind4_prog': relo #0: patched insn #9 (ALU/ALU64) imm 4 -> 4 libbpf: prog 'cgroup_post_bind4_prog': relo #1: kind <byte_off> (0), spec is [2] struct bpf_sock.family (0:1 @ offset 4) libbpf: prog 'cgroup_post_bind4_prog': relo #1: matching candidate #0 [24518] struct bpf_sock.family (0:1 @ offset 4) libbpf: prog 'cgroup_post_bind4_prog': relo #1: patched insn #12 (LDX/ST/STX) off 4 -> 4