Hi everyone, When examining BPF programs that use perf buffers, I have noticed that the `data_sz` parameter in the sample callback has a different value than the size passed to `bpf_perf_event_output`. Raw samples are padded in `perf_prepare_sample` so their size is a multiple of `sizeof(u64)` (see [0]). The sample includes the size header, a `u32`. The size stored in `raw->size` is then size of the sample, minus the 4 bytes for the size header. This size, however, includes both the data and the padding. What I find confusing is that this size including the padding is eventually passed as `data_sz` to the callback in the userspace, instead of the size of the data that was passed as an argument to `bpf_perf_event_output`. Is this intended behaviour? I have a use-case for getting only the size of the data in the userspace, could this be done? To demonstrate, I have prepared a minimal example by modifying BCC's filelife example. It uses a kprobe on vfs_unlink to print some sizes every time a file is unlinked. The sizes are: * the `sizeof(struct event)` measured in the userspace program, * the `sizeof(struct event)` measured in the BPF program, and * the `data_sz` parameter. The first two are 62, as expected, but `data_sz` is 68. The 62 bytes of the struct and the 4 bytes of the sample header make 66 bytes. This is rounded up to the first multiple of 8, which is 72. The 4 bytes for the size header are then subtracted, and 68 is written as the data size. Any input is much appreciated, Best regards, Borna Cafuk [0] https://github.com/torvalds/linux/blob/6a9dc5fd6170d0a41c8a14eb19e63d94bea5705a/kernel/events/core.c#L7035 example.h -------------------------------- #ifndef __EXAMPLE_H #define __EXAMPLE_H struct __attribute__((__packed__)) event { __u16 size; char filler[60]; }; #endif /* __EXAMPLE_H */ example.bpf.c -------------------------------- #include "vmlinux.h" #include <bpf/bpf_helpers.h> #include <bpf/bpf_tracing.h> #include "example.h" struct { __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); __uint(key_size, sizeof(u32)); __uint(value_size, sizeof(u32)); } events SEC(".maps"); SEC("kprobe/vfs_unlink") int BPF_KPROBE(kprobe__vfs_unlink, struct inode *dir, struct dentry *dentry) { struct event event = {}; event.size = sizeof(struct event); bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, &event, sizeof(struct event)); return 0; } char LICENSE[] SEC("license") = "GPL"; example.c -------------------------------- #include <stdio.h> #include <bpf/libbpf.h> #include <sys/resource.h> #include "example.h" #include "example.skel.h" #define PERF_BUFFER_PAGES 16 #define PERF_POLL_TIMEOUT_MS 100 void handle_event(void *ctx, int cpu, void *data, __u32 data_sz) { const struct event *e = data; printf("Userspace: %u | BPF: %zu | data_sz: %u \n", e->size, sizeof(struct event), data_sz); } void handle_lost_events(void *ctx, int cpu, __u64 lost_cnt) { fprintf(stderr, "lost %llu events on CPU #%d\n", lost_cnt, cpu); } int main(int argc, char **argv) { struct perf_buffer_opts pb_opts; struct perf_buffer *pb = NULL; struct example_bpf *obj; int err; struct rlimit rlim_new = { .rlim_cur = RLIM_INFINITY, .rlim_max = RLIM_INFINITY, }; err = setrlimit(RLIMIT_MEMLOCK, &rlim_new); if (err) { fprintf(stderr, "failed to increase rlimit: %d\n", err); return 1; } obj = example_bpf__open(); if (!obj) { fprintf(stderr, "failed to open and/or load BPF object\n"); return 1; } err = example_bpf__load(obj); if (err) { fprintf(stderr, "failed to load BPF object: %d\n", err); goto cleanup; } err = example_bpf__attach(obj); if (err) { fprintf(stderr, "failed to attach BPF programs\n"); goto cleanup; } pb_opts.sample_cb = handle_event; pb_opts.lost_cb = handle_lost_events; pb = perf_buffer__new(bpf_map__fd(obj->maps.events), PERF_BUFFER_PAGES, &pb_opts); err = libbpf_get_error(pb); if (err) { pb = NULL; fprintf(stderr, "failed to open perf buffer: %d\n", err); goto cleanup; } while ((err = perf_buffer__poll(pb, PERF_POLL_TIMEOUT_MS)) >= 0) ; fprintf(stderr, "error polling perf buffer: %d\n", err); cleanup: perf_buffer__free(pb); example_bpf__destroy(obj); return err != 0; }