On Fri, 25 Mar 2022, Andrii Nakryiko wrote: > Implement architecture-agnostic parts of USDT parsing logic. The code is > the documentation in this case, it's futile to try to succinctly > describe how USDT parsing is done in any sort of concreteness. But > still, USDTs are recorded in special ELF notes section (.note.stapsdt), > where each USDT call site is described separately. Along with USDT > provider and USDT name, each such note contains USDT argument > specification, which uses assembly-like syntax to describe how to fetch > value of USDT argument. USDT arg spec could be just a constant, or > a register, or a register dereference (most common cases in x86_64), but > it technically can be much more complicated cases, like offset relative > to global symbol and stuff like that. One of the later patches will > implement most common subset of this for x86 and x86-64 architectures, > which seems to handle a lot of real-world production application. > > USDT arg spec contains a compact encoding allowing usdt.bpf.h from > previous patch to handle the above 3 cases. Instead of recording which > register might be needed, we encode register's offset within struct > pt_regs to simplify BPF-side implementation. USDT argument can be of > different byte sizes (1, 2, 4, and 8) and signed or unsigned. To handle > this, libbpf pre-calculates necessary bit shifts to do proper casting > and sign-extension in a short sequences of left and right shifts. > > The rest is in the code with sometimes extensive comments and references > to external "documentation" for USDTs. > > Signed-off-by: Andrii Nakryiko <andrii@xxxxxxxxxx> Reviewed-by: Alan Maguire <alan.maguire@xxxxxxxxxx> nothing major below, might be no harm to use a common header for some definitions for usdt.bpf.h and usdt.c.. > --- > tools/lib/bpf/usdt.c | 581 ++++++++++++++++++++++++++++++++++++++++++- > 1 file changed, 580 insertions(+), 1 deletion(-) > > diff --git a/tools/lib/bpf/usdt.c b/tools/lib/bpf/usdt.c > index 8481e300598e..86d5d8390eb1 100644 > --- a/tools/lib/bpf/usdt.c > +++ b/tools/lib/bpf/usdt.c > @@ -18,10 +18,56 @@ > > #define PERF_UPROBE_REF_CTR_OFFSET_SHIFT 32 > unused? > +#define USDT_BASE_SEC ".stapsdt.base" > +#define USDT_SEMA_SEC ".probes" unused? > +#define USDT_NOTE_SEC ".note.stapsdt" > +#define USDT_NOTE_TYPE 3 > +#define USDT_NOTE_NAME "stapsdt" > + > +/* should match exactly enum __bpf_usdt_arg_type from bpf_usdt.bpf.h */ > +enum usdt_arg_type { > + USDT_ARG_CONST, > + USDT_ARG_REG, > + USDT_ARG_REG_DEREF, > +}; > + > +/* should match exactly struct __bpf_usdt_arg_spec from bpf_usdt.bpf.h */ > +struct usdt_arg_spec { > + __u64 val_off; > + enum usdt_arg_type arg_type; > + short reg_off; > + bool arg_signed; > + char arg_bitshift; > +}; > + > +/* should match BPF_USDT_MAX_ARG_CNT in usdt.bpf.h */ > +#define USDT_MAX_ARG_CNT 12 > + > +/* should match struct __bpf_usdt_spec from usdt.bpf.h */ > +struct usdt_spec { > + struct usdt_arg_spec args[USDT_MAX_ARG_CNT]; > + __u64 usdt_cookie; > + short arg_cnt; > +}; > + Would it be worth having a usdt.h that both usdt.bpf.h and usdt.c could #include, containing the above definitions, avoiding need to sync? > +struct usdt_note { > + const char *provider; > + const char *name; > + /* USDT args specification string, e.g.: > + * "-4@%esi -4@-24(%rbp) -4@%ecx 2@%ax 8@%rdx" > + */ > + const char *args; > + long loc_addr; > + long base_addr; > + long sema_addr; > +}; > + > struct usdt_target { > long abs_ip; > long rel_ip; > long sema_off; > + struct usdt_spec spec; > + const char *spec_str; > }; > > struct usdt_manager { > @@ -127,11 +173,449 @@ static int sanity_check_usdt_elf(Elf *elf, const char *path) > return 0; > } > > +static int find_elf_sec_by_name(Elf *elf, const char *sec_name, GElf_Shdr *shdr, Elf_Scn **scn) > +{ > + Elf_Scn *sec = NULL; > + size_t shstrndx; > + > + if (elf_getshdrstrndx(elf, &shstrndx)) > + return -EINVAL; > + > + /* check if ELF is corrupted and avoid calling elf_strptr if yes */ > + if (!elf_rawdata(elf_getscn(elf, shstrndx), NULL)) > + return -EINVAL; > + > + while ((sec = elf_nextscn(elf, sec)) != NULL) { > + char *name; > + > + if (!gelf_getshdr(sec, shdr)) > + return -EINVAL; > + > + name = elf_strptr(elf, shstrndx, shdr->sh_name); > + if (name && strcmp(sec_name, name) == 0) { > + *scn = sec; > + return 0; > + } > + } > + > + return -ENOENT; > +} > + > +struct elf_seg { > + long start; > + long end; > + long offset; > + bool is_exec; > +}; > + > +static int cmp_elf_segs(const void *_a, const void *_b) > +{ > + const struct elf_seg *a = _a; > + const struct elf_seg *b = _b; > + > + return a->start < b->start ? -1 : 1; > +} > + > +static int parse_elf_segs(Elf *elf, const char *path, struct elf_seg **segs, size_t *seg_cnt) > +{ > + GElf_Phdr phdr; > + size_t n; > + int i, err; > + struct elf_seg *seg; > + void *tmp; > + > + *seg_cnt = 0; > + > + if (elf_getphdrnum(elf, &n)) { > + err = -errno; > + return err; > + } > + > + for (i = 0; i < n; i++) { > + if (!gelf_getphdr(elf, i, &phdr)) { > + err = -errno; > + return err; > + } > + > + pr_debug("usdt: discovered PHDR #%d in '%s': vaddr 0x%lx memsz 0x%lx offset 0x%lx type 0x%lx flags 0x%lx\n", > + i, path, (long)phdr.p_vaddr, (long)phdr.p_memsz, (long)phdr.p_offset, > + (long)phdr.p_type, (long)phdr.p_flags); > + if (phdr.p_type != PT_LOAD) > + continue; > + > + tmp = libbpf_reallocarray(*segs, *seg_cnt + 1, sizeof(**segs)); > + if (!tmp) > + return -ENOMEM; > + > + *segs = tmp; > + seg = *segs + *seg_cnt; > + (*seg_cnt)++; > + > + seg->start = phdr.p_vaddr; > + seg->end = phdr.p_vaddr + phdr.p_memsz; > + seg->offset = phdr.p_offset; > + seg->is_exec = phdr.p_flags & PF_X; > + } > + > + if (*seg_cnt == 0) { > + pr_warn("usdt: failed to find PT_LOAD program headers in '%s'\n", path); > + return -ESRCH; > + } > + > + qsort(*segs, *seg_cnt, sizeof(**segs), cmp_elf_segs); > + return 0; > +} > + > +static int parse_lib_segs(int pid, const char *lib_path, struct elf_seg **segs, size_t *seg_cnt) > +{ > + char path[PATH_MAX], line[PATH_MAX], mode[16]; > + size_t seg_start, seg_end, seg_off; > + struct elf_seg *seg; > + int tmp_pid, i, err; > + FILE *f; > + > + *seg_cnt = 0; > + > + /* Handle containerized binaries only accessible from > + * /proc/<pid>/root/<path>. They will be reported as just /<path> in > + * /proc/<pid>/maps. > + */ > + if (sscanf(lib_path, "/proc/%d/root%s", &tmp_pid, path) == 2 && pid == tmp_pid) > + goto proceed; > + > + if (!realpath(lib_path, path)) { > + pr_warn("usdt: failed to get absolute path of '%s' (err %d), using path as is...\n", > + lib_path, -errno); > + strcpy(path, lib_path); > + } > + > +proceed: > + sprintf(line, "/proc/%d/maps", pid); > + f = fopen(line, "r"); > + if (!f) { > + err = -errno; > + pr_warn("usdt: failed to open '%s' to get base addr of '%s': %d\n", > + line, lib_path, err); > + return err; > + } > + > + /* We need to handle lines with no path at the end: > + * > + * 7f5c6f5d1000-7f5c6f5d3000 rw-p 001c7000 08:04 21238613 /usr/lib64/libc-2.17.so > + * 7f5c6f5d3000-7f5c6f5d8000 rw-p 00000000 00:00 0 > + * 7f5c6f5d8000-7f5c6f5d9000 r-xp 00000000 103:01 362990598 /data/users/andriin/linux/tools/bpf/usdt/libhello_usdt.so > + */ > + while (fscanf(f, "%zx-%zx %s %zx %*s %*d%[^\n]\n", > + &seg_start, &seg_end, mode, &seg_off, line) == 5) { > + void *tmp; > + > + /* to handle no path case (see above) we need to capture line > + * without skipping any whitespaces. So we need to strip > + * leading whitespaces manually here > + */ > + i = 0; > + while (isblank(line[i])) > + i++; > + if (strcmp(line + i, path) != 0) > + continue; > + > + pr_debug("usdt: discovered segment for lib '%s': addrs %zx-%zx mode %s offset %zx\n", > + path, seg_start, seg_end, mode, seg_off); > + > + /* ignore non-executable sections for shared libs */ > + if (mode[2] != 'x') > + continue; > + > + tmp = libbpf_reallocarray(*segs, *seg_cnt + 1, sizeof(**segs)); > + if (!tmp) { > + err = -ENOMEM; > + goto err_out; > + } > + > + *segs = tmp; > + seg = *segs + *seg_cnt; > + *seg_cnt += 1; > + > + seg->start = seg_start; > + seg->end = seg_end; > + seg->offset = seg_off; > + seg->is_exec = true; > + } > + > + if (*seg_cnt == 0) { > + pr_warn("usdt: failed to find '%s' (resolved to '%s') within PID %d memory mappings\n", > + lib_path, path, pid); > + err = -ESRCH; > + goto err_out; > + } > + > + qsort(*segs, *seg_cnt, sizeof(**segs), cmp_elf_segs); > + err = 0; > +err_out: > + fclose(f); > + return err; > +} > + > +static struct elf_seg *find_elf_seg(struct elf_seg *segs, size_t seg_cnt, long addr, bool relative) > +{ > + struct elf_seg *seg; > + int i; > + > + if (relative) { > + /* for shared libraries, address is relative offset and thus > + * should be fall within logical offset-based range of > + * [offset_start, offset_end) > + */ > + for (i = 0, seg = segs; i < seg_cnt; i++, seg++) { > + if (seg->offset <= addr && addr < seg->offset + (seg->end - seg->start)) > + return seg; > + } > + } else { > + /* for binaries, address is absolute and thus should be within > + * absolute address range of [seg_start, seg_end) > + */ > + for (i = 0, seg = segs; i < seg_cnt; i++, seg++) { > + if (seg->start <= addr && addr < seg->end) > + return seg; > + } > + } > + > + return NULL; > +} > + > +static int parse_usdt_note(Elf *elf, const char *path, long base_addr, > + GElf_Nhdr *nhdr, const char *data, size_t name_off, size_t desc_off, > + struct usdt_note *usdt_note); > + > +static int parse_usdt_spec(struct usdt_spec *spec, const struct usdt_note *note, long usdt_cookie); > + > static int collect_usdt_targets(struct usdt_manager *man, Elf *elf, const char *path, pid_t pid, > const char *usdt_provider, const char *usdt_name, long usdt_cookie, > struct usdt_target **out_targets, size_t *out_target_cnt) > { > - return -ENOTSUP; > + size_t off, name_off, desc_off, seg_cnt = 0, lib_seg_cnt = 0, target_cnt = 0; > + struct elf_seg *segs = NULL, *lib_segs = NULL; > + struct usdt_target *targets = NULL, *target; > + long base_addr = 0; > + Elf_Scn *notes_scn, *base_scn; > + GElf_Shdr base_shdr, notes_shdr; > + GElf_Ehdr ehdr; > + GElf_Nhdr nhdr; > + Elf_Data *data; > + int err; > + > + *out_targets = NULL; > + *out_target_cnt = 0; > + > + err = find_elf_sec_by_name(elf, USDT_NOTE_SEC, ¬es_shdr, ¬es_scn); > + if (err) since find_elf_sec_by_name() doesn't log anything, would be good to have a pr_warn("usdt: no " USDT_NOTE_SEC " section in '%s'", path); > + return err; > + > + if (notes_shdr.sh_type != SHT_NOTE) > + return -EINVAL; > + > + if (!gelf_getehdr(elf, &ehdr)) > + return -EINVAL; > + the above two are unlikely, but could perhaps benefit from an error message like below.. > + err = parse_elf_segs(elf, path, &segs, &seg_cnt); > + if (err) { > + pr_warn("usdt: failed to process ELF program segments for '%s': %d\n", path, err); > + goto err_out; > + } > + > + /* .stapsdt.base ELF section is optional, but is used for prelink > + * offset compensation (see a big comment further below) > + */ > + if (find_elf_sec_by_name(elf, USDT_BASE_SEC, &base_shdr, &base_scn) == 0) > + base_addr = base_shdr.sh_addr; > + > + data = elf_getdata(notes_scn, 0); > + off = 0; > + while ((off = gelf_getnote(data, off, &nhdr, &name_off, &desc_off)) > 0) { > + long usdt_abs_ip, usdt_rel_ip, usdt_sema_off = 0; > + struct usdt_note note; > + struct elf_seg *seg = NULL; > + void *tmp; > + > + err = parse_usdt_note(elf, path, base_addr, &nhdr, > + data->d_buf, name_off, desc_off, ¬e); > + if (err) > + goto err_out; > + > + if (strcmp(note.provider, usdt_provider) != 0 || strcmp(note.name, usdt_name) != 0) > + continue; > + > + /* We need to compensate "prelink effect". See [0] for details, > + * relevant parts quoted here: > + * > + * Each SDT probe also expands into a non-allocated ELF note. You can > + * find this by looking at SHT_NOTE sections and decoding the format; > + * see below for details. Because the note is non-allocated, it means > + * there is no runtime cost, and also preserved in both stripped files > + * and .debug files. > + * > + * However, this means that prelink won't adjust the note's contents > + * for address offsets. Instead, this is done via the .stapsdt.base > + * section. This is a special section that is added to the text. We > + * will only ever have one of these sections in a final link and it > + * will only ever be one byte long. Nothing about this section itself > + * matters, we just use it as a marker to detect prelink address > + * adjustments. > + * > + * Each probe note records the link-time address of the .stapsdt.base > + * section alongside the probe PC address. The decoder compares the > + * base address stored in the note with the .stapsdt.base section's > + * sh_addr. Initially these are the same, but the section header will > + * be adjusted by prelink. So the decoder applies the difference to > + * the probe PC address to get the correct prelinked PC address; the > + * same adjustment is applied to the semaphore address, if any. > + * > + * [0] https://sourceware.org/systemtap/wiki/UserSpaceProbeImplementation > + */ ouch. nice explanation! > + usdt_rel_ip = usdt_abs_ip = note.loc_addr; > + if (base_addr) { > + usdt_abs_ip += base_addr - note.base_addr; > + usdt_rel_ip += base_addr - note.base_addr; > + } > + > + if (ehdr.e_type == ET_EXEC) { should we use a bool is_shared_library here; might simplify debug messaging below... > + /* When attaching uprobes (which what USDTs basically > + * are) kernel expects a relative IP to be specified, > + * so if we are attaching to an executable ELF binary > + * (i.e., not a shared library), we need to calculate > + * proper relative IP based on ELF's load address > + */ > + seg = find_elf_seg(segs, seg_cnt, usdt_abs_ip, false /* relative */); > + if (!seg) { > + err = -ESRCH; > + pr_warn("usdt: failed to find ELF program segment for '%s:%s' in '%s' at IP 0x%lx\n", > + usdt_provider, usdt_name, path, usdt_abs_ip); > + goto err_out; > + } > + if (!seg->is_exec) { > + err = -ESRCH; > + pr_warn("usdt: matched ELF binary '%s' segment [0x%lx, 0x%lx) for '%s:%s' at IP 0x%lx is not executable\n", > + path, seg->start, seg->end, usdt_provider, usdt_name, > + usdt_abs_ip); > + goto err_out; > + } > + > + usdt_rel_ip = usdt_abs_ip - (seg->start - seg->offset); > + } else if (!man->has_bpf_cookie) { /* ehdr.e_type == ET_DYN */ > + /* If we don't have BPF cookie support but need to > + * attach to a shared library, we'll need to know and > + * record absolute addresses of attach points due to > + * the need to lookup USDT spec by absolute IP of > + * triggered uprobe. Doing this resolution is only > + * possible when we have a specific PID of the process > + * that's using specified shared library. BPF cookie > + * removes the absolute address limitation as we don't > + * need to do this lookup (we just use BPF cookie as > + * an index of USDT spec), so for newer kernels with > + * BPF cookie support libbpf supports USDT attachment > + * to shared libraries with no PID filter. > + */ > + if (pid < 0) { > + pr_warn("usdt: attaching to shared libaries without specific PID is not supported on current kernel\n"); > + err = -ENOTSUP; > + goto err_out; > + } > + > + /* lib_segs are lazily initialized only if necessary */ > + if (lib_seg_cnt == 0) { > + err = parse_lib_segs(pid, path, &lib_segs, &lib_seg_cnt); > + if (err) { > + pr_warn("usdt: failed to get memory segments in PID %d for shared library '%s': %d\n", > + pid, path, err); > + goto err_out; > + } > + } > + > + seg = find_elf_seg(lib_segs, lib_seg_cnt, usdt_rel_ip, true /* relative */); > + if (!seg) { > + err = -ESRCH; > + pr_warn("usdt: failed to find shared lib memory segment for '%s:%s' in '%s' at relative IP 0x%lx\n", > + usdt_provider, usdt_name, path, usdt_rel_ip); > + goto err_out; > + } > + > + usdt_abs_ip = seg->start + (usdt_rel_ip - seg->offset); > + } > + > + pr_debug("usdt: probe for '%s:%s' in %s '%s': addr 0x%lx base 0x%lx (resolved abs_ip 0x%lx rel_ip 0x%lx) args '%s' in segment [0x%lx, 0x%lx) at offset 0x%lx\n", > + usdt_provider, usdt_name, ehdr.e_type == ET_EXEC ? "exec" : "lib ", path, > + note.loc_addr, note.base_addr, usdt_abs_ip, usdt_rel_ip, note.args, > + seg ? seg->start : 0, seg ? seg->end : 0, seg ? seg->offset : 0); > + > + /* Adjust semaphore address to be a relative offset */ > + if (note.sema_addr) { > + if (!man->has_sema_refcnt) { > + pr_warn("usdt: kernel doesn't support USDT semaphore refcounting for '%s:%s' in '%s'\n", > + usdt_provider, usdt_name, path); > + err = -ENOTSUP; > + goto err_out; > + } > + > + seg = find_elf_seg(segs, seg_cnt, note.sema_addr, false /* relative */); > + if (!seg) { > + err = -ESRCH; > + pr_warn("usdt: failed to find ELF loadable segment with semaphore of '%s:%s' in '%s' at 0x%lx\n", > + usdt_provider, usdt_name, path, note.sema_addr); > + goto err_out; > + } > + if (seg->is_exec) { > + err = -ESRCH; > + pr_warn("usdt: matched ELF binary '%s' segment [0x%lx, 0x%lx] for semaphore of '%s:%s' at 0x%lx is executable\n", > + path, seg->start, seg->end, usdt_provider, usdt_name, > + note.sema_addr); > + goto err_out; > + } > + could have a bool "exec" arg to find_elf_seg() which allows/disallows the segment to be executable I guess. Alan