This commit modifies the tiny proof-of-concept DTrace utility to use the writable-buffer support in BPF along with the new helpers for buffer reservation and commit. The dtrace_finalize_context() helper is updated and is now marked with ctx_update because it sets the buffer pointer to NULL (and size 0). Signed-off-by: Kris Van Hees <kris.van.hees@xxxxxxxxxx> Reviewed-by: Nick Alcock <nick.alcock@xxxxxxxxxx> --- include/uapi/linux/dtrace.h | 4 + kernel/trace/dtrace/bpf.c | 150 ++++++++++++++++++++++++++++++++++++ tools/dtrace/dt_buffer.c | 54 +++++-------- tools/dtrace/probe1_bpf.c | 47 ++++++----- 4 files changed, 198 insertions(+), 57 deletions(-) diff --git a/include/uapi/linux/dtrace.h b/include/uapi/linux/dtrace.h index bbe2562c11f2..3fcc075a429f 100644 --- a/include/uapi/linux/dtrace.h +++ b/include/uapi/linux/dtrace.h @@ -33,6 +33,10 @@ struct dtrace_bpf_context { u32 gid; /* from_kgid(&init_user_ns, current_real_cred()->gid */ u32 euid; /* from_kuid(&init_user_ns, current_real_cred()->euid */ u32 egid; /* from_kgid(&init_user_ns, current_real_cred()->egid */ + + /* General output buffer */ + __bpf_md_ptr(u8 *, buf); + __bpf_md_ptr(u8 *, buf_end); }; /* diff --git a/kernel/trace/dtrace/bpf.c b/kernel/trace/dtrace/bpf.c index 95f4103d749e..93bd2f0319cc 100644 --- a/kernel/trace/dtrace/bpf.c +++ b/kernel/trace/dtrace/bpf.c @@ -7,6 +7,7 @@ #include <linux/filter.h> #include <linux/ptrace.h> #include <linux/sched.h> +#include <linux/perf_event.h> /* * Actual kernel definition of the DTrace BPF context. @@ -16,6 +17,9 @@ struct dtrace_bpf_ctx { u32 ecb_id; u32 probe_id; struct task_struct *task; + struct perf_output_handle handle; + u64 buf_len; + u8 *buf; }; /* @@ -55,6 +59,8 @@ BPF_CALL_2(dtrace_finalize_context, struct dtrace_bpf_ctx *, ctx, ctx->ecb_id = ecb->id; ctx->probe_id = ecb->probe_id; + ctx->buf_len = 0; + ctx->buf = NULL; return 0; } @@ -62,17 +68,119 @@ BPF_CALL_2(dtrace_finalize_context, struct dtrace_bpf_ctx *, ctx, static const struct bpf_func_proto dtrace_finalize_context_proto = { .func = dtrace_finalize_context, .gpl_only = false, + .ctx_update = true, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, /* ctx */ .arg2_type = ARG_CONST_MAP_PTR, /* map */ }; +BPF_CALL_4(dtrace_buffer_reserve, struct dtrace_bpf_ctx *, ctx, + int, id, struct bpf_map *, map, int, size) +{ + struct bpf_array *arr = container_of(map, struct bpf_array, map); + int cpu = smp_processor_id(); + struct bpf_event_entry *ee; + struct perf_event *ev; + int err; + + /* + * Make sure the writable-buffer id is valid. We use the default which + * is the offset of the start-of-buffer pointer in the public context. + */ + if (id != offsetof(struct dtrace_bpf_context, buf)) + return -EINVAL; + + /* + * Verify whether we have an uncommitted reserve. If so, we deny this + * request. + */ + if (ctx->handle.rb) + return -EBUSY; + + /* + * Perform sanity checks. + */ + if (cpu >= arr->map.max_entries) + return -E2BIG; + ee = READ_ONCE(arr->ptrs[cpu]); + if (!ee) + return -ENOENT; + ev = ee->event; + if (unlikely(ev->attr.type != PERF_TYPE_SOFTWARE || + ev->attr.config != PERF_COUNT_SW_BPF_OUTPUT)) + return -EINVAL; + if (unlikely(ev->oncpu != cpu)) + return -EOPNOTSUPP; + + size = round_up(size, sizeof(u64)); + + err = perf_output_begin_forward_in_page(&ctx->handle, ev, size); + if (err < 0) + return err; + + ctx->buf_len = size; + ctx->buf = ctx->handle.addr; + + return 0; +} + +static const struct bpf_func_proto dtrace_buffer_reserve_proto = { + .func = dtrace_buffer_reserve, + .gpl_only = false, + .ctx_update = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, /* ctx */ + .arg2_type = ARG_ANYTHING, /* id */ + .arg3_type = ARG_CONST_MAP_PTR, /* map */ + .arg4_type = ARG_ANYTHING, /* size */ +}; + +BPF_CALL_3(dtrace_buffer_commit, struct dtrace_bpf_ctx *, ctx, + int, id, struct bpf_map *, map) +{ + /* + * Make sure the writable-buffer id is valid. We use the default which + * is the offset of the start-of-buffer pointer in the public context. + */ + if (id != offsetof(struct dtrace_bpf_context, buf)) + return -EINVAL; + + /* + * Verify that we have an uncommitted reserve. If not, there is really + * nothing to be done here. + */ + if (!ctx->handle.rb) + return 0; + + perf_output_end(&ctx->handle); + + ctx->handle.rb = NULL; + ctx->buf_len = 0; + ctx->buf = NULL; + + return 0; +} + +static const struct bpf_func_proto dtrace_buffer_commit_proto = { + .func = dtrace_buffer_commit, + .gpl_only = false, + .ctx_update = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, /* ctx */ + .arg2_type = ARG_ANYTHING, /* id */ + .arg3_type = ARG_CONST_MAP_PTR, /* map */ +}; + static const struct bpf_func_proto * dtrace_get_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_finalize_context: return &dtrace_finalize_context_proto; + case BPF_FUNC_buffer_reserve: + return &dtrace_buffer_reserve_proto; + case BPF_FUNC_buffer_commit: + return &dtrace_buffer_commit_proto; case BPF_FUNC_perf_event_output: return bpf_get_perf_event_output_proto(); case BPF_FUNC_trace_printk: @@ -131,6 +239,22 @@ static bool dtrace_is_valid_access(int off, int size, enum bpf_access_type type, if (bpf_ctx_narrow_access_ok(off, size, sizeof(u32))) return true; break; + case bpf_ctx_range(struct dtrace_bpf_context, buf): + info->reg_type = PTR_TO_BUFFER; + info->buf_id = offsetof(struct dtrace_bpf_context, buf); + + bpf_ctx_record_field_size(info, sizeof(u64)); + if (bpf_ctx_narrow_access_ok(off, size, sizeof(u64))) + return true; + break; + case bpf_ctx_range(struct dtrace_bpf_context, buf_end): + info->reg_type = PTR_TO_BUFFER_END; + info->buf_id = offsetof(struct dtrace_bpf_context, buf); + + bpf_ctx_record_field_size(info, sizeof(u64)); + if (bpf_ctx_narrow_access_ok(off, size, sizeof(u64))) + return true; + break; default: if (size == sizeof(unsigned long)) return true; @@ -152,6 +276,10 @@ static bool dtrace_is_valid_access(int off, int size, enum bpf_access_type type, * si->dst_reg = ((type *)si->src_reg)->member * target_size = sizeof(((type *)si->src_reg)->member) * + * BPF_LDX_CTX_FIELD_DST(type, member, dst, si, target_size) + * dst = ((type *)si->src_reg)->member + * target_size = sizeof(((type *)si->src_reg)->member) + * * BPF_LDX_LNK_FIELD(type, member, si, target_size) * si->dst_reg = ((type *)si->dst_reg)->member * target_size = sizeof(((type *)si->dst_reg)->member) @@ -172,6 +300,13 @@ static bool dtrace_is_valid_access(int off, int size, enum bpf_access_type type, *(target_size) = FIELD_SIZEOF(type, member); \ offsetof(type, member); \ })) +#define BPF_LDX_CTX_FIELD_DST(type, member, dst, si, target_size) \ + BPF_LDX_MEM(BPF_FIELD_SIZEOF(type, member), \ + (dst), (si)->src_reg, \ + ({ \ + *(target_size) = FIELD_SIZEOF(type, member); \ + offsetof(type, member); \ + })) #define BPF_LDX_LNK_FIELD(type, member, si, target_size) \ BPF_LDX_MEM(BPF_FIELD_SIZEOF(type, member), \ (si)->dst_reg, (si)->dst_reg, \ @@ -261,6 +396,18 @@ static u32 dtrace_convert_ctx_access(enum bpf_access_type type, *insn++ = BPF_LDX_LNK_PTR(struct task_struct, cred, si); *insn++ = BPF_LDX_LNK_FIELD(struct cred, egid, si, target_size); break; + case offsetof(struct dtrace_bpf_context, buf): + *insn++ = BPF_LDX_CTX_FIELD(struct dtrace_bpf_ctx, buf, si, + target_size); + break; + case offsetof(struct dtrace_bpf_context, buf_end): + /* buf_end = ctx->buf + ctx->buf_len */ + *insn++ = BPF_LDX_CTX_FIELD(struct dtrace_bpf_ctx, buf, si, + target_size); + *insn++ = BPF_LDX_CTX_FIELD_DST(struct dtrace_bpf_ctx, buf_len, + BPF_REG_AX, si, target_size); + *insn++ = BPF_ALU64_REG(BPF_ADD, si->dst_reg, BPF_REG_AX); + break; default: *insn++ = BPF_LDX_CTX_PTR(struct dtrace_bpf_ctx, regs, si); *insn++ = BPF_LDX_MEM(BPF_SIZEOF(long), si->dst_reg, si->dst_reg, @@ -308,6 +455,9 @@ static void *dtrace_convert_ctx(enum bpf_prog_type stype, void *ctx) gctx = this_cpu_ptr(&dtrace_ctx); gctx->regs = (struct pt_regs *)ctx; gctx->task = current; + gctx->handle.rb = NULL; + gctx->buf_len = 0; + gctx->buf = NULL; return gctx; } diff --git a/tools/dtrace/dt_buffer.c b/tools/dtrace/dt_buffer.c index 65c107ca8ac4..28fac9036d69 100644 --- a/tools/dtrace/dt_buffer.c +++ b/tools/dtrace/dt_buffer.c @@ -282,33 +282,27 @@ static void write_rb_tail(volatile struct perf_event_mmap_page *rb_page, */ static int output_event(u64 *buf) { - u8 *data = (u8 *)buf; - struct perf_event_header *hdr; - u32 size; - u64 probe_id, task; - u32 pid, ppid, cpu, euid, egid, tag; + u8 *data = (u8 *)buf; + u32 probe_id; + u32 flags; + u64 task; + u32 pid, ppid, cpu, euid, egid, tag; - hdr = (struct perf_event_header *)data; - data += sizeof(struct perf_event_header); + probe_id = *(u32 *)&(data[0]); - if (hdr->type != PERF_RECORD_SAMPLE) - return 1; + if (probe_id == PERF_RECORD_LOST) { + u16 size; + u64 lost; - size = *(u32 *)data; - data += sizeof(u32); + size = *(u16 *)&(data[6]); + lost = *(u16 *)&(data[16]); - /* - * The sample should only take up 48 bytes, but as a result of how the - * BPF program stores the data (filling in a struct that resides on the - * stack, and sending that off using bpf_perf_event_output()), there is - * some internal padding - */ - if (size != 52) { - printf("Sample size is wrong (%d vs expected %d)\n", size, 52); - goto out; + printf("[%ld probes dropped]\n", lost); + + return size; } - probe_id = *(u64 *)&(data[0]); + flags = *(u32 *)&(data[4]); pid = *(u32 *)&(data[8]); ppid = *(u32 *)&(data[12]); cpu = *(u32 *)&(data[16]); @@ -318,19 +312,14 @@ static int output_event(u64 *buf) tag = *(u32 *)&(data[40]); if (probe_id != 123) - printf("Corrupted data (probe_id = %ld)\n", probe_id); + printf("Corrupted data (probe_id = %d)\n", probe_id); if (tag != 0xdace) printf("Corrupted data (tag = %x)\n", tag); - printf("CPU-%d: EPID %ld PID %d PPID %d EUID %d EGID %d TASK %08lx\n", - cpu, probe_id, pid, ppid, euid, egid, task); + printf("CPU-%d: [%d/%d] PID %d PPID %d EUID %d EGID %d TASK %08lx\n", + cpu, probe_id, flags, pid, ppid, euid, egid, task); -out: - /* - * We processed the perf_event_header, the size, and ;size; bytes of - * probe data. - */ - return sizeof(struct perf_event_header) + sizeof(u32) + size; + return 48; } /* @@ -351,10 +340,9 @@ static void process_data(struct dtrace_buffer *buf) /* * Ensure that the buffer contains enough data for at least one - * sample (header + sample size + sample data). + * sample. */ - if (head - tail < sizeof(struct perf_event_header) + - sizeof(u32) + 48) + if (head - tail < 48) break; if (*ptr) diff --git a/tools/dtrace/probe1_bpf.c b/tools/dtrace/probe1_bpf.c index 5b34edb61412..a3196261e66e 100644 --- a/tools/dtrace/probe1_bpf.c +++ b/tools/dtrace/probe1_bpf.c @@ -37,25 +37,16 @@ struct bpf_map_def SEC("maps") buffer_map = { .max_entries = 2, }; -struct sample { - u64 probe_id; - u32 pid; - u32 ppid; - u32 cpu; - u32 euid; - u32 egid; - u64 task; - u32 tag; -}; - #define DPROG(F) SEC("dtrace/"__stringify(F)) int bpf_func_##F +#define BUF_ID offsetof(struct dtrace_bpf_context, buf) /* we jump here when syscall number == __NR_write */ DPROG(__NR_write)(struct dtrace_bpf_context *ctx) { int cpu = bpf_get_smp_processor_id(); struct dtrace_ecb *ecb; - struct sample smpl; + u8 *buf, *buf_end; + int err; bpf_finalize_context(ctx, &probemap); @@ -63,17 +54,25 @@ DPROG(__NR_write)(struct dtrace_bpf_context *ctx) if (!ecb) return 0; - memset(&smpl, 0, sizeof(smpl)); - smpl.probe_id = ecb->probe_id; - smpl.pid = ctx->pid; - smpl.ppid = ctx->ppid; - smpl.cpu = ctx->cpu; - smpl.euid = ctx->euid; - smpl.egid = ctx->egid; - smpl.task = ctx->task; - smpl.tag = 0xdace; - - bpf_perf_event_output(ctx, &buffer_map, cpu, &smpl, sizeof(smpl)); + err = bpf_buffer_reserve(ctx, BUF_ID, &buffer_map, 48); + if (err < 0) + return -1; + buf = ctx->buf; + buf_end = ctx->buf_end; + if (buf + 48 > buf_end) + return -1; + + *(u32 *)(&buf[0]) = ecb->probe_id; + *(u32 *)(&buf[4]) = 0; + *(u32 *)(&buf[8]) = ctx->pid; + *(u32 *)(&buf[12]) = ctx->ppid; + *(u32 *)(&buf[16]) = ctx->cpu; + *(u32 *)(&buf[20]) = ctx->euid; + *(u32 *)(&buf[24]) = ctx->egid; + *(u64 *)(&buf[32]) = ctx->task; + *(u32 *)(&buf[40]) = 0xdace; + + bpf_buffer_commit(ctx, BUF_ID, &buffer_map); return 0; } @@ -84,7 +83,7 @@ int bpf_prog1(struct pt_regs *ctx) struct dtrace_ecb ecb; int cpu = bpf_get_smp_processor_id(); - ecb.id = 1; + ecb.id = 3; ecb.probe_id = 123; bpf_map_update_elem(&probemap, &cpu, &ecb, BPF_ANY); -- 2.20.1