On Mon, 2019-04-01 at 22:40 +0200, Daniel Borkmann wrote: > On 03/29/2019 01:07 AM, Matt Mullins wrote: > > This is an opt-in interface that allows a tracepoint to provide a safe > > buffer that can be written from a BPF_PROG_TYPE_RAW_TRACEPOINT program. > > The size of the buffer must be a compile-time constant, and is checked > > before allowing a BPF program to attach to a tracepoint that uses this > > feature. > > > > The pointer to this buffer will be the first argument of tracepoints > > that opt in; the buffer is readable by both BPF_PROG_TYPE_RAW_TRACEPOINT > > and BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE programs that attach to such a > > tracepoint, but the buffer to which it points may only be written by the > > latter. > > > > bpf_probe: assert that writable tracepoint size is correct > > Maybe also add a kselftest into bpf test suite to i) demo it and ii) make > sure it's continuously been tested by bots running the suite? Will do. > > > Signed-off-by: Matt Mullins <mmullins@xxxxxx> > > --- > > include/linux/bpf.h | 2 ++ > > include/linux/bpf_types.h | 1 + > > include/linux/tracepoint-defs.h | 1 + > > include/trace/bpf_probe.h | 27 +++++++++++++++++++++++++-- > > include/uapi/linux/bpf.h | 1 + > > kernel/bpf/syscall.c | 8 ++++++-- > > kernel/bpf/verifier.c | 11 +++++++++++ > > kernel/trace/bpf_trace.c | 21 +++++++++++++++++++++ > > 8 files changed, 68 insertions(+), 4 deletions(-) > > > > diff --git a/include/linux/bpf.h b/include/linux/bpf.h > > index a2132e09dc1c..d3c71fd67476 100644 > > --- a/include/linux/bpf.h > > +++ b/include/linux/bpf.h > > @@ -263,6 +263,7 @@ enum bpf_reg_type { > > PTR_TO_SOCK_COMMON_OR_NULL, /* reg points to sock_common or NULL */ > > PTR_TO_TCP_SOCK, /* reg points to struct tcp_sock */ > > PTR_TO_TCP_SOCK_OR_NULL, /* reg points to struct tcp_sock or NULL */ > > + PTR_TO_TP_BUFFER, /* reg points to a writable raw tp's buffer */ > > }; > > > > /* The information passed from prog-specific *_is_valid_access > > @@ -352,6 +353,7 @@ struct bpf_prog_aux { > > u32 used_map_cnt; > > u32 max_ctx_offset; > > u32 max_pkt_offset; > > + u32 max_tp_access; > > u32 stack_depth; > > u32 id; > > u32 func_cnt; /* used by non-func prog as the number of func progs */ > > diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h > > index 08bf2f1fe553..c766108608cb 100644 > > --- a/include/linux/bpf_types.h > > +++ b/include/linux/bpf_types.h > > @@ -25,6 +25,7 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_KPROBE, kprobe) > > BPF_PROG_TYPE(BPF_PROG_TYPE_TRACEPOINT, tracepoint) > > BPF_PROG_TYPE(BPF_PROG_TYPE_PERF_EVENT, perf_event) > > BPF_PROG_TYPE(BPF_PROG_TYPE_RAW_TRACEPOINT, raw_tracepoint) > > +BPF_PROG_TYPE(BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE, raw_tracepoint_writable) > > #endif > > #ifdef CONFIG_CGROUP_BPF > > BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_DEVICE, cg_dev) > > diff --git a/include/linux/tracepoint-defs.h b/include/linux/tracepoint-defs.h > > index 49ba9cde7e4b..b29950a19205 100644 > > --- a/include/linux/tracepoint-defs.h > > +++ b/include/linux/tracepoint-defs.h > > @@ -45,6 +45,7 @@ struct bpf_raw_event_map { > > struct tracepoint *tp; > > void *bpf_func; > > u32 num_args; > > + u32 writable_size; > > } __aligned(32); > > > > #endif > > diff --git a/include/trace/bpf_probe.h b/include/trace/bpf_probe.h > > index 505dae0bed80..d6e556c0a085 100644 > > --- a/include/trace/bpf_probe.h > > +++ b/include/trace/bpf_probe.h > > @@ -69,8 +69,7 @@ __bpf_trace_##call(void *__data, proto) \ > > * to make sure that if the tracepoint handling changes, the > > * bpf probe will fail to compile unless it too is updated. > > */ > > -#undef DEFINE_EVENT > > -#define DEFINE_EVENT(template, call, proto, args) \ > > +#define __DEFINE_EVENT(template, call, proto, args, size) \ > > static inline void bpf_test_probe_##call(void) \ > > { \ > > check_trace_callback_type_##call(__bpf_trace_##template); \ > > @@ -81,12 +80,36 @@ __bpf_trace_tp_map_##call = { \ > > .tp = &__tracepoint_##call, \ > > .bpf_func = (void *)__bpf_trace_##template, \ > > .num_args = COUNT_ARGS(args), \ > > + .writable_size = size, \ > > }; > > > > +#define FIRST(x, ...) x > > + > > +#undef DEFINE_EVENT_WRITABLE > > +#define DEFINE_EVENT_WRITABLE(template, call, proto, args, size) \ > > +static inline void bpf_test_buffer_##call(void) \ > > +{ \ > > + /* BUILD_BUG_ON() is ignored if the code is completely eliminated, but \ > > + * BUILD_BUG_ON_ZERO() uses a different mechanism that is not \ > > + * dead-code-eliminated. \ > > + */ \ > > + FIRST(proto); \ > > + (void)BUILD_BUG_ON_ZERO(size != sizeof(*FIRST(args))); \ > > +} \ > > +__DEFINE_EVENT(template, call, PARAMS(proto), PARAMS(args), size) > > + > > +#undef DEFINE_EVENT > > +#define DEFINE_EVENT(template, call, proto, args) \ > > + __DEFINE_EVENT(template, call, PARAMS(proto), PARAMS(args), 0) > > > > #undef DEFINE_EVENT_PRINT > > #define DEFINE_EVENT_PRINT(template, name, proto, args, print) \ > > DEFINE_EVENT(template, name, PARAMS(proto), PARAMS(args)) > > > > #include TRACE_INCLUDE(TRACE_INCLUDE_FILE) > > + > > +#undef DEFINE_EVENT_WRITABLE > > +#undef __DEFINE_EVENT > > +#undef FIRST > > + > > #endif /* CONFIG_BPF_EVENTS */ > > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h > > index 3c38ac9a92a7..c5335d53ce82 100644 > > --- a/include/uapi/linux/bpf.h > > +++ b/include/uapi/linux/bpf.h > > @@ -166,6 +166,7 @@ enum bpf_prog_type { > > BPF_PROG_TYPE_LIRC_MODE2, > > BPF_PROG_TYPE_SK_REUSEPORT, > > BPF_PROG_TYPE_FLOW_DISSECTOR, > > + BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE, > > }; > > > > enum bpf_attach_type { > > diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c > > index 62f6bced3a3c..27e2f22879a4 100644 > > --- a/kernel/bpf/syscall.c > > +++ b/kernel/bpf/syscall.c > > @@ -1720,12 +1720,16 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr) > > } > > raw_tp->btp = btp; > > > > - prog = bpf_prog_get_type(attr->raw_tracepoint.prog_fd, > > - BPF_PROG_TYPE_RAW_TRACEPOINT); > > + prog = bpf_prog_get(attr->raw_tracepoint.prog_fd); > > if (IS_ERR(prog)) { > > err = PTR_ERR(prog); > > goto out_free_tp; > > } > > + if (prog->type != BPF_PROG_TYPE_RAW_TRACEPOINT && > > + prog->type != BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE) { > > I don't think we'd gain a lot by making this an extra prog type which can do the > same as BPF_PROG_TYPE_RAW_TRACEPOINT modulo optional writing. Why not integrating > this directly into BPF_PROG_TYPE_RAW_TRACEPOINT then? The actual opt-in comes from > the DEFINE_EVENT_WRITABLE(), not from the prog type. I did that to separate the hook into raw_tp_writable_prog_is_valid_access, which (compared to raw_tp_prog_is_valid_access): 1) permits writes, and 2) encodes the assumption than the context begins with the pointer to that writable buffer I'm not sure those are appropriate for all users of BPF_PROG_TYPE_RAW_TRACEPOINT, but I can't immediately point out any harm in doing so -- some dereferences of ctx that have historically returned a SCALAR_VALUE would end up tagged as a PTR_TO_TP_BUFFER, but they still won't be able to access through that pointer unless they're attached to the right tracepoint. I'll try to unify the two and see what I get. > > > + err = -EINVAL; > > + goto out_put_prog; > > + } > > > > err = bpf_probe_register(raw_tp->btp, prog); > > if (err) > > diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c > > index ce166a002d16..b6b4a2ca9f0c 100644 > > --- a/kernel/bpf/verifier.c > > +++ b/kernel/bpf/verifier.c > > @@ -2100,6 +2100,17 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn > > err = check_sock_access(env, insn_idx, regno, off, size, t); > > if (!err && value_regno >= 0) > > mark_reg_unknown(env, regs, value_regno); > > + } else if (reg->type == PTR_TO_TP_BUFFER) { > > + if (off < 0) { > > + verbose(env, > > + "R%d invalid tracepoint buffer access: off=%d, size=%d", > > + value_regno, off, size); > > + return -EACCES; > > + } > > + if (off + size > env->prog->aux->max_tp_access) > > + env->prog->aux->max_tp_access = off + size; > > + if (t == BPF_READ && value_regno >= 0) > > + mark_reg_unknown(env, regs, value_regno); > > This should also disallow variable access into the reg, I presume (see check_ctx_reg())? > Or is there a clear rationale for having it enabled? Nope, that was an oversight from an (incorrect) assumption that arithmetic would be disallowed on a PTR_TO_TP_BUFFER. I'll fix this in v2. > > > } else { > > verbose(env, "R%d invalid mem access '%s'\n", regno, > > reg_type_str[reg->type]); > > diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c > > index d64c00afceb5..a2dd79dc6871 100644 > > --- a/kernel/trace/bpf_trace.c > > +++ b/kernel/trace/bpf_trace.c > > @@ -909,6 +909,24 @@ const struct bpf_verifier_ops raw_tracepoint_verifier_ops = { > > const struct bpf_prog_ops raw_tracepoint_prog_ops = { > > }; > > > > +static bool raw_tp_writable_prog_is_valid_access(int off, int size, > > + enum bpf_access_type type, > > + const struct bpf_prog *prog, > > + struct bpf_insn_access_aux *info) > > +{ > > + if (off == 0 && size == sizeof(u64)) > > + info->reg_type = PTR_TO_TP_BUFFER; > > + return raw_tp_prog_is_valid_access(off, size, type, prog, info); > > +} > > + > > +const struct bpf_verifier_ops raw_tracepoint_writable_verifier_ops = { > > + .get_func_proto = raw_tp_prog_func_proto, > > + .is_valid_access = raw_tp_writable_prog_is_valid_access, > > +}; > > + > > +const struct bpf_prog_ops raw_tracepoint_writable_prog_ops = { > > +}; > > + > > static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type, > > const struct bpf_prog *prog, > > struct bpf_insn_access_aux *info) > > @@ -1198,6 +1216,9 @@ static int __bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog * > > if (prog->aux->max_ctx_offset > btp->num_args * sizeof(u64)) > > return -EINVAL; > > > > + if (prog->aux->max_tp_access > btp->writable_size) > > + return -EINVAL; > > + > > return tracepoint_probe_register(tp, (void *)btp->bpf_func, prog); > > } > > > > > >