From: Joanne Koong <joannelkoong@xxxxxxxxx> Currently, our only way of writing dynamically-sized data into a ring buffer is through bpf_ringbuf_output but this incurs an extra memcpy cost. bpf_ringbuf_reserve + bpf_ringbuf_commit avoids this extra memcpy, but it can only safely support reservation sizes that are statically known since the verifier cannot guarantee that the bpf program won’t access memory outside the reserved space. The bpf_dynptr abstraction allows for dynamically-sized ring buffer reservations without the extra memcpy. There are 3 new APIs: long bpf_ringbuf_reserve_dynptr(void *ringbuf, u32 size, u64 flags, struct bpf_dynptr *ptr); void bpf_ringbuf_submit_dynptr(struct bpf_dynptr *ptr, u64 flags); void bpf_ringbuf_discard_dynptr(struct bpf_dynptr *ptr, u64 flags); These closely follow the functionalities of the original ringbuf APIs. For example, all ringbuffer dynptrs that have been reserved must be either submitted or discarded before the program exits. Signed-off-by: Joanne Koong <joannelkoong@xxxxxxxxx> --- include/linux/bpf.h | 10 ++++- include/uapi/linux/bpf.h | 30 ++++++++++++++ kernel/bpf/helpers.c | 6 +++ kernel/bpf/ringbuf.c | 71 ++++++++++++++++++++++++++++++++++ kernel/bpf/verifier.c | 17 ++++++-- tools/include/uapi/linux/bpf.h | 30 ++++++++++++++ 6 files changed, 160 insertions(+), 4 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index cded9753fb7f..2672360172c5 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -352,7 +352,10 @@ enum bpf_type_flag { /* DYNPTR points to dynamically allocated memory. */ DYNPTR_TYPE_MALLOC = BIT(8 + BPF_BASE_TYPE_BITS), - __BPF_TYPE_LAST_FLAG = DYNPTR_TYPE_MALLOC, + /* DYNPTR points to a ringbuf record. */ + DYNPTR_TYPE_RINGBUF = BIT(9 + BPF_BASE_TYPE_BITS), + + __BPF_TYPE_LAST_FLAG = DYNPTR_TYPE_RINGBUF, }; /* Max number of base types. */ @@ -2255,6 +2258,9 @@ extern const struct bpf_func_proto bpf_ringbuf_reserve_proto; extern const struct bpf_func_proto bpf_ringbuf_submit_proto; extern const struct bpf_func_proto bpf_ringbuf_discard_proto; extern const struct bpf_func_proto bpf_ringbuf_query_proto; +extern const struct bpf_func_proto bpf_ringbuf_reserve_dynptr_proto; +extern const struct bpf_func_proto bpf_ringbuf_submit_dynptr_proto; +extern const struct bpf_func_proto bpf_ringbuf_discard_dynptr_proto; extern const struct bpf_func_proto bpf_skc_to_tcp6_sock_proto; extern const struct bpf_func_proto bpf_skc_to_tcp_sock_proto; extern const struct bpf_func_proto bpf_skc_to_tcp_timewait_sock_proto; @@ -2418,6 +2424,8 @@ enum bpf_dynptr_type { BPF_DYNPTR_TYPE_LOCAL, /* Memory allocated dynamically by the kernel for the dynptr */ BPF_DYNPTR_TYPE_MALLOC, + /* Underlying data is a ringbuf record */ + BPF_DYNPTR_TYPE_RINGBUF, }; /* The upper 4 bits of dynptr->size are reserved. Consequently, the diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index c835e437cb28..778de0b052c1 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -5202,6 +5202,33 @@ union bpf_attr { * Pointer to the underlying dynptr data, NULL if the ptr is * read-only, if the dynptr is invalid, or if the offset and length * is out of bounds. + * + * long bpf_ringbuf_reserve_dynptr(void *ringbuf, u32 size, u64 flags, struct bpf_dynptr *ptr) + * Description + * Reserve *size* bytes of payload in a ring buffer *ringbuf* + * through the dynptr interface. *flags* must be 0. + * Return + * 0 on success, or a negative error in case of failure. + * + * void bpf_ringbuf_submit_dynptr(struct bpf_dynptr *ptr, u64 flags) + * Description + * Submit reserved ring buffer sample, pointed to by *data*, + * through the dynptr interface. + * + * For more information on *flags*, please see + * 'bpf_ringbuf_submit'. + * Return + * Nothing. Always succeeds. + * + * void bpf_ringbuf_discard_dynptr(struct bpf_dynptr *ptr, u64 flags) + * Description + * Discard reserved ring buffer sample through the dynptr + * interface. + * + * For more information on *flags*, please see + * 'bpf_ringbuf_discard'. + * Return + * Nothing. Always succeeds. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -5404,6 +5431,9 @@ union bpf_attr { FN(dynptr_read), \ FN(dynptr_write), \ FN(dynptr_data), \ + FN(ringbuf_reserve_dynptr), \ + FN(ringbuf_submit_dynptr), \ + FN(ringbuf_discard_dynptr), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index c1295fb5d9d4..7b1e467f0504 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1580,6 +1580,12 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_ringbuf_discard_proto; case BPF_FUNC_ringbuf_query: return &bpf_ringbuf_query_proto; + case BPF_FUNC_ringbuf_reserve_dynptr: + return &bpf_ringbuf_reserve_dynptr_proto; + case BPF_FUNC_ringbuf_submit_dynptr: + return &bpf_ringbuf_submit_dynptr_proto; + case BPF_FUNC_ringbuf_discard_dynptr: + return &bpf_ringbuf_discard_dynptr_proto; case BPF_FUNC_for_each_map_elem: return &bpf_for_each_map_elem_proto; case BPF_FUNC_loop: diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c index a723aa484ce4..cdbeeb4819ae 100644 --- a/kernel/bpf/ringbuf.c +++ b/kernel/bpf/ringbuf.c @@ -475,3 +475,74 @@ const struct bpf_func_proto bpf_ringbuf_query_proto = { .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_ANYTHING, }; + +BPF_CALL_4(bpf_ringbuf_reserve_dynptr, struct bpf_map *, map, u32, size, u64, flags, + struct bpf_dynptr_kern *, ptr) +{ + void *sample; + int err; + + err = bpf_dynptr_check_size(size); + if (err) { + bpf_dynptr_set_null(ptr); + return err; + } + + sample = (void *)____bpf_ringbuf_reserve(map, size, flags); + + if (!sample) { + bpf_dynptr_set_null(ptr); + return -EINVAL; + } + + bpf_dynptr_init(ptr, sample, BPF_DYNPTR_TYPE_RINGBUF, 0, size); + + return 0; +} + +const struct bpf_func_proto bpf_ringbuf_reserve_dynptr_proto = { + .func = bpf_ringbuf_reserve_dynptr, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_RINGBUF | MEM_UNINIT, +}; + +BPF_CALL_2(bpf_ringbuf_submit_dynptr, struct bpf_dynptr_kern *, ptr, u64, flags) +{ + if (!ptr->data) + return 0; + + ____bpf_ringbuf_submit(ptr->data, flags); + + bpf_dynptr_set_null(ptr); + + return 0; +} + +const struct bpf_func_proto bpf_ringbuf_submit_dynptr_proto = { + .func = bpf_ringbuf_submit_dynptr, + .ret_type = RET_VOID, + .arg1_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_RINGBUF | MEM_RELEASE, + .arg2_type = ARG_ANYTHING, +}; + +BPF_CALL_2(bpf_ringbuf_discard_dynptr, struct bpf_dynptr_kern *, ptr, u64, flags) +{ + if (!ptr->data) + return 0; + + ____bpf_ringbuf_discard(ptr->data, flags); + + bpf_dynptr_set_null(ptr); + + return 0; +} + +const struct bpf_func_proto bpf_ringbuf_discard_dynptr_proto = { + .func = bpf_ringbuf_discard_dynptr, + .ret_type = RET_VOID, + .arg1_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_RINGBUF | MEM_RELEASE, + .arg2_type = ARG_ANYTHING, +}; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 7352ffb4f9a5..6336476eac7d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -679,7 +679,7 @@ static void mark_verifier_state_scratched(struct bpf_verifier_env *env) static int arg_to_dynptr_type(enum bpf_arg_type arg_type, enum bpf_dynptr_type *dynptr_type) { - int type = arg_type & (DYNPTR_TYPE_LOCAL | DYNPTR_TYPE_MALLOC); + int type = arg_type & (DYNPTR_TYPE_LOCAL | DYNPTR_TYPE_MALLOC | DYNPTR_TYPE_RINGBUF); switch (type) { case DYNPTR_TYPE_LOCAL: @@ -688,6 +688,9 @@ static int arg_to_dynptr_type(enum bpf_arg_type arg_type, enum bpf_dynptr_type * case DYNPTR_TYPE_MALLOC: *dynptr_type = BPF_DYNPTR_TYPE_MALLOC; break; + case DYNPTR_TYPE_RINGBUF: + *dynptr_type = BPF_DYNPTR_TYPE_RINGBUF; + break; default: /* Can't have more than one type set and can't have no * type set @@ -702,7 +705,7 @@ static bool dynptr_type_refcounted(struct bpf_func_state *state, int spi) { enum bpf_dynptr_type type = state->stack[spi].spilled_ptr.dynptr_type; - return type == BPF_DYNPTR_TYPE_MALLOC; + return type == BPF_DYNPTR_TYPE_MALLOC || type == BPF_DYNPTR_TYPE_RINGBUF; } static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_state *reg, @@ -5842,6 +5845,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, err_extra = "local "; else if (arg_type & DYNPTR_TYPE_MALLOC) err_extra = "malloc "; + else if (arg_type & DYNPTR_TYPE_RINGBUF) + err_extra = "ringbuf "; verbose(env, "Expected an initialized %sdynptr as arg #%d\n", err_extra, arg + 1); return -EINVAL; @@ -5966,7 +5971,10 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, case BPF_MAP_TYPE_RINGBUF: if (func_id != BPF_FUNC_ringbuf_output && func_id != BPF_FUNC_ringbuf_reserve && - func_id != BPF_FUNC_ringbuf_query) + func_id != BPF_FUNC_ringbuf_query && + func_id != BPF_FUNC_ringbuf_reserve_dynptr && + func_id != BPF_FUNC_ringbuf_submit_dynptr && + func_id != BPF_FUNC_ringbuf_discard_dynptr) goto error; break; case BPF_MAP_TYPE_STACK_TRACE: @@ -6082,6 +6090,9 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, case BPF_FUNC_ringbuf_output: case BPF_FUNC_ringbuf_reserve: case BPF_FUNC_ringbuf_query: + case BPF_FUNC_ringbuf_reserve_dynptr: + case BPF_FUNC_ringbuf_submit_dynptr: + case BPF_FUNC_ringbuf_discard_dynptr: if (map->map_type != BPF_MAP_TYPE_RINGBUF) goto error; break; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index c835e437cb28..778de0b052c1 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -5202,6 +5202,33 @@ union bpf_attr { * Pointer to the underlying dynptr data, NULL if the ptr is * read-only, if the dynptr is invalid, or if the offset and length * is out of bounds. + * + * long bpf_ringbuf_reserve_dynptr(void *ringbuf, u32 size, u64 flags, struct bpf_dynptr *ptr) + * Description + * Reserve *size* bytes of payload in a ring buffer *ringbuf* + * through the dynptr interface. *flags* must be 0. + * Return + * 0 on success, or a negative error in case of failure. + * + * void bpf_ringbuf_submit_dynptr(struct bpf_dynptr *ptr, u64 flags) + * Description + * Submit reserved ring buffer sample, pointed to by *data*, + * through the dynptr interface. + * + * For more information on *flags*, please see + * 'bpf_ringbuf_submit'. + * Return + * Nothing. Always succeeds. + * + * void bpf_ringbuf_discard_dynptr(struct bpf_dynptr *ptr, u64 flags) + * Description + * Discard reserved ring buffer sample through the dynptr + * interface. + * + * For more information on *flags*, please see + * 'bpf_ringbuf_discard'. + * Return + * Nothing. Always succeeds. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -5404,6 +5431,9 @@ union bpf_attr { FN(dynptr_read), \ FN(dynptr_write), \ FN(dynptr_data), \ + FN(ringbuf_reserve_dynptr), \ + FN(ringbuf_submit_dynptr), \ + FN(ringbuf_discard_dynptr), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper -- 2.30.2