This helper is only used by BPF program attached to perf_event. If the perf_event has PEBS entries, calling get_perf_callchain from BPF program may cause unwinder errors. bpf_get_callchain_stackid serves as alternative to bpf_get_stackid for these BPF programs. Signed-off-by: Song Liu <songliubraving@xxxxxx> --- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 43 +++++++++++++++++++++++ kernel/bpf/stackmap.c | 63 ++++++++++++++++++++++++++-------- kernel/bpf/verifier.c | 4 ++- kernel/trace/bpf_trace.c | 2 ++ scripts/bpf_helpers_doc.py | 2 ++ tools/include/uapi/linux/bpf.h | 43 +++++++++++++++++++++++ 7 files changed, 142 insertions(+), 16 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 0cd7f6884c5cd..45cf12acb0e26 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1628,6 +1628,7 @@ extern const struct bpf_func_proto bpf_get_current_comm_proto; extern const struct bpf_func_proto bpf_get_stackid_proto; extern const struct bpf_func_proto bpf_get_stack_proto; extern const struct bpf_func_proto bpf_get_task_stack_proto; +extern const struct bpf_func_proto bpf_get_callchain_stackid_proto; extern const struct bpf_func_proto bpf_sock_map_update_proto; extern const struct bpf_func_proto bpf_sock_hash_update_proto; extern const struct bpf_func_proto bpf_get_current_cgroup_id_proto; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 548a749aebb3e..a808accfbd457 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3319,6 +3319,48 @@ union bpf_attr { * A non-negative value equal to or less than *size* on success, * or a negative error in case of failure. * + * long bpf_get_callchain_stackid(struct perf_callchain_entry *callchain, struct bpf_map *map, u64 flags) + * Description + * Walk a user or a kernel stack and return its id. To achieve + * this, the helper needs *callchain*, which is a pointer to a + * valid perf_callchain_entry, and a pointer to a *map* of type + * **BPF_MAP_TYPE_STACK_TRACE**. + * + * The last argument, *flags*, holds the number of stack frames to + * skip (from 0 to 255), masked with + * **BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set + * a combination of the following flags: + * + * **BPF_F_USER_STACK** + * Collect a user space stack instead of a kernel stack. + * **BPF_F_FAST_STACK_CMP** + * Compare stacks by hash only. + * **BPF_F_REUSE_STACKID** + * If two different stacks hash into the same *stackid*, + * discard the old one. + * + * The stack id retrieved is a 32 bit long integer handle which + * can be further combined with other data (including other stack + * ids) and used as a key into maps. This can be useful for + * generating a variety of graphs (such as flame graphs or off-cpu + * graphs). + * + * For walking a stack, this helper is an improvement over + * **bpf_probe_read**\ (), which can be used with unrolled loops + * but is not efficient and consumes a lot of eBPF instructions. + * Instead, **bpf_get_callchain_stackid**\ () can collect up to + * **PERF_MAX_STACK_DEPTH** both kernel and user frames. Note that + * this limit can be controlled with the **sysctl** program, and + * that it should be manually increased in order to profile long + * user stacks (such as stacks for Java programs). To do so, use: + * + * :: + * + * # sysctl kernel.perf_event_max_stack=<new value> + * Return + * The positive or null stack id on success, or a negative error + * in case of failure. + * */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3463,6 +3505,7 @@ union bpf_attr { FN(skc_to_tcp_request_sock), \ FN(skc_to_udp6_sock), \ FN(get_task_stack), \ + FN(get_callchain_stackid), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index a6c361ed7937b..28acc610f7f94 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -386,11 +386,10 @@ get_callchain_entry_for_task(struct task_struct *task, u32 init_nr) #endif } -BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, - u64, flags) +static long __bpf_get_stackid(struct bpf_map *map, struct perf_callchain_entry *trace, + u64 flags) { struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); - struct perf_callchain_entry *trace; struct stack_map_bucket *bucket, *new_bucket, *old_bucket; u32 max_depth = map->value_size / stack_map_data_size(map); /* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */ @@ -398,21 +397,9 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, u32 skip = flags & BPF_F_SKIP_FIELD_MASK; u32 hash, id, trace_nr, trace_len; bool user = flags & BPF_F_USER_STACK; - bool kernel = !user; u64 *ips; bool hash_matches; - if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK | - BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID))) - return -EINVAL; - - trace = get_perf_callchain(regs, init_nr, kernel, user, - sysctl_perf_event_max_stack, false, false); - - if (unlikely(!trace)) - /* couldn't fetch the stack trace */ - return -EFAULT; - /* get_perf_callchain() guarantees that trace->nr >= init_nr * and trace-nr <= sysctl_perf_event_max_stack, so trace_nr <= max_depth */ @@ -477,6 +464,30 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, return id; } +BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, + u64, flags) +{ + u32 max_depth = map->value_size / stack_map_data_size(map); + /* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */ + u32 init_nr = sysctl_perf_event_max_stack - max_depth; + bool user = flags & BPF_F_USER_STACK; + struct perf_callchain_entry *trace; + bool kernel = !user; + + if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK | + BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID))) + return -EINVAL; + + trace = get_perf_callchain(regs, init_nr, kernel, user, + sysctl_perf_event_max_stack, false, false); + + if (unlikely(!trace)) + /* couldn't fetch the stack trace */ + return -EFAULT; + + return __bpf_get_stackid(map, trace, flags); +} + const struct bpf_func_proto bpf_get_stackid_proto = { .func = bpf_get_stackid, .gpl_only = true, @@ -486,6 +497,28 @@ const struct bpf_func_proto bpf_get_stackid_proto = { .arg3_type = ARG_ANYTHING, }; +BPF_CALL_3(bpf_get_callchain_stackid, struct perf_callchain_entry *, callchain, + struct bpf_map *, map, u64, flags) +{ + if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK | + BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID))) + return -EINVAL; + if (!callchain) + return -EFAULT; + return __bpf_get_stackid(map, callchain, flags); +} + +static int bpf_get_callchain_stackid_btf_ids[5]; +const struct bpf_func_proto bpf_get_callchain_stackid_proto = { + .func = bpf_get_callchain_stackid, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, + .btf_id = bpf_get_callchain_stackid_btf_ids, +}; + static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task, void *buf, u32 size, u64 flags) { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 1e11b0f6fba31..07be75550ca93 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4094,7 +4094,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, goto error; break; case BPF_MAP_TYPE_STACK_TRACE: - if (func_id != BPF_FUNC_get_stackid) + if (func_id != BPF_FUNC_get_stackid && + func_id != BPF_FUNC_get_callchain_stackid) goto error; break; case BPF_MAP_TYPE_CGROUP_ARRAY: @@ -4187,6 +4188,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, goto error; break; case BPF_FUNC_get_stackid: + case BPF_FUNC_get_callchain_stackid: if (map->map_type != BPF_MAP_TYPE_STACK_TRACE) goto error; break; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index c014846c2723c..7a504f734a025 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1396,6 +1396,8 @@ pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_perf_prog_read_value_proto; case BPF_FUNC_read_branch_records: return &bpf_read_branch_records_proto; + case BPF_FUNC_get_callchain_stackid: + return &bpf_get_callchain_stackid_proto; default: return bpf_tracing_func_proto(func_id, prog); } diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py index 6843376733df8..1b99e3618e492 100755 --- a/scripts/bpf_helpers_doc.py +++ b/scripts/bpf_helpers_doc.py @@ -427,6 +427,7 @@ class PrinterHelpers(Printer): 'struct tcp_request_sock', 'struct udp6_sock', 'struct task_struct', + 'struct perf_callchain_entry', 'struct __sk_buff', 'struct sk_msg_md', @@ -470,6 +471,7 @@ class PrinterHelpers(Printer): 'struct tcp_request_sock', 'struct udp6_sock', 'struct task_struct', + 'struct perf_callchain_entry', } mapped_types = { 'u8': '__u8', diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 548a749aebb3e..a808accfbd457 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3319,6 +3319,48 @@ union bpf_attr { * A non-negative value equal to or less than *size* on success, * or a negative error in case of failure. * + * long bpf_get_callchain_stackid(struct perf_callchain_entry *callchain, struct bpf_map *map, u64 flags) + * Description + * Walk a user or a kernel stack and return its id. To achieve + * this, the helper needs *callchain*, which is a pointer to a + * valid perf_callchain_entry, and a pointer to a *map* of type + * **BPF_MAP_TYPE_STACK_TRACE**. + * + * The last argument, *flags*, holds the number of stack frames to + * skip (from 0 to 255), masked with + * **BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set + * a combination of the following flags: + * + * **BPF_F_USER_STACK** + * Collect a user space stack instead of a kernel stack. + * **BPF_F_FAST_STACK_CMP** + * Compare stacks by hash only. + * **BPF_F_REUSE_STACKID** + * If two different stacks hash into the same *stackid*, + * discard the old one. + * + * The stack id retrieved is a 32 bit long integer handle which + * can be further combined with other data (including other stack + * ids) and used as a key into maps. This can be useful for + * generating a variety of graphs (such as flame graphs or off-cpu + * graphs). + * + * For walking a stack, this helper is an improvement over + * **bpf_probe_read**\ (), which can be used with unrolled loops + * but is not efficient and consumes a lot of eBPF instructions. + * Instead, **bpf_get_callchain_stackid**\ () can collect up to + * **PERF_MAX_STACK_DEPTH** both kernel and user frames. Note that + * this limit can be controlled with the **sysctl** program, and + * that it should be manually increased in order to profile long + * user stacks (such as stacks for Java programs). To do so, use: + * + * :: + * + * # sysctl kernel.perf_event_max_stack=<new value> + * Return + * The positive or null stack id on success, or a negative error + * in case of failure. + * */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3463,6 +3505,7 @@ union bpf_attr { FN(skc_to_tcp_request_sock), \ FN(skc_to_udp6_sock), \ FN(get_task_stack), \ + FN(get_callchain_stackid), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper -- 2.24.1