Re: [PATCH bpf] bpf: Add LBR data to BPF_PROG_TYPE_PERF_EVENT prog context

Alexei Starovoitov <alexei.starovoitov@xxxxxxxxx> · Thu, 5 Dec 2019 22:39:44 -0800

On Thu, Dec 05, 2019 at 04:12:26PM -0800, Daniel Xu wrote:
> Last-branch-record is an intel CPU feature that can be configured to
> record certain branches that are taken during code execution. This data
> is particularly interesting for profile guided optimizations. perf has
> had LBR support for a while but the data collection can be a bit coarse
> grained.
> 
> We (Facebook) have recently run a lot of experiments with feeding
> filtered LBR data to various PGO pipelines. We've seen really good
> results (+2.5% throughput with lower cpu util and lower latency) by
> feeding high request latency LBR branches to the compiler on a
> request-oriented service. We used bpf to read a special request context
> ID (which is how we associate branches with latency) from a fixed
> userspace address. Reading from the fixed address is why bpf support is
> useful.
> 
> Aside from this particular use case, having LBR data available to bpf
> progs can be useful to get stack traces out of userspace applications
> that omit frame pointers.
> 
> This patch adds support for LBR data to bpf perf progs.
> 
> Some notes:
> * We use `__u64 entries[BPF_MAX_LBR_ENTRIES * 3]` instead of
>   `struct perf_branch_entry[BPF_MAX_LBR_ENTRIES]` because checkpatch.pl
>   warns about including a uapi header from another uapi header
> 
> * We define BPF_MAX_LBR_ENTRIES as 32 (instead of using the value from
>   arch/x86/events/perf_events.h) because including arch specific headers
>   seems wrong and could introduce circular header includes.
> 
> Signed-off-by: Daniel Xu <dxu@xxxxxxxxx>
> ---
>  include/uapi/linux/bpf_perf_event.h |  5 ++++
>  kernel/trace/bpf_trace.c            | 39 +++++++++++++++++++++++++++++
>  2 files changed, 44 insertions(+)
> 
> diff --git a/include/uapi/linux/bpf_perf_event.h b/include/uapi/linux/bpf_perf_event.h
> index eb1b9d21250c..dc87e3d50390 100644
> --- a/include/uapi/linux/bpf_perf_event.h
> +++ b/include/uapi/linux/bpf_perf_event.h
> @@ -10,10 +10,15 @@
>  
>  #include <asm/bpf_perf_event.h>
>  
> +#define BPF_MAX_LBR_ENTRIES 32
> +
>  struct bpf_perf_event_data {
>  	bpf_user_pt_regs_t regs;
>  	__u64 sample_period;
>  	__u64 addr;
> +	__u64 nr_lbr;
> +	/* Cast to struct perf_branch_entry* before using */
> +	__u64 entries[BPF_MAX_LBR_ENTRIES * 3];
>  };
>  
>  #endif /* _UAPI__LINUX_BPF_PERF_EVENT_H__ */
> diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
> index ffc91d4935ac..96ba7995b3d7 100644
> --- a/kernel/trace/bpf_trace.c
> +++ b/kernel/trace/bpf_trace.c
> @@ -1259,6 +1259,14 @@ static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type
>  		if (!bpf_ctx_narrow_access_ok(off, size, size_u64))
>  			return false;
>  		break;
> +	case bpf_ctx_range(struct bpf_perf_event_data, nr_lbr):
> +		bpf_ctx_record_field_size(info, size_u64);
> +		if (!bpf_ctx_narrow_access_ok(off, size, size_u64))
> +			return false;
> +		break;
> +	case bpf_ctx_range(struct bpf_perf_event_data, entries):
> +		/* No narrow loads */
> +		break;
>  	default:
>  		if (size != sizeof(long))
>  			return false;
> @@ -1273,6 +1281,7 @@ static u32 pe_prog_convert_ctx_access(enum bpf_access_type type,
>  				      struct bpf_prog *prog, u32 *target_size)
>  {
>  	struct bpf_insn *insn = insn_buf;
> +	int off;
>  
>  	switch (si->off) {
>  	case offsetof(struct bpf_perf_event_data, sample_period):
> @@ -1291,6 +1300,36 @@ static u32 pe_prog_convert_ctx_access(enum bpf_access_type type,
>  				      bpf_target_off(struct perf_sample_data, addr, 8,
>  						     target_size));
>  		break;
> +	case offsetof(struct bpf_perf_event_data, nr_lbr):
> +		/* Load struct perf_sample_data* */
> +		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern,
> +						       data), si->dst_reg, si->src_reg,
> +				      offsetof(struct bpf_perf_event_data_kern, data));
> +		/* Load struct perf_branch_stack* */
> +		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct perf_sample_data, br_stack),
> +				      si->dst_reg, si->dst_reg,
> +				      offsetof(struct perf_sample_data, br_stack));

br_stack can be NULL.
if != NULL check has to be emitted too.

Otherwise looks good.
Please add a selftest and resubmit when bpf-next reopens next week.