> On Mar 29, 2022, at 5:00 PM, Andrii Nakryiko <andrii.nakryiko@xxxxxxxxx> wrote: > > On Tue, Mar 29, 2022 at 4:19 PM Song Liu <song@xxxxxxxxxx> wrote: >> >> TP_PROTO of sched_switch is updated with a new arg prev_state, which >> causes runqslower load failure: >> >> libbpf: prog 'handle__sched_switch': BPF program load failed: Permission denied >> libbpf: prog 'handle__sched_switch': -- BEGIN PROG LOAD LOG -- >> R1 type=ctx expected=fp >> 0: R1=ctx(off=0,imm=0) R10=fp0 >> ; int handle__sched_switch(u64 *ctx) >> 0: (bf) r7 = r1 ; R1=ctx(off=0,imm=0) R7_w=ctx(off=0,imm=0) >> ; struct task_struct *next = (struct task_struct *)ctx[2]; >> 1: (79) r6 = *(u64 *)(r7 +16) >> func 'sched_switch' arg2 has btf_id 186 type STRUCT 'task_struct' >> 2: R6_w=ptr_task_struct(off=0,imm=0) R7_w=ctx(off=0,imm=0) >> ; struct task_struct *prev = (struct task_struct *)ctx[1]; >> 2: (79) r2 = *(u64 *)(r7 +8) ; R2_w=scalar() R7_w=ctx(off=0,imm=0) >> 3: (b7) r1 = 0 ; R1_w=0 >> ; struct runq_event event = {}; >> 4: (7b) *(u64 *)(r10 -8) = r1 ; R1_w=P0 R10=fp0 fp-8_w=00000000 >> 5: (7b) *(u64 *)(r10 -16) = r1 ; R1_w=P0 R10=fp0 fp-16_w=00000000 >> 6: (7b) *(u64 *)(r10 -24) = r1 ; R1_w=P0 R10=fp0 fp-24_w=00000000 >> 7: (7b) *(u64 *)(r10 -32) = r1 ; R1_w=P0 R10=fp0 fp-32_w=00000000 >> ; if (prev->__state == TASK_RUNNING) >> 8: (61) r1 = *(u32 *)(r2 +24) >> R2 invalid mem access 'scalar' >> processed 9 insns (limit 1000000) max_states_per_insn 0 total_states 0 peak_states 0 mark_read 0 >> -- END PROG LOAD LOG -- >> libbpf: failed to load program 'handle__sched_switch' >> libbpf: failed to load object 'runqslower_bpf' >> libbpf: failed to load BPF skeleton 'runqslower_bpf': -13 >> failed to load BPF object: -13 >> >> Update runqslower to fix this issue. Also, as we are on this, use BPF_PROG >> in runqslower for cleaner code. >> >> Fixes: fa2c3254d7cf ("sched/tracing: Don't re-read p->state when emitting sched_switch event") >> Signed-off-by: Song Liu <song@xxxxxxxxxx> >> --- >> tools/bpf/runqslower/runqslower.bpf.c | 19 +++++-------------- >> 1 file changed, 5 insertions(+), 14 deletions(-) >> > > It would be much less disruptive if that prev_state was added after > "next", but oh well... Maybe we should change that. +Valentin and Steven, how about we change the order with the attached diff (not the original patch in this thread, but the one at the end of this email)? > > But anyways, let's handle this in a way that can handle both old > kernels and new ones and do the same change in libbpf-tool's > runqslower ([0]). Can you please follow up there as well? Yeah, I will also fix that one. > > > We can use BPF CO-RE to detect which order of arguments running kernel > has by checking prev_state field existence in struct > trace_event_raw_sched_switch. Can you please try that? Use > bpf_core_field_exists() for that. Do you mean something like if (bpf_core_field_exists(ctx->prev_state)) /* use ctx[2] and ctx[3] */ else /* use ctx[1] and ctx[2] */ ? I think we will need BTF for the arguments, which doesn't exist yet. Did I miss something? I was thinking about adding something like struct tp_sched_switch_args for all the raw tracepoints, but haven't got time to look into how. Thanks, Song > > > [0] https://github.com/iovisor/bcc/blob/master/libbpf-tools/runqslower.bpf.c > > >> diff --git a/tools/bpf/runqslower/runqslower.bpf.c b/tools/bpf/runqslower/runqslower.bpf.c >> index 9a5c1f008fe6..30e491d8308f 100644 >> --- a/tools/bpf/runqslower/runqslower.bpf.c >> +++ b/tools/bpf/runqslower/runqslower.bpf.c >> @@ -2,6 +2,7 @@ >> // Copyright (c) 2019 Facebook >> #include "vmlinux.h" >> #include <bpf/bpf_helpers.h> >> +#include <bpf/bpf_tracing.h> >> #include "runqslower.h" >> >> #define TASK_RUNNING 0 >> @@ -43,31 +44,21 @@ static int trace_enqueue(struct task_struct *t) >> } >> >> SEC("tp_btf/sched_wakeup") >> -int handle__sched_wakeup(u64 *ctx) >> +int BPF_PROG(handle__sched_wakeup, struct task_struct *p) >> { >> - /* TP_PROTO(struct task_struct *p) */ >> - struct task_struct *p = (void *)ctx[0]; >> - >> return trace_enqueue(p); >> } >> >> SEC("tp_btf/sched_wakeup_new") >> -int handle__sched_wakeup_new(u64 *ctx) >> +int BPF_PROG(handle__sched_wakeup_new, struct task_struct *p) >> { >> - /* TP_PROTO(struct task_struct *p) */ >> - struct task_struct *p = (void *)ctx[0]; >> - >> return trace_enqueue(p); >> } >> >> SEC("tp_btf/sched_switch") >> -int handle__sched_switch(u64 *ctx) >> +int BPF_PROG(handle__sched_switch, bool preempt, unsigned long prev_state, >> + struct task_struct *prev, struct task_struct *next) >> { >> - /* TP_PROTO(bool preempt, struct task_struct *prev, >> - * struct task_struct *next) >> - */ >> - struct task_struct *prev = (struct task_struct *)ctx[1]; >> - struct task_struct *next = (struct task_struct *)ctx[2]; >> struct runq_event event = {}; >> u64 *tsp, delta_us; >> long state; >> -- >> 2.30.2 diff --git i/include/trace/events/sched.h w/include/trace/events/sched.h index 65e786756321..fbb99a61f714 100644 --- i/include/trace/events/sched.h +++ w/include/trace/events/sched.h @@ -222,11 +222,11 @@ static inline long __trace_sched_switch_state(bool preempt, TRACE_EVENT(sched_switch, TP_PROTO(bool preempt, - unsigned int prev_state, struct task_struct *prev, - struct task_struct *next), + struct task_struct *next, + unsigned int prev_state), - TP_ARGS(preempt, prev_state, prev, next), + TP_ARGS(preempt, prev, next, prev_state), TP_STRUCT__entry( __array( char, prev_comm, TASK_COMM_LEN ) diff --git i/kernel/sched/core.c w/kernel/sched/core.c index d575b4914925..25784f3efd81 100644 --- i/kernel/sched/core.c +++ w/kernel/sched/core.c @@ -6376,7 +6376,7 @@ static void __sched notrace __schedule(unsigned int sched_mode) migrate_disable_switch(rq, prev); psi_sched_switch(prev, next, !task_on_rq_queued(prev)); - trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev_state, prev, next); + trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev, next, prev_state); /* Also unlocks the rq: */ rq = context_switch(rq, prev, next, &rf); diff --git i/kernel/trace/fgraph.c w/kernel/trace/fgraph.c index 19028e072cdb..d7a81d277f66 100644 --- i/kernel/trace/fgraph.c +++ w/kernel/trace/fgraph.c @@ -415,9 +415,10 @@ static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list) static void ftrace_graph_probe_sched_switch(void *ignore, bool preempt, - unsigned int prev_state, struct task_struct *prev, - struct task_struct *next) + struct task_struct *next, + unsigned int prev_state) + { unsigned long long timestamp; int index; diff --git i/kernel/trace/ftrace.c w/kernel/trace/ftrace.c index 4f1d2f5e7263..957354488242 100644 --- i/kernel/trace/ftrace.c +++ w/kernel/trace/ftrace.c @@ -7420,9 +7420,9 @@ ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops) static void ftrace_filter_pid_sched_switch_probe(void *data, bool preempt, - unsigned int prev_state, struct task_struct *prev, - struct task_struct *next) + struct task_struct *next, + unsigned int prev_state) { struct trace_array *tr = data; struct trace_pid_list *pid_list; diff --git i/kernel/trace/trace_events.c w/kernel/trace/trace_events.c index e11e167b7809..e7b6a2155e96 100644 --- i/kernel/trace/trace_events.c +++ w/kernel/trace/trace_events.c @@ -773,9 +773,9 @@ void trace_event_follow_fork(struct trace_array *tr, bool enable) static void event_filter_pid_sched_switch_probe_pre(void *data, bool preempt, - unsigned int prev_state, struct task_struct *prev, - struct task_struct *next) + struct task_struct *next, + unsigned int prev_state) { struct trace_array *tr = data; struct trace_pid_list *no_pid_list; @@ -799,9 +799,9 @@ event_filter_pid_sched_switch_probe_pre(void *data, bool preempt, static void event_filter_pid_sched_switch_probe_post(void *data, bool preempt, - unsigned int prev_state, struct task_struct *prev, - struct task_struct *next) + struct task_struct *next, + unsigned int prev_state) { struct trace_array *tr = data; struct trace_pid_list *no_pid_list; diff --git i/kernel/trace/trace_sched_switch.c w/kernel/trace/trace_sched_switch.c index 45796d8bd4b2..c9ffdcfe622e 100644 --- i/kernel/trace/trace_sched_switch.c +++ w/kernel/trace/trace_sched_switch.c @@ -22,8 +22,8 @@ static DEFINE_MUTEX(sched_register_mutex); static void probe_sched_switch(void *ignore, bool preempt, - unsigned int prev_state, - struct task_struct *prev, struct task_struct *next) + struct task_struct *prev, struct task_struct *next, + unsigned int prev_state) { int flags; diff --git i/kernel/trace/trace_sched_wakeup.c w/kernel/trace/trace_sched_wakeup.c index 46429f9a96fa..330aee1c1a49 100644 --- i/kernel/trace/trace_sched_wakeup.c +++ w/kernel/trace/trace_sched_wakeup.c @@ -426,8 +426,8 @@ tracing_sched_wakeup_trace(struct trace_array *tr, static void notrace probe_wakeup_sched_switch(void *ignore, bool preempt, - unsigned int prev_state, - struct task_struct *prev, struct task_struct *next) + struct task_struct *prev, struct task_struct *next, + unsigned int prev_state) { struct trace_array_cpu *data; u64 T0, T1, delta;