On Thu, 3 Oct 2024 11:16:33 -0400 Mathieu Desnoyers <mathieu.desnoyers@xxxxxxxxxxxx> wrote: > In preparation for allowing system call enter/exit instrumentation to > handle page faults, make sure that perf can handle this change by > explicitly disabling preemption within the perf system call tracepoint > probes to respect the current expectations within perf ring buffer code. > > This change does not yet allow perf to take page faults per se within > its probe, but allows its existing probes to adapt to the upcoming > change. Frederic, Does the perf ring buffer expect preemption to be disabled when used? In other words, is this patch needed? -- Steve > > Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@xxxxxxxxxxxx> > Cc: Michael Jeanson <mjeanson@xxxxxxxxxxxx> > Cc: Steven Rostedt <rostedt@xxxxxxxxxxx> > Cc: Masami Hiramatsu <mhiramat@xxxxxxxxxx> > Cc: Peter Zijlstra <peterz@xxxxxxxxxxxxx> > Cc: Alexei Starovoitov <ast@xxxxxxxxxx> > Cc: Yonghong Song <yhs@xxxxxx> > Cc: Paul E. McKenney <paulmck@xxxxxxxxxx> > Cc: Ingo Molnar <mingo@xxxxxxxxxx> > Cc: Arnaldo Carvalho de Melo <acme@xxxxxxxxxx> > Cc: Mark Rutland <mark.rutland@xxxxxxx> > Cc: Alexander Shishkin <alexander.shishkin@xxxxxxxxxxxxxxx> > Cc: Namhyung Kim <namhyung@xxxxxxxxxx> > Cc: Andrii Nakryiko <andrii.nakryiko@xxxxxxxxx> > Cc: bpf@xxxxxxxxxxxxxxx > Cc: Joel Fernandes <joel@xxxxxxxxxxxxxxxxx> > --- > include/trace/perf.h | 41 +++++++++++++++++++++++++++++++---- > kernel/trace/trace_syscalls.c | 12 ++++++++++ > 2 files changed, 49 insertions(+), 4 deletions(-) > > diff --git a/include/trace/perf.h b/include/trace/perf.h > index ded997af481e..5650c1bad088 100644 > --- a/include/trace/perf.h > +++ b/include/trace/perf.h > @@ -12,10 +12,10 @@ > #undef __perf_task > #define __perf_task(t) (__task = (t)) > > -#undef DECLARE_EVENT_CLASS > -#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \ > +#undef __DECLARE_EVENT_CLASS > +#define __DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \ > static notrace void \ > -perf_trace_##call(void *__data, proto) \ > +do_perf_trace_##call(void *__data, proto) \ > { \ > struct trace_event_call *event_call = __data; \ > struct trace_event_data_offsets_##call __maybe_unused __data_offsets;\ > @@ -55,8 +55,38 @@ perf_trace_##call(void *__data, proto) \ > head, __task); \ > } > > +/* > + * Define unused __count and __task variables to use @args to pass > + * arguments to do_perf_trace_##call. This is needed because the > + * macros __perf_count and __perf_task introduce the side-effect to > + * store copies into those local variables. > + */ > +#undef DECLARE_EVENT_CLASS > +#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \ > +__DECLARE_EVENT_CLASS(call, PARAMS(proto), PARAMS(args), PARAMS(tstruct), \ > + PARAMS(assign), PARAMS(print)) \ > +static notrace void \ > +perf_trace_##call(void *__data, proto) \ > +{ \ > + u64 __count __attribute__((unused)); \ > + struct task_struct *__task __attribute__((unused)); \ > + \ > + do_perf_trace_##call(__data, args); \ > +} > + > #undef DECLARE_EVENT_SYSCALL_CLASS > -#define DECLARE_EVENT_SYSCALL_CLASS DECLARE_EVENT_CLASS > +#define DECLARE_EVENT_SYSCALL_CLASS(call, proto, args, tstruct, assign, print) \ > +__DECLARE_EVENT_CLASS(call, PARAMS(proto), PARAMS(args), PARAMS(tstruct), \ > + PARAMS(assign), PARAMS(print)) \ > +static notrace void \ > +perf_trace_##call(void *__data, proto) \ > +{ \ > + u64 __count __attribute__((unused)); \ > + struct task_struct *__task __attribute__((unused)); \ > + \ > + guard(preempt_notrace)(); \ > + do_perf_trace_##call(__data, args); \ > +} > > /* > * This part is compiled out, it is only here as a build time check > @@ -76,4 +106,7 @@ static inline void perf_test_probe_##call(void) \ > DEFINE_EVENT(template, name, PARAMS(proto), PARAMS(args)) > > #include TRACE_INCLUDE(TRACE_INCLUDE_FILE) > + > +#undef __DECLARE_EVENT_CLASS > + > #endif /* CONFIG_PERF_EVENTS */ > diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c > index ab4db8c23f36..edcfa47446c7 100644 > --- a/kernel/trace/trace_syscalls.c > +++ b/kernel/trace/trace_syscalls.c > @@ -596,6 +596,12 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) > int rctx; > int size; > > + /* > + * Syscall probe called with preemption enabled, but the ring > + * buffer and per-cpu data require preemption to be disabled. > + */ > + guard(preempt_notrace)(); > + > syscall_nr = trace_get_syscall_nr(current, regs); > if (syscall_nr < 0 || syscall_nr >= NR_syscalls) > return; > @@ -698,6 +704,12 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) > int rctx; > int size; > > + /* > + * Syscall probe called with preemption enabled, but the ring > + * buffer and per-cpu data require preemption to be disabled. > + */ > + guard(preempt_notrace)(); > + > syscall_nr = trace_get_syscall_nr(current, regs); > if (syscall_nr < 0 || syscall_nr >= NR_syscalls) > return;