Extend the syscall tracing subsystem by adding a handler for compat tasks. For some architectures, where compat tasks' syscall numbers have an exclusive set of syscall numbers, this already works since the removal of syscall_nr. Architectures where the same syscall may use a different syscall number for compat tasks need to define ARCH_COMPAT_SYSCALL_NUMBERS_OVERLAP and define a method arch_trace_is_compat_syscall(struct pt_regs*) that tells if a current task is a compat one. For architectures that define ARCH_COMPAT_SYSCALL_NUMBERS_OVERLAP the number of trace event files is doubled and all syscall trace events are identified by the syscall number offset by NR_syscalls. Note that as this patch series is posted as an RFC, this currently only includes arch updates for MIPS and x86 (and has only been tested on MIPS and x86_64). I will work on updating other arch trees after this solution is reviewed. Signed-off-by: Marcin Nowakowski <marcin.nowakowski@xxxxxxxxxx> --- arch/mips/kernel/ftrace.c | 4 +- arch/x86/include/asm/ftrace.h | 10 +--- arch/x86/kernel/ftrace.c | 14 ++++++ include/linux/ftrace.h | 2 +- kernel/trace/trace.h | 11 +++- kernel/trace/trace_syscalls.c | 113 +++++++++++++++++++++++++----------------- 6 files changed, 94 insertions(+), 60 deletions(-) diff --git a/arch/mips/kernel/ftrace.c b/arch/mips/kernel/ftrace.c index 937c54b..e150cf6 100644 --- a/arch/mips/kernel/ftrace.c +++ b/arch/mips/kernel/ftrace.c @@ -412,7 +412,7 @@ out: #ifdef CONFIG_FTRACE_SYSCALLS #ifdef CONFIG_32BIT -unsigned long __init arch_syscall_addr(int nr) +unsigned long __init arch_syscall_addr(int nr, int compat) { return (unsigned long)sys_call_table[nr - __NR_O32_Linux]; } @@ -420,7 +420,7 @@ unsigned long __init arch_syscall_addr(int nr) #ifdef CONFIG_64BIT -unsigned long __init arch_syscall_addr(int nr) +unsigned long __init arch_syscall_addr(int nr, int compat) { #ifdef CONFIG_MIPS32_N32 if (nr >= __NR_N32_Linux && nr <= __NR_N32_Linux + __NR_N32_Linux_syscalls) diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index a4820d4..a24a21c 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -47,15 +47,7 @@ int ftrace_int3_handler(struct pt_regs *regs); #if defined(CONFIG_FTRACE_SYSCALLS) && defined(CONFIG_IA32_EMULATION) #include <asm/compat.h> -/* - * Because ia32 syscalls do not map to x86_64 syscall numbers - * this screws up the trace output when tracing a ia32 task. - * Instead of reporting bogus syscalls, just do not trace them. - * - * If the user really wants these, then they should use the - * raw syscall tracepoints with filtering. - */ -#define ARCH_TRACE_IGNORE_COMPAT_SYSCALLS 1 +#define ARCH_COMPAT_SYSCALL_NUMBERS_OVERLAP 1 static inline bool arch_trace_is_compat_syscall(struct pt_regs *regs) { if (in_compat_syscall()) diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index d036cfb..78f3e36 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -28,6 +28,7 @@ #include <asm/kprobes.h> #include <asm/ftrace.h> #include <asm/nops.h> +#include <asm/syscall.h> #ifdef CONFIG_DYNAMIC_FTRACE @@ -1035,3 +1036,16 @@ void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent, } } #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ + +#ifdef CONFIG_FTRACE_SYSCALLS + +unsigned long arch_syscall_addr(int nr, int compat) +{ +#if defined(CONFIG_X86_64) && defined(CONFIG_IA32_EMULATION) + if (compat) + return (unsigned long)ia32_sys_call_table[nr]; +#endif + return (unsigned long)sys_call_table[nr]; +} + +#endif /* CONFIG_FTRACE_SYSCALLS */ diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 7d565af..110f95d 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -938,7 +938,7 @@ static inline void disable_trace_on_warning(void) { } #ifdef CONFIG_FTRACE_SYSCALLS -unsigned long arch_syscall_addr(int nr); +unsigned long arch_syscall_addr(int nr, int compat); #endif /* CONFIG_FTRACE_SYSCALLS */ diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index f783df4..102a41a 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -234,8 +234,15 @@ struct trace_array { #ifdef CONFIG_FTRACE_SYSCALLS int sys_refcount_enter; int sys_refcount_exit; - struct trace_event_file __rcu *enter_syscall_files[NR_syscalls]; - struct trace_event_file __rcu *exit_syscall_files[NR_syscalls]; + +#ifdef ARCH_COMPAT_SYSCALL_NUMBERS_OVERLAP +#define FTRACE_SYSCALL_CNT (NR_syscalls * (1 + IS_ENABLED(CONFIG_COMPAT))) +#else +#define FTRACE_SYSCALL_CNT (NR_syscalls) +#endif + + struct trace_event_file __rcu *enter_syscall_files[FTRACE_SYSCALL_CNT]; + struct trace_event_file __rcu *exit_syscall_files[FTRACE_SYSCALL_CNT]; #endif int stop_count; int clock_id; diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 1da10ca..dc7df38 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -44,37 +44,35 @@ static inline bool arch_syscall_match_sym_name(const char *sym, const char *name } #endif -#ifdef ARCH_TRACE_IGNORE_COMPAT_SYSCALLS +#ifdef ARCH_COMPAT_SYSCALL_NUMBERS_OVERLAP /* * Some architectures that allow for 32bit applications * to run on a 64bit kernel, do not map the syscalls for * the 32bit tasks the same as they do for 64bit tasks. * - * *cough*x86*cough* - * - * In such a case, instead of reporting the wrong syscalls, - * simply ignore them. - * - * For an arch to ignore the compat syscalls it needs to - * define ARCH_TRACE_IGNORE_COMPAT_SYSCALLS as well as + * If a set of syscall numbers for 32-bit tasks overlaps + * the set of syscall numbers for 64-bit tasks, define + * ARCH_COMPAT_SYSCALL_NUMBERS_OVERLAP as well as * define the function arch_trace_is_compat_syscall() to let - * the tracing system know that it should ignore it. + * the tracing system know that a compat syscall is being handled. */ -static int -trace_get_syscall_nr(struct task_struct *task, struct pt_regs *regs) +static inline bool trace_is_compat_syscall(struct pt_regs *regs) { - if (unlikely(arch_trace_is_compat_syscall(regs))) - return -1; - - return syscall_get_nr(task, regs); + return arch_trace_is_compat_syscall(regs); } #else +static inline bool trace_is_compat_syscall(struct pt_regs *regs) +{ + return false; +} +#endif /* ARCH_COMPAT_SYSCALL_NUMBERS_OVERLAP */ + static inline int trace_get_syscall_nr(struct task_struct *task, struct pt_regs *regs) { return syscall_get_nr(task, regs); } -#endif /* ARCH_TRACE_IGNORE_COMPAT_SYSCALLS */ + static __init struct syscall_metadata * find_syscall_meta(unsigned long syscall) @@ -98,9 +96,9 @@ find_syscall_meta(unsigned long syscall) return NULL; } -static struct syscall_metadata *syscall_nr_to_meta(int nr) +static struct syscall_metadata *trace_syscall_nr_to_meta(int nr) { - if (!syscalls_metadata || nr >= NR_syscalls || nr < 0) + if (!syscalls_metadata || nr >= FTRACE_SYSCALL_CNT || nr < 0) return NULL; return syscalls_metadata[nr]; @@ -110,7 +108,7 @@ const char *get_syscall_name(int syscall) { struct syscall_metadata *entry; - entry = syscall_nr_to_meta(syscall); + entry = trace_syscall_nr_to_meta(syscall); if (!entry) return NULL; @@ -130,7 +128,7 @@ print_syscall_enter(struct trace_iterator *iter, int flags, trace = (typeof(trace))ent; syscall = trace->nr; - entry = syscall_nr_to_meta(syscall); + entry = trace_syscall_nr_to_meta(syscall); if (!entry) goto end; @@ -176,7 +174,7 @@ print_syscall_exit(struct trace_iterator *iter, int flags, trace = (typeof(trace))ent; syscall = trace->nr; - entry = syscall_nr_to_meta(syscall); + entry = trace_syscall_nr_to_meta(syscall); if (!entry) { trace_seq_putc(s, '\n'); @@ -321,6 +319,9 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) if (syscall_nr < 0 || syscall_nr >= NR_syscalls) return; + if (trace_is_compat_syscall(regs)) + syscall_nr += NR_syscalls; + /* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE) */ trace_file = rcu_dereference_sched(tr->enter_syscall_files[syscall_nr]); if (!trace_file) @@ -329,7 +330,7 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) if (trace_trigger_soft_disabled(trace_file)) return; - sys_data = syscall_nr_to_meta(syscall_nr); + sys_data = trace_syscall_nr_to_meta(syscall_nr); if (!sys_data) return; @@ -368,6 +369,9 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) if (syscall_nr < 0 || syscall_nr >= NR_syscalls) return; + if (trace_is_compat_syscall(regs)) + syscall_nr += NR_syscalls; + /* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE()) */ trace_file = rcu_dereference_sched(tr->exit_syscall_files[syscall_nr]); if (!trace_file) @@ -376,7 +380,7 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) if (trace_trigger_soft_disabled(trace_file)) return; - sys_data = syscall_nr_to_meta(syscall_nr); + sys_data = trace_syscall_nr_to_meta(syscall_nr); if (!sys_data) return; @@ -415,7 +419,7 @@ static int reg_event_syscall_enter(struct trace_event_file *file, goto out_unlock; } - for (num = 0; num < NR_syscalls; num++) { + for (num = 0; num < FTRACE_SYSCALL_CNT; num++) { if (syscalls_metadata[num] && arch_syscall_match_sym_name(syscalls_metadata[num]->name, name)) @@ -438,7 +442,7 @@ static void unreg_event_syscall_enter(struct trace_event_file *file, name = ((const struct syscall_metadata *)call->data)->name; mutex_lock(&syscall_trace_lock); tr->sys_refcount_enter--; - for (num = 0; num < NR_syscalls; num++) { + for (num = 0; num < FTRACE_SYSCALL_CNT; num++) { if (syscalls_metadata[num] && arch_syscall_match_sym_name(syscalls_metadata[num]->name, name)) @@ -466,7 +470,7 @@ static int reg_event_syscall_exit(struct trace_event_file *file, goto out_unlock; } - for (num = 0; num < NR_syscalls; num++) { + for (num = 0; num < FTRACE_SYSCALL_CNT; num++) { if (syscalls_metadata[num] && arch_syscall_match_sym_name(syscalls_metadata[num]->name, name)) @@ -490,7 +494,7 @@ static void unreg_event_syscall_exit(struct trace_event_file *file, mutex_lock(&syscall_trace_lock); tr->sys_refcount_exit--; - for (num = 0; num < NR_syscalls; num++) { + for (num = 0; num < FTRACE_SYSCALL_CNT; num++) { if (syscalls_metadata[num] && arch_syscall_match_sym_name(syscalls_metadata[num]->name, name)) @@ -542,38 +546,47 @@ struct trace_event_class __refdata event_class_syscall_exit = { .raw_init = init_syscall_trace, }; -unsigned long __init __weak arch_syscall_addr(int nr) +unsigned long __init __weak arch_syscall_addr(int nr, int compat) { return (unsigned long)sys_call_table[nr]; } -void __init init_ftrace_syscalls(void) +void __init init_ftrace_syscalls_meta(int compat) { struct syscall_metadata *meta; unsigned long addr; int i; - syscalls_metadata = kcalloc(NR_syscalls, sizeof(*syscalls_metadata), - GFP_KERNEL); - if (!syscalls_metadata) { - WARN_ON(1); - return; - } - for (i = 0; i < NR_syscalls; i++) { - addr = arch_syscall_addr(i); + addr = arch_syscall_addr(i, compat); meta = find_syscall_meta(addr); if (!meta) continue; - syscalls_metadata[i] = meta; + syscalls_metadata[compat * NR_syscalls + i] = meta; } } +void __init init_ftrace_syscalls(void) +{ + syscalls_metadata = kcalloc(FTRACE_SYSCALL_CNT, + sizeof(*syscalls_metadata), GFP_KERNEL); + if (!syscalls_metadata) { + WARN_ON(1); + return; + } + + init_ftrace_syscalls_meta(0); +#ifdef ARCH_COMPAT_SYSCALL_NUMBERS_OVERLAP + if (IS_ENABLED(CONFIG_COMPAT)) + init_ftrace_syscalls_meta(1); +#endif +} + #ifdef CONFIG_PERF_EVENTS -static DECLARE_BITMAP(enabled_perf_enter_syscalls, NR_syscalls); -static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls); +static DECLARE_BITMAP(enabled_perf_enter_syscalls, FTRACE_SYSCALL_CNT); +static DECLARE_BITMAP(enabled_perf_exit_syscalls, FTRACE_SYSCALL_CNT); static int sys_perf_refcount_enter; static int sys_perf_refcount_exit; @@ -589,10 +602,14 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) syscall_nr = trace_get_syscall_nr(current, regs); if (syscall_nr < 0 || syscall_nr >= NR_syscalls) return; + + if (trace_is_compat_syscall(regs)) + syscall_nr += NR_syscalls; + if (!test_bit(syscall_nr, enabled_perf_enter_syscalls)) return; - sys_data = syscall_nr_to_meta(syscall_nr); + sys_data = trace_syscall_nr_to_meta(syscall_nr); if (!sys_data) return; @@ -635,7 +652,7 @@ static int perf_sysenter_enable(struct trace_event_call *call) } } - for (num = 0; num < NR_syscalls; num++) { + for (num = 0; num < FTRACE_SYSCALL_CNT; num++) { if (syscalls_metadata[num] && arch_syscall_match_sym_name(syscalls_metadata[num]->name, name)) @@ -657,7 +674,7 @@ static void perf_sysenter_disable(struct trace_event_call *call) mutex_lock(&syscall_trace_lock); sys_perf_refcount_enter--; - for (num = 0; num < NR_syscalls; num++) { + for (num = 0; num < FTRACE_SYSCALL_CNT; num++) { if (syscalls_metadata[num] && arch_syscall_match_sym_name(syscalls_metadata[num]->name, name)) @@ -680,10 +697,14 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) syscall_nr = trace_get_syscall_nr(current, regs); if (syscall_nr < 0 || syscall_nr >= NR_syscalls) return; + + if (trace_is_compat_syscall(regs)) + syscall_nr += NR_syscalls; + if (!test_bit(syscall_nr, enabled_perf_exit_syscalls)) return; - sys_data = syscall_nr_to_meta(syscall_nr); + sys_data = trace_syscall_nr_to_meta(syscall_nr); if (!sys_data) return; @@ -723,7 +744,7 @@ static int perf_sysexit_enable(struct trace_event_call *call) } } - for (num = 0; num < NR_syscalls; num++) { + for (num = 0; num < FTRACE_SYSCALL_CNT; num++) { if (syscalls_metadata[num] && arch_syscall_match_sym_name(syscalls_metadata[num]->name, name)) @@ -745,7 +766,7 @@ static void perf_sysexit_disable(struct trace_event_call *call) mutex_lock(&syscall_trace_lock); sys_perf_refcount_exit--; - for (num = 0; num < NR_syscalls; num++) { + for (num = 0; num < FTRACE_SYSCALL_CNT; num++) { if (syscalls_metadata[num] && arch_syscall_match_sym_name(syscalls_metadata[num]->name, name)) -- 2.7.4 -- To unsubscribe from this list: send the line "unsubscribe linux-api" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html