On Thu, Aug 22, 2019 at 5:56 PM David Abdurachmanov <david.abdurachmanov@xxxxxxxxx> wrote: > > This patch was extensively tested on Fedora/RISCV (applied by default on > top of 5.2-rc7 kernel for <2 months). The patch was also tested with 5.3-rc > on QEMU and SiFive Unleashed board. > > libseccomp (userspace) was rebased: > https://github.com/seccomp/libseccomp/pull/134 > > Fully passes libseccomp regression testing (simulation and live). > > There is one failing kernel selftest: global.user_notification_signal > > v1 -> v2: > - return immediatly if secure_computing(NULL) returns -1 > - fixed whitespace issues > - add missing seccomp.h > - remove patch #2 (solved now) > - add riscv to seccomp kernel selftest > > Cc: keescook@xxxxxxxxxxxx > Cc: me@xxxxxxxxxxxxx > > Signed-off-by: David Abdurachmanov <david.abdurachmanov@xxxxxxxxxx> > Tested-by: Carlos de Paula <me@xxxxxxxxxxxxx> > --- > arch/riscv/Kconfig | 14 ++++++++++ > arch/riscv/include/asm/seccomp.h | 10 +++++++ > arch/riscv/include/asm/thread_info.h | 5 +++- > arch/riscv/kernel/entry.S | 27 +++++++++++++++++-- > arch/riscv/kernel/ptrace.c | 10 +++++++ > tools/testing/selftests/seccomp/seccomp_bpf.c | 8 +++++- > 6 files changed, 70 insertions(+), 4 deletions(-) > create mode 100644 arch/riscv/include/asm/seccomp.h > > diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig > index 59a4727ecd6c..441e63ff5adc 100644 > --- a/arch/riscv/Kconfig > +++ b/arch/riscv/Kconfig > @@ -31,6 +31,7 @@ config RISCV > select GENERIC_SMP_IDLE_THREAD > select GENERIC_ATOMIC64 if !64BIT > select HAVE_ARCH_AUDITSYSCALL > + select HAVE_ARCH_SECCOMP_FILTER > select HAVE_MEMBLOCK_NODE_MAP > select HAVE_DMA_CONTIGUOUS > select HAVE_FUTEX_CMPXCHG if FUTEX > @@ -235,6 +236,19 @@ menu "Kernel features" > > source "kernel/Kconfig.hz" > > +config SECCOMP > + bool "Enable seccomp to safely compute untrusted bytecode" > + help > + This kernel feature is useful for number crunching applications > + that may need to compute untrusted bytecode during their > + execution. By using pipes or other transports made available to > + the process as file descriptors supporting the read/write > + syscalls, it's possible to isolate those applications in > + their own address space using seccomp. Once seccomp is > + enabled via prctl(PR_SET_SECCOMP), it cannot be disabled > + and the task is only allowed to execute a few safe syscalls > + defined by each seccomp mode. > + > endmenu > > menu "Boot options" > diff --git a/arch/riscv/include/asm/seccomp.h b/arch/riscv/include/asm/seccomp.h > new file mode 100644 > index 000000000000..bf7744ee3b3d > --- /dev/null > +++ b/arch/riscv/include/asm/seccomp.h > @@ -0,0 +1,10 @@ > +/* SPDX-License-Identifier: GPL-2.0 */ > + > +#ifndef _ASM_SECCOMP_H > +#define _ASM_SECCOMP_H > + > +#include <asm/unistd.h> > + > +#include <asm-generic/seccomp.h> > + > +#endif /* _ASM_SECCOMP_H */ > diff --git a/arch/riscv/include/asm/thread_info.h b/arch/riscv/include/asm/thread_info.h > index 905372d7eeb8..a0b2a29a0da1 100644 > --- a/arch/riscv/include/asm/thread_info.h > +++ b/arch/riscv/include/asm/thread_info.h > @@ -75,6 +75,7 @@ struct thread_info { > #define TIF_MEMDIE 5 /* is terminating due to OOM killer */ > #define TIF_SYSCALL_TRACEPOINT 6 /* syscall tracepoint instrumentation */ > #define TIF_SYSCALL_AUDIT 7 /* syscall auditing */ > +#define TIF_SECCOMP 8 /* syscall secure computing */ > > #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) > #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) > @@ -82,11 +83,13 @@ struct thread_info { > #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) > #define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT) > #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) > +#define _TIF_SECCOMP (1 << TIF_SECCOMP) > > #define _TIF_WORK_MASK \ > (_TIF_NOTIFY_RESUME | _TIF_SIGPENDING | _TIF_NEED_RESCHED) > > #define _TIF_SYSCALL_WORK \ > - (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_TRACEPOINT | _TIF_SYSCALL_AUDIT) > + (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_TRACEPOINT | _TIF_SYSCALL_AUDIT | \ > + _TIF_SECCOMP ) > > #endif /* _ASM_RISCV_THREAD_INFO_H */ > diff --git a/arch/riscv/kernel/entry.S b/arch/riscv/kernel/entry.S > index bc7a56e1ca6f..0bbedfa3e47d 100644 > --- a/arch/riscv/kernel/entry.S > +++ b/arch/riscv/kernel/entry.S > @@ -203,8 +203,25 @@ check_syscall_nr: > /* Check to make sure we don't jump to a bogus syscall number. */ > li t0, __NR_syscalls > la s0, sys_ni_syscall > - /* Syscall number held in a7 */ > - bgeu a7, t0, 1f > + /* > + * The tracer can change syscall number to valid/invalid value. > + * We use syscall_set_nr helper in syscall_trace_enter thus we > + * cannot trust the current value in a7 and have to reload from > + * the current task pt_regs. > + */ > + REG_L a7, PT_A7(sp) > + /* > + * Syscall number held in a7. > + * If syscall number is above allowed value, redirect to ni_syscall. > + */ > + bge a7, t0, 1f > + /* > + * Check if syscall is rejected by tracer or seccomp, i.e., a7 == -1. > + * If yes, we pretend it was executed. > + */ > + li t1, -1 > + beq a7, t1, ret_from_syscall_rejected > + /* Call syscall */ > la s0, sys_call_table > slli t0, a7, RISCV_LGPTR > add s0, s0, t0 > @@ -215,6 +232,12 @@ check_syscall_nr: > ret_from_syscall: > /* Set user a0 to kernel a0 */ > REG_S a0, PT_A0(sp) > + /* > + * We didn't execute the actual syscall. > + * Seccomp already set return value for the current task pt_regs. > + * (If it was configured with SECCOMP_RET_ERRNO/TRACE) > + */ > +ret_from_syscall_rejected: > /* Trace syscalls, but only if requested by the user. */ > REG_L t0, TASK_TI_FLAGS(tp) > andi t0, t0, _TIF_SYSCALL_WORK > diff --git a/arch/riscv/kernel/ptrace.c b/arch/riscv/kernel/ptrace.c > index 368751438366..63e47c9f85f0 100644 > --- a/arch/riscv/kernel/ptrace.c > +++ b/arch/riscv/kernel/ptrace.c > @@ -154,6 +154,16 @@ void do_syscall_trace_enter(struct pt_regs *regs) > if (tracehook_report_syscall_entry(regs)) > syscall_set_nr(current, regs, -1); > > + /* > + * Do the secure computing after ptrace; failures should be fast. > + * If this fails we might have return value in a0 from seccomp > + * (via SECCOMP_RET_ERRNO/TRACE). > + */ > + if (secure_computing(NULL) == -1) { > + syscall_set_nr(current, regs, -1); > + return; > + } > + > #ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS > if (test_thread_flag(TIF_SYSCALL_TRACEPOINT)) > trace_sys_enter(regs, syscall_get_nr(current, regs)); > diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c > index 6ef7f16c4cf5..492e0adad9d3 100644 > --- a/tools/testing/selftests/seccomp/seccomp_bpf.c > +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c > @@ -112,6 +112,8 @@ struct seccomp_data { > # define __NR_seccomp 383 > # elif defined(__aarch64__) > # define __NR_seccomp 277 > +# elif defined(__riscv) > +# define __NR_seccomp 277 > # elif defined(__hppa__) > # define __NR_seccomp 338 > # elif defined(__powerpc__) > @@ -1582,6 +1584,10 @@ TEST_F(TRACE_poke, getpid_runs_normally) > # define ARCH_REGS struct user_pt_regs > # define SYSCALL_NUM regs[8] > # define SYSCALL_RET regs[0] > +#elif defined(__riscv) && __riscv_xlen == 64 > +# define ARCH_REGS struct user_regs_struct > +# define SYSCALL_NUM a7 > +# define SYSCALL_RET a0 > #elif defined(__hppa__) > # define ARCH_REGS struct user_regs_struct > # define SYSCALL_NUM gr[20] > @@ -1671,7 +1677,7 @@ void change_syscall(struct __test_metadata *_metadata, > EXPECT_EQ(0, ret) {} > > #if defined(__x86_64__) || defined(__i386__) || defined(__powerpc__) || \ > - defined(__s390__) || defined(__hppa__) > + defined(__s390__) || defined(__hppa__) || defined(__riscv) > { > regs.SYSCALL_NUM = syscall; > } > -- > 2.21.0 > Kernel selftests results: ➜ uname -a Linux fedora-unleashed 5.2.0-rc7-30159-g2d072d4-dirty #3 SMP Thu Jul 4 20:18:21 -03 2019 riscv64 riscv64 riscv64 GNU/Linux ➜ sudo ./seccomp_bpf [==========] Running 74 tests from 1 test cases. [ RUN ] global.mode_strict_support [ OK ] global.mode_strict_support [ RUN ] global.mode_strict_cannot_call_prctl [ OK ] global.mode_strict_cannot_call_prctl [ RUN ] global.no_new_privs_support [ OK ] global.no_new_privs_support [ RUN ] global.mode_filter_support [ OK ] global.mode_filter_support [ RUN ] global.mode_filter_without_nnp [ OK ] global.mode_filter_without_nnp [ RUN ] global.filter_size_limits [ OK ] global.filter_size_limits [ RUN ] global.filter_chain_limits [ OK ] global.filter_chain_limits [ RUN ] global.mode_filter_cannot_move_to_strict [ OK ] global.mode_filter_cannot_move_to_strict [ RUN ] global.mode_filter_get_seccomp [ OK ] global.mode_filter_get_seccomp [ RUN ] global.ALLOW_all [ OK ] global.ALLOW_all [ RUN ] global.empty_prog [ OK ] global.empty_prog [ RUN ] global.log_all [ OK ] global.log_all [ RUN ] global.unknown_ret_is_kill_inside [ OK ] global.unknown_ret_is_kill_inside [ RUN ] global.unknown_ret_is_kill_above_allow [ OK ] global.unknown_ret_is_kill_above_allow [ RUN ] global.KILL_all [ OK ] global.KILL_all [ RUN ] global.KILL_one [ OK ] global.KILL_one [ RUN ] global.KILL_one_arg_one [ OK ] global.KILL_one_arg_one [ RUN ] global.KILL_one_arg_six [ OK ] global.KILL_one_arg_six [ RUN ] global.KILL_thread [ OK ] global.KILL_thread [ RUN ] global.KILL_process [ OK ] global.KILL_process [ RUN ] global.arg_out_of_range [ OK ] global.arg_out_of_range [ RUN ] global.ERRNO_valid [ OK ] global.ERRNO_valid [ RUN ] global.ERRNO_zero [ OK ] global.ERRNO_zero [ RUN ] global.ERRNO_capped [ OK ] global.ERRNO_capped [ RUN ] global.ERRNO_order [ OK ] global.ERRNO_order [ RUN ] TRAP.dfl [ OK ] TRAP.dfl [ RUN ] TRAP.ign [ OK ] TRAP.ign [ RUN ] TRAP.handler [ OK ] TRAP.handler [ RUN ] precedence.allow_ok [ OK ] precedence.allow_ok [ RUN ] precedence.kill_is_highest [ OK ] precedence.kill_is_highest [ RUN ] precedence.kill_is_highest_in_any_order [ OK ] precedence.kill_is_highest_in_any_order [ RUN ] precedence.trap_is_second [ OK ] precedence.trap_is_second [ RUN ] precedence.trap_is_second_in_any_order [ OK ] precedence.trap_is_second_in_any_order [ RUN ] precedence.errno_is_third [ OK ] precedence.errno_is_third [ RUN ] precedence.errno_is_third_in_any_order [ OK ] precedence.errno_is_third_in_any_order [ RUN ] precedence.trace_is_fourth [ OK ] precedence.trace_is_fourth [ RUN ] precedence.trace_is_fourth_in_any_order [ OK ] precedence.trace_is_fourth_in_any_order [ RUN ] precedence.log_is_fifth [ OK ] precedence.log_is_fifth [ RUN ] precedence.log_is_fifth_in_any_order [ OK ] precedence.log_is_fifth_in_any_order [ RUN ] TRACE_poke.read_has_side_effects [ OK ] TRACE_poke.read_has_side_effects [ RUN ] TRACE_poke.getpid_runs_normally [ OK ] TRACE_poke.getpid_runs_normally [ RUN ] TRACE_syscall.ptrace_syscall_redirected [ OK ] TRACE_syscall.ptrace_syscall_redirected [ RUN ] TRACE_syscall.ptrace_syscall_errno [ OK ] TRACE_syscall.ptrace_syscall_errno [ RUN ] TRACE_syscall.ptrace_syscall_faked [ OK ] TRACE_syscall.ptrace_syscall_faked [ RUN ] TRACE_syscall.syscall_allowed [ OK ] TRACE_syscall.syscall_allowed [ RUN ] TRACE_syscall.syscall_redirected [ OK ] TRACE_syscall.syscall_redirected [ RUN ] TRACE_syscall.syscall_errno [ OK ] TRACE_syscall.syscall_errno [ RUN ] TRACE_syscall.syscall_faked [ OK ] TRACE_syscall.syscall_faked [ RUN ] TRACE_syscall.skip_after_RET_TRACE [ OK ] TRACE_syscall.skip_after_RET_TRACE [ RUN ] TRACE_syscall.kill_after_RET_TRACE [ OK ] TRACE_syscall.kill_after_RET_TRACE [ RUN ] TRACE_syscall.skip_after_ptrace [ OK ] TRACE_syscall.skip_after_ptrace [ RUN ] TRACE_syscall.kill_after_ptrace [ OK ] TRACE_syscall.kill_after_ptrace [ RUN ] global.seccomp_syscall [ OK ] global.seccomp_syscall [ RUN ] global.seccomp_syscall_mode_lock [ OK ] global.seccomp_syscall_mode_lock [ RUN ] global.detect_seccomp_filter_flags [ OK ] global.detect_seccomp_filter_flags [ RUN ] global.TSYNC_first [ OK ] global.TSYNC_first [ RUN ] TSYNC.siblings_fail_prctl [ OK ] TSYNC.siblings_fail_prctl [ RUN ] TSYNC.two_siblings_with_ancestor [ OK ] TSYNC.two_siblings_with_ancestor [ RUN ] TSYNC.two_sibling_want_nnp [ OK ] TSYNC.two_sibling_want_nnp [ RUN ] TSYNC.two_siblings_with_no_filter [ OK ] TSYNC.two_siblings_with_no_filter [ RUN ] TSYNC.two_siblings_with_one_divergence [ OK ] TSYNC.two_siblings_with_one_divergence [ RUN ] TSYNC.two_siblings_not_under_filter [ OK ] TSYNC.two_siblings_not_under_filter [ RUN ] global.syscall_restart [ OK ] global.syscall_restart [ RUN ] global.filter_flag_log [ OK ] global.filter_flag_log [ RUN ] global.get_action_avail [ OK ] global.get_action_avail [ RUN ] global.get_metadata [ OK ] global.get_metadata [ RUN ] global.user_notification_basic [ OK ] global.user_notification_basic [ RUN ] global.user_notification_kill_in_middle [ OK ] global.user_notification_kill_in_middle [ RUN ] global.user_notification_signal [1] 5951 alarm sudo ./seccomp_bpf carlosedp in ~ at fedora-unleashed ➜ sudo ./seccomp_benchmark Calibrating reasonable sample size... 1564584448.964538790 - 1564584448.964529687 = 9103 1564584448.964588859 - 1564584448.964575204 = 13655 1564584448.964631342 - 1564584448.964604790 = 26552 1564584448.964710239 - 1564584448.964644997 = 65242 1564584448.964842239 - 1564584448.964726928 = 115311 1564584448.965072859 - 1564584448.964857411 = 215448 1564584448.965513618 - 1564584448.965089549 = 424069 1564584448.966417894 - 1564584448.965532584 = 885310 1564584448.968286377 - 1564584448.966443687 = 1842690 1564584448.971667549 - 1564584448.968314446 = 3353103 1564584448.978288790 - 1564584448.971694101 = 6594689 1564584448.991803618 - 1564584448.978313066 = 13490552 1564584449.017692308 - 1564584448.991836239 = 25856069 1564584449.069651756 - 1564584449.017713549 = 51938207 1564584449.173110928 - 1564584449.069673756 = 103437172 1564584449.380001204 - 1564584449.173132928 = 206868276 1564584449.793857618 - 1564584449.380041411 = 413816207 1564584450.625367342 - 1564584449.793898584 = 831468758 1564584452.299529411 - 1564584450.625426514 = 1674102897 1564584455.665938307 - 1564584452.299592376 = 3366345931 1564584462.331777479 - 1564584455.665973962 = 6665803517 Benchmarking 33554432 samples... 18.107882743 - 12.075641371 = 6032241372 getpid native: 179 ns 34.720410331 - 18.107978605 = 16612431726 getpid RET_ALLOW: 495 ns Estimated seccomp overhead per syscall: 316 n -- ________________________________________ Carlos Eduardo de Paula me@xxxxxxxxxxxxx http://carlosedp.com http://twitter.com/carlosedp Linkedin ________________________________________