On Mon, Oct 29, 2018 at 09:25:15AM -0700, Kees Cook wrote: > On Mon, Oct 29, 2018 at 4:23 AM, Michael Sammler <msammler@xxxxxxxxxxx> wrote: > > Add the current value of an architecture specific protection keys > > register (currently PKRU on x86) to data available for seccomp-bpf > > programs to work on. This allows filters based on the currently > > enabled protection keys. > > > > Support for protection keys on the POWER architecture is not part of > > this patch since I do not have access to a PowerPC, but adding support > > for it can be achieved by setting sd->pkeys to the AMR register in > > populate_seccomp_data. Maybe you can use a generic read_pkey() function, and each arch can provide their own implementation. In the case of powerpc it is mfspr(SPRN_AMR); something like the code below might help? diff --git a/arch/powerpc/include/asm/pkeys.h b/arch/powerpc/include/asm/pkeys.h index 92a9962..d79efe3 100644 --- a/arch/powerpc/include/asm/pkeys.h +++ b/arch/powerpc/include/asm/pkeys.h @@ -89,6 +89,11 @@ static inline u16 pte_to_pkey_bits(u64 pteflags) #define __mm_pkey_is_reserved(pkey) (reserved_allocation_mask & \ pkey_alloc_mask(pkey)) +static inline u64 read_pkey() +{ + return mfspr(SPRN_AMR); +} + static inline bool mm_pkey_is_allocated(struct mm_struct *mm, int pkey) { if (pkey < 0 || pkey >= arch_max_pkey()) diff --git a/arch/x86/include/asm/pkeys.h b/arch/x86/include/asm/pkeys.h index 19b137f..feea114 100644 --- a/arch/x86/include/asm/pkeys.h +++ b/arch/x86/include/asm/pkeys.h @@ -9,6 +9,11 @@ extern int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, unsigned long init_val); +static inline u64 read_pkey() +{ + return read_pkru(); +} + static inline bool arch_pkeys_enabled(void) { return boot_cpu_has(X86_FEATURE_OSPKE); diff --git a/include/linux/pkeys.h b/include/linux/pkeys.h index 2955ba97..f1538c7 100644 --- a/include/linux/pkeys.h +++ b/include/linux/pkeys.h @@ -13,6 +13,11 @@ #define PKEY_DEDICATED_EXECUTE_ONLY 0 #define ARCH_VM_PKEY_FLAGS 0 +static inline u64 read_pkey() +{ + return 0; +} + static inline int vma_pkey(struct vm_area_struct *vma) { return 0; RP > > > > One use case for this patch is disabling unnecessary system calls for a > > library (e.g. network i/o for a crypto library) while the library runs > > without disabling the system calls for the whole program (by changing > > the protection keys before and after the library executes). Using this > > one could ensure that the library behaves a expected (e.g. the crypto > > library not sending private keys to a malicious server). > > > > This patch also enables lightweight sandboxing of untrusted code using > > memory protection keys: Protection keys provide memory isolation but > > for a sandbox system call isolation is needed as well. This patch > > allows writing a seccomp filter to prevent system calls by the > > untrusted code while still allowing system calls for the trusted code. > > Isn't PKU instruction based? Couldn't a malicious library just change > the state of the MPK registers? This seems like an easy way to bypass > any filters that used PKU. I'm not convinced this is a meaningful > barrier that should be enforced by seccomp. > > This can also be done with a signal handler with SECCOMP_RET_TRAP and > check instruction pointer vs PKU state. > > -Kees > > > > > An alternative design would be to extend (c)BPF with a new instruction > > to read the state of a protection key. This alternate design would > > provide a simpler interface to the user space since the BPF program > > would not need to deal with the architecture specific pkeys field in > > seccomp_data, but the question is, how much of an advantage this would > > be as the nr field in seccomp_data is already architecture specific. > > Adding a new instruction for BPF programs is more complicated than > > this patch and might be a breaking change. > > > > Results of selftests/seccomp_benchmark.c on a x86 machine with pkeys > > support: > > > > With patch: > > Benchmarking 33554432 samples... > > 28.019505558 - 18.676858522 = 9342647036 > > getpid native: 278 ns > > 42.279109885 - 28.019657031 = 14259452854 > > getpid RET_ALLOW: 424 ns > > Estimated seccomp overhead per syscall: 146 ns > > > > Without patch: > > Benchmarking 33554432 samples... > > 28.059619466 - 18.706769155 = 9352850311 > > getpid native: 278 ns > > 42.299228279 - 28.059761804 = 14239466475 > > getpid RET_ALLOW: 424 ns > > Estimated seccomp overhead per syscall: 146 ns > > > > Cc: Kees Cook <keescook@xxxxxxxxxxxx> > > Cc: Andy Lutomirski <luto@xxxxxxxxxxxxxx> > > Cc: Will Drewry <wad@xxxxxxxxxxxx> > > Cc: Ram Pai <linuxram@xxxxxxxxxx> > > Cc: linuxppc-dev@xxxxxxxxxxxxxxxx > > Cc: linux-api@xxxxxxxxxxxxxxx > > Signed-off-by: Michael Sammler <msammler@xxxxxxxxxxx> > > --- > > Changes to the previous version: > > - added motivation, notes about POWER, alternative design and benchmark results to the commit log > > - renamed pkru field in seccomp_data to pkeys > > - changed size of pkru field to __u64 and removed reserved field > > - added test for x86 > > > > arch/mips/kernel/ptrace.c | 1 + > > arch/x86/entry/common.c | 1 + > > include/uapi/linux/seccomp.h | 3 + > > kernel/seccomp.c | 1 + > > tools/testing/selftests/seccomp/seccomp_bpf.c | 107 +++++++++++++++++++++++++- > > 5 files changed, 112 insertions(+), 1 deletion(-) > > > > diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c > > index e5ba56c0..a58dd04d 100644 > > --- a/arch/mips/kernel/ptrace.c > > +++ b/arch/mips/kernel/ptrace.c > > @@ -1277,6 +1277,7 @@ asmlinkage long syscall_trace_enter(struct pt_regs *regs, long syscall) > > for (i = 0; i < 6; i++) > > sd.args[i] = args[i]; > > sd.instruction_pointer = KSTK_EIP(current); > > + sd.pkeys = 0; > > > > ret = __secure_computing(&sd); > > if (ret == -1) > > diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c > > index 3b2490b8..20c51bf2 100644 > > --- a/arch/x86/entry/common.c > > +++ b/arch/x86/entry/common.c > > @@ -98,6 +98,7 @@ static long syscall_trace_enter(struct pt_regs *regs) > > sd.arch = arch; > > sd.nr = regs->orig_ax; > > sd.instruction_pointer = regs->ip; > > + sd.pkeys = read_pkru(); > > #ifdef CONFIG_X86_64 > > if (arch == AUDIT_ARCH_X86_64) { > > sd.args[0] = regs->di; > > diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h > > index 9efc0e73..3aa2d934 100644 > > --- a/include/uapi/linux/seccomp.h > > +++ b/include/uapi/linux/seccomp.h > > @@ -52,12 +52,15 @@ > > * @instruction_pointer: at the time of the system call. > > * @args: up to 6 system call arguments always stored as 64-bit values > > * regardless of the architecture. > > + * @pkeys: value of an architecture specific protection keys register > > + * (currently PKRU on x86) > > */ > > struct seccomp_data { > > int nr; > > __u32 arch; > > __u64 instruction_pointer; > > __u64 args[6]; > > + __u64 pkeys; > > }; > > > > #endif /* _UAPI_LINUX_SECCOMP_H */ > > diff --git a/kernel/seccomp.c b/kernel/seccomp.c > > index fd023ac2..dfb8b0d6 100644 > > --- a/kernel/seccomp.c > > +++ b/kernel/seccomp.c > > @@ -91,6 +91,7 @@ static void populate_seccomp_data(struct seccomp_data *sd) > > sd->args[4] = args[4]; > > sd->args[5] = args[5]; > > sd->instruction_pointer = KSTK_EIP(task); > > + sd->pkeys = 0; > > } > > > > /** > > diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c > > index e1473234..f7f8fa6f 100644 > > --- a/tools/testing/selftests/seccomp/seccomp_bpf.c > > +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c > > @@ -82,6 +82,7 @@ struct seccomp_data { > > __u32 arch; > > __u64 instruction_pointer; > > __u64 args[6]; > > + __u64 pkeys; > > }; > > #endif > > > > @@ -732,7 +733,9 @@ TEST(KILL_process) > > TEST(arg_out_of_range) > > { > > struct sock_filter filter[] = { > > - BPF_STMT(BPF_LD|BPF_W|BPF_ABS, syscall_arg(6)), > > + BPF_STMT(BPF_LD|BPF_W|BPF_ABS, > > + offsetof(struct seccomp_data, pkeys) > > + + sizeof(__u64)), > > BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), > > }; > > struct sock_fprog prog = { > > @@ -2933,6 +2936,108 @@ skip: > > ASSERT_EQ(0, kill(pid, SIGKILL)); > > } > > > > +#if defined(__i386__) || defined(__x86_64__) > > +static inline void __cpuid(unsigned int *eax, unsigned int *ebx, > > + unsigned int *ecx, unsigned int *edx) > > +{ > > + /* ecx is often an input as well as an output. */ > > + asm volatile( > > + "cpuid;" > > + : "=a" (*eax), > > + "=b" (*ebx), > > + "=c" (*ecx), > > + "=d" (*edx) > > + : "0" (*eax), "2" (*ecx)); > > +} > > + > > +/* Intel-defined CPU features, CPUID level 0x00000007:0 (ecx) */ > > +#define X86_FEATURE_PKU (1<<3) /* Protection Keys for Userspace */ > > +#define X86_FEATURE_OSPKE (1<<4) /* OS Protection Keys Enable */ > > + > > +static inline int cpu_has_pku(void) > > +{ > > + unsigned int eax; > > + unsigned int ebx; > > + unsigned int ecx; > > + unsigned int edx; > > + > > + eax = 0x7; > > + ecx = 0x0; > > + __cpuid(&eax, &ebx, &ecx, &edx); > > + > > + if (!(ecx & X86_FEATURE_PKU)) > > + return 0; > > + if (!(ecx & X86_FEATURE_OSPKE)) > > + return 0; > > + return 1; > > +} > > + > > +static inline __u32 read_pkru(void) > > +{ > > + if (!cpu_has_pku()) > > + return 0; > > + > > + __u32 ecx = 0; > > + __u32 edx, pkru; > > + > > + /* > > + * "rdpkru" instruction. Places PKRU contents in to EAX, > > + * clears EDX and requires that ecx=0. > > + */ > > + asm volatile(".byte 0x0f,0x01,0xee\n\t" > > + : "=a" (pkru), "=d" (edx) > > + : "c" (ecx)); > > + return pkru; > > +} > > + > > +static inline void write_pkru(__u32 pkru) > > +{ > > + if (!cpu_has_pku()) > > + return; > > + > > + __u32 ecx = 0, edx = 0; > > + > > + /* > > + * "wrpkru" instruction. Loads contents in EAX to PKRU, > > + * requires that ecx = edx = 0. > > + */ > > + asm volatile(".byte 0x0f,0x01,0xef\n\t" > > + : : "a" (pkru), "c"(ecx), "d"(edx)); > > +} > > + > > +#define TEST_PKRU 0x55555550 > > + > > +TEST_SIGNAL(pkeys_set, SIGSYS) > > +{ > > + write_pkru(TEST_PKRU); > > + /* read back the written value because pkru might not be supported */ > > + __u32 pkru = read_pkru(); > > + > > + struct sock_filter filter[] = { > > + BPF_STMT(BPF_LD|BPF_W|BPF_ABS, > > + offsetof(struct seccomp_data, pkeys)), > > + BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, pkru, 1, 0), > > + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW), > > + BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL), > > + }; > > + struct sock_fprog prog = { > > + .len = (unsigned short)ARRAY_SIZE(filter), > > + .filter = filter, > > + }; > > + long ret; > > + > > + ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); > > + ASSERT_EQ(0, ret); > > + > > + ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog); > > + ASSERT_EQ(0, ret); > > + > > + /* should never return. */ > > + EXPECT_EQ(0, syscall(__NR_getpid)); > > +} > > +#endif > > + > > + > > /* > > * TODO: > > * - add microbenchmarks > > -- > > 2.11.0 > > > > -- > Kees Cook -- Ram Pai