An alternative design would be to extend (c)BPF with a new instruction
to read the state of a protection key. This alternate design would
provide a simpler interface to the user space since the BPF program
would not need to deal with the architecture specific pkeys field in
seccomp_data, but the question is, how much of an advantage this would
be as the nr field in seccomp_data is already architecture specific.
Adding a new instruction for BPF programs is more complicated than
this patch and might be a breaking change.
Results of selftests/seccomp_benchmark.c on a x86 machine with pkeys
support:
With patch:
Benchmarking 33554432 samples...
28.019505558 - 18.676858522 = 9342647036
getpid native: 278 ns
42.279109885 - 28.019657031 = 14259452854
getpid RET_ALLOW: 424 ns
Estimated seccomp overhead per syscall: 146 ns
Without patch:
Benchmarking 33554432 samples...
28.059619466 - 18.706769155 = 9352850311
getpid native: 278 ns
42.299228279 - 28.059761804 = 14239466475
getpid RET_ALLOW: 424 ns
Estimated seccomp overhead per syscall: 146 ns
Cc: Kees Cook <keescook@xxxxxxxxxxxx>
Cc: Andy Lutomirski <luto@xxxxxxxxxxxxxx>
Cc: Will Drewry <wad@xxxxxxxxxxxx>
Cc: Ram Pai <linuxram@xxxxxxxxxx>
Cc: linuxppc-dev@xxxxxxxxxxxxxxxx
Cc: linux-api@xxxxxxxxxxxxxxx
Signed-off-by: Michael Sammler <msammler@xxxxxxxxxxx>
---
Changes to the previous version:
- added motivation, notes about POWER, alternative design and benchmark results to the commit log
- renamed pkru field in seccomp_data to pkeys
- changed size of pkru field to __u64 and removed reserved field
- added test for x86
arch/mips/kernel/ptrace.c | 1 +
arch/x86/entry/common.c | 1 +
include/uapi/linux/seccomp.h | 3 +
kernel/seccomp.c | 1 +
tools/testing/selftests/seccomp/seccomp_bpf.c | 107 +++++++++++++++++++++++++-
5 files changed, 112 insertions(+), 1 deletion(-)
diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c
index e5ba56c0..a58dd04d 100644
--- a/arch/mips/kernel/ptrace.c
+++ b/arch/mips/kernel/ptrace.c
@@ -1277,6 +1277,7 @@ asmlinkage long syscall_trace_enter(struct pt_regs *regs, long syscall)
for (i = 0; i < 6; i++)
sd.args[i] = args[i];
sd.instruction_pointer = KSTK_EIP(current);
+ sd.pkeys = 0;
ret = __secure_computing(&sd);
if (ret == -1)
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 3b2490b8..20c51bf2 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -98,6 +98,7 @@ static long syscall_trace_enter(struct pt_regs *regs)
sd.arch = arch;
sd.nr = regs->orig_ax;
sd.instruction_pointer = regs->ip;
+ sd.pkeys = read_pkru();
#ifdef CONFIG_X86_64
if (arch == AUDIT_ARCH_X86_64) {
sd.args[0] = regs->di;
diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
index 9efc0e73..3aa2d934 100644
--- a/include/uapi/linux/seccomp.h
+++ b/include/uapi/linux/seccomp.h
@@ -52,12 +52,15 @@
* @instruction_pointer: at the time of the system call.
* @args: up to 6 system call arguments always stored as 64-bit values
* regardless of the architecture.
+ * @pkeys: value of an architecture specific protection keys register
+ * (currently PKRU on x86)
*/
struct seccomp_data {
int nr;
__u32 arch;
__u64 instruction_pointer;
__u64 args[6];
+ __u64 pkeys;
};
#endif /* _UAPI_LINUX_SECCOMP_H */
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index fd023ac2..dfb8b0d6 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -91,6 +91,7 @@ static void populate_seccomp_data(struct seccomp_data *sd)
sd->args[4] = args[4];
sd->args[5] = args[5];
sd->instruction_pointer = KSTK_EIP(task);
+ sd->pkeys = 0;
}
/**
diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c
index e1473234..f7f8fa6f 100644
--- a/tools/testing/selftests/seccomp/seccomp_bpf.c
+++ b/tools/testing/selftests/seccomp/seccomp_bpf.c
@@ -82,6 +82,7 @@ struct seccomp_data {
__u32 arch;
__u64 instruction_pointer;
__u64 args[6];
+ __u64 pkeys;
};
#endif
@@ -732,7 +733,9 @@ TEST(KILL_process)
TEST(arg_out_of_range)
{
struct sock_filter filter[] = {
- BPF_STMT(BPF_LD|BPF_W|BPF_ABS, syscall_arg(6)),
+ BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
+ offsetof(struct seccomp_data, pkeys)
+ + sizeof(__u64)),
BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
};
struct sock_fprog prog = {
@@ -2933,6 +2936,108 @@ skip:
ASSERT_EQ(0, kill(pid, SIGKILL));
}
+#if defined(__i386__) || defined(__x86_64__)
+static inline void __cpuid(unsigned int *eax, unsigned int *ebx,
+ unsigned int *ecx, unsigned int *edx)
+{
+ /* ecx is often an input as well as an output. */
+ asm volatile(
+ "cpuid;"
+ : "=a" (*eax),
+ "=b" (*ebx),
+ "=c" (*ecx),
+ "=d" (*edx)
+ : "0" (*eax), "2" (*ecx));
+}
+
+/* Intel-defined CPU features, CPUID level 0x00000007:0 (ecx) */
+#define X86_FEATURE_PKU (1<<3) /* Protection Keys for Userspace */
+#define X86_FEATURE_OSPKE (1<<4) /* OS Protection Keys Enable */
+
+static inline int cpu_has_pku(void)
+{
+ unsigned int eax;
+ unsigned int ebx;
+ unsigned int ecx;
+ unsigned int edx;
+
+ eax = 0x7;
+ ecx = 0x0;
+ __cpuid(&eax, &ebx, &ecx, &edx);
+
+ if (!(ecx & X86_FEATURE_PKU))
+ return 0;
+ if (!(ecx & X86_FEATURE_OSPKE))
+ return 0;
+ return 1;
+}
+
+static inline __u32 read_pkru(void)
+{
+ if (!cpu_has_pku())
+ return 0;
+
+ __u32 ecx = 0;
+ __u32 edx, pkru;
+
+ /*
+ * "rdpkru" instruction. Places PKRU contents in to EAX,
+ * clears EDX and requires that ecx=0.
+ */
+ asm volatile(".byte 0x0f,0x01,0xee\n\t"
+ : "=a" (pkru), "=d" (edx)
+ : "c" (ecx));
+ return pkru;
+}
+
+static inline void write_pkru(__u32 pkru)
+{
+ if (!cpu_has_pku())
+ return;
+
+ __u32 ecx = 0, edx = 0;
+
+ /*
+ * "wrpkru" instruction. Loads contents in EAX to PKRU,
+ * requires that ecx = edx = 0.
+ */
+ asm volatile(".byte 0x0f,0x01,0xef\n\t"
+ : : "a" (pkru), "c"(ecx), "d"(edx));
+}
+
+#define TEST_PKRU 0x55555550
+
+TEST_SIGNAL(pkeys_set, SIGSYS)
+{
+ write_pkru(TEST_PKRU);
+ /* read back the written value because pkru might not be supported */
+ __u32 pkru = read_pkru();
+
+ struct sock_filter filter[] = {
+ BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
+ offsetof(struct seccomp_data, pkeys)),
+ BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, pkru, 1, 0),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+ BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
+ };
+ struct sock_fprog prog = {
+ .len = (unsigned short)ARRAY_SIZE(filter),
+ .filter = filter,
+ };
+ long ret;
+
+ ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+ ASSERT_EQ(0, ret);
+
+ ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
+ ASSERT_EQ(0, ret);
+
+ /* should never return. */
+ EXPECT_EQ(0, syscall(__NR_getpid));
+}
+#endif
+
+
/*
* TODO:
* - add microbenchmarks
--
2.11.0