[RFC PATCH] seccomp: Add protection keys into seccomp_data

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Add the current value of an architecture specific protection keys
register (currently PKRU on x86) to data available for seccomp-bpf
programs to work on. This allows filters based on the currently
enabled protection keys.

Support for protection keys on the POWER architecture is not part of
this patch since I do not have access to a PowerPC, but adding support
for it can be achieved by setting sd->pkeys to the AMR register in
populate_seccomp_data.

One use case for this patch is disabling unnecessary system calls for a
library (e.g. network i/o for a crypto library) while the library runs
without disabling the system calls for the whole program (by changing
the protection keys before and after the library executes). Using this
one could ensure that the library behaves a expected (e.g. the crypto
library not sending private keys to a malicious server).

This patch also enables lightweight sandboxing of untrusted code using
memory protection keys: Protection keys provide memory isolation but
for a sandbox system call isolation is needed as well. This patch
allows writing a seccomp filter to prevent system calls by the
untrusted code while still allowing system calls for the trusted code.

An alternative design would be to extend (c)BPF with a new instruction
to read the state of a protection key. This alternate design would
provide a simpler interface to the user space since the BPF program
would not need to deal with the architecture specific pkeys field in
seccomp_data, but the question is, how much of an advantage this would
be as the nr field in seccomp_data is already architecture specific.
Adding a new instruction for BPF programs is more complicated than
this patch and might be a breaking change.

Results of selftests/seccomp_benchmark.c on a x86 machine with pkeys
support:

With patch:
Benchmarking 33554432 samples...
28.019505558 - 18.676858522 = 9342647036
getpid native: 278 ns
42.279109885 - 28.019657031 = 14259452854
getpid RET_ALLOW: 424 ns
Estimated seccomp overhead per syscall: 146 ns

Without patch:
Benchmarking 33554432 samples...
28.059619466 - 18.706769155 = 9352850311
getpid native: 278 ns
42.299228279 - 28.059761804 = 14239466475
getpid RET_ALLOW: 424 ns
Estimated seccomp overhead per syscall: 146 ns

Cc: Kees Cook <keescook@xxxxxxxxxxxx>
Cc: Andy Lutomirski <luto@xxxxxxxxxxxxxx>
Cc: Will Drewry <wad@xxxxxxxxxxxx>
Cc: Ram Pai <linuxram@xxxxxxxxxx>
Cc: linuxppc-dev@xxxxxxxxxxxxxxxx
Cc: linux-api@xxxxxxxxxxxxxxx
Signed-off-by: Michael Sammler <msammler@xxxxxxxxxxx>
---
Changes to the previous version:
- added motivation, notes about POWER, alternative design and benchmark results to the commit log
- renamed pkru field in seccomp_data to pkeys
- changed size of pkru field to __u64 and removed reserved field
- added test for x86

 arch/mips/kernel/ptrace.c                     |   1 +
 arch/x86/entry/common.c                       |   1 +
 include/uapi/linux/seccomp.h                  |   3 +
 kernel/seccomp.c                              |   1 +
 tools/testing/selftests/seccomp/seccomp_bpf.c | 107 +++++++++++++++++++++++++-
 5 files changed, 112 insertions(+), 1 deletion(-)

diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c
index e5ba56c0..a58dd04d 100644
--- a/arch/mips/kernel/ptrace.c
+++ b/arch/mips/kernel/ptrace.c
@@ -1277,6 +1277,7 @@ asmlinkage long syscall_trace_enter(struct pt_regs *regs, long syscall)
 		for (i = 0; i < 6; i++)
 			sd.args[i] = args[i];
 		sd.instruction_pointer = KSTK_EIP(current);
+		sd.pkeys = 0;

 		ret = __secure_computing(&sd);
 		if (ret == -1)
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 3b2490b8..20c51bf2 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -98,6 +98,7 @@ static long syscall_trace_enter(struct pt_regs *regs)
 		sd.arch = arch;
 		sd.nr = regs->orig_ax;
 		sd.instruction_pointer = regs->ip;
+		sd.pkeys = read_pkru();
 #ifdef CONFIG_X86_64
 		if (arch == AUDIT_ARCH_X86_64) {
 			sd.args[0] = regs->di;
diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
index 9efc0e73..3aa2d934 100644
--- a/include/uapi/linux/seccomp.h
+++ b/include/uapi/linux/seccomp.h
@@ -52,12 +52,15 @@
  * @instruction_pointer: at the time of the system call.
  * @args: up to 6 system call arguments always stored as 64-bit values
  *        regardless of the architecture.
+ * @pkeys: value of an architecture specific protection keys register
+ *         (currently PKRU on x86)
  */
 struct seccomp_data {
 	int nr;
 	__u32 arch;
 	__u64 instruction_pointer;
 	__u64 args[6];
+	__u64 pkeys;
 };

 #endif /* _UAPI_LINUX_SECCOMP_H */
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index fd023ac2..dfb8b0d6 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -91,6 +91,7 @@ static void populate_seccomp_data(struct seccomp_data *sd)
 	sd->args[4] = args[4];
 	sd->args[5] = args[5];
 	sd->instruction_pointer = KSTK_EIP(task);
+	sd->pkeys = 0;
 }

 /**
diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c
index e1473234..f7f8fa6f 100644
--- a/tools/testing/selftests/seccomp/seccomp_bpf.c
+++ b/tools/testing/selftests/seccomp/seccomp_bpf.c
@@ -82,6 +82,7 @@ struct seccomp_data {
 	__u32 arch;
 	__u64 instruction_pointer;
 	__u64 args[6];
+	__u64 pkeys;
 };
 #endif

@@ -732,7 +733,9 @@ TEST(KILL_process)
 TEST(arg_out_of_range)
 {
 	struct sock_filter filter[] = {
-		BPF_STMT(BPF_LD|BPF_W|BPF_ABS, syscall_arg(6)),
+		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
+			offsetof(struct seccomp_data, pkeys)
+				+ sizeof(__u64)),
 		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
 	};
 	struct sock_fprog prog = {
@@ -2933,6 +2936,108 @@ skip:
 	ASSERT_EQ(0, kill(pid, SIGKILL));
 }

+#if defined(__i386__) || defined(__x86_64__)
+static inline void __cpuid(unsigned int *eax, unsigned int *ebx,
+		unsigned int *ecx, unsigned int *edx)
+{
+	/* ecx is often an input as well as an output. */
+	asm volatile(
+		"cpuid;"
+		: "=a" (*eax),
+		  "=b" (*ebx),
+		  "=c" (*ecx),
+		  "=d" (*edx)
+		: "0" (*eax), "2" (*ecx));
+}
+
+/* Intel-defined CPU features, CPUID level 0x00000007:0 (ecx) */
+#define X86_FEATURE_PKU        (1<<3) /* Protection Keys for Userspace */
+#define X86_FEATURE_OSPKE      (1<<4) /* OS Protection Keys Enable */
+
+static inline int cpu_has_pku(void)
+{
+	unsigned int eax;
+	unsigned int ebx;
+	unsigned int ecx;
+	unsigned int edx;
+
+	eax = 0x7;
+	ecx = 0x0;
+	__cpuid(&eax, &ebx, &ecx, &edx);
+
+	if (!(ecx & X86_FEATURE_PKU))
+		return 0;
+	if (!(ecx & X86_FEATURE_OSPKE))
+		return 0;
+	return 1;
+}
+
+static inline __u32 read_pkru(void)
+{
+	if (!cpu_has_pku())
+		return 0;
+
+	__u32 ecx = 0;
+	__u32 edx, pkru;
+
+	/*
+	 * "rdpkru" instruction.  Places PKRU contents in to EAX,
+	 * clears EDX and requires that ecx=0.
+	 */
+	asm volatile(".byte 0x0f,0x01,0xee\n\t"
+		     : "=a" (pkru), "=d" (edx)
+		     : "c" (ecx));
+	return pkru;
+}
+
+static inline void write_pkru(__u32 pkru)
+{
+	if (!cpu_has_pku())
+		return;
+
+	__u32 ecx = 0, edx = 0;
+
+	/*
+	 * "wrpkru" instruction.  Loads contents in EAX to PKRU,
+	 * requires that ecx = edx = 0.
+	 */
+	asm volatile(".byte 0x0f,0x01,0xef\n\t"
+		     : : "a" (pkru), "c"(ecx), "d"(edx));
+}
+
+#define TEST_PKRU 0x55555550
+
+TEST_SIGNAL(pkeys_set, SIGSYS)
+{
+	write_pkru(TEST_PKRU);
+	/* read back the written value because pkru might not be supported */
+	__u32 pkru = read_pkru();
+
+	struct sock_filter filter[] = {
+		BPF_STMT(BPF_LD|BPF_W|BPF_ABS,
+			offsetof(struct seccomp_data, pkeys)),
+		BPF_JUMP(BPF_JMP|BPF_JEQ|BPF_K, pkru, 1, 0),
+		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_ALLOW),
+		BPF_STMT(BPF_RET|BPF_K, SECCOMP_RET_KILL),
+	};
+	struct sock_fprog prog = {
+		.len = (unsigned short)ARRAY_SIZE(filter),
+		.filter = filter,
+	};
+	long ret;
+
+	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
+	ASSERT_EQ(0, ret);
+
+	ret = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog);
+	ASSERT_EQ(0, ret);
+
+	/* should never return. */
+	EXPECT_EQ(0, syscall(__NR_getpid));
+}
+#endif
+
+
 /*
  * TODO:
  * - add microbenchmarks
--
2.11.0



[Index of Archives]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux