[RFC PATCH 1/1] seccomp: provide information about the previous syscall

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



This patch allows applications to restrict the order in which
its system calls may be requested. In order to do that, we
provide seccomp-BPF scripts with information about the
previous system call requested.

An example use case consists of detecting (and stopping) return
oriented attacks that disturb the normal execution flow of
a user program.

Signed-off-by: Daniel Sangorrin <daniel.sangorrin@xxxxxxxxxxxxx>
---
 include/linux/seccomp.h      |   2 +
 include/uapi/linux/seccomp.h |   2 +
 kernel/seccomp.c             |  10 +++
 samples/seccomp/.gitignore   |   1 +
 samples/seccomp/Makefile     |   9 ++-
 samples/seccomp/bpf-prev.c   | 160 +++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 183 insertions(+), 1 deletion(-)
 create mode 100644 samples/seccomp/bpf-prev.c

diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
index 2296e6b..8c6de6d 100644
--- a/include/linux/seccomp.h
+++ b/include/linux/seccomp.h
@@ -16,6 +16,7 @@ struct seccomp_filter;
  *
  * @mode:  indicates one of the valid values above for controlled
  *         system calls available to a process.
+ * @prev_nr: stores the previous system call number.
  * @filter: must always point to a valid seccomp-filter or NULL as it is
  *          accessed without locking during system call entry.
  *
@@ -24,6 +25,7 @@ struct seccomp_filter;
  */
 struct seccomp {
 	int mode;
+	int prev_nr;
 	struct seccomp_filter *filter;
 };
 
diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
index 0f238a4..42775dc 100644
--- a/include/uapi/linux/seccomp.h
+++ b/include/uapi/linux/seccomp.h
@@ -38,6 +38,7 @@
 /**
  * struct seccomp_data - the format the BPF program executes over.
  * @nr: the system call number
+ * @prev_nr: the previous system call number
  * @arch: indicates system call convention as an AUDIT_ARCH_* value
  *        as defined in <linux/audit.h>.
  * @instruction_pointer: at the time of the system call.
@@ -46,6 +47,7 @@
  */
 struct seccomp_data {
 	int nr;
+	int prev_nr;
 	__u32 arch;
 	__u64 instruction_pointer;
 	__u64 args[6];
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 580ac2d..98b2c9d3 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -190,6 +190,8 @@ static u32 seccomp_run_filters(struct seccomp_data *sd)
 		sd = &sd_local;
 	}
 
+	sd->prev_nr = current->seccomp.prev_nr;
+
 	/*
 	 * All filters in the list are evaluated and the lowest BPF return
 	 * value always takes priority (ignoring the DATA).
@@ -200,6 +202,9 @@ static u32 seccomp_run_filters(struct seccomp_data *sd)
 		if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION))
 			ret = cur_ret;
 	}
+
+	current->seccomp.prev_nr = sd->nr;
+
 	return ret;
 }
 #endif /* CONFIG_SECCOMP_FILTER */
@@ -443,6 +448,11 @@ static long seccomp_attach_filter(unsigned int flags,
 			return ret;
 	}
 
+	/* Initialize the prev_nr field only once */
+	if (current->seccomp.filter == NULL)
+		current->seccomp.prev_nr =
+			syscall_get_nr(current, task_pt_regs(current));
+
 	/*
 	 * If there is an existing filter, make it the prev and don't drop its
 	 * task reference.
diff --git a/samples/seccomp/.gitignore b/samples/seccomp/.gitignore
index 78fb781..11dda7a 100644
--- a/samples/seccomp/.gitignore
+++ b/samples/seccomp/.gitignore
@@ -1,3 +1,4 @@
 bpf-direct
 bpf-fancy
 dropper
+bpf-prev
diff --git a/samples/seccomp/Makefile b/samples/seccomp/Makefile
index 1b4e4b8..b50821c 100644
--- a/samples/seccomp/Makefile
+++ b/samples/seccomp/Makefile
@@ -1,7 +1,7 @@
 # kbuild trick to avoid linker error. Can be omitted if a module is built.
 obj- := dummy.o
 
-hostprogs-$(CONFIG_SECCOMP_FILTER) := bpf-fancy dropper bpf-direct
+hostprogs-$(CONFIG_SECCOMP_FILTER) := bpf-fancy dropper bpf-direct bpf-prev
 
 HOSTCFLAGS_bpf-fancy.o += -I$(objtree)/usr/include
 HOSTCFLAGS_bpf-fancy.o += -idirafter $(objtree)/include
@@ -17,6 +17,11 @@ HOSTCFLAGS_bpf-direct.o += -I$(objtree)/usr/include
 HOSTCFLAGS_bpf-direct.o += -idirafter $(objtree)/include
 bpf-direct-objs := bpf-direct.o
 
+HOSTCFLAGS_bpf-prev.o += -I$(objtree)/usr/include
+HOSTCFLAGS_bpf-prev.o += -idirafter $(objtree)/include
+bpf-prev-objs := bpf-prev.o
+
+
 # Try to match the kernel target.
 ifndef CROSS_COMPILE
 ifndef CONFIG_64BIT
@@ -29,10 +34,12 @@ MFLAG = -m31
 endif
 
 HOSTCFLAGS_bpf-direct.o += $(MFLAG)
+HOSTCFLAGS_bpf-prev.o += $(MFLAG)
 HOSTCFLAGS_dropper.o += $(MFLAG)
 HOSTCFLAGS_bpf-helper.o += $(MFLAG)
 HOSTCFLAGS_bpf-fancy.o += $(MFLAG)
 HOSTLOADLIBES_bpf-direct += $(MFLAG)
+HOSTLOADLIBES_bpf-prev += $(MFLAG)
 HOSTLOADLIBES_bpf-fancy += $(MFLAG)
 HOSTLOADLIBES_dropper += $(MFLAG)
 endif
diff --git a/samples/seccomp/bpf-prev.c b/samples/seccomp/bpf-prev.c
new file mode 100644
index 0000000..138c584
--- /dev/null
+++ b/samples/seccomp/bpf-prev.c
@@ -0,0 +1,160 @@
+/*
+ * Seccomp BPF example that uses information about the previous syscall.
+ *
+ * Copyright (C) 2015 TOSHIBA corp.
+ * Author: Daniel Sangorrin <daniel.sangorrin@xxxxxxxxx>
+ *
+ * The code may be used by anyone for any purpose,
+ * and can serve as a starting point for developing
+ * applications using prctl or seccomp.
+ */
+#if defined(__x86_64__)
+#define SUPPORTED_ARCH 1
+#endif
+
+#if defined(SUPPORTED_ARCH)
+#define __USE_GNU 1
+#define _GNU_SOURCE 1
+
+#include <linux/filter.h>
+/* NOTE: make sure seccomp_data in /usr/include/linux/seccomp.h has prev_nr */
+#include <linux/seccomp.h>
+#include <linux/unistd.h>
+#include <stdio.h>
+#include <stddef.h>
+#include <sys/prctl.h>
+#include <unistd.h>
+#include <sys/msg.h>
+#include <assert.h>
+
+#define MSGPERM		0600
+#define MTEXTSIZE	128
+#define MTYPE		1
+
+struct msg_buf {
+	long mtype;
+	char mtext[MTEXTSIZE];
+};
+
+#define syscall_nr (offsetof(struct seccomp_data, nr))
+#define prev_nr (offsetof(struct seccomp_data, prev_nr))
+
+#define EXAMINE_SYSCALL \
+	BPF_STMT(BPF_LD+BPF_W+BPF_ABS, syscall_nr)
+
+#define EXAMINE_PREV_SYSCALL \
+	BPF_STMT(BPF_LD+BPF_W+BPF_ABS, prev_nr)
+
+#define KILL_PROCESS \
+	BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL)
+
+#ifndef PR_SET_NO_NEW_PRIVS
+#define PR_SET_NO_NEW_PRIVS 38
+#endif
+
+#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
+
+static int install_syscall_filter(void)
+{
+	/* allow __NR_msgrcv only if prev_nr is __NR_prctl or __NR_msgsnd */
+	struct sock_filter filter[] = {
+		EXAMINE_SYSCALL,
+		BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_msgrcv, 1, 0),
+		BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW),
+		EXAMINE_PREV_SYSCALL,
+		BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_prctl, 0, 1),
+		BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW),
+		BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_msgsnd, 0, 1),
+		BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW),
+		BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_clone, 0, 1),
+		BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW),
+		KILL_PROCESS,
+	};
+	struct sock_fprog prog = {
+		.len = ARRAY_SIZE(filter),
+		.filter = filter,
+	};
+
+	if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
+		perror("prctl(NO_NEW_PRIVS)");
+		return 1;
+	}
+
+	if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog)) {
+		perror("prctl(SECCOMP)");
+		return 1;
+	}
+	return 0;
+}
+
+int main(int argc, char *argv[])
+{
+	long ret;
+	int id;
+	struct msg_buf send, recv;
+
+	id = syscall(__NR_msgget, IPC_PRIVATE, MSGPERM | IPC_CREAT | IPC_EXCL);
+	assert(id >= 0);
+
+	send.mtype = MTYPE;
+	snprintf(send.mtext, MTEXTSIZE, "hello");
+	printf("parent msgsnd: %s\n", send.mtext);
+	ret = syscall(__NR_msgsnd, id, &send, MTEXTSIZE, 0);
+	assert(ret == 0);
+
+	install_syscall_filter();
+
+	/* TEST 1: msgrcv can be executed after prctl */
+	ret = syscall(__NR_msgrcv, id, &recv, MTEXTSIZE, MTYPE, 0);
+	assert(ret == MTEXTSIZE);
+	printf("parent msgrcv after prctl: %s (%d bytes)\n", recv.mtext, ret);
+
+	snprintf(send.mtext, MTEXTSIZE, "world");
+	printf("parent msgsnd: %s\n", send.mtext);
+	ret = syscall(__NR_msgsnd, id, &send, MTEXTSIZE, 0);
+	assert(ret == 0);
+
+	/* TEST 2: msgrcv can be executed after msgsnd */
+	ret = syscall(__NR_msgrcv, id, &recv, MTEXTSIZE, MTYPE, 0);
+	assert(ret == MTEXTSIZE);
+	printf("parent msgrcv after msgsnd: %s (%d bytes)\n", recv.mtext, ret);
+
+	snprintf(send.mtext, MTEXTSIZE, "this is mars");
+	printf("parent msgsnd: %s\n", send.mtext);
+	ret = syscall(__NR_msgsnd, id, &send, MTEXTSIZE, 0);
+	assert(ret == 0);
+
+	pid_t pid = fork();
+
+	if (pid == 0) {
+		/* TEST 3a: msgrcv can be executed after clone */
+		ret = syscall(__NR_msgrcv, id, &recv, MTEXTSIZE, MTYPE, 0);
+		assert(ret == MTEXTSIZE);
+		printf("child msgrcv after clone: %s (%d bytes)\n",
+		       recv.mtext, ret);
+		_exit(0);
+	} else if (pid > 0) {
+		int status;
+
+		pid = wait(&status);
+		printf("parent: child %d exited with status %d\n", pid, status);
+		/* TEST 3b: msgrcv can NOT be executed after write (dmseg) */
+		syscall(__NR_write, STDOUT_FILENO, "Should fail: ", 14);
+		syscall(__NR_msgrcv, id, &recv, MTEXTSIZE, MTYPE, 0);
+		return 0;
+	}
+
+	assert(0); /* should never arrive here */
+
+	return 0;
+}
+#else	/* SUPPORTED_ARCH */
+/*
+ * This sample has been tested on x86_64. Other architectures will result in
+ * using only the main() below.
+ */
+int main(void)
+{
+	return 1;
+}
+#endif	/* SUPPORTED_ARCH */
-- 
2.1.4


--
To unsubscribe from this list: send the line "unsubscribe linux-api" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html



[Index of Archives]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux