On Thu, May 31, 2018 at 08:49:46AM -0600, Tycho Andersen wrote: > This patch introduces a means for syscalls matched in seccomp to notify > some other task that a particular filter has been triggered. > > The motivation for this is primarily for use with containers. For example, > if a container does an init_module(), we obviously don't want to load this > untrusted code, which may be compiled for the wrong version of the kernel > anyway. Instead, we could parse the module image, figure out which module > the container is trying to load and load it on the host. > > As another example, containers cannot mknod(), since this checks > capable(CAP_SYS_ADMIN). However, harmless devices like /dev/null or > /dev/zero should be ok for containers to mknod, but we'd like to avoid hard > coding some whitelist in the kernel. Another example is mount(), which has > many security restrictions for good reason, but configuration or runtime > knowledge could potentially be used to relax these restrictions. > > This patch adds functionality that is already possible via at least two > other means that I know about, both of which involve ptrace(): first, one > could ptrace attach, and then iterate through syscalls via PTRACE_SYSCALL. > Unfortunately this is slow, so a faster version would be to install a > filter that does SECCOMP_RET_TRACE, which triggers a PTRACE_EVENT_SECCOMP. > Since ptrace allows only one tracer, if the container runtime is that > tracer, users inside the container (or outside) trying to debug it will not > be able to use ptrace, which is annoying. It also means that older > distributions based on Upstart cannot boot inside containers using ptrace, > since upstart itself uses ptrace to start services. > > The actual implementation of this is fairly small, although getting the > synchronization right was/is slightly complex. > > Finally, it's worth noting that the classic seccomp TOCTOU of reading > memory data from the task still applies here, but can be avoided with > careful design of the userspace handler: if the userspace handler reads all > of the task memory that is necessary before applying its security policy, > the tracee's subsequent memory edits will not be read by the tracer. > > v2: * make id a u64; the idea here being that it will never overflow, > because 64 is huge (one syscall every nanosecond => wrap every 584 > years) (Andy) > * prevent nesting of user notifications: if someone is already attached > the tree in one place, nobody else can attach to the tree (Andy) > * notify the listener of signals the tracee receives as well (Andy) > * implement poll > v3: * lockdep fix (Oleg) > * drop unnecessary WARN()s (Christian) > * rearrange error returns to be more rpetty (Christian) > * fix build in !CONFIG_SECCOMP_USER_NOTIFICATION case > > Signed-off-by: Tycho Andersen <tycho@xxxxxxxx> > CC: Kees Cook <keescook@xxxxxxxxxxxx> > CC: Andy Lutomirski <luto@xxxxxxxxxxxxxx> > CC: Oleg Nesterov <oleg@xxxxxxxxxx> > CC: Eric W. Biederman <ebiederm@xxxxxxxxxxxx> > CC: "Serge E. Hallyn" <serge@xxxxxxxxxx> > CC: Christian Brauner <christian.brauner@xxxxxxxxxx> > CC: Tyler Hicks <tyhicks@xxxxxxxxxxxxx> > CC: Akihiro Suda <suda.akihiro@xxxxxxxxxxxxx> > --- > arch/Kconfig | 7 + > include/linux/seccomp.h | 3 +- > include/uapi/linux/seccomp.h | 18 +- > kernel/seccomp.c | 398 +++++++++++++++++- > tools/testing/selftests/seccomp/seccomp_bpf.c | 195 ++++++++- > 5 files changed, 615 insertions(+), 6 deletions(-) > > diff --git a/arch/Kconfig b/arch/Kconfig > index 75dd23acf133..1c1ae8d8c8b9 100644 > --- a/arch/Kconfig > +++ b/arch/Kconfig > @@ -401,6 +401,13 @@ config SECCOMP_FILTER > > See Documentation/prctl/seccomp_filter.txt for details. > > +config SECCOMP_USER_NOTIFICATION > + bool "Enable the SECCOMP_RET_USER_NOTIF seccomp action" > + depends on SECCOMP_FILTER > + help > + Enable SECCOMP_RET_USER_NOTIF, a return code which can be used by seccomp > + programs to notify a userspace listener that a particular event happened. > + > config HAVE_GCC_PLUGINS > bool > help > diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h > index c723a5c4e3ff..0fd3e0676a1c 100644 > --- a/include/linux/seccomp.h > +++ b/include/linux/seccomp.h > @@ -5,7 +5,8 @@ > #include <uapi/linux/seccomp.h> > > #define SECCOMP_FILTER_FLAG_MASK (SECCOMP_FILTER_FLAG_TSYNC | \ > - SECCOMP_FILTER_FLAG_LOG) > + SECCOMP_FILTER_FLAG_LOG | \ > + SECCOMP_FILTER_FLAG_GET_LISTENER) > > #ifdef CONFIG_SECCOMP > > diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h > index 2a0bd9dd104d..8160e6cad528 100644 > --- a/include/uapi/linux/seccomp.h > +++ b/include/uapi/linux/seccomp.h > @@ -17,8 +17,9 @@ > #define SECCOMP_GET_ACTION_AVAIL 2 > > /* Valid flags for SECCOMP_SET_MODE_FILTER */ > -#define SECCOMP_FILTER_FLAG_TSYNC 1 > -#define SECCOMP_FILTER_FLAG_LOG 2 > +#define SECCOMP_FILTER_FLAG_TSYNC 1 > +#define SECCOMP_FILTER_FLAG_LOG 2 > +#define SECCOMP_FILTER_FLAG_GET_LISTENER 4 > > /* > * All BPF programs must return a 32-bit value. > @@ -34,6 +35,7 @@ > #define SECCOMP_RET_KILL SECCOMP_RET_KILL_THREAD > #define SECCOMP_RET_TRAP 0x00030000U /* disallow and force a SIGSYS */ > #define SECCOMP_RET_ERRNO 0x00050000U /* returns an errno */ > +#define SECCOMP_RET_USER_NOTIF 0x7fc00000U /* notifies userspace */ > #define SECCOMP_RET_TRACE 0x7ff00000U /* pass to a tracer or disallow */ > #define SECCOMP_RET_LOG 0x7ffc0000U /* allow after logging */ > #define SECCOMP_RET_ALLOW 0x7fff0000U /* allow */ > @@ -59,4 +61,16 @@ struct seccomp_data { > __u64 args[6]; > }; > > +struct seccomp_notif { > + __u64 id; > + pid_t pid; > + struct seccomp_data data; > +}; > + > +struct seccomp_notif_resp { > + __u64 id; > + __s32 error; > + __s64 val; > +}; > + > #endif /* _UAPI_LINUX_SECCOMP_H */ > diff --git a/kernel/seccomp.c b/kernel/seccomp.c > index dc77548167ef..f69327d5f7c7 100644 > --- a/kernel/seccomp.c > +++ b/kernel/seccomp.c > @@ -31,6 +31,7 @@ > #endif > > #ifdef CONFIG_SECCOMP_FILTER > +#include <linux/file.h> > #include <linux/filter.h> > #include <linux/pid.h> > #include <linux/ptrace.h> > @@ -38,6 +39,52 @@ > #include <linux/tracehook.h> > #include <linux/uaccess.h> > > +#ifdef CONFIG_SECCOMP_USER_NOTIFICATION > +#include <linux/anon_inodes.h> > + > +enum notify_state { > + SECCOMP_NOTIFY_INIT, > + SECCOMP_NOTIFY_SENT, > + SECCOMP_NOTIFY_REPLIED, > +}; > + > +struct seccomp_knotif { > + /* The pid whose filter triggered the notification */ > + pid_t pid; > + > + /* > + * The "cookie" for this request; this is unique for this filter. > + */ > + u32 id; Perhaps /* The "cookie" for this request; this is unique for this filter. */ Epic patch review :) thanks, Tobin. _______________________________________________ Containers mailing list Containers@xxxxxxxxxxxxxxxxxxxxxxxxxx https://lists.linuxfoundation.org/mailman/listinfo/containers