On Sun, Feb 02, 2025 at 08:29:20AM -0800, Eyal Birger wrote: > When attaching uretprobes to processes running inside docker, the attached > process is segfaulted when encountering the retprobe. > > The reason is that now that uretprobe is a system call the default seccomp > filters in docker block it as they only allow a specific set of known > syscalls. This is true for other userspace applications which use seccomp > to control their syscall surface. > > Since uretprobe is a "kernel implementation detail" system call which is > not used by userspace application code directly, it is impractical and > there's very little point in forcing all userspace applications to > explicitly allow it in order to avoid crashing tracked processes. > > Pass this systemcall through seccomp without depending on configuration. > > Note: uretprobe isn't supported in i386 and __NR_ia32_rt_tgsigqueueinfo > uses the same number as __NR_uretprobe so the syscall isn't forced in the > compat bitmap. > > Fixes: ff474a78cef5 ("uprobe: Add uretprobe syscall to speed up return probe") > Reported-by: Rafael Buchbinder <rafi@xxxxxx> > Link: https://lore.kernel.org/lkml/CAHsH6Gs3Eh8DFU0wq58c_LF8A4_+o6z456J7BidmcVY2AqOnHQ@xxxxxxxxxxxxxx/ > Link: https://lore.kernel.org/lkml/20250121182939.33d05470@xxxxxxxxxxxxxxxxxx/T/#me2676c378eff2d6a33f3054fed4a5f3afa64e65b > Link: https://lore.kernel.org/lkml/20250128145806.1849977-1-eyal.birger@xxxxxxxxx/ > Cc: stable@xxxxxxxxxxxxxxx > Signed-off-by: Eyal Birger <eyal.birger@xxxxxxxxx> > --- > v3: no change - deferring 32bit compat handling as there aren't plans to > support this syscall in compat mode. > v2: use action_cache bitmap and mode1 array to check the syscall > --- > kernel/seccomp.c | 24 +++++++++++++++++++++--- > 1 file changed, 21 insertions(+), 3 deletions(-) > > diff --git a/kernel/seccomp.c b/kernel/seccomp.c > index f59381c4a2ff..09b6f8e6db51 100644 > --- a/kernel/seccomp.c > +++ b/kernel/seccomp.c > @@ -734,13 +734,13 @@ seccomp_prepare_user_filter(const char __user *user_filter) > > #ifdef SECCOMP_ARCH_NATIVE > /** > - * seccomp_is_const_allow - check if filter is constant allow with given data > + * seccomp_is_filter_const_allow - check if filter is constant allow with given data > * @fprog: The BPF programs > * @sd: The seccomp data to check against, only syscall number and arch > * number are considered constant. > */ > -static bool seccomp_is_const_allow(struct sock_fprog_kern *fprog, > - struct seccomp_data *sd) > +static bool seccomp_is_filter_const_allow(struct sock_fprog_kern *fprog, > + struct seccomp_data *sd) > { > unsigned int reg_value = 0; > unsigned int pc; > @@ -812,6 +812,21 @@ static bool seccomp_is_const_allow(struct sock_fprog_kern *fprog, > return false; > } > > +static bool seccomp_is_const_allow(struct sock_fprog_kern *fprog, > + struct seccomp_data *sd) > +{ > +#ifdef __NR_uretprobe > + if (sd->nr == __NR_uretprobe > +#ifdef SECCOMP_ARCH_COMPAT > + && sd->arch != SECCOMP_ARCH_COMPAT > +#endif > + ) > + return true; > +#endif > + > + return seccomp_is_filter_const_allow(fprog, sd); > +} > + > static void seccomp_cache_prepare_bitmap(struct seccomp_filter *sfilter, > void *bitmap, const void *bitmap_prev, > size_t bitmap_size, int arch) I minimized the above to: @@ -749,6 +749,15 @@ static bool seccomp_is_const_allow(struct sock_fprog_kern *fprog, if (WARN_ON_ONCE(!fprog)) return false; + /* Our single exception to filtering. */ +#ifdef __NR_uretprobe +#ifdef SECCOMP_ARCH_COMPAT + if (sd->arch == SECCOMP_ARCH_NATIVE) +#endif + if (sd->nr == __NR_uretprobe) + return true; +#endif + for (pc = 0; pc < fprog->len; pc++) { struct sock_filter *insn = &fprog->filter[pc]; u16 code = insn->code; > @@ -1023,6 +1038,9 @@ static inline void seccomp_log(unsigned long syscall, long signr, u32 action, > */ > static const int mode1_syscalls[] = { > __NR_seccomp_read, __NR_seccomp_write, __NR_seccomp_exit, __NR_seccomp_sigreturn, > +#ifdef __NR_uretprobe > + __NR_uretprobe, > +#endif > -1, /* negative terminated */ > }; > > -- > 2.43.0 > -Kees -- Kees Cook