The seccomp filter used to be released in free_task() which is called asynchronously via call_rcu() and assorted mechanisms. Since we need to inform tasks waiting on the seccomp notifier when a filter goes empty we will notify them as soon as a task has been marked fully dead in release_task(). To not split seccomp cleanup into two parts, move filter release out of free_task() and into release_task() after we've unhashed struct task from struct pid, exited signals, and unlinked it from the threadgroups' thread list. We'll put the empty filter notification infrastructure into it in a follow up patch. This also renames put_seccomp_filter() to seccomp_filter_release() which is a more descriptive name of what we're doing here especially once we've added the empty filter notification mechanism in there. We're also NULL-ing the task's filter tree entrypoint which seems cleaner than leaving a dangling pointer in there. Note that this shouldn't need any memory barriers since we're calling this when the task is in release_task() which means it's EXIT_DEAD. So it can't modify it's seccomp filters anymore. You can also see this from the point where we're calling seccomp_filter_release(). It's after __exit_signal() and at this point, tsk->sighand will already have been NULLed which is required for thread-sync and filter installation alike. Cc: Tycho Andersen <tycho@xxxxxxxx> Cc: Kees Cook <keescook@xxxxxxxxxxxx> Cc: Matt Denton <mpdenton@xxxxxxxxxx> Cc: Sargun Dhillon <sargun@xxxxxxxxx> Cc: Jann Horn <jannh@xxxxxxxxxx> Cc: Chris Palmer <palmer@xxxxxxxxxx> Cc: Aleksa Sarai <cyphar@xxxxxxxxxx> Cc: Robert Sesek <rsesek@xxxxxxxxxx> Cc: Jeffrey Vander Stoep <jeffv@xxxxxxxxxx> Cc: Linux Containers <containers@xxxxxxxxxxxxxxxxxxxxxxxxxx> Signed-off-by: Christian Brauner <christian.brauner@xxxxxxxxxx> --- /* v2 */ patch not present /* v3 */ patch introduced --- include/linux/seccomp.h | 4 +-- kernel/exit.c | 1 + kernel/fork.c | 1 - kernel/seccomp.c | 60 ++++++++++++++++++++++++----------------- 4 files changed, 38 insertions(+), 28 deletions(-) diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h index 4192369b8418..1644922af19a 100644 --- a/include/linux/seccomp.h +++ b/include/linux/seccomp.h @@ -82,10 +82,10 @@ static inline int seccomp_mode(struct seccomp *s) #endif /* CONFIG_SECCOMP */ #ifdef CONFIG_SECCOMP_FILTER -extern void put_seccomp_filter(struct task_struct *tsk); +extern void seccomp_filter_release(struct task_struct *tsk); extern void get_seccomp_filter(struct task_struct *tsk); #else /* CONFIG_SECCOMP_FILTER */ -static inline void put_seccomp_filter(struct task_struct *tsk) +static inline void seccomp_filter_release(struct task_struct *tsk) { return; } diff --git a/kernel/exit.c b/kernel/exit.c index ce2a75bc0ade..5490cc04f436 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -218,6 +218,7 @@ void release_task(struct task_struct *p) } write_unlock_irq(&tasklist_lock); + seccomp_filter_release(p); proc_flush_pid(thread_pid); put_pid(thread_pid); release_thread(p); diff --git a/kernel/fork.c b/kernel/fork.c index 48ed22774efa..b948a14da94f 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -471,7 +471,6 @@ void free_task(struct task_struct *tsk) #endif rt_mutex_debug_task_free(tsk); ftrace_graph_exit_task(tsk); - put_seccomp_filter(tsk); arch_release_task_struct(tsk); if (tsk->flags & PF_KTHREAD) free_kthread_struct(tsk); diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 0ba2d6d0800f..55251b1fe03f 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -367,6 +367,40 @@ static inline pid_t seccomp_can_sync_threads(void) return 0; } +static inline void seccomp_filter_free(struct seccomp_filter *filter) +{ + if (filter) { + bpf_prog_destroy(filter->prog); + kfree(filter); + } +} + +static void __put_seccomp_filter(struct seccomp_filter *orig) +{ + /* Clean up single-reference branches iteratively. */ + while (orig && refcount_dec_and_test(&orig->refs)) { + struct seccomp_filter *freeme = orig; + orig = orig->prev; + seccomp_filter_free(freeme); + } +} + +/** + * seccomp_filter_release - Detach the task from its filter tree + * and drop its reference count during + * exit. + * + * This function should only be called when the task is exiting as + * it detaches it from its filter tree. + */ +void seccomp_filter_release(struct task_struct *tsk) +{ + struct seccomp_filter *cur = tsk->seccomp.filter; + + tsk->seccomp.filter = NULL; + __put_seccomp_filter(cur); +} + /** * seccomp_sync_threads: sets all threads to use current's filter * @@ -396,7 +430,7 @@ static inline void seccomp_sync_threads(unsigned long flags) * current's path will hold a reference. (This also * allows a put before the assignment.) */ - put_seccomp_filter(thread); + __put_seccomp_filter(thread->seccomp.filter); smp_store_release(&thread->seccomp.filter, caller->seccomp.filter); @@ -567,30 +601,6 @@ void get_seccomp_filter(struct task_struct *tsk) __get_seccomp_filter(orig); } -static inline void seccomp_filter_free(struct seccomp_filter *filter) -{ - if (filter) { - bpf_prog_destroy(filter->prog); - kfree(filter); - } -} - -static void __put_seccomp_filter(struct seccomp_filter *orig) -{ - /* Clean up single-reference branches iteratively. */ - while (orig && refcount_dec_and_test(&orig->refs)) { - struct seccomp_filter *freeme = orig; - orig = orig->prev; - seccomp_filter_free(freeme); - } -} - -/* put_seccomp_filter - decrements the ref count of tsk->seccomp.filter */ -void put_seccomp_filter(struct task_struct *tsk) -{ - __put_seccomp_filter(tsk->seccomp.filter); -} - static void seccomp_init_siginfo(kernel_siginfo_t *info, int syscall, int reason) { clear_siginfo(info); -- 2.26.2 _______________________________________________ Containers mailing list Containers@xxxxxxxxxxxxxxxxxxxxxxxxxx https://lists.linuxfoundation.org/mailman/listinfo/containers