On Thu, May 22, 2014 at 4:05 PM, Kees Cook <keescook@xxxxxxxxxxxx> wrote: > Normally, task_struct.seccomp.filter is only ever read or modified by > the task that owns it (current). This property aids in fast access > during system call filtering as read access is lockless. > > Updating the pointer from another task, however, opens up race > conditions. To allow cross-task filter pointer updates, writes to the > seccomp fields are now protected by a spinlock. Read access remains > lockless because pointer updates themselves are atomic. However, writes > (or cloning) often entail additional checking (like maximum instruction > counts) which require locking to perform safely. > > In the case of cloning threads, the child is invisible to the system > until it enters the task list. To make sure a child can't be cloned > from a thread and left in a prior state, seccomp duplication is moved > under the tasklist_lock. Then parent and child are certain have the same > seccomp state when they exit the lock. > > Based on patches by Will Drewry and David Drysdale. > > Signed-off-by: Kees Cook <keescook@xxxxxxxxxxxx> > --- > include/linux/init_task.h | 8 ++++++++ > include/linux/sched.h | 20 ++++++++++++++++++++ > include/linux/seccomp.h | 6 ++++-- > kernel/fork.c | 34 +++++++++++++++++++++++++++++++++- > kernel/seccomp.c | 20 ++++++++++++++------ > 5 files changed, 79 insertions(+), 9 deletions(-) > > diff --git a/include/linux/init_task.h b/include/linux/init_task.h > index 6df7f9fe0d01..e2370ec3102b 100644 > --- a/include/linux/init_task.h > +++ b/include/linux/init_task.h > @@ -154,6 +154,13 @@ extern struct task_group root_task_group; > # define INIT_VTIME(tsk) > #endif > > +#ifdef CONFIG_SECCOMP > +# define INIT_SECCOMP(tsk) \ > + .seccomp.lock = __SPIN_LOCK_UNLOCKED(tsk.seccomp.lock), > +#else > +# define INIT_SECCOMP(tsk) > +#endif > + > #define INIT_TASK_COMM "swapper" > > #ifdef CONFIG_RT_MUTEXES > @@ -234,6 +241,7 @@ extern struct task_group root_task_group; > INIT_CPUSET_SEQ(tsk) \ > INIT_RT_MUTEXES(tsk) \ > INIT_VTIME(tsk) \ > + INIT_SECCOMP(tsk) \ > } > > > diff --git a/include/linux/sched.h b/include/linux/sched.h > index 25f54c79f757..71a6cb66a3f3 100644 > --- a/include/linux/sched.h > +++ b/include/linux/sched.h > @@ -2480,6 +2480,26 @@ static inline void task_unlock(struct task_struct *p) > spin_unlock(&p->alloc_lock); > } > > +#ifdef CONFIG_SECCOMP > +/* > + * Protects changes to ->seccomp > + */ > +static inline void seccomp_lock(struct task_struct *p, unsigned long *flags) > +{ > + spin_lock_irqsave(&p->seccomp.lock, (*flags)); > +} > + > +static inline void seccomp_unlock(struct task_struct *p, unsigned long flags) > +{ > + spin_unlock_irqrestore(&p->seccomp.lock, flags); > +} > +#else > +static inline void seccomp_lock(struct task_struct *p, unsigned long *flags) > +{ } > +static inline void seccomp_unlock(struct task_struct *p, unsigned long flags) > +{ } > +#endif > + > extern struct sighand_struct *__lock_task_sighand(struct task_struct *tsk, > unsigned long *flags); > > diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h > index 4054b0994071..c47be00e8ffb 100644 > --- a/include/linux/seccomp.h > +++ b/include/linux/seccomp.h > @@ -14,14 +14,16 @@ struct seccomp_filter; > * > * @mode: indicates one of the valid values above for controlled > * system calls available to a process. > - * @filter: The metadata and ruleset for determining what system calls > - * are allowed for a task. > + * @lock: held when making changes to avoid thread races. > + * @filter: must always point to a valid seccomp-filter or NULL as it is > + * accessed without locking during system call entry. > * > * @filter must only be accessed from the context of current as there > * is no locking. > */ > struct seccomp { > int mode; > + spinlock_t lock; > struct seccomp_filter *filter; > }; > > diff --git a/kernel/fork.c b/kernel/fork.c > index 54a8d26f612f..e08a1907cd78 100644 > --- a/kernel/fork.c > +++ b/kernel/fork.c > @@ -315,6 +315,15 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) > goto free_ti; > > tsk->stack = ti; > +#ifdef CONFIG_SECCOMP > + /* > + * We must handle setting up seccomp filters once we're under > + * the tasklist_lock in case orig has changed between now and > + * then. Until then, filter must be NULL to avoid messing up > + * the usage counts on the error path calling free_task. > + */ > + tsk->seccomp.filter = NULL; > +#endif > > setup_thread_stack(tsk, orig); > clear_user_return_notifier(tsk); > @@ -1081,6 +1090,24 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) > return 0; > } > > +static void copy_seccomp(struct task_struct *p) > +{ > +#ifdef CONFIG_SECCOMP > + /* Child lock not needed since it is not yet in tasklist. */ > + unsigned long irqflags; > + seccomp_lock(current, &irqflags); > + > + get_seccomp_filter(current); > + p->seccomp = current->seccomp; > + spin_lock_init(&p->seccomp.lock); > + > + if (p->seccomp.mode != SECCOMP_MODE_DISABLED) > + set_tsk_thread_flag(p, TIF_SECCOMP); > + > + seccomp_unlock(current, irqflags); > +#endif > +} > + > SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr) > { > current->clear_child_tid = tidptr; > @@ -1196,7 +1223,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, > goto fork_out; > > ftrace_graph_init_task(p); > - get_seccomp_filter(p); > > rt_mutex_init_task(p); > > @@ -1425,6 +1451,12 @@ static struct task_struct *copy_process(unsigned long clone_flags, > */ > write_lock_irq(&tasklist_lock); > > + /* > + * Copy seccomp details explicitly here, in case they were changed > + * before holding tasklist_lock. > + */ > + copy_seccomp(p); > + > /* CLONE_PARENT re-uses the old parent */ > if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) { > p->real_parent = current->real_parent; > diff --git a/kernel/seccomp.c b/kernel/seccomp.c > index 481504100b1d..d200029728ca 100644 > --- a/kernel/seccomp.c > +++ b/kernel/seccomp.c > @@ -174,12 +174,12 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen) > */ > static u32 seccomp_run_filters(int syscall) > { > - struct seccomp_filter *f; > + struct seccomp_filter *f = smp_load_acquire(¤t->seccomp.filter); > struct seccomp_data sd; > u32 ret = SECCOMP_RET_ALLOW; > > /* Ensure unexpected behavior doesn't result in failing open. */ > - if (WARN_ON(current->seccomp.filter == NULL)) > + if (WARN_ON(f == NULL)) > return SECCOMP_RET_KILL; > > populate_seccomp_data(&sd); > @@ -188,7 +188,7 @@ static u32 seccomp_run_filters(int syscall) > * All filters in the list are evaluated and the lowest BPF return > * value always takes priority (ignoring the DATA). > */ > - for (f = current->seccomp.filter; f; f = f->prev) { > + for (; f; f = smp_load_acquire(&f->prev)) { > u32 cur_ret = sk_run_filter_int_seccomp(&sd, f->insnsi); indeed. the conflicts here will be trivial to resolve. Not sure what tree you're using for these patches. Can we avoid the conflicts all together? Could you rebase against linux-next at least? > if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION)) > ret = cur_ret; > @@ -305,6 +305,8 @@ out: > * _seccomp_attach_filter: validated and attach filter > * @filter: seccomp filter to add to the current process > * > + * Caller must be holding the seccomp lock. > + * > * Returns 0 on success, -ve on error. > */ > static long _seccomp_attach_filter(struct seccomp_filter *filter) > @@ -312,6 +314,8 @@ static long _seccomp_attach_filter(struct seccomp_filter *filter) > unsigned long total_insns; > struct seccomp_filter *walker; > > + BUG_ON(!spin_is_locked(¤t->seccomp.lock)); > + > /* Validate resulting filter length. */ > total_insns = filter->len; > for (walker = current->seccomp.filter; walker; walker = filter->prev) > @@ -324,7 +328,7 @@ static long _seccomp_attach_filter(struct seccomp_filter *filter) > * task reference. > */ > filter->prev = current->seccomp.filter; > - current->seccomp.filter = filter; > + smp_store_release(¤t->seccomp.filter, filter); > > return 0; > } > @@ -332,7 +336,7 @@ static long _seccomp_attach_filter(struct seccomp_filter *filter) > /* get_seccomp_filter - increments the reference count of the filter on @tsk */ > void get_seccomp_filter(struct task_struct *tsk) > { > - struct seccomp_filter *orig = tsk->seccomp.filter; > + struct seccomp_filter *orig = smp_load_acquire(&tsk->seccomp.filter); > if (!orig) > return; > /* Reference count is bounded by the number of total processes. */ > @@ -346,7 +350,7 @@ void put_seccomp_filter(struct task_struct *tsk) > /* Clean up single-reference branches iteratively. */ > while (orig && atomic_dec_and_test(&orig->usage)) { > struct seccomp_filter *freeme = orig; > - orig = orig->prev; > + orig = smp_load_acquire(&orig->prev); > kfree(freeme); > } > } > @@ -498,6 +502,7 @@ long prctl_get_seccomp(void) > static long seccomp_set_mode(unsigned long seccomp_mode, char __user *filter) > { > struct seccomp_filter *prepared = NULL; > + unsigned long irqflags; > long ret = -EINVAL; > > #ifdef CONFIG_SECCOMP_FILTER > @@ -509,6 +514,8 @@ static long seccomp_set_mode(unsigned long seccomp_mode, char __user *filter) > } > #endif > > + seccomp_lock(current, &irqflags); > + > if (current->seccomp.mode && > current->seccomp.mode != seccomp_mode) > goto out; > @@ -536,6 +543,7 @@ static long seccomp_set_mode(unsigned long seccomp_mode, char __user *filter) > current->seccomp.mode = seccomp_mode; > set_thread_flag(TIF_SECCOMP); > out: > + seccomp_unlock(current, irqflags); > kfree(prepared); > return ret; > } > -- > 1.7.9.5 > > -- > To unsubscribe from this list: send the line "unsubscribe linux-kernel" in > the body of a message to majordomo@xxxxxxxxxxxxxxx > More majordomo info at http://vger.kernel.org/majordomo-info.html > Please read the FAQ at http://www.tux.org/lkml/ -- To unsubscribe from this list: send the line "unsubscribe linux-doc" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html