Re: [PATCH v5 3/6] seccomp: introduce writer locking

Alexei Starovoitov <alexei.starovoitov@xxxxxxxxx> · Thu, 22 May 2014 17:28:57 -0700

On Thu, May 22, 2014 at 4:05 PM, Kees Cook <keescook@xxxxxxxxxxxx> wrote:
> Normally, task_struct.seccomp.filter is only ever read or modified by
> the task that owns it (current). This property aids in fast access
> during system call filtering as read access is lockless.
>
> Updating the pointer from another task, however, opens up race
> conditions. To allow cross-task filter pointer updates, writes to the
> seccomp fields are now protected by a spinlock.  Read access remains
> lockless because pointer updates themselves are atomic.  However, writes
> (or cloning) often entail additional checking (like maximum instruction
> counts) which require locking to perform safely.
>
> In the case of cloning threads, the child is invisible to the system
> until it enters the task list. To make sure a child can't be cloned
> from a thread and left in a prior state, seccomp duplication is moved
> under the tasklist_lock. Then parent and child are certain have the same
> seccomp state when they exit the lock.
>
> Based on patches by Will Drewry and David Drysdale.
>
> Signed-off-by: Kees Cook <keescook@xxxxxxxxxxxx>
> ---
>  include/linux/init_task.h |    8 ++++++++
>  include/linux/sched.h     |   20 ++++++++++++++++++++
>  include/linux/seccomp.h   |    6 ++++--
>  kernel/fork.c             |   34 +++++++++++++++++++++++++++++++++-
>  kernel/seccomp.c          |   20 ++++++++++++++------
>  5 files changed, 79 insertions(+), 9 deletions(-)
>
> diff --git a/include/linux/init_task.h b/include/linux/init_task.h
> index 6df7f9fe0d01..e2370ec3102b 100644
> --- a/include/linux/init_task.h
> +++ b/include/linux/init_task.h
> @@ -154,6 +154,13 @@ extern struct task_group root_task_group;
>  # define INIT_VTIME(tsk)
>  #endif
>
> +#ifdef CONFIG_SECCOMP
> +# define INIT_SECCOMP(tsk)                                             \
> +       .seccomp.lock   = __SPIN_LOCK_UNLOCKED(tsk.seccomp.lock),
> +#else
> +# define INIT_SECCOMP(tsk)
> +#endif
> +
>  #define INIT_TASK_COMM "swapper"
>
>  #ifdef CONFIG_RT_MUTEXES
> @@ -234,6 +241,7 @@ extern struct task_group root_task_group;
>         INIT_CPUSET_SEQ(tsk)                                            \
>         INIT_RT_MUTEXES(tsk)                                            \
>         INIT_VTIME(tsk)                                                 \
> +       INIT_SECCOMP(tsk)                                               \
>  }
>
>
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 25f54c79f757..71a6cb66a3f3 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -2480,6 +2480,26 @@ static inline void task_unlock(struct task_struct *p)
>         spin_unlock(&p->alloc_lock);
>  }
>
> +#ifdef CONFIG_SECCOMP
> +/*
> + * Protects changes to ->seccomp
> + */
> +static inline void seccomp_lock(struct task_struct *p, unsigned long *flags)
> +{
> +       spin_lock_irqsave(&p->seccomp.lock, (*flags));
> +}
> +
> +static inline void seccomp_unlock(struct task_struct *p, unsigned long flags)
> +{
> +       spin_unlock_irqrestore(&p->seccomp.lock, flags);
> +}
> +#else
> +static inline void seccomp_lock(struct task_struct *p, unsigned long *flags)
> +{ }
> +static inline void seccomp_unlock(struct task_struct *p, unsigned long flags)
> +{ }
> +#endif
> +
>  extern struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
>                                                         unsigned long *flags);
>
> diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
> index 4054b0994071..c47be00e8ffb 100644
> --- a/include/linux/seccomp.h
> +++ b/include/linux/seccomp.h
> @@ -14,14 +14,16 @@ struct seccomp_filter;
>   *
>   * @mode:  indicates one of the valid values above for controlled
>   *         system calls available to a process.
> - * @filter: The metadata and ruleset for determining what system calls
> - *          are allowed for a task.
> + * @lock:  held when making changes to avoid thread races.
> + * @filter: must always point to a valid seccomp-filter or NULL as it is
> + *          accessed without locking during system call entry.
>   *
>   *          @filter must only be accessed from the context of current as there
>   *          is no locking.
>   */
>  struct seccomp {
>         int mode;
> +       spinlock_t lock;
>         struct seccomp_filter *filter;
>  };
>
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 54a8d26f612f..e08a1907cd78 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -315,6 +315,15 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
>                 goto free_ti;
>
>         tsk->stack = ti;
> +#ifdef CONFIG_SECCOMP
> +       /*
> +        * We must handle setting up seccomp filters once we're under
> +        * the tasklist_lock in case orig has changed between now and
> +        * then. Until then, filter must be NULL to avoid messing up
> +        * the usage counts on the error path calling free_task.
> +        */
> +       tsk->seccomp.filter = NULL;
> +#endif
>
>         setup_thread_stack(tsk, orig);
>         clear_user_return_notifier(tsk);
> @@ -1081,6 +1090,24 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
>         return 0;
>  }
>
> +static void copy_seccomp(struct task_struct *p)
> +{
> +#ifdef CONFIG_SECCOMP
> +       /* Child lock not needed since it is not yet in tasklist. */
> +       unsigned long irqflags;
> +       seccomp_lock(current, &irqflags);
> +
> +       get_seccomp_filter(current);
> +       p->seccomp = current->seccomp;
> +       spin_lock_init(&p->seccomp.lock);
> +
> +       if (p->seccomp.mode != SECCOMP_MODE_DISABLED)
> +               set_tsk_thread_flag(p, TIF_SECCOMP);
> +
> +       seccomp_unlock(current, irqflags);
> +#endif
> +}
> +
>  SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr)
>  {
>         current->clear_child_tid = tidptr;
> @@ -1196,7 +1223,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
>                 goto fork_out;
>
>         ftrace_graph_init_task(p);
> -       get_seccomp_filter(p);
>
>         rt_mutex_init_task(p);
>
> @@ -1425,6 +1451,12 @@ static struct task_struct *copy_process(unsigned long clone_flags,
>          */
>         write_lock_irq(&tasklist_lock);
>
> +       /*
> +        * Copy seccomp details explicitly here, in case they were changed
> +        * before holding tasklist_lock.
> +        */
> +       copy_seccomp(p);
> +
>         /* CLONE_PARENT re-uses the old parent */
>         if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {
>                 p->real_parent = current->real_parent;
> diff --git a/kernel/seccomp.c b/kernel/seccomp.c
> index 481504100b1d..d200029728ca 100644
> --- a/kernel/seccomp.c
> +++ b/kernel/seccomp.c
> @@ -174,12 +174,12 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
>   */
>  static u32 seccomp_run_filters(int syscall)
>  {
> -       struct seccomp_filter *f;
> +       struct seccomp_filter *f = smp_load_acquire(&current->seccomp.filter);
>         struct seccomp_data sd;
>         u32 ret = SECCOMP_RET_ALLOW;
>
>         /* Ensure unexpected behavior doesn't result in failing open. */
> -       if (WARN_ON(current->seccomp.filter == NULL))
> +       if (WARN_ON(f == NULL))
>                 return SECCOMP_RET_KILL;
>
>         populate_seccomp_data(&sd);
> @@ -188,7 +188,7 @@ static u32 seccomp_run_filters(int syscall)
>          * All filters in the list are evaluated and the lowest BPF return
>          * value always takes priority (ignoring the DATA).
>          */
> -       for (f = current->seccomp.filter; f; f = f->prev) {
> +       for (; f; f = smp_load_acquire(&f->prev)) {
>                 u32 cur_ret = sk_run_filter_int_seccomp(&sd, f->insnsi);

indeed. the conflicts here will be trivial to resolve. Not sure what
tree you're using
for these patches. Can we avoid the conflicts all together?
Could you rebase against linux-next at least?

>                 if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION))
>                         ret = cur_ret;
> @@ -305,6 +305,8 @@ out:
>   * _seccomp_attach_filter: validated and attach filter
>   * @filter: seccomp filter to add to the current process
>   *
> + * Caller must be holding the seccomp lock.
> + *
>   * Returns 0 on success, -ve on error.
>   */
>  static long _seccomp_attach_filter(struct seccomp_filter *filter)
> @@ -312,6 +314,8 @@ static long _seccomp_attach_filter(struct seccomp_filter *filter)
>         unsigned long total_insns;
>         struct seccomp_filter *walker;
>
> +       BUG_ON(!spin_is_locked(&current->seccomp.lock));
> +
>         /* Validate resulting filter length. */
>         total_insns = filter->len;
>         for (walker = current->seccomp.filter; walker; walker = filter->prev)
> @@ -324,7 +328,7 @@ static long _seccomp_attach_filter(struct seccomp_filter *filter)
>          * task reference.
>          */
>         filter->prev = current->seccomp.filter;
> -       current->seccomp.filter = filter;
> +       smp_store_release(&current->seccomp.filter, filter);
>
>         return 0;
>  }
> @@ -332,7 +336,7 @@ static long _seccomp_attach_filter(struct seccomp_filter *filter)
>  /* get_seccomp_filter - increments the reference count of the filter on @tsk */
>  void get_seccomp_filter(struct task_struct *tsk)
>  {
> -       struct seccomp_filter *orig = tsk->seccomp.filter;
> +       struct seccomp_filter *orig = smp_load_acquire(&tsk->seccomp.filter);
>         if (!orig)
>                 return;
>         /* Reference count is bounded by the number of total processes. */
> @@ -346,7 +350,7 @@ void put_seccomp_filter(struct task_struct *tsk)
>         /* Clean up single-reference branches iteratively. */
>         while (orig && atomic_dec_and_test(&orig->usage)) {
>                 struct seccomp_filter *freeme = orig;
> -               orig = orig->prev;
> +               orig = smp_load_acquire(&orig->prev);
>                 kfree(freeme);
>         }
>  }
> @@ -498,6 +502,7 @@ long prctl_get_seccomp(void)
>  static long seccomp_set_mode(unsigned long seccomp_mode, char __user *filter)
>  {
>         struct seccomp_filter *prepared = NULL;
> +       unsigned long irqflags;
>         long ret = -EINVAL;
>
>  #ifdef CONFIG_SECCOMP_FILTER
> @@ -509,6 +514,8 @@ static long seccomp_set_mode(unsigned long seccomp_mode, char __user *filter)
>         }
>  #endif
>
> +       seccomp_lock(current, &irqflags);
> +
>         if (current->seccomp.mode &&
>             current->seccomp.mode != seccomp_mode)
>                 goto out;
> @@ -536,6 +543,7 @@ static long seccomp_set_mode(unsigned long seccomp_mode, char __user *filter)
>         current->seccomp.mode = seccomp_mode;
>         set_thread_flag(TIF_SECCOMP);
>  out:
> +       seccomp_unlock(current, irqflags);
>         kfree(prepared);
>         return ret;
>  }
> --
> 1.7.9.5
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@xxxxxxxxxxxxxxx
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html