Re: [RFC 2/2] signal: extend pidfd_send_signal() to allow expedited process killing

Suren Baghdasaryan <surenb@xxxxxxxxxx> · Thu, 11 Apr 2019 08:18:30 -0700

Thanks for the feedback!
Just to be clear, this implementation is used in this RFC as a
reference to explain the intent. To be honest I don't think it will be
adopted as is even if the idea survives scrutiny.

On Thu, Apr 11, 2019 at 3:30 AM Christian Brauner <christian@xxxxxxxxxx> wrote:
>
> On Wed, Apr 10, 2019 at 06:43:53PM -0700, Suren Baghdasaryan wrote:
> > Add new SS_EXPEDITE flag to be used when sending SIGKILL via
> > pidfd_send_signal() syscall to allow expedited memory reclaim of the
> > victim process. The usage of this flag is currently limited to SIGKILL
> > signal and only to privileged users.
> >
> > Signed-off-by: Suren Baghdasaryan <surenb@xxxxxxxxxx>
> > ---
> >  include/linux/sched/signal.h |  3 ++-
> >  include/linux/signal.h       | 11 ++++++++++-
> >  ipc/mqueue.c                 |  2 +-
> >  kernel/signal.c              | 37 ++++++++++++++++++++++++++++--------
> >  kernel/time/itimer.c         |  2 +-
> >  5 files changed, 43 insertions(+), 12 deletions(-)
> >
> > diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
> > index e412c092c1e8..8a227633a058 100644
> > --- a/include/linux/sched/signal.h
> > +++ b/include/linux/sched/signal.h
> > @@ -327,7 +327,8 @@ extern int send_sig_info(int, struct kernel_siginfo *, struct task_struct *);
> >  extern void force_sigsegv(int sig, struct task_struct *p);
> >  extern int force_sig_info(int, struct kernel_siginfo *, struct task_struct *);
> >  extern int __kill_pgrp_info(int sig, struct kernel_siginfo *info, struct pid *pgrp);
> > -extern int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid);
> > +extern int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid,
> > +                             bool expedite);
> >  extern int kill_pid_info_as_cred(int, struct kernel_siginfo *, struct pid *,
> >                               const struct cred *);
> >  extern int kill_pgrp(struct pid *pid, int sig, int priv);
> > diff --git a/include/linux/signal.h b/include/linux/signal.h
> > index 9702016734b1..34b7852aa4a0 100644
> > --- a/include/linux/signal.h
> > +++ b/include/linux/signal.h
> > @@ -446,8 +446,17 @@ int __save_altstack(stack_t __user *, unsigned long);
> >  } while (0);
> >
> >  #ifdef CONFIG_PROC_FS
> > +
> > +/*
> > + * SS_FLAGS values used in pidfd_send_signal:
> > + *
> > + * SS_EXPEDITE indicates desire to expedite the operation.
> > + */
> > +#define SS_EXPEDITE  0x00000001
>
> Does this make sense as an SS_* flag?
> How does this relate to the signal stack?

It doesn't, so I agree that the name should be changed.
PIDFD_SIGNAL_EXPEDITE_MM_RECLAIM would seem appropriate.

> Is there any intention to ever use this flag with stack_t?
>
> New flags should be PIDFD_SIGNAL_*. (E.g. the thread flag will be
> PIDFD_SIGNAL_THREAD.)
> And since this is exposed to userspace in contrast to the mm internal
> naming it should be something more easily understandable like
> PIDFD_SIGNAL_MM_RECLAIM{_FASTER} or something.
>
> > +
> >  struct seq_file;
> >  extern void render_sigset_t(struct seq_file *, const char *, sigset_t *);
> > -#endif
> > +
> > +#endif /* CONFIG_PROC_FS */
> >
> >  #endif /* _LINUX_SIGNAL_H */
> > diff --git a/ipc/mqueue.c b/ipc/mqueue.c
> > index aea30530c472..27c66296e08e 100644
> > --- a/ipc/mqueue.c
> > +++ b/ipc/mqueue.c
> > @@ -720,7 +720,7 @@ static void __do_notify(struct mqueue_inode_info *info)
> >                       rcu_read_unlock();
> >
> >                       kill_pid_info(info->notify.sigev_signo,
> > -                                   &sig_i, info->notify_owner);
> > +                                   &sig_i, info->notify_owner, false);
> >                       break;
> >               case SIGEV_THREAD:
> >                       set_cookie(info->notify_cookie, NOTIFY_WOKENUP);
> > diff --git a/kernel/signal.c b/kernel/signal.c
> > index f98448cf2def..02ed4332d17c 100644
> > --- a/kernel/signal.c
> > +++ b/kernel/signal.c
> > @@ -43,6 +43,7 @@
> >  #include <linux/compiler.h>
> >  #include <linux/posix-timers.h>
> >  #include <linux/livepatch.h>
> > +#include <linux/oom.h>
> >
> >  #define CREATE_TRACE_POINTS
> >  #include <trace/events/signal.h>
> > @@ -1394,7 +1395,8 @@ int __kill_pgrp_info(int sig, struct kernel_siginfo *info, struct pid *pgrp)
> >       return success ? 0 : retval;
> >  }
> >
> > -int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid)
> > +int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid,
> > +                               bool expedite)
> >  {
> >       int error = -ESRCH;
> >       struct task_struct *p;
> > @@ -1402,8 +1404,17 @@ int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid)
> >       for (;;) {
> >               rcu_read_lock();
> >               p = pid_task(pid, PIDTYPE_PID);
> > -             if (p)
> > +             if (p) {
> >                       error = group_send_sig_info(sig, info, p, PIDTYPE_TGID);
> > +
> > +                     /*
> > +                      * Ignore expedite_reclaim return value, it is best
> > +                      * effort only.
> > +                      */
> > +                     if (!error && expedite)
> > +                             expedite_reclaim(p);
>
> SIGKILL will take the whole thread group down so the reclaim should make
> sense here.
>

This sounds like confirmation. I hope I'm not missing some flaw that
you are trying to point out.

> > +             }
> > +
> >               rcu_read_unlock();
> >               if (likely(!p || error != -ESRCH))
> >                       return error;
> > @@ -1420,7 +1431,7 @@ static int kill_proc_info(int sig, struct kernel_siginfo *info, pid_t pid)
> >  {
> >       int error;
> >       rcu_read_lock();
> > -     error = kill_pid_info(sig, info, find_vpid(pid));
> > +     error = kill_pid_info(sig, info, find_vpid(pid), false);
> >       rcu_read_unlock();
> >       return error;
> >  }
> > @@ -1487,7 +1498,7 @@ static int kill_something_info(int sig, struct kernel_siginfo *info, pid_t pid)
> >
> >       if (pid > 0) {
> >               rcu_read_lock();
> > -             ret = kill_pid_info(sig, info, find_vpid(pid));
> > +             ret = kill_pid_info(sig, info, find_vpid(pid), false);
> >               rcu_read_unlock();
> >               return ret;
> >       }
> > @@ -1704,7 +1715,7 @@ EXPORT_SYMBOL(kill_pgrp);
> >
> >  int kill_pid(struct pid *pid, int sig, int priv)
> >  {
> > -     return kill_pid_info(sig, __si_special(priv), pid);
> > +     return kill_pid_info(sig, __si_special(priv), pid, false);
> >  }
> >  EXPORT_SYMBOL(kill_pid);
> >
> > @@ -3577,10 +3588,20 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig,
> >       struct pid *pid;
> >       kernel_siginfo_t kinfo;
> >
> > -     /* Enforce flags be set to 0 until we add an extension. */
> > -     if (flags)
> > +     /* Enforce no unknown flags. */
> > +     if (flags & ~SS_EXPEDITE)
> >               return -EINVAL;
> >
> > +     if (flags & SS_EXPEDITE) {
> > +             /* Enforce SS_EXPEDITE to be used with SIGKILL only. */
> > +             if (sig != SIGKILL)
> > +                     return -EINVAL;
>
> Not super fond of this being a SIGKILL specific flag but I get why.

Understood. I was thinking that EXPEDITE flag might make sense for
other signals in the future but from internal feedback sounds like if
we go this way the flag name should be more specific.

> > +
> > +             /* Limit expedited killing to privileged users only. */
> > +             if (!capable(CAP_SYS_NICE))
> > +                     return -EPERM;
>
> Do you have a specific (DOS or other) attack vector in mind that renders
> ns_capable unsuitable?
>
> > +     }
> > +
> >       f = fdget_raw(pidfd);
> >       if (!f.file)
> >               return -EBADF;
> > @@ -3614,7 +3635,7 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig,
> >               prepare_kill_siginfo(sig, &kinfo);
> >       }
> >
> > -     ret = kill_pid_info(sig, &kinfo, pid);
> > +     ret = kill_pid_info(sig, &kinfo, pid, (flags & SS_EXPEDITE) != 0);
> >
> >  err:
> >       fdput(f);
> > diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c
> > index 02068b2d5862..c926483cdb53 100644
> > --- a/kernel/time/itimer.c
> > +++ b/kernel/time/itimer.c
> > @@ -140,7 +140,7 @@ enum hrtimer_restart it_real_fn(struct hrtimer *timer)
> >       struct pid *leader_pid = sig->pids[PIDTYPE_TGID];
> >
> >       trace_itimer_expire(ITIMER_REAL, leader_pid, 0);
> > -     kill_pid_info(SIGALRM, SEND_SIG_PRIV, leader_pid);
> > +     kill_pid_info(SIGALRM, SEND_SIG_PRIV, leader_pid, false);
> >
> >       return HRTIMER_NORESTART;
> >  }
> > --
> > 2.21.0.392.gf8f6787159e-goog
> >