Re: [RFC PATCH 1/1] sched/rseq: Consider rseq abort in page fault handler

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Thu, 15 Feb 2024 at 20:14, Mathieu Desnoyers
<mathieu.desnoyers@xxxxxxxxxxxx> wrote:
>
> Consider rseq abort before emitting the SIGSEGV or SIGBUS signals from
> the page fault handler.
>
> This allows using membarrier MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ
> to abort rseq critical sections which include memory accesses to
> memory which mapping can be munmap'd or mprotect'd after the
> membarrier "rseq fence" without causing SIGSEGV or SIGBUS when the page
> fault handler triggered by a faulting memory access within a rseq
> critical section is preempted before handling the page fault.
>
> The problematic scenario is:
>
> CPU 0                          CPU 1
> ------------------------------------------------------------------
>                                old_p = P
>                                P = NULL
> - rseq c.s. begins
> - x = P
> - if (x != NULL)
>   - v = *x
>     - page fault
>     - preempted
>                                membarrier(MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ)
>                                munmap(old_p) (or mprotect(old_p))
>     - handle page fault
>     - force_sig_fault(SIGSEGV)
>     - rseq resume notifier
>       - move IP to abort IP
>   -> SIGSEGV handler runs.
>
> This is solved by postponing the force_sig_fault() to return to
> user-space when the page fault handler detects that rseq events will
> cause the thread to call the rseq resume notifier before going back to
> user-space. This allows the rseq resume notifier to load the userspace
> memory pointed by rseq->rseq_cs to compare the IP with the rseq c.s.
> range before either moving the IP to the abort handler or calling
> force_sig_fault() with the parameters previously saved by the page fault
> handler.
>
> Add a new AT_RSEQ_FEATURE_FLAGS getauxval(3) to allow user-space to
> query whether the kernel implements this behavior (flag:
> RSEQ_FEATURE_PAGE_FAULT_ABORT).
>
> Untested implementation submitted for early feedback.
>
> Only x86 is implemented in this PoC.
>
> Link: https://lore.kernel.org/lkml/CACT4Y+bXfekygoyhO7pCctjnL15=E=Zs31BUGXU0dk8d4rc1Cw@xxxxxxxxxxxxxx/
> Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@xxxxxxxxxxxx>
> Cc: Dmitry Vyukov <dvyukov@xxxxxxxxxx>
> Cc: Peter Oskolkov <posk@xxxxxxxxxx>
> Cc: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
> Cc: "Paul E. McKenney" <paulmck@xxxxxxxxxx>
> Cc: Boqun Feng <boqun.feng@xxxxxxxxx>
> Cc: Chris Kennelly <ckennelly@xxxxxxxxxx>
> Cc: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx>
> Cc: Andy Lutomirski <luto@xxxxxxxxxx>
> Cc: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
> Cc: Ingo Molnar <mingo@xxxxxxxxxx>
> Cc: Borislav Petkov <bp@xxxxxxxxx>
> Cc: Dave Hansen <dave.hansen@xxxxxxxxxxxxxxx>
> Cc: "H. Peter Anvin" <hpa@xxxxxxxxx>
> Cc: linux-mm@xxxxxxxxx

Hi Mathieu,

Thanks for the quick fix.
I can try to test this, but I can't apply this.
What's the base commit for the patch?

On top of latest upstream head v6.8-rc5:

$ patch -p1 < /tmp/patch
patching file arch/x86/mm/fault.c
patching file fs/binfmt_elf.c
patching file include/linux/sched.h
Hunk #1 succeeded at 745 (offset 2 lines).
Hunk #2 succeeded at 1329 (offset 3 lines).
Hunk #3 succeeded at 2143 with fuzz 2 (offset -197 lines).
Hunk #4 FAILED at 2402.
Hunk #5 FAILED at 2417.
2 out of 5 hunks FAILED -- saving rejects to file include/linux/sched.h.rej
patching file include/linux/sched/signal.h
Hunk #1 succeeded at 784 (offset 3 lines).
patching file include/uapi/linux/auxvec.h
patching file include/uapi/linux/rseq.h
patching file kernel/rseq.c
Hunk #2 succeeded at 299 with fuzz 1.


> ---
>  arch/x86/mm/fault.c          |  4 ++--
>  fs/binfmt_elf.c              |  1 +
>  include/linux/sched.h        | 16 ++++++++++++++++
>  include/linux/sched/signal.h | 24 ++++++++++++++++++++++++
>  include/uapi/linux/auxvec.h  |  1 +
>  include/uapi/linux/rseq.h    |  7 +++++++
>  kernel/rseq.c                | 36 +++++++++++++++++++++++++++++++-----
>  7 files changed, 82 insertions(+), 7 deletions(-)
>
> diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
> index 679b09cfe241..42ac39680cb6 100644
> --- a/arch/x86/mm/fault.c
> +++ b/arch/x86/mm/fault.c
> @@ -854,7 +854,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
>         if (si_code == SEGV_PKUERR)
>                 force_sig_pkuerr((void __user *)address, pkey);
>         else
> -               force_sig_fault(SIGSEGV, si_code, (void __user *)address);
> +               rseq_lazy_force_sig_fault(SIGSEGV, si_code, (void __user *)address);
>
>         local_irq_disable();
>  }
> @@ -973,7 +973,7 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
>                 return;
>         }
>  #endif
> -       force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address);
> +       rseq_lazy_force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address);
>  }
>
>  static int spurious_kernel_fault_check(unsigned long error_code, pte_t *pte)
> diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
> index 5397b552fbeb..8fece0911c7d 100644
> --- a/fs/binfmt_elf.c
> +++ b/fs/binfmt_elf.c
> @@ -273,6 +273,7 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec,
>  #ifdef CONFIG_RSEQ
>         NEW_AUX_ENT(AT_RSEQ_FEATURE_SIZE, offsetof(struct rseq, end));
>         NEW_AUX_ENT(AT_RSEQ_ALIGN, __alignof__(struct rseq));
> +       NEW_AUX_ENT(AT_RSEQ_FEATURE_FLAGS, RSEQ_FEATURE_FLAGS);
>  #endif
>  #undef NEW_AUX_ENT
>         /* AT_NULL is zero; clear the rest too */
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 292c31697248..39aa585ba2a3 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -743,6 +743,15 @@ struct kmap_ctrl {
>  #endif
>  };
>
> +#ifdef CONFIG_RSEQ
> +struct rseq_lazy_sig {
> +       bool pending;
> +       int sig;
> +       int code;
> +       void __user *addr;
> +};
> +#endif
> +
>  struct task_struct {
>  #ifdef CONFIG_THREAD_INFO_IN_TASK
>         /*
> @@ -1317,6 +1326,7 @@ struct task_struct {
>          * with respect to preemption.
>          */
>         unsigned long rseq_event_mask;
> +       struct rseq_lazy_sig rseq_lazy_sig;
>  #endif
>
>  #ifdef CONFIG_SCHED_MM_CID
> @@ -2330,6 +2340,8 @@ unsigned long sched_cpu_util(int cpu);
>
>  #ifdef CONFIG_RSEQ
>
> +#define RSEQ_FEATURE_FLAGS     RSEQ_FEATURE_PAGE_FAULT_ABORT
> +
>  /*
>   * Map the event mask on the user-space ABI enum rseq_cs_flags
>   * for direct mask checks.
> @@ -2390,6 +2402,8 @@ static inline void rseq_migrate(struct task_struct *t)
>   */
>  static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags)
>  {
> +       WARN_ON_ONCE(current->rseq_lazy_sig.pending);
> +
>         if (clone_flags & CLONE_VM) {
>                 t->rseq = NULL;
>                 t->rseq_len = 0;
> @@ -2405,6 +2419,8 @@ static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags)
>
>  static inline void rseq_execve(struct task_struct *t)
>  {
> +       WARN_ON_ONCE(current->rseq_lazy_sig.pending);
> +
>         t->rseq = NULL;
>         t->rseq_len = 0;
>         t->rseq_sig = 0;
> diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
> index 3499c1a8b929..0d75dfde2f9b 100644
> --- a/include/linux/sched/signal.h
> +++ b/include/linux/sched/signal.h
> @@ -781,4 +781,28 @@ static inline unsigned long rlimit_max(unsigned int limit)
>         return task_rlimit_max(current, limit);
>  }
>
> +#ifdef CONFIG_RSEQ
> +
> +static inline int rseq_lazy_force_sig_fault(int sig, int code, void __user *addr)
> +{
> +       struct task_struct *t = current;
> +
> +       if (!t->rseq_event_mask)
> +               return force_sig_fault(sig, code, addr);
> +       t->rseq_lazy_sig.pending = true;
> +       t->rseq_lazy_sig.sig = sig;
> +       t->rseq_lazy_sig.code = code;
> +       t->rseq_lazy_sig.addr = addr;
> +       return 0;
> +}
> +
> +#else
> +
> +static inline int rseq_lazy_force_sig_fault(int sig, int code, void __user *addr)
> +{
> +       return force_sig_fault(sig, code, addr);
> +}
> +
> +#endif
> +
>  #endif /* _LINUX_SCHED_SIGNAL_H */
> diff --git a/include/uapi/linux/auxvec.h b/include/uapi/linux/auxvec.h
> index 6991c4b8ab18..5044f367a219 100644
> --- a/include/uapi/linux/auxvec.h
> +++ b/include/uapi/linux/auxvec.h
> @@ -32,6 +32,7 @@
>  #define AT_HWCAP2 26   /* extension of AT_HWCAP */
>  #define AT_RSEQ_FEATURE_SIZE   27      /* rseq supported feature size */
>  #define AT_RSEQ_ALIGN          28      /* rseq allocation alignment */
> +#define AT_RSEQ_FEATURE_FLAGS  29      /* rseq feature flags */
>
>  #define AT_EXECFN  31  /* filename of program */
>
> diff --git a/include/uapi/linux/rseq.h b/include/uapi/linux/rseq.h
> index c233aae5eac9..0fdb192e3cd3 100644
> --- a/include/uapi/linux/rseq.h
> +++ b/include/uapi/linux/rseq.h
> @@ -37,6 +37,13 @@ enum rseq_cs_flags {
>                 (1U << RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT),
>  };
>
> +/*
> + * rseq feature flags. Query with getauxval(AT_RSEQ_FEATURE_FLAGS).
> + */
> +enum rseq_feature_flags {
> +       RSEQ_FEATURE_PAGE_FAULT_ABORT = (1U << 0),
> +};
> +
>  /*
>   * struct rseq_cs is aligned on 4 * 8 bytes to ensure it is always
>   * contained within a single cache-line. It is usually declared as
> diff --git a/kernel/rseq.c b/kernel/rseq.c
> index 9de6e35fe679..f686a97abb45 100644
> --- a/kernel/rseq.c
> +++ b/kernel/rseq.c
> @@ -271,6 +271,25 @@ static bool in_rseq_cs(unsigned long ip, struct rseq_cs *rseq_cs)
>         return ip - rseq_cs->start_ip < rseq_cs->post_commit_offset;
>  }
>
> +static void rseq_clear_lazy_sig_fault(struct task_struct *t)
> +{
> +       if (!t->rseq_lazy_sig.pending)
> +               return;
> +       t->rseq_lazy_sig.pending = false;
> +       t->rseq_lazy_sig.sig = 0;
> +       t->rseq_lazy_sig.code = 0;
> +       t->rseq_lazy_sig.addr = NULL;
> +}
> +
> +static void rseq_force_lazy_sig_fault(struct task_struct *t)
> +{
> +       if (!t->rseq_lazy_sig.pending)
> +               return;
> +       force_sig_fault(t->rseq_lazy_sig.sig, t->rseq_lazy_sig.code,
> +                       t->rseq_lazy_sig.addr);
> +       rseq_clear_lazy_sig_fault(t);
> +}
> +
>  static int rseq_ip_fixup(struct pt_regs *regs)
>  {
>         unsigned long ip = instruction_pointer(regs);
> @@ -280,25 +299,32 @@ static int rseq_ip_fixup(struct pt_regs *regs)
>
>         ret = rseq_get_rseq_cs(t, &rseq_cs);
>         if (ret)
> -               return ret;
> +               goto nofixup;
>
>         /*
>          * Handle potentially not being within a critical section.
>          * If not nested over a rseq critical section, restart is useless.
>          * Clear the rseq_cs pointer and return.
>          */
> -       if (!in_rseq_cs(ip, &rseq_cs))
> -               return clear_rseq_cs(t);
> +       if (!in_rseq_cs(ip, &rseq_cs)) {
> +               ret = clear_rseq_cs(t);
> +               goto nofixup;
> +       }
>         ret = rseq_need_restart(t, rseq_cs.flags);
>         if (ret <= 0)
> -               return ret;
> +               goto nofixup;
>         ret = clear_rseq_cs(t);
>         if (ret)
> -               return ret;
> +               goto nofixup;
> +       rseq_clear_lazy_sig_fault(t);
>         trace_rseq_ip_fixup(ip, rseq_cs.start_ip, rseq_cs.post_commit_offset,
>                             rseq_cs.abort_ip);
>         instruction_pointer_set(regs, (unsigned long)rseq_cs.abort_ip);
>         return 0;
> +
> +nofixup:
> +       rseq_force_lazy_sig_fault(t);
> +       return ret;
>  }
>
>  /*
> --
> 2.39.2
>




[Index of Archives]     [Linux ARM Kernel]     [Linux ARM]     [Linux Omap]     [Fedora ARM]     [IETF Annouce]     [Bugtraq]     [Linux OMAP]     [Linux MIPS]     [eCos]     [Asterisk Internet PBX]     [Linux API]

  Powered by Linux