Considering that some custom libc could possibly choose not to use CLONE_SETTLS, we should allow the libc to override the choice of clone flags meant to unregister rseq. This is a policy decision which should not be made by the kernel. Therefore, introduce a new RSEQ_FLAG_UNREG_CLONE_FLAGS, which makes the rseq system call expect an additional 5th argument: a mask of all the clone flags which may each ensure rseq is unregistered upon clone. So even if CLONE_SETTLS is eventually replaced by some other flag in the future, the libc will be able to adapt and pass this new flag upon rseq registration as well. The default when RSEQ_FLAG_UNREG_CLONE_FLAGS is unset is to unregister rseq on clone with CLONE_SETTLS. Suggested-by: "H . Peter Anvin" <hpa@xxxxxxxxx> Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@xxxxxxxxxxxx> Cc: Thomas Gleixner <tglx@xxxxxxxxxxxxx> Cc: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx> Cc: "Paul E. McKenney" <paulmck@xxxxxxxxxxxxx> Cc: Boqun Feng <boqun.feng@xxxxxxxxx> Cc: "H . Peter Anvin" <hpa@xxxxxxxxx> Cc: Paul Turner <pjt@xxxxxxxxxx> Cc: Dmitry Vyukov <dvyukov@xxxxxxxxxx> Cc: linux-api@xxxxxxxxxxxxxxx --- include/linux/sched.h | 9 +++++++-- include/linux/syscalls.h | 2 +- include/uapi/linux/rseq.h | 1 + kernel/rseq.c | 14 +++++++++++--- 4 files changed, 20 insertions(+), 6 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index deb4154dbf11..c8faa6f8493d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1138,6 +1138,7 @@ struct task_struct { * with respect to preemption. */ unsigned long rseq_event_mask; + int rseq_unreg_clone_flags; #endif struct tlbflush_unmap_batch tlb_ubc; @@ -1919,18 +1920,21 @@ static inline void rseq_migrate(struct task_struct *t) /* * If parent process has a registered restartable sequences area, the - * child inherits. Unregister rseq for a clone with CLONE_TLS set. + * child inherits, except if it has been required to be explicitly + * unregistered when any of the rseq_unreg_clone_flags are passed to clone. */ static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags) { - if (clone_flags & CLONE_TLS) { + if (clone_flags & t->rseq_unreg_clone_flags) { t->rseq = NULL; t->rseq_sig = 0; t->rseq_event_mask = 0; + t->rseq_unreg_clone_flags = 0; } else { t->rseq = current->rseq; t->rseq_sig = current->rseq_sig; t->rseq_event_mask = current->rseq_event_mask; + t->rseq_unreg_clone_flags = current->rseq_unreg_clone_flags; } } @@ -1939,6 +1943,7 @@ static inline void rseq_execve(struct task_struct *t) t->rseq = NULL; t->rseq_sig = 0; t->rseq_event_mask = 0; + t->rseq_unreg_clone_flags = 0; } #else diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 88145da7d140..6a242cfcc360 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -987,7 +987,7 @@ asmlinkage long sys_pkey_free(int pkey); asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags, unsigned mask, struct statx __user *buffer); asmlinkage long sys_rseq(struct rseq __user *rseq, uint32_t rseq_len, - int flags, uint32_t sig); + int flags, uint32_t sig, int unreg_clone_flags); asmlinkage long sys_open_tree(int dfd, const char __user *path, unsigned flags); asmlinkage long sys_move_mount(int from_dfd, const char __user *from_path, int to_dfd, const char __user *to_path, diff --git a/include/uapi/linux/rseq.h b/include/uapi/linux/rseq.h index 9a402fdb60e9..d71e3c6b7fdb 100644 --- a/include/uapi/linux/rseq.h +++ b/include/uapi/linux/rseq.h @@ -20,6 +20,7 @@ enum rseq_cpu_id_state { enum rseq_flags { RSEQ_FLAG_UNREGISTER = (1 << 0), + RSEQ_FLAG_UNREG_CLONE_FLAGS = (1 << 1), }; enum rseq_cs_flags_bit { diff --git a/kernel/rseq.c b/kernel/rseq.c index a4f86a9d6937..c59b8d3dc275 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -304,8 +304,8 @@ void rseq_syscall(struct pt_regs *regs) /* * sys_rseq - setup restartable sequences for caller thread. */ -SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, - int, flags, u32, sig) +SYSCALL_DEFINE5(rseq, struct rseq __user *, rseq, u32, rseq_len, + int, flags, u32, sig, int, unreg_clone_flags) { int ret; @@ -324,12 +324,16 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, return ret; current->rseq = NULL; current->rseq_sig = 0; + current->rseq_unreg_clone_flags = 0; return 0; } - if (unlikely(flags)) + if (unlikely(flags & ~RSEQ_FLAG_UNREG_CLONE_FLAGS)) return -EINVAL; + if (!(flags & RSEQ_FLAG_UNREG_CLONE_FLAGS)) + unreg_clone_flags = CLONE_SETTLS; + if (current->rseq) { /* * If rseq is already registered, check whether @@ -338,6 +342,9 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, */ if (current->rseq != rseq || rseq_len != sizeof(*rseq)) return -EINVAL; + if ((flags & RSEQ_FLAG_UNREG_CLONE_FLAGS) && + current->rseq_unreg_clone_flags != unreg_clone_flags) + return -EINVAL; if (current->rseq_sig != sig) return -EPERM; /* Already registered. */ @@ -355,6 +362,7 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, return -EFAULT; current->rseq = rseq; current->rseq_sig = sig; + current->rseq_unreg_clone_flags = unreg_clone_flags; /* * If rseq was previously inactive, and has just been * registered, ensure the cpu_id_start and cpu_id fields -- 2.17.1