On Wed, Jul 15, 2020 at 01:38:51PM +0200, Christian Brauner wrote: > On Mon, Jul 13, 2020 at 11:03:46PM -0400, Mathieu Desnoyers wrote: > > Add a __rseq_abi.flags "RSEQ_TLS_FLAG_SIZE", which indicates support for > > extending struct rseq. This adds two new fields to struct rseq: > > user_size and kernel_size. > > > > The user_size field allows the size of the __rseq_abi definition (which > > can be overridden by symbol interposition either by a preloaded library > > or by the application) to be handed over to the kernel at registration. > > This registration can be performed by a library, e.g. glibc, which does > > not know there is interposition taking place. > > > > The kernel_size is populated by the kernel when the "RSEQ_TLS_FLAG_SIZE" > > flag is set in __rseq_abi.flags to the minimum between user_size and > > the offset of the "end" field of struct rseq as known by the kernel. > > This allows user-space to query which fields are effectively populated > > by the kernel. > > > > A rseq_size field is added to the task struct to keep track of the > > "kernel_size" effective for each thread. > > > > Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@xxxxxxxxxxxx> > > --- > > include/linux/sched.h | 4 ++++ > > include/uapi/linux/rseq.h | 37 ++++++++++++++++++++++++++++++++-- > > kernel/rseq.c | 42 +++++++++++++++++++++++++++++++++------ > > 3 files changed, 75 insertions(+), 8 deletions(-) > > > > diff --git a/include/linux/sched.h b/include/linux/sched.h > > index 692e327d7455..5d61a3197987 100644 > > --- a/include/linux/sched.h > > +++ b/include/linux/sched.h > > @@ -1147,6 +1147,7 @@ struct task_struct { > > #ifdef CONFIG_RSEQ > > struct rseq __user *rseq; > > u32 rseq_sig; > > + u32 rseq_size; > > /* > > * RmW on rseq_event_mask must be performed atomically > > * with respect to preemption. > > @@ -1976,10 +1977,12 @@ static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags) > > if (clone_flags & CLONE_VM) { > > t->rseq = NULL; > > t->rseq_sig = 0; > > + t->rseq_size = 0; > > t->rseq_event_mask = 0; > > } else { > > t->rseq = current->rseq; > > t->rseq_sig = current->rseq_sig; > > + t->rseq_size = current->rseq_size; > > t->rseq_event_mask = current->rseq_event_mask; > > } > > } > > @@ -1988,6 +1991,7 @@ static inline void rseq_execve(struct task_struct *t) > > { > > t->rseq = NULL; > > t->rseq_sig = 0; > > + t->rseq_size = 0; > > t->rseq_event_mask = 0; > > } > > > > diff --git a/include/uapi/linux/rseq.h b/include/uapi/linux/rseq.h > > index e11d9df5e564..03c0b5e9a859 100644 > > --- a/include/uapi/linux/rseq.h > > +++ b/include/uapi/linux/rseq.h > > @@ -37,6 +37,15 @@ enum rseq_cs_flags { > > (1U << RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT), > > }; > > > > +enum rseq_tls_flags_bit { > > + /* enum rseq_cs_flags reserves bits 0-2. */ > > + RSEQ_TLS_FLAG_SIZE_BIT = 3, > > +}; > > + > > +enum rseq_tls_flags { > > + RSEQ_TLS_FLAG_SIZE = (1U << RSEQ_TLS_FLAG_SIZE_BIT), > > +}; > > + > > /* The rseq_len expected by rseq registration is always 32 bytes. */ > > enum rseq_len_expected { > > RSEQ_LEN_EXPECTED = 32, > > @@ -133,8 +142,9 @@ struct rseq { > > * > > * This field should only be updated by the thread which > > * registered this data structure. Read by the kernel. > > - * Mainly used for single-stepping through rseq critical sections > > - * with debuggers. > > + * > > + * The RSEQ_CS flags are mainly used for single-stepping through rseq > > + * critical sections with debuggers. > > * > > * - RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT > > * Inhibit instruction sequence block restart on preemption > > @@ -145,8 +155,31 @@ struct rseq { > > * - RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE > > * Inhibit instruction sequence block restart on migration for > > * this thread. > > + * > > + * - RSEQ_TLS_FLAG_SIZE > > + * Extensible struct rseq ABI. This flag should be statically > > + * initialized. > > */ > > __u32 flags; > > + /* > > + * With __rseq_abi.flags RSEQ_TLS_FLAG_SIZE set, user_size should be > > + * statically initialized to offsetof(struct rseq, end). > > + */ > > + __u16 user_size; > > + /* > > + * With __rseq_abi.flags RSEQ_TLS_FLAG_SIZE set, if the kernel supports > > + * extensible struct rseq ABI, the kernel_size field is populated by > > + * the kernel to the minimum between user_size and the offset of the > > + * "end" field within the struct rseq supported by the kernel on > > + * successful registration. Should be initialized to 0. > > + */ > > + __u16 kernel_size; > > (Btw, this what I suggested - minus the user_size part - when I said > "expose the size of struct rseq" the kernel knows about. The approach > here is of course more general.) > > It's pretty uncommon to use __u16 for sizes at least in public facing > structs. I'd suggest to use __u32 user_size and __u32 kernel_size and if > needed, insert padding. Seems you have done this in your union above > already. > > > + > > + /* > > + * Very last field of the structure, to calculate size excluding padding > > + * with offsetof(). > > + */ > > + char end[]; > > Hm, could this mess with alignment or break making the struct > extensible? Feels like you're adding new members always before this > which is also pretty non-standard in terms of how we'd usually extend > structs. > > > } __attribute__((aligned(4 * sizeof(__u64)))); > > > > #endif /* _UAPI_LINUX_RSEQ_H */ > > diff --git a/kernel/rseq.c b/kernel/rseq.c > > index a4f86a9d6937..bbc57fc18573 100644 > > --- a/kernel/rseq.c > > +++ b/kernel/rseq.c > > @@ -96,6 +96,7 @@ static int rseq_update_cpu_id(struct task_struct *t) > > static int rseq_reset_rseq_cpu_id(struct task_struct *t) > > { > > u32 cpu_id_start = 0, cpu_id = RSEQ_CPU_ID_UNINITIALIZED; > > + u16 kernel_size = 0; > > > > /* > > * Reset cpu_id_start to its initial state (0). > > @@ -109,6 +110,11 @@ static int rseq_reset_rseq_cpu_id(struct task_struct *t) > > */ > > if (put_user(cpu_id, &t->rseq->cpu_id)) > > return -EFAULT; > > + /* > > + * Reset kernel_size to its initial state (0). > > + */ > > + if (put_user(kernel_size, &t->rseq->kernel_size)) > > + return -EFAULT; > > return 0; > > } > > > > @@ -266,7 +272,7 @@ void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs) > > > > if (unlikely(t->flags & PF_EXITING)) > > return; > > - if (unlikely(!access_ok(t->rseq, sizeof(*t->rseq)))) > > + if (unlikely(!access_ok(t->rseq, t->rseq_size))) > > goto error; > > ret = rseq_ip_fixup(regs); > > if (unlikely(ret < 0)) > > @@ -294,7 +300,7 @@ void rseq_syscall(struct pt_regs *regs) > > > > if (!t->rseq) > > return; > > - if (!access_ok(t->rseq, sizeof(*t->rseq)) || > > + if (!access_ok(t->rseq, t->rseq_size) || > > rseq_get_rseq_cs(t, &rseq_cs) || in_rseq_cs(ip, &rseq_cs)) > > force_sig(SIGSEGV); > > } > > @@ -308,6 +314,7 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, > > int, flags, u32, sig) > > { > > int ret; > > + u32 tls_flags; > > > > if (flags & RSEQ_FLAG_UNREGISTER) { > > if (flags & ~RSEQ_FLAG_UNREGISTER) > > @@ -315,7 +322,7 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, > > /* Unregister rseq for current thread. */ > > if (current->rseq != rseq || !current->rseq) > > return -EINVAL; > > - if (rseq_len != sizeof(*rseq)) > > + if (rseq_len != RSEQ_LEN_EXPECTED) > > So I have to say that I think it's not a great to fix the length of the > rseq_len argument basically making it somewhat a nop. If I recall > correctly Florian said something about the rseq_len becoming part of the > glibc abi and that's why it can't be changed? > Is there any way we can avoid that so we can use the rseq_len argument > to have userspace pass down the size of struct rseq they know about? > > It's really unintuitive to pass down an extensible struct but the length > argument associated with it is fixed. > > I also think there should be some compile-time sanity checks here > similar to what we do in other places see e.g. > > BUILD_BUG_ON(sizeof(struct clone_args) != CLONE_ARGS_SIZE_VER2); > > So here should at least be sm like: > > BUILD_BUG_ON(sizeof(struct rseq) != RSEQ_LEN_EXPECTED); > So here's a very free-wheeling draft of roughly what I had in mind. Not even compile-tested just to illustrate what I'd change and sorry if that code will make you sob in your hands: >From 2879e3c30dbe6ba0fc53884b1c41deaa444924a8 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers <mathieu.desnoyers@xxxxxxxxxxxx> Date: Mon, 13 Jul 2020 23:03:46 -0400 Subject: [PATCH] [UNTESTED] rseq: Allow extending struct rseq Add a __rseq_abi.flags "RSEQ_TLS_FLAG_SIZE", which indicates support for extending struct rseq. This adds two new fields to struct rseq: user_size and kernel_size. The user_size field allows the size of the __rseq_abi definition (which can be overridden by symbol interposition either by a preloaded library or by the application) to be handed over to the kernel at registration. This registration can be performed by a library, e.g. glibc, which does not know there is interposition taking place. The kernel_size is populated by the kernel when the "RSEQ_TLS_FLAG_SIZE" flag is set in __rseq_abi.flags to the minimum between user_size and the offset of the "end" field of struct rseq as known by the kernel. This allows user-space to query which fields are effectively populated by the kernel. A rseq_size field is added to the task struct to keep track of the "kernel_size" effective for each thread. Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@xxxxxxxxxxxx> Signed-off-by: Christian Brauner <christian.brauner@xxxxxxxxxx> --- include/linux/sched.h | 4 +++ include/uapi/linux/rseq.h | 36 ++++++++++++++++++-- kernel/rseq.c | 72 ++++++++++++++++++++++++++++++++++++--- 3 files changed, 105 insertions(+), 7 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 692e327d7455..5d61a3197987 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1147,6 +1147,7 @@ struct task_struct { #ifdef CONFIG_RSEQ struct rseq __user *rseq; u32 rseq_sig; + u32 rseq_size; /* * RmW on rseq_event_mask must be performed atomically * with respect to preemption. @@ -1976,10 +1977,12 @@ static inline void rseq_fork(struct task_struct *t, unsigned long clone_flags) if (clone_flags & CLONE_VM) { t->rseq = NULL; t->rseq_sig = 0; + t->rseq_size = 0; t->rseq_event_mask = 0; } else { t->rseq = current->rseq; t->rseq_sig = current->rseq_sig; + t->rseq_size = current->rseq_size; t->rseq_event_mask = current->rseq_event_mask; } } @@ -1988,6 +1991,7 @@ static inline void rseq_execve(struct task_struct *t) { t->rseq = NULL; t->rseq_sig = 0; + t->rseq_size = 0; t->rseq_event_mask = 0; } diff --git a/include/uapi/linux/rseq.h b/include/uapi/linux/rseq.h index 9a402fdb60e9..f10ce38d9712 100644 --- a/include/uapi/linux/rseq.h +++ b/include/uapi/linux/rseq.h @@ -37,6 +37,15 @@ enum rseq_cs_flags { (1U << RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT), }; +enum rseq_tls_flags_bit { + /* enum rseq_cs_flags reserves bits 0-2. */ + RSEQ_TLS_FLAG_SIZE_BIT = 3, +}; + +enum rseq_tls_flags { + RSEQ_TLS_FLAG_SIZE = (1U << RSEQ_TLS_FLAG_SIZE_BIT), +}; + /* * struct rseq_cs is aligned on 4 * 8 bytes to ensure it is always * contained within a single cache-line. It is usually declared as @@ -128,8 +137,9 @@ struct rseq { * * This field should only be updated by the thread which * registered this data structure. Read by the kernel. - * Mainly used for single-stepping through rseq critical sections - * with debuggers. + * + * The RSEQ_CS flags are mainly used for single-stepping through rseq + * critical sections with debuggers. * * - RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT * Inhibit instruction sequence block restart on preemption @@ -140,8 +150,30 @@ struct rseq { * - RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE * Inhibit instruction sequence block restart on migration for * this thread. + * + * - RSEQ_TLS_FLAG_SIZE + * Extensible struct rseq ABI. This flag should be statically + * initialized. */ __u32 flags; + /* + * With __rseq_abi.flags RSEQ_TLS_FLAG_SIZE set, user_size should be + * statically initialized to offsetof(struct rseq, end). + */ + __u32 user_size; + /* + * With __rseq_abi.flags RSEQ_TLS_FLAG_SIZE set, if the kernel supports + * extensible struct rseq ABI, the kernel_size field is populated by + * the kernel to the minimum between user_size and the offset of the + * "end" field within the struct rseq supported by the kernel on + * successful registration. Should be initialized to 0. + */ + __u32 kernel_size; + __u32 active_size; } __attribute__((aligned(4 * sizeof(__u64)))); +#define RSEQ_SIZE_VER0 24 /* sizeof first published struct */ +#define RSEQ_SIZE_VER1 32 /* sizeof second published struct */ +#define RSEQ_SIZE_LATEST RSEQ_SIZE_VER1 /* sizeof last published struct */ + #endif /* _UAPI_LINUX_RSEQ_H */ diff --git a/kernel/rseq.c b/kernel/rseq.c index a4f86a9d6937..9d28b41a99cd 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -96,6 +96,7 @@ static int rseq_update_cpu_id(struct task_struct *t) static int rseq_reset_rseq_cpu_id(struct task_struct *t) { u32 cpu_id_start = 0, cpu_id = RSEQ_CPU_ID_UNINITIALIZED; + u16 kernel_size = 0; /* * Reset cpu_id_start to its initial state (0). @@ -109,6 +110,11 @@ static int rseq_reset_rseq_cpu_id(struct task_struct *t) */ if (put_user(cpu_id, &t->rseq->cpu_id)) return -EFAULT; + /* + * Reset kernel_size to its initial state (0). + */ + if (put_user(kernel_size, &t->rseq->kernel_size)) + return -EFAULT; return 0; } @@ -266,7 +272,7 @@ void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs) if (unlikely(t->flags & PF_EXITING)) return; - if (unlikely(!access_ok(t->rseq, sizeof(*t->rseq)))) + if (unlikely(!access_ok(t->rseq, t->rseq_size))) goto error; ret = rseq_ip_fixup(regs); if (unlikely(ret < 0)) @@ -294,7 +300,7 @@ void rseq_syscall(struct pt_regs *regs) if (!t->rseq) return; - if (!access_ok(t->rseq, sizeof(*t->rseq)) || + if (!access_ok(t->rseq, t->rseq_size) || rseq_get_rseq_cs(t, &rseq_cs) || in_rseq_cs(ip, &rseq_cs)) force_sig(SIGSEGV); } @@ -308,6 +314,11 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32, sig) { int ret; + u32 tls_flags; + + BUILD_BUG_ON(offsetofend(struct rseq, flags) != RSEQ_SIZE_VER0); + BUILD_BUG_ON(offsetofend(struct rseq, active_size) != RSEQ_SIZE_VER1); + BUILD_BUG_ON(sizeof(struct rseq) != RSEQ_SIZE_LATEST); if (flags & RSEQ_FLAG_UNREGISTER) { if (flags & ~RSEQ_FLAG_UNREGISTER) @@ -315,14 +326,17 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, /* Unregister rseq for current thread. */ if (current->rseq != rseq || !current->rseq) return -EINVAL; - if (rseq_len != sizeof(*rseq)) + if (rseq_len < RSEQ_SIZE_VER0) return -EINVAL; + if (!access_ok(rseq, rseq_len)) + return -EFAULT; if (current->rseq_sig != sig) return -EPERM; ret = rseq_reset_rseq_cpu_id(current); if (ret) return ret; current->rseq = NULL; + current->rseq_size = 0; current->rseq_sig = 0; return 0; } @@ -336,7 +350,7 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, * the provided address differs from the prior * one. */ - if (current->rseq != rseq || rseq_len != sizeof(*rseq)) + if (current->rseq != rseq || rseq_len != RSEQ_LEN_EXPECTED) return -EINVAL; if (current->rseq_sig != sig) return -EPERM; @@ -349,10 +363,58 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, * ensure the provided rseq is properly aligned and valid. */ if (!IS_ALIGNED((unsigned long)rseq, __alignof__(*rseq)) || - rseq_len != sizeof(*rseq)) + rseq_len < RSEQ_SIZE_VER0) return -EINVAL; if (!access_ok(rseq, rseq_len)) return -EFAULT; + + /* Handle extensible struct rseq ABI. */ + ret = get_user(tls_flags, &rseq->flags); + if (ret) + return ret; + if (tls_flags & RSEQ_TLS_FLAG_SIZE) { + u32 user_size, kernel_size, active_size; + + /* Can probably be made nicer by using check_zeroed_user(). */ + ret = get_user(user_size, &rseq->user_size); + if (ret) + return ret; + if (user_size != 0) + return -EINVAL; + + ret = get_user(active_size, &rseq->active_size); + if (ret) + return ret; + if (active_size != 0) + return -EINVAL; + + ret = get_user(active_size, &rseq->kernel_size); + if (ret) + return ret; + if (kernel_size != 0) + return -EINVAL; + + /* Calculate the useable size. */ + active_size = min_t(u32, rseq_len, RSEQ_SIZE_LATEST); + ret = put_user(active_size, &rseq->active_size); + if (ret) + return ret; + + /* Let other users know what userspace used to register. */ + ret = put_user(rseq_len, &rseq->user_size); + if (ret) + return -EFAULT; + + /* Let other users know what size the kernel supports. */ + ret = put_user(RSEQ_SIZE_LATEST, &rseq->kernel_size); + if (ret) + return -EFAULT; + + current->rseq_size = active_size; + } else { + current->rseq_size = RSEQ_SIZE_VER0; + } + current->rseq = rseq; current->rseq_sig = sig; /* -- 2.27.0