On Thu, Oct 31, 2019 at 10:08:06AM +0000, Lai Jiangshan wrote: > Convert x86 to use a per-cpu rcu_preempt_depth. The reason for doing so > is that accessing per-cpu variables is a lot cheaper than accessing > task_struct variables. > > We need to save/restore the actual rcu_preempt_depth when switch. > We also place the per-cpu rcu_preempt_depth close to __preempt_count > and current_task variable. > > Using the idea of per-cpu __preempt_count. > > No function call when using rcu_read_[un]lock(). > Single instruction for rcu_read_lock(). > 2 instructions for fast path of rcu_read_unlock(). > > CC: Peter Zijlstra <peterz@xxxxxxxxxxxxx> > Signed-off-by: Lai Jiangshan <laijs@xxxxxxxxxxxxxxxxx> > --- > arch/x86/Kconfig | 2 + > arch/x86/include/asm/rcu_preempt_depth.h | 87 ++++++++++++++++++++++++ > arch/x86/kernel/cpu/common.c | 7 ++ > arch/x86/kernel/process_32.c | 2 + > arch/x86/kernel/process_64.c | 2 + > include/linux/rcupdate.h | 24 +++++++ > init/init_task.c | 2 +- > kernel/fork.c | 2 +- > kernel/rcu/Kconfig | 3 + > kernel/rcu/tree_exp.h | 2 + > kernel/rcu/tree_plugin.h | 39 ++++++++--- > 11 files changed, 160 insertions(+), 12 deletions(-) > create mode 100644 arch/x86/include/asm/rcu_preempt_depth.h > > diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig > index d6e1faa28c58..af9fedc0fdc4 100644 > --- a/arch/x86/Kconfig > +++ b/arch/x86/Kconfig > @@ -18,6 +18,7 @@ config X86_32 > select MODULES_USE_ELF_REL > select OLD_SIGACTION > select GENERIC_VDSO_32 > + select ARCH_HAVE_RCU_PREEMPT_DEEPTH > > config X86_64 > def_bool y > @@ -31,6 +32,7 @@ config X86_64 > select NEED_DMA_MAP_STATE > select SWIOTLB > select ARCH_HAS_SYSCALL_WRAPPER > + select ARCH_HAVE_RCU_PREEMPT_DEEPTH > > config FORCE_DYNAMIC_FTRACE > def_bool y > diff --git a/arch/x86/include/asm/rcu_preempt_depth.h b/arch/x86/include/asm/rcu_preempt_depth.h > new file mode 100644 > index 000000000000..88010ad59c20 > --- /dev/null > +++ b/arch/x86/include/asm/rcu_preempt_depth.h > @@ -0,0 +1,87 @@ > +/* SPDX-License-Identifier: GPL-2.0 */ > +#ifndef __ASM_RCU_PREEMPT_DEPTH_H > +#define __ASM_RCU_PREEMPT_DEPTH_H > + > +#include <asm/rmwcc.h> > +#include <asm/percpu.h> > + > +#ifdef CONFIG_PREEMPT_RCU > +DECLARE_PER_CPU(int, __rcu_preempt_depth); > + > +/* > + * We use the RCU_NEED_SPECIAL bit as an inverted need_special > + * such that a decrement hitting 0 means we can and should do > + * rcu_read_unlock_special(). > + */ > +#define RCU_NEED_SPECIAL 0x80000000 > + > +#define INIT_RCU_PREEMPT_DEPTH (RCU_NEED_SPECIAL) > + > +/* We mask the RCU_NEED_SPECIAL bit so that it return real depth */ > +static __always_inline int rcu_preempt_depth(void) > +{ > + return raw_cpu_read_4(__rcu_preempt_depth) & ~RCU_NEED_SPECIAL; Why not raw_cpu_generic_read()? OK, OK, I get that raw_cpu_read_4() translates directly into an "mov" instruction on x86, but given that x86 percpu_from_op() is able to adjust based on operand size, why doesn't something like raw_cpu_read() also have an x86-specific definition that adjusts based on operand size? > +} > + > +static __always_inline void rcu_preempt_depth_set(int pc) > +{ > + int old, new; > + > + do { > + old = raw_cpu_read_4(__rcu_preempt_depth); > + new = (old & RCU_NEED_SPECIAL) | > + (pc & ~RCU_NEED_SPECIAL); > + } while (raw_cpu_cmpxchg_4(__rcu_preempt_depth, old, new) != old); Ummm... OK, as you know, I have long wanted _rcu_read_lock() to be inlineable. But are you -sure- that an x86 cmpxchg is faster than a function call and return? I have strong doubts on that score. Plus multiplying the x86-specific code by 26 doesn't look good. And the RCU read-side nesting depth really is a per-task thing. Copying it to and from the task at context-switch time might make sense if we had a serious optimization, but it does not appear that we do. You original patch some years back, ill-received though it was at the time, is looking rather good by comparison. Plus it did not require architecture-specific code! Thanx, Paul +} > + > +/* > + * We fold the RCU_NEED_SPECIAL bit into the rcu_preempt_depth such that > + * rcu_read_unlock() can decrement and test for needing to do special > + * with a single instruction. > + * > + * We invert the actual bit, so that when the decrement hits 0 we know > + * both it just exited the outmost rcu_read_lock() critical section and > + * we need to do specail (the bit is cleared) if it doesn't need to be > + * deferred. > + */ > + > +static inline void set_rcu_preempt_need_special(void) > +{ > + raw_cpu_and_4(__rcu_preempt_depth, ~RCU_NEED_SPECIAL); > +} > + > +/* > + * irq needs to be disabled for clearing any bits of ->rcu_read_unlock_special > + * and calling this function. Otherwise it may clear the work done > + * by set_rcu_preempt_need_special() in interrupt. > + */ > +static inline void clear_rcu_preempt_need_special(void) > +{ > + raw_cpu_or_4(__rcu_preempt_depth, RCU_NEED_SPECIAL); > +} > + > +static __always_inline void rcu_preempt_depth_inc(void) > +{ > + raw_cpu_add_4(__rcu_preempt_depth, 1); > +} > + > +static __always_inline bool rcu_preempt_depth_dec_and_test(void) > +{ > + return GEN_UNARY_RMWcc("decl", __rcu_preempt_depth, e, __percpu_arg([var])); > +} > + > +/* must be macros to avoid header recursion hell */ > +#define save_restore_rcu_preempt_depth(prev_p, next_p) do { \ > + prev_p->rcu_read_lock_nesting = this_cpu_read(__rcu_preempt_depth); \ > + this_cpu_write(__rcu_preempt_depth, next_p->rcu_read_lock_nesting); \ > + } while (0) > + > +#define DEFINE_PERCPU_RCU_PREEMP_DEPTH \ > + DEFINE_PER_CPU(int, __rcu_preempt_depth) = INIT_RCU_PREEMPT_DEPTH; \ > + EXPORT_PER_CPU_SYMBOL(__rcu_preempt_depth) > +#else /* #ifdef CONFIG_PREEMPT_RCU */ > +#define save_restore_rcu_preempt_depth(prev_p, next_p) do {} while (0) > +#define DEFINE_PERCPU_RCU_PREEMP_DEPTH /* empty */ > +#endif /* #else #ifdef CONFIG_PREEMPT_RCU */ > + > +#endif /* __ASM_RCU_PREEMPT_DEPTH_H */ > diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c > index 9ae7d1bcd4f4..0151737e196c 100644 > --- a/arch/x86/kernel/cpu/common.c > +++ b/arch/x86/kernel/cpu/common.c > @@ -46,6 +46,7 @@ > #include <asm/asm.h> > #include <asm/bugs.h> > #include <asm/cpu.h> > +#include <asm/rcu_preempt_depth.h> > #include <asm/mce.h> > #include <asm/msr.h> > #include <asm/pat.h> > @@ -1633,6 +1634,9 @@ DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1; > DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT; > EXPORT_PER_CPU_SYMBOL(__preempt_count); > > +/* close to __preempt_count */ > +DEFINE_PERCPU_RCU_PREEMP_DEPTH; > + > /* May not be marked __init: used by software suspend */ > void syscall_init(void) > { > @@ -1690,6 +1694,9 @@ EXPORT_PER_CPU_SYMBOL(current_task); > DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT; > EXPORT_PER_CPU_SYMBOL(__preempt_count); > > +/* close to __preempt_count */ > +DEFINE_PERCPU_RCU_PREEMP_DEPTH; > + > /* > * On x86_32, vm86 modifies tss.sp0, so sp0 isn't a reliable way to find > * the top of the kernel stack. Use an extra percpu variable to track the > diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c > index b8ceec4974fe..ab1f20353663 100644 > --- a/arch/x86/kernel/process_32.c > +++ b/arch/x86/kernel/process_32.c > @@ -51,6 +51,7 @@ > #include <asm/cpu.h> > #include <asm/syscalls.h> > #include <asm/debugreg.h> > +#include <asm/rcu_preempt_depth.h> > #include <asm/switch_to.h> > #include <asm/vm86.h> > #include <asm/resctrl_sched.h> > @@ -290,6 +291,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) > if (prev->gs | next->gs) > lazy_load_gs(next->gs); > > + save_restore_rcu_preempt_depth(prev_p, next_p); > this_cpu_write(current_task, next_p); > > switch_fpu_finish(next_fpu); > diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c > index af64519b2695..2e1c6e829d30 100644 > --- a/arch/x86/kernel/process_64.c > +++ b/arch/x86/kernel/process_64.c > @@ -50,6 +50,7 @@ > #include <asm/ia32.h> > #include <asm/syscalls.h> > #include <asm/debugreg.h> > +#include <asm/rcu_preempt_depth.h> > #include <asm/switch_to.h> > #include <asm/xen/hypervisor.h> > #include <asm/vdso.h> > @@ -559,6 +560,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) > > x86_fsgsbase_load(prev, next); > > + save_restore_rcu_preempt_depth(prev_p, next_p); > /* > * Switch the PDA and FPU contexts. > */ > diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h > index a35daab95d14..0d2abf08b694 100644 > --- a/include/linux/rcupdate.h > +++ b/include/linux/rcupdate.h > @@ -41,6 +41,29 @@ void synchronize_rcu(void); > > #ifdef CONFIG_PREEMPT_RCU > > +#ifdef CONFIG_ARCH_HAVE_RCU_PREEMPT_DEEPTH > +#include <asm/rcu_preempt_depth.h> > + > +#ifndef CONFIG_PROVE_LOCKING > +extern void rcu_read_unlock_special(void); > + > +static inline void __rcu_read_lock(void) > +{ > + rcu_preempt_depth_inc(); > +} > + > +static inline void __rcu_read_unlock(void) > +{ > + if (unlikely(rcu_preempt_depth_dec_and_test())) > + rcu_read_unlock_special(); > +} > +#else > +void __rcu_read_lock(void); > +void __rcu_read_unlock(void); > +#endif > + > +#else /* #ifdef CONFIG_ARCH_HAVE_RCU_PREEMPT_DEEPTH */ > +#define INIT_RCU_PREEMPT_DEPTH (0) > void __rcu_read_lock(void); > void __rcu_read_unlock(void); > > @@ -51,6 +74,7 @@ void __rcu_read_unlock(void); > * types of kernel builds, the rcu_read_lock() nesting depth is unknowable. > */ > #define rcu_preempt_depth() (current->rcu_read_lock_nesting) > +#endif /* #else #ifdef CONFIG_ARCH_HAVE_RCU_PREEMPT_DEEPTH */ > > #else /* #ifdef CONFIG_PREEMPT_RCU */ > > diff --git a/init/init_task.c b/init/init_task.c > index 9e5cbe5eab7b..0a91e38fba37 100644 > --- a/init/init_task.c > +++ b/init/init_task.c > @@ -130,7 +130,7 @@ struct task_struct init_task > .perf_event_list = LIST_HEAD_INIT(init_task.perf_event_list), > #endif > #ifdef CONFIG_PREEMPT_RCU > - .rcu_read_lock_nesting = 0, > + .rcu_read_lock_nesting = INIT_RCU_PREEMPT_DEPTH, > .rcu_read_unlock_special.s = 0, > .rcu_node_entry = LIST_HEAD_INIT(init_task.rcu_node_entry), > .rcu_blocked_node = NULL, > diff --git a/kernel/fork.c b/kernel/fork.c > index f9572f416126..7368d4ccb857 100644 > --- a/kernel/fork.c > +++ b/kernel/fork.c > @@ -1665,7 +1665,7 @@ init_task_pid(struct task_struct *task, enum pid_type type, struct pid *pid) > static inline void rcu_copy_process(struct task_struct *p) > { > #ifdef CONFIG_PREEMPT_RCU > - p->rcu_read_lock_nesting = 0; > + p->rcu_read_lock_nesting = INIT_RCU_PREEMPT_DEPTH; > p->rcu_read_unlock_special.s = 0; > p->rcu_blocked_node = NULL; > INIT_LIST_HEAD(&p->rcu_node_entry); > diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig > index 1cc940fef17c..d2ecca49a1a4 100644 > --- a/kernel/rcu/Kconfig > +++ b/kernel/rcu/Kconfig > @@ -14,6 +14,9 @@ config TREE_RCU > thousands of CPUs. It also scales down nicely to > smaller systems. > > +config ARCH_HAVE_RCU_PREEMPT_DEEPTH > + def_bool n > + > config PREEMPT_RCU > bool > default y if PREEMPTION > diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h > index dc1af2073e25..b8922cb19884 100644 > --- a/kernel/rcu/tree_exp.h > +++ b/kernel/rcu/tree_exp.h > @@ -588,6 +588,7 @@ static void wait_rcu_exp_gp(struct work_struct *wp) > } > > #ifdef CONFIG_PREEMPT_RCU > +static inline void set_rcu_preempt_need_special(void); > > /* > * Remote handler for smp_call_function_single(). If there is an > @@ -644,6 +645,7 @@ static void rcu_exp_handler(void *unused) > if (rcu_preempt_depth() > 0) { > rdp->exp_deferred_qs = true; > WRITE_ONCE(t->rcu_read_unlock_special.b.exp_hint, true); > + set_rcu_preempt_need_special(); > return; > } > } > diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h > index 21bb04fec0d2..4d958d4b179c 100644 > --- a/kernel/rcu/tree_plugin.h > +++ b/kernel/rcu/tree_plugin.h > @@ -82,7 +82,7 @@ static void __init rcu_bootup_announce_oddness(void) > #ifdef CONFIG_PREEMPT_RCU > > static void rcu_report_exp_rnp(struct rcu_node *rnp, bool wake); > -static void rcu_read_unlock_special(struct task_struct *t); > +void rcu_read_unlock_special(void); > > /* > * Tell them what RCU they are running. > @@ -298,6 +298,7 @@ void rcu_note_context_switch(bool preempt) > t->rcu_read_unlock_special.b.need_qs = false; > t->rcu_read_unlock_special.b.blocked = true; > t->rcu_blocked_node = rnp; > + set_rcu_preempt_need_special(); > > /* > * Verify the CPU's sanity, trace the preemption, and > @@ -345,6 +346,7 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp) > /* Bias and limit values for ->rcu_read_lock_nesting. */ > #define RCU_NEST_PMAX (INT_MAX / 2) > > +#ifndef CONFIG_ARCH_HAVE_RCU_PREEMPT_DEEPTH > static inline void rcu_preempt_depth_inc(void) > { > current->rcu_read_lock_nesting++; > @@ -352,7 +354,12 @@ static inline void rcu_preempt_depth_inc(void) > > static inline bool rcu_preempt_depth_dec_and_test(void) > { > - return --current->rcu_read_lock_nesting == 0; > + if (--current->rcu_read_lock_nesting == 0) { > + /* check speical after dec ->rcu_read_lock_nesting */ > + barrier(); > + return READ_ONCE(current->rcu_read_unlock_special.s); > + } > + return 0; > } > > static inline void rcu_preempt_depth_set(int val) > @@ -360,6 +367,12 @@ static inline void rcu_preempt_depth_set(int val) > current->rcu_read_lock_nesting = val; > } > > +static inline void clear_rcu_preempt_need_special(void) {} > +static inline void set_rcu_preempt_need_special(void) {} > + > +#endif /* #ifndef CONFIG_ARCH_HAVE_RCU_PREEMPT_DEEPTH */ > + > +#if !defined(CONFIG_ARCH_HAVE_RCU_PREEMPT_DEEPTH) || defined (CONFIG_PROVE_LOCKING) > /* > * Preemptible RCU implementation for rcu_read_lock(). > * Just increment ->rcu_read_lock_nesting, shared state will be updated > @@ -383,18 +396,16 @@ EXPORT_SYMBOL_GPL(__rcu_read_lock); > */ > void __rcu_read_unlock(void) > { > - struct task_struct *t = current; > - > if (rcu_preempt_depth_dec_and_test()) { > - barrier(); /* critical section before exit code. */ > - if (unlikely(READ_ONCE(t->rcu_read_unlock_special.s))) > - rcu_read_unlock_special(t); > + rcu_read_unlock_special(); > } > if (IS_ENABLED(CONFIG_PROVE_LOCKING)) { > - WARN_ON_ONCE(rcu_preempt_depth() < 0); > + WARN_ON_ONCE(rcu_preempt_depth() < 0 || > + rcu_preempt_depth() > RCU_NEST_PMAX); > } > } > EXPORT_SYMBOL_GPL(__rcu_read_unlock); > +#endif /* #if !defined(CONFIG_ARCH_HAVE_RCU_PREEMPT_DEEPTH) || defined (CONFIG_PROVE_LOCKING) */ > > /* > * Advance a ->blkd_tasks-list pointer to the next entry, instead > @@ -445,6 +456,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) > */ > special = t->rcu_read_unlock_special; > t->rcu_read_unlock_special.s = 0; > + clear_rcu_preempt_need_special(); > if (special.b.need_qs) { > rcu_qs(); > } > @@ -577,8 +589,9 @@ static void rcu_preempt_deferred_qs_handler(struct irq_work *iwp) > * notify RCU core processing or task having blocked during the RCU > * read-side critical section. > */ > -static void rcu_read_unlock_special(struct task_struct *t) > +void rcu_read_unlock_special(void) > { > + struct task_struct *t = current; > unsigned long flags; > bool preempt_bh_were_disabled = > !!(preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK)); > @@ -610,6 +623,8 @@ static void rcu_read_unlock_special(struct task_struct *t) > // call rcu_read_unlock_special() and then wake_up() > // recursively and deadlock if deferred_qs is still false. > // To avoid it, deferred_qs has to be set beforehand. > + // rcu_preempt_need_special is already set, so it doesn't > + // need to call set_rcu_preempt_need_special() > t->rcu_read_unlock_special.b.deferred_qs = true; > if (irqs_were_disabled && use_softirq && > (in_interrupt() || (exp && !deferred_qs))) { > @@ -636,6 +651,7 @@ static void rcu_read_unlock_special(struct task_struct *t) > } > rcu_preempt_deferred_qs_irqrestore(t, flags); > } > +EXPORT_SYMBOL_GPL(rcu_read_unlock_special); > > /* > * Check that the list of blocked tasks for the newly completed grace > @@ -699,8 +715,10 @@ static void rcu_flavor_sched_clock_irq(int user) > __this_cpu_read(rcu_data.core_needs_qs) && > __this_cpu_read(rcu_data.cpu_no_qs.b.norm) && > !t->rcu_read_unlock_special.b.need_qs && > - time_after(jiffies, rcu_state.gp_start + HZ)) > + time_after(jiffies, rcu_state.gp_start + HZ)) { > t->rcu_read_unlock_special.b.need_qs = true; > + set_rcu_preempt_need_special(); > + } > } > > /* > @@ -719,6 +737,7 @@ void exit_rcu(void) > rcu_preempt_depth_set(1); > barrier(); > WRITE_ONCE(t->rcu_read_unlock_special.b.blocked, true); > + set_rcu_preempt_need_special(); > } else if (unlikely(rcu_preempt_depth())) { > rcu_preempt_depth_set(1); > } else { > -- > 2.20.1 >