Avoid overhead on the majority of migrate disable/enable sequences by only manipulating scheduler data (and grabbing the relevant locks) when the task actually schedules while migrate-disabled. Very large speedups were seen during a kernel build. Instead of cpuhp_pin_lock, CPU hotplug is handled by keeping a per-CPU count of the number of pinned tasks (including tasks which have not scheduled in the migrate-disabled section); takedown_cpu() will wait until that reaches zero (confirmed by take_cpu_down() in stop machine context to deal with races) before migrating tasks off of the cpu. To simplify synchronization, updating cpus_mask is no longer deferred until migrate_enable(). This lets us not have to worry about migrate_enable() missing the update if it's on the fast path (didn't schedule during the migrate disabled section). It also makes the code a bit simpler and reduces deviation from mainline. While the main motivation for this is the performance benefit, lazy migrate disable also eliminates the restriction on calling migrate_disable() while atomic but leaving the atomic region prior to calling migrate_enable() -- though this won't help with local_bh_disable() (and thus rcutorture) unless something similar is done with the recently added local_lock. Signed-off-by: Scott Wood <swood@xxxxxxxxxx> --- include/linux/cpu.h | 4 -- include/linux/sched.h | 11 +-- init/init_task.c | 4 ++ kernel/cpu.c | 97 +++++++++---------------- kernel/sched/core.c | 192 ++++++++++++++++++++----------------------------- kernel/sched/sched.h | 4 ++ lib/smp_processor_id.c | 3 + 7 files changed, 130 insertions(+), 185 deletions(-) diff --git a/include/linux/cpu.h b/include/linux/cpu.h index f4a772c12d14..2df500fdcbc4 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -113,8 +113,6 @@ static inline void cpu_maps_update_done(void) extern void cpu_hotplug_enable(void); void clear_tasks_mm_cpumask(int cpu); int cpu_down(unsigned int cpu); -extern void pin_current_cpu(void); -extern void unpin_current_cpu(void); #else /* CONFIG_HOTPLUG_CPU */ @@ -126,8 +124,6 @@ static inline void cpus_read_unlock(void) { } static inline void lockdep_assert_cpus_held(void) { } static inline void cpu_hotplug_disable(void) { } static inline void cpu_hotplug_enable(void) { } -static inline void pin_current_cpu(void) { } -static inline void unpin_current_cpu(void) { } #endif /* !CONFIG_HOTPLUG_CPU */ diff --git a/include/linux/sched.h b/include/linux/sched.h index ad23ab939b35..069c46dde15b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -229,6 +229,8 @@ extern long io_schedule_timeout(long timeout); extern void io_schedule(void); +int cpu_nr_pinned(int cpu); + /** * struct prev_cputime - snapshot of system and user cputime * @utime: time spent in user mode @@ -661,16 +663,13 @@ struct task_struct { cpumask_t cpus_mask; #if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT_BASE) int migrate_disable; - int migrate_disable_update; - int pinned_on_cpu; + bool migrate_disable_scheduled; # ifdef CONFIG_SCHED_DEBUG - int migrate_disable_atomic; + int pinned_on_cpu; # endif - #elif !defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT_BASE) # ifdef CONFIG_SCHED_DEBUG int migrate_disable; - int migrate_disable_atomic; # endif #endif #ifdef CONFIG_PREEMPT_RT_BASE @@ -2066,4 +2065,6 @@ static inline void rseq_syscall(struct pt_regs *regs) #endif +extern struct task_struct *takedown_cpu_task; + #endif diff --git a/init/init_task.c b/init/init_task.c index e402413dc47d..c0c7618fd2fb 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -81,6 +81,10 @@ struct task_struct init_task .cpus_ptr = &init_task.cpus_mask, .cpus_mask = CPU_MASK_ALL, .nr_cpus_allowed= NR_CPUS, +#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT_BASE) && \ + defined(CONFIG_SCHED_DEBUG) + .pinned_on_cpu = -1, +#endif .mm = NULL, .active_mm = &init_mm, .restart_block = { diff --git a/kernel/cpu.c b/kernel/cpu.c index 885a195dfbe0..0096acf1a692 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -76,11 +76,6 @@ static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state) = { .fail = CPUHP_INVALID, }; -#if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_PREEMPT_RT_FULL) -static DEFINE_PER_CPU(struct rt_rw_lock, cpuhp_pin_lock) = \ - __RWLOCK_RT_INITIALIZER(cpuhp_pin_lock); -#endif - #if defined(CONFIG_LOCKDEP) && defined(CONFIG_SMP) static struct lockdep_map cpuhp_state_up_map = STATIC_LOCKDEP_MAP_INIT("cpuhp_state-up", &cpuhp_state_up_map); @@ -287,55 +282,6 @@ void cpu_maps_update_done(void) #ifdef CONFIG_HOTPLUG_CPU -/** - * pin_current_cpu - Prevent the current cpu from being unplugged - */ -void pin_current_cpu(void) -{ -#ifdef CONFIG_PREEMPT_RT_FULL - struct rt_rw_lock *cpuhp_pin; - unsigned int cpu; - int ret; - -again: - cpuhp_pin = this_cpu_ptr(&cpuhp_pin_lock); - ret = __read_rt_trylock(cpuhp_pin); - if (ret) { - current->pinned_on_cpu = smp_processor_id(); - return; - } - cpu = smp_processor_id(); - preempt_lazy_enable(); - preempt_enable(); - - __read_rt_lock(cpuhp_pin); - - preempt_disable(); - preempt_lazy_disable(); - if (cpu != smp_processor_id()) { - __read_rt_unlock(cpuhp_pin); - goto again; - } - current->pinned_on_cpu = cpu; -#endif -} - -/** - * unpin_current_cpu - Allow unplug of current cpu - */ -void unpin_current_cpu(void) -{ -#ifdef CONFIG_PREEMPT_RT_FULL - struct rt_rw_lock *cpuhp_pin = this_cpu_ptr(&cpuhp_pin_lock); - - if (WARN_ON(current->pinned_on_cpu != smp_processor_id())) - cpuhp_pin = per_cpu_ptr(&cpuhp_pin_lock, current->pinned_on_cpu); - - current->pinned_on_cpu = -1; - __read_rt_unlock(cpuhp_pin); -#endif -} - DEFINE_STATIC_PERCPU_RWSEM(cpu_hotplug_lock); void cpus_read_lock(void) @@ -893,6 +839,15 @@ static int take_cpu_down(void *_param) int err, cpu = smp_processor_id(); int ret; +#ifdef CONFIG_PREEMPT_RT_BASE + /* + * If any tasks disabled migration before we got here, + * go back and sleep again. + */ + if (cpu_nr_pinned(cpu)) + return -EAGAIN; +#endif + /* Ensure this CPU doesn't handle any more interrupts. */ err = __cpu_disable(); if (err < 0) @@ -922,11 +877,10 @@ static int take_cpu_down(void *_param) return 0; } +struct task_struct *takedown_cpu_task; + static int takedown_cpu(unsigned int cpu) { -#ifdef CONFIG_PREEMPT_RT_FULL - struct rt_rw_lock *cpuhp_pin = per_cpu_ptr(&cpuhp_pin_lock, cpu); -#endif struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); int err; @@ -939,17 +893,34 @@ static int takedown_cpu(unsigned int cpu) */ irq_lock_sparse(); -#ifdef CONFIG_PREEMPT_RT_FULL - __write_rt_lock(cpuhp_pin); +#ifdef CONFIG_PREEMPT_RT_BASE + WARN_ON_ONCE(takedown_cpu_task); + takedown_cpu_task = current; + +again: + for (;;) { + int nr_pinned; + + set_current_state(TASK_UNINTERRUPTIBLE); + nr_pinned = cpu_nr_pinned(cpu); + if (nr_pinned == 0) + break; + schedule(); + } + set_current_state(TASK_RUNNING); #endif /* * So now all preempt/rcu users must observe !cpu_active(). */ err = stop_machine_cpuslocked(take_cpu_down, NULL, cpumask_of(cpu)); +#ifdef CONFIG_PREEMPT_RT_BASE + if (err == -EAGAIN) + goto again; +#endif if (err) { -#ifdef CONFIG_PREEMPT_RT_FULL - __write_rt_unlock(cpuhp_pin); +#ifdef CONFIG_PREEMPT_RT_BASE + takedown_cpu_task = NULL; #endif /* CPU refused to die */ irq_unlock_sparse(); @@ -969,8 +940,8 @@ static int takedown_cpu(unsigned int cpu) wait_for_ap_thread(st, false); BUG_ON(st->state != CPUHP_AP_IDLE_DEAD); -#ifdef CONFIG_PREEMPT_RT_FULL - __write_rt_unlock(cpuhp_pin); +#ifdef CONFIG_PREEMPT_RT_BASE + takedown_cpu_task = NULL; #endif /* Interrupts are moved away from the dying cpu, reenable alloc/free */ irq_unlock_sparse(); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3a2d8251a30c..a11bfa7939da 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1126,7 +1126,8 @@ static int migration_cpu_stop(void *data) void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask) { cpumask_copy(&p->cpus_mask, new_mask); - p->nr_cpus_allowed = cpumask_weight(new_mask); + if (p->cpus_ptr == &p->cpus_mask) + p->nr_cpus_allowed = cpumask_weight(new_mask); } #if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT_BASE) @@ -1137,8 +1138,7 @@ int __migrate_disabled(struct task_struct *p) EXPORT_SYMBOL_GPL(__migrate_disabled); #endif -static void __do_set_cpus_allowed_tail(struct task_struct *p, - const struct cpumask *new_mask) +void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) { struct rq *rq = task_rq(p); bool queued, running; @@ -1167,20 +1167,6 @@ static void __do_set_cpus_allowed_tail(struct task_struct *p, set_curr_task(rq, p); } -void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) -{ -#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT_BASE) - if (__migrate_disabled(p)) { - lockdep_assert_held(&p->pi_lock); - - cpumask_copy(&p->cpus_mask, new_mask); - p->migrate_disable_update = 1; - return; - } -#endif - __do_set_cpus_allowed_tail(p, new_mask); -} - /* * Change a given task's CPU affinity. Migrate the thread to a * proper CPU and schedule it away if the CPU it's executing on @@ -1239,7 +1225,8 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, } /* Can the task run on the task's current CPU? If so, we're done */ - if (cpumask_test_cpu(task_cpu(p), new_mask) || __migrate_disabled(p)) + if (cpumask_test_cpu(task_cpu(p), new_mask) || + p->cpus_ptr != &p->cpus_mask) goto out; dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask); @@ -3461,6 +3448,8 @@ static inline void schedule_debug(struct task_struct *prev) BUG(); } +static void migrate_disabled_sched(struct task_struct *p); + /* * __schedule() is the main scheduler function. * @@ -3531,6 +3520,9 @@ static void __sched notrace __schedule(bool preempt) rq_lock(rq, &rf); smp_mb__after_spinlock(); + if (__migrate_disabled(prev)) + migrate_disabled_sched(prev); + /* Promote REQ to ACT */ rq->clock_update_flags <<= 1; update_rq_clock(rq); @@ -5781,6 +5773,8 @@ static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf) BUG_ON(!next); put_prev_task(rq, next); + WARN_ON_ONCE(__migrate_disabled(next)); + /* * Rules for changing task_struct::cpus_mask are holding * both pi_lock and rq->lock, such that holding either @@ -7280,14 +7274,9 @@ void dump_cpu_task(int cpu) static inline void migrate_disable_update_cpus_allowed(struct task_struct *p) { - struct rq *rq; - struct rq_flags rf; - - rq = task_rq_lock(p, &rf); p->cpus_ptr = cpumask_of(smp_processor_id()); update_nr_migratory(p, -1); p->nr_cpus_allowed = 1; - task_rq_unlock(rq, p, &rf); } static inline void @@ -7305,54 +7294,35 @@ void dump_cpu_task(int cpu) void migrate_disable(void) { - struct task_struct *p = current; + preempt_disable(); - if (in_atomic() || irqs_disabled()) { + if (++current->migrate_disable == 1) { + this_rq()->nr_pinned++; + preempt_lazy_disable(); #ifdef CONFIG_SCHED_DEBUG - p->migrate_disable_atomic++; + WARN_ON_ONCE(current->pinned_on_cpu >= 0); + current->pinned_on_cpu = smp_processor_id(); #endif - return; - } -#ifdef CONFIG_SCHED_DEBUG - if (unlikely(p->migrate_disable_atomic)) { - tracing_off(); - WARN_ON_ONCE(1); } -#endif - if (p->migrate_disable) { - p->migrate_disable++; - return; - } + preempt_enable(); +} +EXPORT_SYMBOL(migrate_disable); - preempt_disable(); - preempt_lazy_disable(); - pin_current_cpu(); +static void migrate_disabled_sched(struct task_struct *p) +{ + if (p->migrate_disable_scheduled) + return; migrate_disable_update_cpus_allowed(p); - p->migrate_disable = 1; - - preempt_enable(); + p->migrate_disable_scheduled = 1; } -EXPORT_SYMBOL(migrate_disable); void migrate_enable(void) { struct task_struct *p = current; - - if (in_atomic() || irqs_disabled()) { -#ifdef CONFIG_SCHED_DEBUG - p->migrate_disable_atomic--; -#endif - return; - } - -#ifdef CONFIG_SCHED_DEBUG - if (unlikely(p->migrate_disable_atomic)) { - tracing_off(); - WARN_ON_ONCE(1); - } -#endif + struct rq *rq = this_rq(); + int cpu = task_cpu(p); WARN_ON_ONCE(p->migrate_disable <= 0); if (p->migrate_disable > 1) { @@ -7362,72 +7332,74 @@ void migrate_enable(void) preempt_disable(); +#ifdef CONFIG_SCHED_DEBUG + WARN_ON_ONCE(current->pinned_on_cpu != cpu); + current->pinned_on_cpu = -1; +#endif + + WARN_ON_ONCE(rq->nr_pinned < 1); + p->migrate_disable = 0; + rq->nr_pinned--; + if (rq->nr_pinned == 0 && unlikely(!cpu_active(cpu)) && + takedown_cpu_task) + wake_up_process(takedown_cpu_task); + + if (!p->migrate_disable_scheduled) + goto out; + + p->migrate_disable_scheduled = 0; + migrate_enable_update_cpus_allowed(p); - if (p->migrate_disable_update) { - struct rq *rq; + WARN_ON(smp_processor_id() != cpu); + if (!is_cpu_allowed(p, cpu)) { + struct migration_arg arg = { p }; struct rq_flags rf; - int cpu = task_cpu(p); rq = task_rq_lock(p, &rf); update_rq_clock(rq); - - __do_set_cpus_allowed_tail(p, &p->cpus_mask); + arg.dest_cpu = select_fallback_rq(cpu, p); task_rq_unlock(rq, p, &rf); - p->migrate_disable_update = 0; - - WARN_ON(smp_processor_id() != cpu); - if (!cpumask_test_cpu(cpu, &p->cpus_mask)) { - struct migration_arg arg = { p }; - struct rq_flags rf; - - rq = task_rq_lock(p, &rf); - update_rq_clock(rq); - arg.dest_cpu = select_fallback_rq(cpu, p); - task_rq_unlock(rq, p, &rf); + preempt_lazy_enable(); + preempt_enable(); - unpin_current_cpu(); - preempt_lazy_enable(); - preempt_enable(); - - /* - * Avoid sleeping with an existing non-running - * state. This will result in a spurious wakeup - * for the calling context. - */ - __set_current_state(TASK_RUNNING); + /* + * Avoid sleeping with an existing non-running + * state. This will result in a spurious wakeup + * for the calling context. + */ + __set_current_state(TASK_RUNNING); - sleeping_lock_inc(); - stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg); - sleeping_lock_dec(); - return; - } + sleeping_lock_inc(); + stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg); + sleeping_lock_dec(); + return; } - unpin_current_cpu(); + +out: preempt_lazy_enable(); preempt_enable(); } EXPORT_SYMBOL(migrate_enable); -#elif !defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT_BASE) -void migrate_disable(void) +int cpu_nr_pinned(int cpu) { -#ifdef CONFIG_SCHED_DEBUG - struct task_struct *p = current; + struct rq *rq = cpu_rq(cpu); - if (in_atomic() || irqs_disabled()) { - p->migrate_disable_atomic++; - return; - } + return rq->nr_pinned; +} - if (unlikely(p->migrate_disable_atomic)) { - tracing_off(); - WARN_ON_ONCE(1); - } +#elif !defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT_BASE) +static void migrate_disabled_sched(struct task_struct *p) +{ +} - p->migrate_disable++; +void migrate_disable(void) +{ +#ifdef CONFIG_SCHED_DEBUG + current->migrate_disable++; #endif barrier(); } @@ -7438,20 +7410,14 @@ void migrate_enable(void) #ifdef CONFIG_SCHED_DEBUG struct task_struct *p = current; - if (in_atomic() || irqs_disabled()) { - p->migrate_disable_atomic--; - return; - } - - if (unlikely(p->migrate_disable_atomic)) { - tracing_off(); - WARN_ON_ONCE(1); - } - WARN_ON_ONCE(p->migrate_disable <= 0); p->migrate_disable--; #endif barrier(); } EXPORT_SYMBOL(migrate_enable); +#else +static void migrate_disabled_sched(struct task_struct *p) +{ +} #endif diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 9c1bbd67e971..2e4db417049d 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -952,6 +952,10 @@ struct rq { /* Must be inspected within a rcu lock section */ struct cpuidle_state *idle_state; #endif + +#if defined(CONFIG_PREEMPT_RT_BASE) && defined(CONFIG_SMP) + int nr_pinned; +#endif }; #ifdef CONFIG_FAIR_GROUP_SCHED diff --git a/lib/smp_processor_id.c b/lib/smp_processor_id.c index 60ba93fc42ce..ded40a11c867 100644 --- a/lib/smp_processor_id.c +++ b/lib/smp_processor_id.c @@ -23,6 +23,9 @@ unsigned int check_preemption_disabled(const char *what1, const char *what2) * Kernel threads bound to a single CPU can safely use * smp_processor_id(): */ + if (current->migrate_disable) + goto out; + if (cpumask_equal(current->cpus_ptr, cpumask_of(this_cpu))) goto out; -- 1.8.3.1