Adjust affinity of watchdog_cpumask, hrtimers according to change of housekeeping.cpumasks[HK_TYPE_TIMER]. Function migrate_hrtimer_list_except() is prototyped from migrate_hrtimer_list() and is more generic. Potentially it can be used instead of migrate_hrtimer_list. Function hrtimers_resettle_from_cpu() is blindly prototyped from hrtimers_cpu_dying(). local_irq_disable() is used because cpuhp_thread_fun() uses it before cpuhp_invoke_callback(). Core test snippets without infrastructure: 1. Create hrtimer on specific cpu with: set_cpus_allowed_ptr(current, cpumask_of(test_cpu)); hrtimer_init(&test_hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); test_hrtimer.function = test_hrtimer_cb; hrtimer_start(&test_hrtimer, -1, HRTIMER_MODE_REL); 2. Call housekeeping_update() 3. Assure that there is only tick_nohz_handler on specified cpu in /proc/timer_list manually or with script: grep -E 'cpu| #[0-9]' /proc/timer_list | \ awk "/cpu:/{y=0};/cpu: $test_cpu\$/{y=1};y" Another alternative solution to migrate hrtimers: 1. Use cpuhp to set sched_timer offline 2. Resettle all hrtimers likewise migrate_hrtimer_list 3. Use cpuhp to set sched_timer online Signed-off-by: Costa Shulyupin <costa.shul@xxxxxxxxxx> --- include/linux/hrtimer.h | 2 + kernel/sched/isolation.c | 2 + kernel/time/hrtimer.c | 81 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 85 insertions(+) diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index aa1e65ccb6158..004632fc7d643 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -381,8 +381,10 @@ extern void sysrq_timer_list_show(void); int hrtimers_prepare_cpu(unsigned int cpu); #ifdef CONFIG_HOTPLUG_CPU int hrtimers_cpu_dying(unsigned int cpu); +void hrtimers_resettle_from_cpu(unsigned int cpu); #else #define hrtimers_cpu_dying NULL +static inline void hrtimers_resettle_from_cpu(unsigned int cpu) { } #endif #endif diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c index 3b63f0212887e..85a17d39d8bb0 100644 --- a/kernel/sched/isolation.c +++ b/kernel/sched/isolation.c @@ -126,10 +126,12 @@ static void resettle_all_timers(cpumask_var_t enable_mask, cpumask_var_t disable for_each_cpu(cpu, enable_mask) { timers_prepare_cpu(cpu); + hrtimers_prepare_cpu(cpu); } for_each_cpu(cpu, disable_mask) { timers_resettle_from_cpu(cpu); + hrtimers_resettle_from_cpu(cpu); } } diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 492c14aac642b..7e71ebbb72348 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -2201,6 +2201,87 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, } } +/* + * migrate_hrtimer_list_except - migrates hrtimers from one base to another, + * except specified one. + */ +static void migrate_hrtimer_list_except(struct hrtimer_clock_base *old_base, + struct hrtimer_clock_base *new_base, struct hrtimer *except) +{ + struct hrtimer *timer; + struct timerqueue_node *node; + + node = timerqueue_getnext(&old_base->active); + while (node) { + timer = container_of(node, struct hrtimer, node); + node = timerqueue_iterate_next(node); + if (timer == except) + continue; + + BUG_ON(hrtimer_callback_running(timer)); + debug_deactivate(timer); + + /* + * Mark it as ENQUEUED not INACTIVE otherwise the + * timer could be seen as !active and just vanish away + * under us on another CPU + */ + __remove_hrtimer(timer, old_base, HRTIMER_STATE_ENQUEUED, 0); + timer->base = new_base; + /* + * Enqueue the timers on the new cpu. This does not + * reprogram the event device in case the timer + * expires before the earliest on this CPU, but we run + * hrtimer_interrupt after we migrated everything to + * sort out already expired timers and reprogram the + * event device. + */ + enqueue_hrtimer(timer, new_base, HRTIMER_MODE_ABS); + } +} + +/** + * hrtimers_resettle_from_cpu - resettles hrtimers from + * specified cpu to housekeeping cpus. + */ +void hrtimers_resettle_from_cpu(unsigned int isol_cpu) +{ + int ncpu, i; + struct tick_sched *ts = tick_get_tick_sched(isol_cpu); + struct hrtimer_cpu_base *old_base, *new_base; + + local_irq_disable(); + ncpu = cpumask_any_and(cpu_active_mask, housekeeping_cpumask(HK_TYPE_TIMER)); + + old_base = &per_cpu(hrtimer_bases, isol_cpu); + new_base = &per_cpu(hrtimer_bases, ncpu); + + /* + * The caller is globally serialized and nobody else + * takes two locks at once, deadlock is not possible. + */ + raw_spin_lock(&old_base->lock); + raw_spin_lock_nested(&new_base->lock, SINGLE_DEPTH_NESTING); + for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { + migrate_hrtimer_list_except(&old_base->clock_base[i], + &new_base->clock_base[i], + &ts->sched_timer); + } + + /* + * The migration might have changed the first expiring softirq + * timer on this CPU. Update it. + */ + __hrtimer_get_next_event(new_base, HRTIMER_ACTIVE_SOFT); + + raw_spin_unlock(&new_base->lock); + raw_spin_unlock(&old_base->lock); + local_irq_enable(); + + /* Tell the other CPU to retrigger the next event */ + smp_call_function_single(ncpu, retrigger_next_event, NULL, 0); +} + int hrtimers_cpu_dying(unsigned int dying_cpu) { int i, ncpu = cpumask_any_and(cpu_active_mask, housekeeping_cpumask(HK_TYPE_TIMER)); -- 2.45.0