On Tue, Dec 31 2024 at 18:07, Frederic Weisbecker wrote: > hrtimers are migrated away from the dying CPU to any online target at > the CPUHP_AP_HRTIMERS_DYING stage in order not to delay bandwidth timers > handling tasks involved in the CPU hotplug forward progress. > > However wake ups can still be performed by the outgoing CPU after > CPUHP_AP_HRTIMERS_DYING. Those can result again in bandwidth timers > being armed. Depending on several considerations (crystal ball > power management based election, earliest timer already enqueued, timer > migration enabled or not), the target may eventually be the current > CPU even if offline. If that happens, the timer is eventually ignored. > > The most notable example is RCU which had to deal with each an every of > those wake-ups by deferring them to an online CPU, along with related > workarounds: > > _ e787644caf76 (rcu: Defer RCU kthreads wakeup when CPU is dying) > _ 9139f93209d1 (rcu/nocb: Fix RT throttling hrtimer armed from offline CPU) > _ f7345ccc62a4 (rcu/nocb: Fix rcuog wake-up from offline softirq) > > The problem isn't confined to RCU though as the stop machine kthread > (which runs CPUHP_AP_HRTIMERS_DYING) reports its completion at the end > and performs a wake up that eventually arms the deadline server timer: Where does it report the completion? > WARNING: CPU: 94 PID: 588 at kernel/time/hrtimer.c:1086 hrtimer_start_range_ns+0x289/0x2d0 > CPU: 94 UID: 0 PID: 588 Comm: migration/94 Not tainted > Stopper: multi_cpu_stop+0x0/0x120 <- stop_machine_cpuslocked+0x66/0xc0 > RIP: 0010:hrtimer_start_range_ns+0x289/0x2d0 > Call Trace: > ? __warn+0xcf/0x1b0 > ? hrtimer_start_range_ns+0x289/0x2d0 > ? hrtimer_start_range_ns+0x186/0x2d0 > start_dl_timer+0xfc/0x150 > enqueue_dl_entity+0x367/0x640 > dl_server_start+0x53/0xa0 > enqueue_task_fair+0x363/0x460 > enqueue_task+0x3c/0x200 > ttwu_do_activate+0x94/0x240 > try_to_wake_up+0x315/0x600 > complete+0x4b/0x80 You trimmed the backtrace at the wrong end. You left all the useless register gunk around, but removed the call chain leading up to the completion. :) I assume it's the complete in stomp_machine() .... > Instead of providing yet another bandaid to work around the situation, > fix it from hrtimers infrastructure instead: always migrate away a > timer to an online target whenever it is enqueued from an offline CPU. > +/* > + * If the current CPU is offline and timers have been already > + * migrated away, make sure not to enqueue locally and perform > + * a remote retrigger as a last resort. > + */ > +static void enqueue_hrtimer_offline(struct hrtimer *timer, > + struct hrtimer_clock_base *base, > + const enum hrtimer_mode mode) > +{ > +#ifdef CONFIG_HOTPLUG_CPU > + struct hrtimer_cpu_base *new_cpu_base, *old_cpu_base, *this_cpu_base; > + struct hrtimer_clock_base *new_base; > + int cpu; > + > + old_cpu_base = base->cpu_base; > + this_cpu_base = this_cpu_ptr(&hrtimer_bases); > + > + if (old_cpu_base == this_cpu_base || !old_cpu_base->online) { > + WARN_ON_ONCE(hrtimer_callback_running(timer)); > + cpu = cpumask_any_and(cpu_online_mask, > + housekeeping_cpumask(HK_TYPE_TIMER)); > + new_cpu_base = &per_cpu(hrtimer_bases, cpu); > + new_base = &new_cpu_base->clock_base[base->index]; > + WRITE_ONCE(timer->base, &migration_base); > + raw_spin_unlock(&old_cpu_base->lock); > + raw_spin_lock(&new_cpu_base->lock); > + WRITE_ONCE(timer->base, new_base); > + } else { > + new_base = base; > + new_cpu_base = new_base->cpu_base; > + cpu = new_cpu_base->cpu; > + } > + > + if (enqueue_hrtimer(timer, new_base, mode)) > + smp_call_function_single_async(cpu, &new_cpu_base->csd); Duh. This reimplementation of switch_hrtimer_base() is really aweful. We can be smarter than that. Untested patch below. Thanks, tglx --- --- a/include/linux/hrtimer_defs.h +++ b/include/linux/hrtimer_defs.h @@ -125,6 +125,7 @@ struct hrtimer_cpu_base { ktime_t softirq_expires_next; struct hrtimer *softirq_next_timer; struct hrtimer_clock_base clock_base[HRTIMER_MAX_CLOCK_BASES]; + call_single_data_t csd; } ____cacheline_aligned; --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -58,6 +58,8 @@ #define HRTIMER_ACTIVE_SOFT (HRTIMER_ACTIVE_HARD << MASK_SHIFT) #define HRTIMER_ACTIVE_ALL (HRTIMER_ACTIVE_SOFT | HRTIMER_ACTIVE_HARD) +static void retrigger_next_event(void *arg); + /* * The timer bases: * @@ -111,7 +113,8 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, .clockid = CLOCK_TAI, .get_time = &ktime_get_clocktai, }, - } + }, + .csd = CSD_INIT(retrigger_next_event, NULL) }; static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = { @@ -208,6 +211,13 @@ struct hrtimer_cpu_base *get_target_base if (static_branch_likely(&timers_migration_enabled) && !pinned) return &per_cpu(hrtimer_bases, get_nohz_timer_target()); #endif +#ifdef CONFIG_HOTPLUG_CPU + if (unlikely(!base->online)) { + int cpu = cpumask_any_and(cpu_online_mask, housekeeping_cpumask(HK_TYPE_TIMER)); + + return &per_cpu(hrtimer_bases, cpu); + } +#endif return base; } @@ -254,7 +264,7 @@ switch_hrtimer_base(struct hrtimer *time raw_spin_unlock(&base->cpu_base->lock); raw_spin_lock(&new_base->cpu_base->lock); - if (new_cpu_base != this_cpu_base && + if (new_cpu_base != this_cpu_base && this_cpu_base->online && hrtimer_check_target(timer, new_base)) { raw_spin_unlock(&new_base->cpu_base->lock); raw_spin_lock(&base->cpu_base->lock); @@ -716,8 +726,6 @@ static inline int hrtimer_is_hres_enable return hrtimer_hres_enabled; } -static void retrigger_next_event(void *arg); - /* * Switch to high resolution mode */ @@ -1206,6 +1214,7 @@ static int __hrtimer_start_range_ns(stru u64 delta_ns, const enum hrtimer_mode mode, struct hrtimer_clock_base *base) { + struct hrtimer_cpu_base *this_cpu_base = this_cpu_ptr(&hrtimer_bases); struct hrtimer_clock_base *new_base; bool force_local, first; @@ -1217,10 +1226,16 @@ static int __hrtimer_start_range_ns(stru * and enforce reprogramming after it is queued no matter whether * it is the new first expiring timer again or not. */ - force_local = base->cpu_base == this_cpu_ptr(&hrtimer_bases); + force_local = base->cpu_base == this_cpu_base; force_local &= base->cpu_base->next_timer == timer; /* + * Don't force local queuing if this enqueue happens on a unplugged + * CPU after hrtimer_cpu_dying() has been invoked. + */ + force_local &= this_cpu_base->online; + + /* * Remove an active timer from the queue. In case it is not queued * on the current CPU, make sure that remove_hrtimer() updates the * remote data correctly. @@ -1249,8 +1264,17 @@ static int __hrtimer_start_range_ns(stru } first = enqueue_hrtimer(timer, new_base, mode); - if (!force_local) - return first; + if (!force_local) { + if (likely(this_cpu_base->online)) + return first; + + if (first) { + struct hrtimer_cpu_base *new_cpu_base = new_base->cpu_base; + + smp_call_function_single_async(new_cpu_base->cpu, &new_cpu_base->csd); + } + return 0; + } /* * Timer was forced to stay on the current CPU to avoid