On Mon, 2023-07-31 at 18:14 +0200, Peter Zijlstra wrote: > Ha!, I was poking around the same thing. My hack below seems to (so far, > <20 boots) help things. > > > diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h > index 56c470a489c8..b083b5a30025 100644 > --- a/kernel/rcu/tasks.h > +++ b/kernel/rcu/tasks.h > @@ -652,7 +658,11 @@ static void __init rcu_spawn_tasks_kthread_generic(struct rcu_tasks *rtp) > t = kthread_run(rcu_tasks_kthread, rtp, "%s_kthread", rtp->kname); > if (WARN_ONCE(IS_ERR(t), "%s: Could not start %s grace-period kthread, OOM is now expected behavior\n", __func__, rtp->name)) > return; > - smp_mb(); /* Ensure others see full kthread. */ > + for (;;) { > + cond_resched(); > + if (smp_load_acquire(&rtp->kthread_ptr)) > + break; > + } > } > > #ifndef CONFIG_TINY_RCU FWIW, here's my hack which seems to fix it. diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 9b9ce09f8f35..2e76fbfff9c6 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -52,6 +52,7 @@ struct rcu_tasks_percpu { * @cbs_gbl_lock: Lock protecting callback list. * @tasks_gp_mutex: Mutex protecting grace period, needed during mid-boot dead zone. * @kthread_ptr: This flavor's grace-period/callback-invocation kthread. + * @kthread_started: Flag that indicates whether kthread has been launched. * @gp_func: This flavor's grace-period-wait function. * @gp_state: Grace period's most recent state transition (debugging). * @gp_sleep: Per-grace-period sleep to prevent CPU-bound looping. @@ -92,6 +93,7 @@ struct rcu_tasks { unsigned long n_ipis; unsigned long n_ipis_fails; struct task_struct *kthread_ptr; + int kthread_started; rcu_tasks_gp_func_t gp_func; pregp_func_t pregp_func; pertask_func_t pertask_func; @@ -582,7 +584,7 @@ static void synchronize_rcu_tasks_generic(struct rcu_tasks *rtp) return; // If the grace-period kthread is running, use it. - if (READ_ONCE(rtp->kthread_ptr)) { + if (READ_ONCE(rtp->kthread_started)) { wait_rcu_gp(rtp->call_func); return; } @@ -595,6 +597,7 @@ static void __init rcu_spawn_tasks_kthread_generic(struct rcu_tasks *rtp) struct task_struct *t; t = kthread_run(rcu_tasks_kthread, rtp, "%s_kthread", rtp->kname); + rtp->kthread_started = 1; if (WARN_ONCE(IS_ERR(t), "%s: Could not start %s grace-period kthread, OOM is now expected behavior\n", __func__, rtp->name)) return; smp_mb(); /* Ensure others see full kthread. */