It was noticed that enabling CONFIG_RCU_BOOST did not help RCU performance because the workqueues that runs expedited GP work are subject to scheduling delays. This patch moves the expedited GP work items to RT kthread_worker. The results were evaluated on arm64 Android devices (6GB ram) running 5.10 kernel, and caturing trace data during critical user journeys. The table below compares the time synchronize_rcu_expedited() is blocked: ---------------------------------------------------------------------- | | Using WQ | Using kthead_worker | Diff | ----------------------------------------------------------------------- | Max duration (ns) | 372766967 | 2329671 | -99.38% | ----------------------------------------------------------------------- | Avg duration (ns) | 2746353.16 | 151242.311 | -94.49% | ---------------------------------------------------------------------- Cc: "Paul E. McKenney" <paulmck@xxxxxxxxxx> Cc: Tejun Heo <tj@xxxxxxxxxx> Reported-by: Tim Murray <timmurray@xxxxxxxxxx> Reported-by: Wei Wang <wvw@xxxxxxxxxx> Tested-by: Kyle Lin <kylelin@xxxxxxxxxx> Tested-by: Chunwei Lu <chunweilu@xxxxxxxxxx> Tested-by: Lulu Wang <luluw@xxxxxxxxxx> Signed-off-by: Kalesh Singh <kaleshsingh@xxxxxxxxxx> --- kernel/rcu/rcu.h | 3 ++- kernel/rcu/tree.c | 41 +++++++++++++++++++++++++++++++++++++---- kernel/rcu/tree.h | 3 ++- kernel/rcu/tree_exp.h | 35 +++++++++++++++-------------------- 4 files changed, 56 insertions(+), 26 deletions(-) diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 24b5f2c2de87..13d2b74bf19f 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -534,7 +534,8 @@ int rcu_get_gp_kthreads_prio(void); void rcu_fwd_progress_check(unsigned long j); void rcu_force_quiescent_state(void); extern struct workqueue_struct *rcu_gp_wq; -extern struct workqueue_struct *rcu_par_gp_wq; +extern struct kthread_worker *rcu_exp_gp_kworker; +extern struct kthread_worker *rcu_exp_par_gp_kworker; #endif /* #else #ifdef CONFIG_TINY_RCU */ #ifdef CONFIG_RCU_NOCB_CPU diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index a4b8189455d5..bd5e672ffa5a 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4471,6 +4471,33 @@ static int rcu_pm_notify(struct notifier_block *self, return NOTIFY_OK; } +struct kthread_worker *rcu_exp_gp_kworker; +struct kthread_worker *rcu_exp_par_gp_kworker; + +static void __init rcu_start_exp_gp_kworkers(void) +{ + const char *par_gp_kworker_name = "rcu_exp_par_gp_kthread_worker"; + const char *gp_kworker_name = "rcu_exp_gp_kthread_worker"; + struct sched_param param = { .sched_priority = kthread_prio }; + + rcu_exp_gp_kworker = kthread_create_worker(0, gp_kworker_name); + if (IS_ERR_OR_NULL(rcu_exp_gp_kworker)) { + pr_err("Failed to create %s!\n", gp_kworker_name); + return; + } + + rcu_exp_par_gp_kworker = kthread_create_worker(0, par_gp_kworker_name); + if (IS_ERR_OR_NULL(rcu_exp_par_gp_kworker)) { + pr_err("Failed to create %s!\n", par_gp_kworker_name); + kthread_destroy_worker(rcu_exp_gp_kworker); + return; + } + + sched_setscheduler_nocheck(rcu_exp_gp_kworker->task, SCHED_FIFO, ¶m); + sched_setscheduler_nocheck(rcu_exp_par_gp_kworker->task, SCHED_FIFO, + ¶m); +} + /* * Spawn the kthreads that handle RCU's grace periods. */ @@ -4500,6 +4527,10 @@ static int __init rcu_spawn_gp_kthread(void) rcu_spawn_nocb_kthreads(); rcu_spawn_boost_kthreads(); rcu_spawn_core_kthreads(); + + /* Create kthread worker for expedited GPs */ + rcu_start_exp_gp_kworkers(); + return 0; } early_initcall(rcu_spawn_gp_kthread); @@ -4745,7 +4776,6 @@ static void __init rcu_dump_rcu_node_tree(void) } struct workqueue_struct *rcu_gp_wq; -struct workqueue_struct *rcu_par_gp_wq; static void __init kfree_rcu_batch_init(void) { @@ -4808,11 +4838,14 @@ void __init rcu_init(void) rcutree_online_cpu(cpu); } - /* Create workqueue for Tree SRCU and for expedited GPs. */ + /* + * Create workqueue for Tree SRCU. + * + * Expedited GPs use RT kthread_worker. + * See: rcu_start_exp_gp_kworkers() + */ rcu_gp_wq = alloc_workqueue("rcu_gp", WQ_MEM_RECLAIM, 0); WARN_ON(!rcu_gp_wq); - rcu_par_gp_wq = alloc_workqueue("rcu_par_gp", WQ_MEM_RECLAIM, 0); - WARN_ON(!rcu_par_gp_wq); /* Fill in default value for rcutree.qovld boot parameter. */ /* -After- the rcu_node ->lock fields are initialized! */ diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 926673ebe355..0193d67a706a 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -10,6 +10,7 @@ */ #include <linux/cache.h> +#include <linux/kthread.h> #include <linux/spinlock.h> #include <linux/rtmutex.h> #include <linux/threads.h> @@ -23,7 +24,7 @@ /* Communicate arguments to a workqueue handler. */ struct rcu_exp_work { unsigned long rew_s; - struct work_struct rew_work; + struct kthread_work rew_work; }; /* RCU's kthread states for tracing. */ diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 60197ea24ceb..f5f3722c0a74 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -334,7 +334,7 @@ static bool exp_funnel_lock(unsigned long s) * Select the CPUs within the specified rcu_node that the upcoming * expedited grace period needs to wait for. */ -static void sync_rcu_exp_select_node_cpus(struct work_struct *wp) +static void sync_rcu_exp_select_node_cpus(struct kthread_work *wp) { int cpu; unsigned long flags; @@ -423,7 +423,6 @@ static void sync_rcu_exp_select_node_cpus(struct work_struct *wp) */ static void sync_rcu_exp_select_cpus(void) { - int cpu; struct rcu_node *rnp; trace_rcu_exp_grace_period(rcu_state.name, rcu_exp_gp_seq_endval(), TPS("reset")); @@ -435,28 +434,27 @@ static void sync_rcu_exp_select_cpus(void) rnp->exp_need_flush = false; if (!READ_ONCE(rnp->expmask)) continue; /* Avoid early boot non-existent wq. */ - if (!READ_ONCE(rcu_par_gp_wq) || + if (!READ_ONCE(rcu_exp_par_gp_kworker) || rcu_scheduler_active != RCU_SCHEDULER_RUNNING || rcu_is_last_leaf_node(rnp)) { - /* No workqueues yet or last leaf, do direct call. */ + /* kthread worker not started yet or last leaf, do direct call. */ sync_rcu_exp_select_node_cpus(&rnp->rew.rew_work); continue; } - INIT_WORK(&rnp->rew.rew_work, sync_rcu_exp_select_node_cpus); - cpu = find_next_bit(&rnp->ffmask, BITS_PER_LONG, -1); - /* If all offline, queue the work on an unbound CPU. */ - if (unlikely(cpu > rnp->grphi - rnp->grplo)) - cpu = WORK_CPU_UNBOUND; - else - cpu += rnp->grplo; - queue_work_on(cpu, rcu_par_gp_wq, &rnp->rew.rew_work); + kthread_init_work(&rnp->rew.rew_work, sync_rcu_exp_select_node_cpus); + /* + * Use rcu_exp_par_gp_kworker, because flushing a work item from + * another work item on the same kthread worker can result in + * deadlock. + */ + kthread_queue_work(rcu_exp_par_gp_kworker, &rnp->rew.rew_work); rnp->exp_need_flush = true; } /* Wait for workqueue jobs (if any) to complete. */ rcu_for_each_leaf_node(rnp) if (rnp->exp_need_flush) - flush_work(&rnp->rew.rew_work); + kthread_flush_work(&rnp->rew.rew_work); } /* @@ -625,7 +623,7 @@ static void rcu_exp_sel_wait_wake(unsigned long s) /* * Work-queue handler to drive an expedited grace period forward. */ -static void wait_rcu_exp_gp(struct work_struct *wp) +static void wait_rcu_exp_gp(struct kthread_work *wp) { struct rcu_exp_work *rewp; @@ -848,20 +846,17 @@ void synchronize_rcu_expedited(void) } else { /* Marshall arguments & schedule the expedited grace period. */ rew.rew_s = s; - INIT_WORK_ONSTACK(&rew.rew_work, wait_rcu_exp_gp); - queue_work(rcu_gp_wq, &rew.rew_work); + kthread_init_work(&rew.rew_work, wait_rcu_exp_gp); + kthread_queue_work(rcu_exp_gp_kworker, &rew.rew_work); } /* Wait for expedited grace period to complete. */ rnp = rcu_get_root(); wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3], sync_exp_work_done(s)); - smp_mb(); /* Workqueue actions happen before return. */ + smp_mb(); /* kthread actions happen before return. */ /* Let the next expedited grace period start. */ mutex_unlock(&rcu_state.exp_mutex); - - if (likely(!boottime)) - destroy_work_on_stack(&rew.rew_work); } EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); base-commit: 7a3ecddc571cc3294e5d6bb5948ff2b0cfa12735 -- 2.35.1.1094.g7c7d902a7c-goog