On Mon, Jul 06, 2020 at 03:28:38PM +0100, Qais Yousef wrote: > +static void __uclamp_sync_util_min_rt_default(struct task_struct *p) > +{ > + unsigned int default_util_min; > + struct uclamp_se *uc_se; > + > + WARN_ON_ONCE(!rcu_read_lock_held()); > + > + if (!rt_task(p)) > + return; > + > + uc_se = &p->uclamp_req[UCLAMP_MIN]; > + > + /* Only sync if user didn't override the default */ > + if (uc_se->user_defined) > + return; > + > + /* Sync with smp_wmb() in uclamp_sync_util_min_rt_default() */ > + smp_rmb(); > + default_util_min = sysctl_sched_uclamp_util_min_rt_default; > + uclamp_se_set(uc_se, default_util_min, false); > +} > + > +static void uclamp_sync_util_min_rt_default(void) > +{ > + struct task_struct *g, *p; > + > + /* > + * Make sure the updated sysctl_sched_uclamp_util_min_rt_default which > + * was just written is synchronized against any future read on another > + * cpu. > + */ > + smp_wmb(); > + > + /* > + * Wait for all updaters to observe the new change. > + * > + * There are 2 races to deal with here: > + * > + * 1. fork()->copy_process() > + * > + * If a task was concurrently forking, for_each_process_thread() > + * will not see it, hence it could have copied the old value and > + * we missed the opportunity to update it. > + * > + * This should be handled by sched_post_fork() where it'll ensure > + * it performs the sync after the fork. > + * > + * 2. fork()->sched_post_fork() > + * __setscheduler_uclamp() > + * > + * Both of these functions could read the old value but then get > + * preempted, during which a user might write new value to > + * sysctl_sched_uclamp_util_min_rt_default. > + * > + * // read sysctl_sched_uclamp_util_min_rt_default; > + * // PREEMPT-OUT > + * . > + * . <-- sync happens here > + * . > + * // PREEMPT-IN > + * // write p->uclamp_req[UCLAMP_MIN] > + * > + * That section is protected with rcu_read_lock(), so > + * synchronize_rcu() will guarantee it has finished before we > + * perform the update. Hence ensure that this sync happens after > + * any concurrent sync which should guarantee correctness. > + */ > + synchronize_rcu(); > + > + rcu_read_lock(); > + for_each_process_thread(g, p) > + __uclamp_sync_util_min_rt_default(p); > + rcu_read_unlock(); > +} It's monday, and I cannot get my brain working.. I cannot decipher the comments you have with the smp_[rw]mb(), what actual ordering do they enforce? Also, your synchronize_rcu() relies on write_lock() beeing non-preemptible, which isn't true on PREEMPT_RT. The below seems simpler... --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1013,8 +1013,6 @@ static void __uclamp_sync_util_min_rt_de unsigned int default_util_min; struct uclamp_se *uc_se; - WARN_ON_ONCE(!rcu_read_lock_held()); - if (!rt_task(p)) return; @@ -1024,8 +1022,6 @@ static void __uclamp_sync_util_min_rt_de if (uc_se->user_defined) return; - /* Sync with smp_wmb() in uclamp_sync_util_min_rt_default() */ - smp_rmb(); default_util_min = sysctl_sched_uclamp_util_min_rt_default; uclamp_se_set(uc_se, default_util_min, false); } @@ -1035,47 +1031,21 @@ static void uclamp_sync_util_min_rt_defa struct task_struct *g, *p; /* - * Make sure the updated sysctl_sched_uclamp_util_min_rt_default which - * was just written is synchronized against any future read on another - * cpu. - */ - smp_wmb(); - - /* - * Wait for all updaters to observe the new change. - * - * There are 2 races to deal with here: - * - * 1. fork()->copy_process() - * - * If a task was concurrently forking, for_each_process_thread() - * will not see it, hence it could have copied the old value and - * we missed the opportunity to update it. - * - * This should be handled by sched_post_fork() where it'll ensure - * it performs the sync after the fork. - * - * 2. fork()->sched_post_fork() - * __setscheduler_uclamp() - * - * Both of these functions could read the old value but then get - * preempted, during which a user might write new value to - * sysctl_sched_uclamp_util_min_rt_default. - * - * // read sysctl_sched_uclamp_util_min_rt_default; - * // PREEMPT-OUT - * . - * . <-- sync happens here - * . - * // PREEMPT-IN - * // write p->uclamp_req[UCLAMP_MIN] - * - * That section is protected with rcu_read_lock(), so - * synchronize_rcu() will guarantee it has finished before we - * perform the update. Hence ensure that this sync happens after - * any concurrent sync which should guarantee correctness. - */ - synchronize_rcu(); + * copy_process() sysctl_uclamp + * uclamp_min_rt = X; + * write_lock(&tasklist_lock) read_lock(&tasklist_lock) + * // link thread smp_mb__after_spinlock() + * write_unlock(&tasklist_lock) read_unlock(&tasklist_lock); + * sched_post_fork() for_each_process_thread() + * __uclamp_sync_rt() __uclamp_sync_rt() + * + * Ensures that either sched_post_fork() will observe the new + * uclamp_min_rt or for_each_process_thread() will observe the new + * task. + */ + read_lock(&tasklist_lock); + smp_mb__after_spinlock(); + read_unlock(&tasklist_lock); rcu_read_lock(); for_each_process_thread(g, p) @@ -1408,6 +1378,9 @@ int sysctl_sched_uclamp_handler(struct c uclamp_update_root_tg(); } + if (old_min_rt != sysctl_sched_uclamp_util_min_rt_default) + uclamp_sync_util_min_rt_default(); + /* * We update all RUNNABLE tasks only when task groups are in use. * Otherwise, keep it simple and do just a lazy update at each next @@ -1466,9 +1439,7 @@ static void __setscheduler_uclamp(struct * at runtime. */ if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN)) { - rcu_read_lock(); __uclamp_sync_util_min_rt_default(p); - rcu_read_unlock(); } else { uclamp_se_set(uc_se, uclamp_none(clamp_id), false); } @@ -1521,6 +1492,11 @@ static void __init init_uclamp_rq(struct rq->uclamp_flags = 0; } +static void uclamp_post_fork(struct task_struct *p) +{ + __uclamp_sync_util_min_rt_default(p); +} + static void __init init_uclamp(void) { struct uclamp_se uc_max = {};