For virtualization usecases, util_est and util_avg currently tracked on the host aren't sufficient to accurately represent the workload on vCPU threads, which results in poor frequency selection and performance. For example, when a large workload migrates from a busy vCPU thread to an idle vCPU thread, it incurs additional DVFS ramp-up latencies as util accumulates. Introduce a new "util_guest" member as an additional PELT signal that's independently updated by the guest. When used, it's max aggregated to provide a boost to both task_util and task_util_est. Updating task_util and task_util_est will ensure: -Better task placement decisions for vCPU threads on the host -Correctly updating util_est.ewma during dequeue -Additive util with other threads on the same runqueue for more accurate frequency responses Co-developed-by: Saravana Kannan <saravanak@xxxxxxxxxx> Signed-off-by: Saravana Kannan <saravanak@xxxxxxxxxx> Signed-off-by: David Dai <davidai@xxxxxxxxxx> --- include/linux/sched.h | 11 +++++++++++ kernel/sched/core.c | 18 +++++++++++++++++- kernel/sched/fair.c | 15 +++++++++++++-- 3 files changed, 41 insertions(+), 3 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 63d242164b1a..d8c346fcdf52 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -445,6 +445,16 @@ struct util_est { #define UTIL_AVG_UNCHANGED 0x80000000 } __attribute__((__aligned__(sizeof(u64)))); +/* + * For sched_setattr_nocheck() (kernel) only + * + * Allow vCPU threads to use UTIL_GUEST as a way to hint the scheduler with more + * accurate utilization info. This is useful when guest kernels have some way of + * tracking its own runqueue's utilization. + * + */ +#define SCHED_FLAG_UTIL_GUEST 0x20000000 + /* * The load/runnable/util_avg accumulates an infinite geometric series * (see __update_load_avg_cfs_rq() in kernel/sched/pelt.c). @@ -499,6 +509,7 @@ struct sched_avg { unsigned long load_avg; unsigned long runnable_avg; unsigned long util_avg; + unsigned long util_guest; struct util_est util_est; } ____cacheline_aligned; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 0d18c3969f90..7700ef5610c1 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2024,6 +2024,16 @@ static inline void uclamp_post_fork(struct task_struct *p) { } static inline void init_uclamp(void) { } #endif /* CONFIG_UCLAMP_TASK */ +static void __setscheduler_task_util(struct task_struct *p, + const struct sched_attr *attr) +{ + + if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_GUEST))) + return; + + p->se.avg.util_guest = attr->sched_util_min; +} + bool sched_task_on_rq(struct task_struct *p) { return task_on_rq_queued(p); @@ -7561,7 +7571,7 @@ static int __sched_setscheduler(struct task_struct *p, return -EINVAL; } - if (attr->sched_flags & ~(SCHED_FLAG_ALL | SCHED_FLAG_SUGOV)) + if (attr->sched_flags & ~(SCHED_FLAG_ALL | SCHED_FLAG_SUGOV | SCHED_FLAG_UTIL_GUEST)) return -EINVAL; /* @@ -7583,6 +7593,9 @@ static int __sched_setscheduler(struct task_struct *p, if (attr->sched_flags & SCHED_FLAG_SUGOV) return -EINVAL; + if (attr->sched_flags & SCHED_FLAG_UTIL_GUEST) + return -EINVAL; + retval = security_task_setscheduler(p); if (retval) return retval; @@ -7629,6 +7642,8 @@ static int __sched_setscheduler(struct task_struct *p, goto change; if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) goto change; + if (attr->sched_flags & SCHED_FLAG_UTIL_GUEST) + goto change; p->sched_reset_on_fork = reset_on_fork; retval = 0; @@ -7718,6 +7733,7 @@ static int __sched_setscheduler(struct task_struct *p, __setscheduler_prio(p, newprio); } __setscheduler_uclamp(p, attr); + __setscheduler_task_util(p, attr); if (queued) { /* diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6986ea31c984..998649554344 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4276,14 +4276,16 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf); static inline unsigned long task_util(struct task_struct *p) { - return READ_ONCE(p->se.avg.util_avg); + return max(READ_ONCE(p->se.avg.util_avg), + READ_ONCE(p->se.avg.util_guest)); } static inline unsigned long _task_util_est(struct task_struct *p) { struct util_est ue = READ_ONCE(p->se.avg.util_est); - return max(ue.ewma, (ue.enqueued & ~UTIL_AVG_UNCHANGED)); + return max_t(unsigned long, READ_ONCE(p->se.avg.util_guest), + max(ue.ewma, (ue.enqueued & ~UTIL_AVG_UNCHANGED))); } static inline unsigned long task_util_est(struct task_struct *p) @@ -6242,6 +6244,15 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) */ util_est_enqueue(&rq->cfs, p); + /* + * The normal code path for host thread enqueue doesn't take into + * account guest task migrations when updating cpufreq util. + * So, always update the cpufreq when a vCPU thread has a + * non-zero util_guest value. + */ + if (READ_ONCE(p->se.avg.util_guest)) + cpufreq_update_util(rq, 0); + /* * If in_iowait is set, the code below may not trigger any cpufreq * utilization updates, so do it here explicitly with the IOWAIT flag -- 2.40.0.348.gf938b09366-goog