On Tue, 18 Feb 2025 at 21:27, Fernand Sieber <sieberf@xxxxxxxxxx> wrote: > > With guest hlt/mwait/pause pass through, the scheduler has no visibility into > real vCPU activity as it sees them all 100% active. As such, load balancing > cannot make informed decisions on where it is preferrable to collocate > tasks when necessary. I.e as far as the load balancer is concerned, a > halted vCPU and an idle polling vCPU look exactly the same so it may decide > that either should be preempted when in reality it would be preferrable to > preempt the idle one. > > This commits enlightens the scheduler to real guest activity in this > situation. Leveraging gtime unhalted, it adds a hook for kvm to communicate > to the scheduler the duration that a vCPU spends halted. This is then used in > PELT accounting to discount it from real activity. This results in better > placement and overall steal time reduction. NAK, PELT account for time spent by se on the CPU. If your thread/vcpu doesn't do anything but burn cycles, find another way to report thatto the host Furthermore this breaks all the hierarchy dependency > > This initial implementation assumes that non-idle CPUs are ticking as it > hooks the unhalted time the PELT decaying load accounting. As such it > doesn't work well if PELT is updated infrequenly with large chunks of > halted time. This is not a fundamental limitation but more complex > accounting is needed to generalize the use case to nohz full. > --- > arch/x86/kvm/x86.c | 8 ++++++-- > include/linux/sched.h | 4 ++++ > kernel/sched/core.c | 1 + > kernel/sched/fair.c | 25 +++++++++++++++++++++++++ > kernel/sched/pelt.c | 42 +++++++++++++++++++++++++++++++++++------- > kernel/sched/sched.h | 2 ++ > 6 files changed, 73 insertions(+), 9 deletions(-) > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > index 46975b0a63a5..156cf05b325f 100644 > --- a/arch/x86/kvm/x86.c > +++ b/arch/x86/kvm/x86.c > @@ -10712,6 +10712,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) > int r; > unsigned long long cycles, cycles_start = 0; > unsigned long long unhalted_cycles, unhalted_cycles_start = 0; > + unsigned long long halted_cycles_ns = 0; > bool req_int_win = > dm_request_for_irq_injection(vcpu) && > kvm_cpu_accept_dm_intr(vcpu); > @@ -11083,8 +11084,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) > cycles = get_cycles() - cycles_start; > unhalted_cycles = get_unhalted_cycles() - > unhalted_cycles_start; > - if (likely(cycles > unhalted_cycles)) > - current->gtime_halted += cycles2ns(cycles - unhalted_cycles); > + if (likely(cycles > unhalted_cycles)) { > + halted_cycles_ns = cycles2ns(cycles - unhalted_cycles); > + current->gtime_halted += halted_cycles_ns; > + sched_account_gtime_halted(current, halted_cycles_ns); > + } > } > > local_irq_enable(); > diff --git a/include/linux/sched.h b/include/linux/sched.h > index 5f6745357e20..5409fac152c9 100644 > --- a/include/linux/sched.h > +++ b/include/linux/sched.h > @@ -367,6 +367,8 @@ struct vtime { > u64 gtime; > }; > > +extern void sched_account_gtime_halted(struct task_struct *p, u64 gtime_halted); > + > /* > * Utilization clamp constraints. > * @UCLAMP_MIN: Minimum utilization > @@ -588,6 +590,8 @@ struct sched_entity { > */ > struct sched_avg avg; > #endif > + > + u64 gtime_halted; > }; > > struct sched_rt_entity { > diff --git a/kernel/sched/core.c b/kernel/sched/core.c > index 9aecd914ac69..1f3ced2b2636 100644 > --- a/kernel/sched/core.c > +++ b/kernel/sched/core.c > @@ -4487,6 +4487,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) > p->se.nr_migrations = 0; > p->se.vruntime = 0; > p->se.vlag = 0; > + p->se.gtime_halted = 0; > INIT_LIST_HEAD(&p->se.group_node); > > /* A delayed task cannot be in clone(). */ > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c > index 1c0ef435a7aa..5ff52711d459 100644 > --- a/kernel/sched/fair.c > +++ b/kernel/sched/fair.c > @@ -13705,4 +13705,29 @@ __init void init_sched_fair_class(void) > #endif > #endif /* SMP */ > > + > +} > + > +#ifdef CONFIG_NO_HZ_FULL > +void sched_account_gtime_halted(struct task_struct *p, u64 gtime_halted) > +{ > } > +#else > +/* > + * The implementation hooking into PELT requires regular updates of > + * gtime_halted. This is guaranteed unless we run on CONFIG_NO_HZ_FULL. > + */ > +void sched_account_gtime_halted(struct task_struct *p, u64 gtime_halted) > +{ > + struct sched_entity *se = &p->se; > + > + if (unlikely(!gtime_halted)) > + return; > + > + for_each_sched_entity(se) { > + se->gtime_halted += gtime_halted; > + se->cfs_rq->gtime_halted += gtime_halted; > + } > +} > +#endif > +EXPORT_SYMBOL(sched_account_gtime_halted); > diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c > index 7a8534a2deff..9f96b7c46c00 100644 > --- a/kernel/sched/pelt.c > +++ b/kernel/sched/pelt.c > @@ -305,10 +305,23 @@ int __update_load_avg_blocked_se(u64 now, struct sched_entity *se) > > int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se) > { > - if (___update_load_sum(now, &se->avg, !!se->on_rq, se_runnable(se), > - cfs_rq->curr == se)) { > + int ret = 0; > + u64 delta = now - se->avg.last_update_time; > + u64 gtime_halted = min(delta, se->gtime_halted); > > - ___update_load_avg(&se->avg, se_weight(se)); > + ret = ___update_load_sum(now - gtime_halted, &se->avg, !!se->on_rq, se_runnable(se), > + cfs_rq->curr == se); > + > + if (gtime_halted) { > + ret |= ___update_load_sum(now, &se->avg, 0, 0, 0); > + se->gtime_halted -= gtime_halted; > + > + /* decay residual halted time */ > + if (ret && se->gtime_halted) > + se->gtime_halted = decay_load(se->gtime_halted, delta / 1024); > + } > + > + if (ret) { > cfs_se_util_change(&se->avg); > trace_pelt_se_tp(se); > return 1; > @@ -319,10 +332,25 @@ int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se > > int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq) > { > - if (___update_load_sum(now, &cfs_rq->avg, > - scale_load_down(cfs_rq->load.weight), > - cfs_rq->h_nr_runnable, > - cfs_rq->curr != NULL)) { > + int ret = 0; > + u64 delta = now - cfs_rq->avg.last_update_time; > + u64 gtime_halted = min(delta, cfs_rq->gtime_halted); > + > + ret = ___update_load_sum(now - gtime_halted, &cfs_rq->avg, > + scale_load_down(cfs_rq->load.weight), > + cfs_rq->h_nr_runnable, > + cfs_rq->curr != NULL); > + > + if (gtime_halted) { > + ret |= ___update_load_sum(now, &cfs_rq->avg, 0, 0, 0); > + cfs_rq->gtime_halted -= gtime_halted; > + > + /* decay any residual halted time */ > + if (ret && cfs_rq->gtime_halted) > + cfs_rq->gtime_halted = decay_load(cfs_rq->gtime_halted, delta / 1024); > + } > + > + if (ret) { > > ___update_load_avg(&cfs_rq->avg, 1); > trace_pelt_cfs_tp(cfs_rq); > diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h > index b93c8c3dc05a..79b1166265bf 100644 > --- a/kernel/sched/sched.h > +++ b/kernel/sched/sched.h > @@ -744,6 +744,8 @@ struct cfs_rq { > struct list_head throttled_csd_list; > #endif /* CONFIG_CFS_BANDWIDTH */ > #endif /* CONFIG_FAIR_GROUP_SCHED */ > + > + u64 gtime_halted; > }; > > #ifdef CONFIG_SCHED_CLASS_EXT > -- > 2.43.0 > > > > > Amazon Development Centre (South Africa) (Proprietary) Limited > 29 Gogosoa Street, Observatory, Cape Town, Western Cape, 7925, South Africa > Registration Number: 2004 / 034463 / 07 >