Hi Vincent, On Fri, 29 Jun 2018 at 03:07, Vincent Guittot <vincent.guittot@xxxxxxxxxx> wrote: > > interrupt and steal time are the only remaining activities tracked by > rt_avg. Like for sched classes, we can use PELT to track their average > utilization of the CPU. But unlike sched class, we don't track when > entering/leaving interrupt; Instead, we take into account the time spent > under interrupt context when we update rqs' clock (rq_clock_task). > This also means that we have to decay the normal context time and account > for interrupt time during the update. > > That's also important to note that because > rq_clock == rq_clock_task + interrupt time > and rq_clock_task is used by a sched class to compute its utilization, the > util_avg of a sched class only reflects the utilization of the time spent > in normal context and not of the whole time of the CPU. The utilization of > interrupt gives an more accurate level of utilization of CPU. > The CPU utilization is : > avg_irq + (1 - avg_irq / max capacity) * /Sum avg_rq > > Most of the time, avg_irq is small and neglictible so the use of the > approximation CPU utilization = /Sum avg_rq was enough > > Cc: Ingo Molnar <mingo@xxxxxxxxxx> > Cc: Peter Zijlstra <peterz@xxxxxxxxxxxxx> > Signed-off-by: Vincent Guittot <vincent.guittot@xxxxxxxxxx> > --- > kernel/sched/core.c | 4 +++- > kernel/sched/fair.c | 13 ++++++++++--- > kernel/sched/pelt.c | 40 ++++++++++++++++++++++++++++++++++++++++ > kernel/sched/pelt.h | 16 ++++++++++++++++ > kernel/sched/sched.h | 3 +++ > 5 files changed, 72 insertions(+), 4 deletions(-) > > diff --git a/kernel/sched/core.c b/kernel/sched/core.c > index 78d8fac..e5263a4 100644 > --- a/kernel/sched/core.c > +++ b/kernel/sched/core.c > @@ -18,6 +18,8 @@ > #include "../workqueue_internal.h" > #include "../smpboot.h" > > +#include "pelt.h" > + > #define CREATE_TRACE_POINTS > #include <trace/events/sched.h> > > @@ -186,7 +188,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) > > #if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) > if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY)) > - sched_rt_avg_update(rq, irq_delta + steal); > + update_irq_load_avg(rq, irq_delta + steal); I think we should not add steal time into irq load tracking, steal time is always 0 on native kernel which doesn't matter, what will happen when guest disables IRQ_TIME_ACCOUNTING and enables PARAVIRT_TIME_ACCOUNTING? Steal time is not the real irq util_avg. In addition, we haven't exposed power management for performance which means that e.g. schedutil governor can not cooperate with passive mode intel_pstate driver to tune the OPP. To decay the old steal time avg and add the new one just wastes cpu cycles. Regards, Wanpeng Li > #endif > } > > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c > index ffce4b2..d2758e3 100644 > --- a/kernel/sched/fair.c > +++ b/kernel/sched/fair.c > @@ -7289,7 +7289,7 @@ static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) > return false; > } > > -static inline bool others_rqs_have_blocked(struct rq *rq) > +static inline bool others_have_blocked(struct rq *rq) > { > if (READ_ONCE(rq->avg_rt.util_avg)) > return true; > @@ -7297,6 +7297,11 @@ static inline bool others_rqs_have_blocked(struct rq *rq) > if (READ_ONCE(rq->avg_dl.util_avg)) > return true; > > +#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) > + if (READ_ONCE(rq->avg_irq.util_avg)) > + return true; > +#endif > + > return false; > } > > @@ -7361,8 +7366,9 @@ static void update_blocked_averages(int cpu) > } > update_rt_rq_load_avg(rq_clock_task(rq), rq, 0); > update_dl_rq_load_avg(rq_clock_task(rq), rq, 0); > + update_irq_load_avg(rq, 0); > /* Don't need periodic decay once load/util_avg are null */ > - if (others_rqs_have_blocked(rq)) > + if (others_have_blocked(rq)) > done = false; > > #ifdef CONFIG_NO_HZ_COMMON > @@ -7431,9 +7437,10 @@ static inline void update_blocked_averages(int cpu) > update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq); > update_rt_rq_load_avg(rq_clock_task(rq), rq, 0); > update_dl_rq_load_avg(rq_clock_task(rq), rq, 0); > + update_irq_load_avg(rq, 0); > #ifdef CONFIG_NO_HZ_COMMON > rq->last_blocked_load_update_tick = jiffies; > - if (!cfs_rq_has_blocked(cfs_rq) && !others_rqs_have_blocked(rq)) > + if (!cfs_rq_has_blocked(cfs_rq) && !others_have_blocked(rq)) > rq->has_blocked_load = 0; > #endif > rq_unlock_irqrestore(rq, &rf); > diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c > index 8b78b63..ead6d8b 100644 > --- a/kernel/sched/pelt.c > +++ b/kernel/sched/pelt.c > @@ -357,3 +357,43 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running) > > return 0; > } > + > +#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) > +/* > + * irq: > + * > + * util_sum = \Sum se->avg.util_sum but se->avg.util_sum is not tracked > + * util_sum = cpu_scale * load_sum > + * runnable_load_sum = load_sum > + * > + */ > + > +int update_irq_load_avg(struct rq *rq, u64 running) > +{ > + int ret = 0; > + /* > + * We know the time that has been used by interrupt since last update > + * but we don't when. Let be pessimistic and assume that interrupt has > + * happened just before the update. This is not so far from reality > + * because interrupt will most probably wake up task and trig an update > + * of rq clock during which the metric si updated. > + * We start to decay with normal context time and then we add the > + * interrupt context time. > + * We can safely remove running from rq->clock because > + * rq->clock += delta with delta >= running > + */ > + ret = ___update_load_sum(rq->clock - running, rq->cpu, &rq->avg_irq, > + 0, > + 0, > + 0); > + ret += ___update_load_sum(rq->clock, rq->cpu, &rq->avg_irq, > + 1, > + 1, > + 1); > + > + if (ret) > + ___update_load_avg(&rq->avg_irq, 1, 1); > + > + return ret; > +} > +#endif > diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h > index 0e4f912..d2894db 100644 > --- a/kernel/sched/pelt.h > +++ b/kernel/sched/pelt.h > @@ -6,6 +6,16 @@ int __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq); > int update_rt_rq_load_avg(u64 now, struct rq *rq, int running); > int update_dl_rq_load_avg(u64 now, struct rq *rq, int running); > > +#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) > +int update_irq_load_avg(struct rq *rq, u64 running); > +#else > +static inline int > +update_irq_load_avg(struct rq *rq, u64 running) > +{ > + return 0; > +} > +#endif > + > /* > * When a task is dequeued, its estimated utilization should not be update if > * its util_avg has not been updated at least once. > @@ -51,6 +61,12 @@ update_dl_rq_load_avg(u64 now, struct rq *rq, int running) > { > return 0; > } > + > +static inline int > +update_irq_load_avg(struct rq *rq, u64 running) > +{ > + return 0; > +} > #endif > > > diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h > index ef5d6aa..377be2b 100644 > --- a/kernel/sched/sched.h > +++ b/kernel/sched/sched.h > @@ -850,6 +850,9 @@ struct rq { > u64 age_stamp; > struct sched_avg avg_rt; > struct sched_avg avg_dl; > +#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) > + struct sched_avg avg_irq; > +#endif > u64 idle_stamp; > u64 avg_idle; > > -- > 2.7.4 >