The following commit has been merged into the sched/core branch of tip: Commit-ID: 74722bb223d0f236303b60c9509ff924a9713780 Gitweb: https://git.kernel.org/tip/74722bb223d0f236303b60c9509ff924a9713780 Author: Frederic Weisbecker <frederic@xxxxxxxxxx> AuthorDate: Thu, 21 Nov 2019 03:44:26 +01:00 Committer: Ingo Molnar <mingo@xxxxxxxxxx> CommitterDate: Thu, 21 Nov 2019 07:33:24 +01:00 sched/vtime: Bring up complete kcpustat accessor Many callsites want to fetch the values of system, user, user_nice, guest or guest_nice kcpustat fields altogether or at least a pair of these. In that case calling kcpustat_field() for each requested field brings unecessary overhead when we could fetch all of them in a row. So provide kcpustat_cpu_fetch() that fetches the whole kcpustat array in a vtime safe way under the same RCU and seqcount block. Signed-off-by: Frederic Weisbecker <frederic@xxxxxxxxxx> Cc: Peter Zijlstra <peterz@xxxxxxxxxxxxx> Cc: Wanpeng Li <wanpengli@xxxxxxxxxxx> Cc: Yauheni Kaliuta <yauheni.kaliuta@xxxxxxxxxx> Link: https://lkml.kernel.org/r/20191121024430.19938-3-frederic@xxxxxxxxxx Signed-off-by: Ingo Molnar <mingo@xxxxxxxxxx> --- include/linux/kernel_stat.h | 7 ++- kernel/sched/cputime.c | 136 +++++++++++++++++++++++++++++------ 2 files changed, 123 insertions(+), 20 deletions(-) diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h index 7978119..89f0745 100644 --- a/include/linux/kernel_stat.h +++ b/include/linux/kernel_stat.h @@ -81,12 +81,19 @@ static inline unsigned int kstat_cpu_irqs_sum(unsigned int cpu) #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN extern u64 kcpustat_field(struct kernel_cpustat *kcpustat, enum cpu_usage_stat usage, int cpu); +extern void kcpustat_cpu_fetch(struct kernel_cpustat *dst, int cpu); #else static inline u64 kcpustat_field(struct kernel_cpustat *kcpustat, enum cpu_usage_stat usage, int cpu) { return kcpustat->cpustat[usage]; } + +static inline void kcpustat_cpu_fetch(struct kernel_cpustat *dst, int cpu) +{ + *dst = kcpustat_cpu(cpu); +} + #endif extern void account_user_time(struct task_struct *, u64); diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 27b5406..d43318a 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -912,6 +912,30 @@ void task_cputime(struct task_struct *t, u64 *utime, u64 *stime) } while (read_seqcount_retry(&vtime->seqcount, seq)); } +static int vtime_state_check(struct vtime *vtime, int cpu) +{ + /* + * We raced against a context switch, fetch the + * kcpustat task again. + */ + if (vtime->cpu != cpu && vtime->cpu != -1) + return -EAGAIN; + + /* + * Two possible things here: + * 1) We are seeing the scheduling out task (prev) or any past one. + * 2) We are seeing the scheduling in task (next) but it hasn't + * passed though vtime_task_switch() yet so the pending + * cputime of the prev task may not be flushed yet. + * + * Case 1) is ok but 2) is not. So wait for a safe VTIME state. + */ + if (vtime->state == VTIME_INACTIVE) + return -EAGAIN; + + return 0; +} + static u64 kcpustat_user_vtime(struct vtime *vtime) { if (vtime->state == VTIME_USER) @@ -933,26 +957,9 @@ static int kcpustat_field_vtime(u64 *cpustat, do { seq = read_seqcount_begin(&vtime->seqcount); - /* - * We raced against context switch, fetch the - * kcpustat task again. - */ - if (vtime->cpu != cpu && vtime->cpu != -1) - return -EAGAIN; - - /* - * Two possible things here: - * 1) We are seeing the scheduling out task (prev) or any past one. - * 2) We are seeing the scheduling in task (next) but it hasn't - * passed though vtime_task_switch() yet so the pending - * cputime of the prev task may not be flushed yet. - * - * Case 1) is ok but 2) is not. So wait for a safe VTIME state. - */ - if (vtime->state == VTIME_INACTIVE) - return -EAGAIN; - - err = 0; + err = vtime_state_check(vtime, cpu); + if (err < 0) + return err; *val = cpustat[usage]; @@ -1025,4 +1032,93 @@ u64 kcpustat_field(struct kernel_cpustat *kcpustat, } } EXPORT_SYMBOL_GPL(kcpustat_field); + +static int kcpustat_cpu_fetch_vtime(struct kernel_cpustat *dst, + const struct kernel_cpustat *src, + struct task_struct *tsk, int cpu) +{ + struct vtime *vtime = &tsk->vtime; + unsigned int seq; + int err; + + do { + u64 *cpustat; + u64 delta; + + seq = read_seqcount_begin(&vtime->seqcount); + + err = vtime_state_check(vtime, cpu); + if (err < 0) + return err; + + *dst = *src; + cpustat = dst->cpustat; + + /* Task is sleeping, dead or idle, nothing to add */ + if (vtime->state < VTIME_SYS) + continue; + + delta = vtime_delta(vtime); + + /* + * Task runs either in user (including guest) or kernel space, + * add pending nohz time to the right place. + */ + if (vtime->state == VTIME_SYS) { + cpustat[CPUTIME_SYSTEM] += vtime->stime + delta; + } else if (vtime->state == VTIME_USER) { + if (task_nice(tsk) > 0) + cpustat[CPUTIME_NICE] += vtime->utime + delta; + else + cpustat[CPUTIME_USER] += vtime->utime + delta; + } else { + WARN_ON_ONCE(vtime->state != VTIME_GUEST); + if (task_nice(tsk) > 0) { + cpustat[CPUTIME_GUEST_NICE] += vtime->gtime + delta; + cpustat[CPUTIME_NICE] += vtime->gtime + delta; + } else { + cpustat[CPUTIME_GUEST] += vtime->gtime + delta; + cpustat[CPUTIME_USER] += vtime->gtime + delta; + } + } + } while (read_seqcount_retry(&vtime->seqcount, seq)); + + return err; +} + +void kcpustat_cpu_fetch(struct kernel_cpustat *dst, int cpu) +{ + const struct kernel_cpustat *src = &kcpustat_cpu(cpu); + struct rq *rq; + int err; + + if (!vtime_accounting_enabled_cpu(cpu)) { + *dst = *src; + return; + } + + rq = cpu_rq(cpu); + + for (;;) { + struct task_struct *curr; + + rcu_read_lock(); + curr = rcu_dereference(rq->curr); + if (WARN_ON_ONCE(!curr)) { + rcu_read_unlock(); + *dst = *src; + return; + } + + err = kcpustat_cpu_fetch_vtime(dst, src, curr, cpu); + rcu_read_unlock(); + + if (!err) + return; + + cpu_relax(); + } +} +EXPORT_SYMBOL_GPL(kcpustat_cpu_fetch); + #endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */