Re: [PATCH v5 03/11] cgroup, sched: let cpu serve the same files as cpuacct

Sha Zhengju <handai.szj@xxxxxxxxx> · Mon, 14 Jan 2013 16:34:11 +0800



On Wed, Jan 9, 2013 at 7:45 PM, Glauber Costa <glommer@xxxxxxxxxxxxx> wrote:
> From: Tejun Heo <tj@xxxxxxxxxx>
>
> cpuacct being on a separate hierarchy is one of the main cgroup
> related complaints from scheduler side and the consensus seems to be
>
> * Allowing cpuacct to be a separate controller was a mistake.  In
>   general multiple controllers on the same type of resource should be
>   avoided, especially accounting-only ones.
>
> * Statistics provided by cpuacct are useful and should instead be
>   served by cpu.
>
> This patch makes cpu maintain and serve all cpuacct.* files and make
> cgroup core ignore cpuacct if it's co-mounted with cpu.  This is a
> step in deprecating cpuacct.  The next patch will allow disabling or
> dropping cpuacct without affecting userland too much.
>
> Note that this creates some discrepancies in /proc/cgroups and
> /proc/PID/cgroup.  The co-mounted cpuacct won't be reflected correctly
> there.  cpuacct will eventually be removed completely probably except
> for the statistics filenames and I'd like to keep the amount of
> compatbility hackery to minimum as much as possible.
>
> The cpu statistics implementation isn't optimized in any way.  It's
> mostly verbatim copy from cpuacct.  The goal is allowing quick
> disabling and removal of CONFIG_CGROUP_CPUACCT and creating a base on
> top of which cpu can implement proper optimization.
>
> [ glommer: don't call *_charge in stop_task.c ]
>
> Signed-off-by: Tejun Heo <tj@xxxxxxxxxx>
> Signed-off-by: Glauber Costa <glommer@xxxxxxxxxxxxx>
> Cc: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
> Cc: Michal Hocko <mhocko@xxxxxxx>
> Cc: Kay Sievers <kay.sievers@xxxxxxxx>
> Cc: Lennart Poettering <mzxreary@xxxxxxxxxxx>
> Cc: Dave Jones <davej@xxxxxxxxxx>
> Cc: Ben Hutchings <ben@xxxxxxxxxxxxxxx>
> Cc: Paul Turner <pjt@xxxxxxxxxx>
> ---
>  kernel/cgroup.c        |  13 ++++
>  kernel/sched/core.c    | 173 +++++++++++++++++++++++++++++++++++++++++++++++++
>  kernel/sched/cputime.c |  18 ++++-
>  kernel/sched/fair.c    |   1 +
>  kernel/sched/rt.c      |   1 +
>  kernel/sched/sched.h   |   7 ++
>  6 files changed, 212 insertions(+), 1 deletion(-)
>
> diff --git a/kernel/cgroup.c b/kernel/cgroup.c
> index 2f98398..0750669d 100644
> --- a/kernel/cgroup.c
> +++ b/kernel/cgroup.c
> @@ -1248,6 +1248,19 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
>         /* Consistency checks */
>
>         /*
> +        * cpuacct is deprecated and cpu will serve the same stat files.
> +        * If co-mount with cpu is requested, ignore cpuacct.  Note that
> +        * this creates some discrepancies in /proc/cgroups and
> +        * /proc/PID/cgroup.
> +        *
> +        * https://lkml.org/lkml/2012/9/13/542
> +        */
> +#if IS_ENABLED(CONFIG_CGROUP_SCHED) && IS_ENABLED(CONFIG_CGROUP_CPUACCT)
> +       if ((opts->subsys_bits & (1 << cpu_cgroup_subsys_id)) &&
> +           (opts->subsys_bits & (1 << cpuacct_subsys_id)))
> +               opts->subsys_bits &= ~(1 << cpuacct_subsys_id);
> +#endif
> +       /*
>          * Option noprefix was introduced just for backward compatibility
>          * with the old cpuset, so we allow noprefix only if mounting just
>          * the cpuset subsystem.
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 257002c..6516694 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -6811,6 +6811,7 @@ int in_sched_functions(unsigned long addr)
>  #ifdef CONFIG_CGROUP_SCHED
>  struct task_group root_task_group;
>  LIST_HEAD(task_groups);
> +static DEFINE_PER_CPU(u64, root_tg_cpuusage);
>  #endif
>
>  DECLARE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
> @@ -6869,6 +6870,8 @@ void __init sched_init(void)
>  #endif /* CONFIG_RT_GROUP_SCHED */
>
>  #ifdef CONFIG_CGROUP_SCHED
> +       root_task_group.cpustat = &kernel_cpustat;
> +       root_task_group.cpuusage = &root_tg_cpuusage;
>         list_add(&root_task_group.list, &task_groups);
>         INIT_LIST_HEAD(&root_task_group.children);
>         INIT_LIST_HEAD(&root_task_group.siblings);
> @@ -7152,6 +7155,8 @@ static void free_sched_group(struct task_group *tg)
>         free_fair_sched_group(tg);
>         free_rt_sched_group(tg);
>         autogroup_free(tg);
> +       free_percpu(tg->cpuusage);
> +       free_percpu(tg->cpustat);
>         kfree(tg);
>  }
>
> @@ -7165,6 +7170,11 @@ struct task_group *sched_create_group(struct task_group *parent)
>         if (!tg)
>                 return ERR_PTR(-ENOMEM);
>
> +       tg->cpuusage = alloc_percpu(u64);
> +       tg->cpustat = alloc_percpu(struct kernel_cpustat);
> +       if (!tg->cpuusage || !tg->cpustat)
> +               goto err;
> +
>         if (!alloc_fair_sched_group(tg, parent))
>                 goto err;
>
> @@ -7256,6 +7266,24 @@ void sched_move_task(struct task_struct *tsk)
>
>         task_rq_unlock(rq, tsk, &flags);
>  }
> +
> +void task_group_charge(struct task_struct *tsk, u64 cputime)
> +{
> +       struct task_group *tg;
> +       int cpu = task_cpu(tsk);
> +
> +       rcu_read_lock();
> +
> +       tg = container_of(task_subsys_state(tsk, cpu_cgroup_subsys_id),
> +                         struct task_group, css);
> +
> +       for (; tg; tg = tg->parent) {
> +               u64 *cpuusage = per_cpu_ptr(tg->cpuusage, cpu);
> +               *cpuusage += cputime;
> +       }
> +
> +       rcu_read_unlock();
> +}
>  #endif /* CONFIG_CGROUP_SCHED */
>
>  #if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH)
> @@ -7612,6 +7640,134 @@ cpu_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp,
>         sched_move_task(task);
>  }
>
> +static u64 task_group_cpuusage_read(struct task_group *tg, int cpu)
> +{
> +       u64 *cpuusage = per_cpu_ptr(tg->cpuusage, cpu);
> +       u64 data;
> +
> +#ifndef CONFIG_64BIT
> +       /*
> +        * Take rq->lock to make 64-bit read safe on 32-bit platforms.
> +        */
> +       raw_spin_lock_irq(&cpu_rq(cpu)->lock);
> +       data = *cpuusage;
> +       raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
> +#else
> +       data = *cpuusage;
> +#endif
> +
> +       return data;
> +}
> +
> +static void task_group_cpuusage_write(struct task_group *tg, int cpu, u64 val)
> +{
> +       u64 *cpuusage = per_cpu_ptr(tg->cpuusage, cpu);
> +
> +#ifndef CONFIG_64BIT
> +       /*
> +        * Take rq->lock to make 64-bit write safe on 32-bit platforms.
> +        */
> +       raw_spin_lock_irq(&cpu_rq(cpu)->lock);
> +       *cpuusage = val;
> +       raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
> +#else
> +       *cpuusage = val;
> +#endif
> +}
> +
> +/* return total cpu usage (in nanoseconds) of a group */
> +static u64 cpucg_cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
> +{
> +       struct task_group *tg;
> +       u64 totalcpuusage = 0;
> +       int i;
> +
> +       tg = container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id),
> +                         struct task_group, css);
> +
> +       for_each_present_cpu(i)
> +               totalcpuusage += task_group_cpuusage_read(tg, i);
> +
> +       return totalcpuusage;
> +}
> +
> +static int cpucg_cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,
> +                               u64 reset)
> +{
> +       struct task_group *tg;
> +       int err = 0;
> +       int i;
> +
> +       tg = container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id),
> +                         struct task_group, css);
> +
> +       if (reset) {
> +               err = -EINVAL;
> +               goto out;
> +       }
> +
> +       for_each_present_cpu(i)
> +               task_group_cpuusage_write(tg, i, 0);
> +
> +out:
> +       return err;
> +}
> +
> +static int cpucg_percpu_seq_read(struct cgroup *cgrp, struct cftype *cft,
> +                                struct seq_file *m)
> +{
> +       struct task_group *tg;
> +       u64 percpu;
> +       int i;
> +
> +       tg = container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id),
> +                         struct task_group, css);
> +
> +       for_each_present_cpu(i) {
> +               percpu = task_group_cpuusage_read(tg, i);
> +               seq_printf(m, "%llu ", (unsigned long long) percpu);
> +       }
> +       seq_printf(m, "\n");
> +       return 0;
> +}
> +
> +static const char *cpucg_stat_desc[] = {
> +       [CPUACCT_STAT_USER] = "user",
> +       [CPUACCT_STAT_SYSTEM] = "system",
> +};
> +
> +static int cpucg_stats_show(struct cgroup *cgrp, struct cftype *cft,
> +                           struct cgroup_map_cb *cb)
> +{
> +       struct task_group *tg;
> +       int cpu;
> +       s64 val = 0;
> +
> +       tg = container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id),
> +                         struct task_group, css);
> +
> +       for_each_online_cpu(cpu) {
> +               struct kernel_cpustat *kcpustat = per_cpu_ptr(tg->cpustat, cpu);
> +               val += kcpustat->cpustat[CPUTIME_USER];
> +               val += kcpustat->cpustat[CPUTIME_NICE];
> +       }
> +       val = cputime64_to_clock_t(val);
> +       cb->fill(cb, cpucg_stat_desc[CPUACCT_STAT_USER], val);
> +
> +       val = 0;
> +       for_each_online_cpu(cpu) {
> +               struct kernel_cpustat *kcpustat = per_cpu_ptr(tg->cpustat, cpu);
> +               val += kcpustat->cpustat[CPUTIME_SYSTEM];
> +               val += kcpustat->cpustat[CPUTIME_IRQ];
> +               val += kcpustat->cpustat[CPUTIME_SOFTIRQ];
> +       }
> +
> +       val = cputime64_to_clock_t(val);
> +       cb->fill(cb, cpucg_stat_desc[CPUACCT_STAT_SYSTEM], val);
> +
> +       return 0;
> +}
> +
>  #ifdef CONFIG_FAIR_GROUP_SCHED
>  static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,
>                                 u64 shareval)
> @@ -7918,6 +8074,23 @@ static struct cftype cpu_files[] = {
>                 .write_u64 = cpu_rt_period_write_uint,
>         },
>  #endif
> +       /* cpuacct.* which used to be served by a separate cpuacct controller */
> +       {
> +               .name = "cpuacct.usage",
> +               .flags = CFTYPE_NO_PREFIX,
> +               .read_u64 = cpucg_cpuusage_read,
> +               .write_u64 = cpucg_cpuusage_write,
> +       },
> +       {
> +               .name = "cpuacct.usage_percpu",
> +               .flags = CFTYPE_NO_PREFIX,
> +               .read_seq_string = cpucg_percpu_seq_read,
> +       },
> +       {
> +               .name = "cpuacct.stat",
> +               .flags = CFTYPE_NO_PREFIX,
> +               .read_map = cpucg_stats_show,
> +       },
>         { }     /* terminate */
>  };
>
> diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
> index 293b202..a4fe53e 100644
> --- a/kernel/sched/cputime.c
> +++ b/kernel/sched/cputime.c
> @@ -114,8 +114,10 @@ static int irqtime_account_si_update(void)
>  static inline void task_group_account_field(struct task_struct *p, int index,
>                                             u64 tmp)
>  {
> +#ifdef CONFIG_CGROUP_SCHED
> +       struct task_group *tg;
> +#endif
>  #ifdef CONFIG_CGROUP_CPUACCT
> -       struct kernel_cpustat *kcpustat;
>         struct cpuacct *ca;
>  #endif
>         /*
> @@ -125,7 +127,19 @@ static inline void task_group_account_field(struct task_struct *p, int index,
>          *
>          */
>         __get_cpu_var(kernel_cpustat).cpustat[index] += tmp;
> +#ifdef CONFIG_CGROUP_SCHED
> +       rcu_read_lock();
> +       tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
> +                         struct task_group, css);
>
> +       while (tg && (tg != &root_task_group)) {
> +               struct kernel_cpustat *kcpustat = this_cpu_ptr(tg->cpustat);
> +
> +               kcpustat->cpustat[index] += tmp;
> +               tg = tg->parent;
> +       }
> +       rcu_read_unlock();
> +#endif
>  #ifdef CONFIG_CGROUP_CPUACCT
>         if (unlikely(!cpuacct_subsys.active))
>                 return;
> @@ -133,6 +147,8 @@ static inline void task_group_account_field(struct task_struct *p, int index,
>         rcu_read_lock();
>         ca = task_ca(p);
>         while (ca && (ca != &root_cpuacct)) {
> +               struct kernel_cpustat *kcpustat = this_cpu_ptr(ca->cpustat);
> +
>                 kcpustat = this_cpu_ptr(ca->cpustat);
Is this reassignment unnecessary?


>                 kcpustat->cpustat[index] += tmp;
>                 ca = parent_ca(ca);
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 5eea870..c15bc92 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -707,6 +707,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
>                 struct task_struct *curtask = task_of(curr);
>
>                 trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
> +               task_group_charge(curtask, delta_exec);
>                 cpuacct_charge(curtask, delta_exec);
>                 account_group_exec_runtime(curtask, delta_exec);
>         }
> diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
> index 418feb0..f7e05d87 100644
> --- a/kernel/sched/rt.c
> +++ b/kernel/sched/rt.c
> @@ -935,6 +935,7 @@ static void update_curr_rt(struct rq *rq)
>         account_group_exec_runtime(curr, delta_exec);
>
>         curr->se.exec_start = rq->clock_task;
> +       task_group_charge(curr, delta_exec);
>         cpuacct_charge(curr, delta_exec);
>
>         sched_rt_avg_update(rq, delta_exec);
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index fc88644..84a339d 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -104,6 +104,10 @@ struct cfs_bandwidth {
>  struct task_group {
>         struct cgroup_subsys_state css;
>
> +       /* statistics */
> +       u64 __percpu *cpuusage;
> +       struct kernel_cpustat __percpu *cpustat;
> +
>  #ifdef CONFIG_FAIR_GROUP_SCHED
>         /* schedulable entities of this group on each cpu */
>         struct sched_entity **se;
> @@ -590,6 +594,8 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
>  #endif
>  }
>
> +extern void task_group_charge(struct task_struct *tsk, u64 cputime);
> +
>  #else /* CONFIG_CGROUP_SCHED */
>
>  static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
> @@ -597,6 +603,7 @@ static inline struct task_group *task_group(struct task_struct *p)
>  {
>         return NULL;
>  }
> +static inline void task_group_charge(struct task_struct *tsk, u64 cputime) { }
>
>  #endif /* CONFIG_CGROUP_SCHED */
>
> --
> 1.7.11.7
>
> --
> To unsubscribe from this list: send the line "unsubscribe cgroups" in
> the body of a message to majordomo@xxxxxxxxxxxxxxx
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


-- 
Thanks,
Sha
--
To unsubscribe from this list: send the line "unsubscribe cgroups" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html