Re: [PATCH v8 04/16] sched/core: uclamp: Add system default clamps

Suren Baghdasaryan <surenb@xxxxxxxxxx> · Wed, 17 Apr 2019 17:51:02 -0700

On Tue, Apr 2, 2019 at 3:42 AM Patrick Bellasi <patrick.bellasi@xxxxxxx> wrote:
>
> Tasks without a user-defined clamp value are considered not clamped
> and by default their utilization can have any value in the
> [0..SCHED_CAPACITY_SCALE] range.
>
> Tasks with a user-defined clamp value are allowed to request any value
> in that range, and the required clamp is unconditionally enforced.
> However, a "System Management Software" could be interested in limiting
> the range of clamp values allowed for all tasks.
>
> Add a privileged interface to define a system default configuration via:
>
>   /proc/sys/kernel/sched_uclamp_util_{min,max}
>
> which works as an unconditional clamp range restriction for all tasks.
>
> With the default configuration, the full SCHED_CAPACITY_SCALE range of
> values is allowed for each clamp index. Otherwise, the task-specific
> clamp is capped by the corresponding system default value.
>
> Do that by tracking, for each task, the "effective" clamp value and
> bucket the task has been refcounted in at enqueue time. This
> allows to lazy aggregate "requested" and "system default" values at
> enqueue time and simplifies refcounting updates at dequeue time.
>
> The cached bucket ids are used to avoid (relatively) more expensive
> integer divisions every time a task is enqueued.
>
> An active flag is used to report when the "effective" value is valid and
> thus the task is actually refcounted in the corresponding rq's bucket.
>
> Signed-off-by: Patrick Bellasi <patrick.bellasi@xxxxxxx>
> Cc: Ingo Molnar <mingo@xxxxxxxxxx>
> Cc: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
>
> ---
> Changes in v8:
>  Message-ID: <20190313201010.GU2482@xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx>
>  - add "requested" values uclamp_se instance beside the existing
>    "effective" values instance
>  - rename uclamp_effective_{get,assign}() into uclamp_eff_{get,set}()
>  - make uclamp_eff_get() return the new "effective" values by copy
> Message-ID: <20190318125844.ajhjpaqlcgxn7qkq@e110439-lin>
>  - run uclamp_fork() code independently from the class being supported.
>    Resetting active flag is not harmful and following patches will add
>    other code which still needs to be executed independently from class
>    support.
> Message-ID: <20190313201342.GV2482@xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx>
>  - add sysctl_sched_uclamp_handler()'s internal mutex to serialize
>    concurrent usages
> ---
>  include/linux/sched.h        |  10 +++
>  include/linux/sched/sysctl.h |  11 +++
>  kernel/sched/core.c          | 131 ++++++++++++++++++++++++++++++++++-
>  kernel/sysctl.c              |  16 +++++
>  4 files changed, 167 insertions(+), 1 deletion(-)
>
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 0c0dd7aac8e9..d8491954e2e1 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -584,14 +584,21 @@ struct sched_dl_entity {
>   * Utilization clamp for a scheduling entity
>   * @value:             clamp value "assigned" to a se
>   * @bucket_id:         bucket index corresponding to the "assigned" value
> + * @active:            the se is currently refcounted in a rq's bucket
>   *
>   * The bucket_id is the index of the clamp bucket matching the clamp value
>   * which is pre-computed and stored to avoid expensive integer divisions from
>   * the fast path.
> + *
> + * The active bit is set whenever a task has got an "effective" value assigned,
> + * which can be different from the clamp value "requested" from user-space.
> + * This allows to know a task is refcounted in the rq's bucket corresponding
> + * to the "effective" bucket_id.
>   */
>  struct uclamp_se {
>         unsigned int value              : bits_per(SCHED_CAPACITY_SCALE);
>         unsigned int bucket_id          : bits_per(UCLAMP_BUCKETS);
> +       unsigned int active             : 1;
>  };
>  #endif /* CONFIG_UCLAMP_TASK */
>
> @@ -676,6 +683,9 @@ struct task_struct {
>         struct sched_dl_entity          dl;
>
>  #ifdef CONFIG_UCLAMP_TASK
> +       /* Clamp values requested for a scheduling entity */
> +       struct uclamp_se                uclamp_req[UCLAMP_CNT];
> +       /* Effective clamp values used for a scheduling entity */
>         struct uclamp_se                uclamp[UCLAMP_CNT];
>  #endif
>
> diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
> index 99ce6d728df7..d4f6215ee03f 100644
> --- a/include/linux/sched/sysctl.h
> +++ b/include/linux/sched/sysctl.h
> @@ -56,6 +56,11 @@ int sched_proc_update_handler(struct ctl_table *table, int write,
>  extern unsigned int sysctl_sched_rt_period;
>  extern int sysctl_sched_rt_runtime;
>
> +#ifdef CONFIG_UCLAMP_TASK
> +extern unsigned int sysctl_sched_uclamp_util_min;
> +extern unsigned int sysctl_sched_uclamp_util_max;
> +#endif
> +
>  #ifdef CONFIG_CFS_BANDWIDTH
>  extern unsigned int sysctl_sched_cfs_bandwidth_slice;
>  #endif
> @@ -75,6 +80,12 @@ extern int sched_rt_handler(struct ctl_table *table, int write,
>                 void __user *buffer, size_t *lenp,
>                 loff_t *ppos);
>
> +#ifdef CONFIG_UCLAMP_TASK
> +extern int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
> +                                      void __user *buffer, size_t *lenp,
> +                                      loff_t *ppos);
> +#endif
> +
>  extern int sysctl_numa_balancing(struct ctl_table *table, int write,
>                                  void __user *buffer, size_t *lenp,
>                                  loff_t *ppos);
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 046f61d33f00..d368ac26b8aa 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -733,6 +733,14 @@ static void set_load_weight(struct task_struct *p, bool update_load)
>  }
>
>  #ifdef CONFIG_UCLAMP_TASK
> +/* Max allowed minimum utilization */
> +unsigned int sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE;
> +
> +/* Max allowed maximum utilization */
> +unsigned int sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE;
> +
> +/* All clamps are required to be less or equal than these values */
> +static struct uclamp_se uclamp_default[UCLAMP_CNT];
>
>  /* Integer rounded range for each bucket */
>  #define UCLAMP_BUCKET_DELTA DIV_ROUND_CLOSEST(SCHED_CAPACITY_SCALE, UCLAMP_BUCKETS)
> @@ -801,6 +809,52 @@ unsigned int uclamp_rq_max_value(struct rq *rq, unsigned int clamp_id,
>         return uclamp_idle_value(rq, clamp_id, clamp_value);
>  }
>
> +/*
> + * The effective clamp bucket index of a task depends on, by increasing
> + * priority:
> + * - the task specific clamp value, when explicitly requested from userspace
> + * - the system default clamp value, defined by the sysadmin
> + */
> +static inline struct uclamp_se
> +uclamp_eff_get(struct task_struct *p, unsigned int clamp_id)
> +{
> +       struct uclamp_se uc_req = p->uclamp_req[clamp_id];
> +       struct uclamp_se uc_max = uclamp_default[clamp_id];
> +
> +       /* System default restrictions always apply */
> +       if (unlikely(uc_req.value > uc_max.value))
> +               return uc_max;
> +
> +       return uc_req;
> +}
> +
> +static inline unsigned int
> +uclamp_eff_bucket_id(struct task_struct *p, unsigned int clamp_id)

This function is not used anywhere AFAIKT. uclamp_eff_bucket_id() and
uclamp_eff_value() look very similar, maybe they can be combined into
one function returning struct uclamp_se?

> +{
> +       struct uclamp_se uc_eff;
> +
> +       /* Task currently refcounted: use back-annotated (effective) bucket */
> +       if (p->uclamp[clamp_id].active)
> +               return p->uclamp[clamp_id].bucket_id;
> +
> +       uc_eff = uclamp_eff_get(p, clamp_id);
> +
> +       return uc_eff.bucket_id;
> +}
> +
> +unsigned int uclamp_eff_value(struct task_struct *p, unsigned int clamp_id)
> +{
> +       struct uclamp_se uc_eff;
> +
> +       /* Task currently refcounted: use back-annotated (effective) value */
> +       if (p->uclamp[clamp_id].active)
> +               return p->uclamp[clamp_id].value;
> +
> +       uc_eff = uclamp_eff_get(p, clamp_id);
> +
> +       return uc_eff.value;
> +}
> +
>  /*
>   * When a task is enqueued on a rq, the clamp bucket currently defined by the
>   * task's uclamp::bucket_id is refcounted on that rq. This also immediately
> @@ -818,8 +872,12 @@ static inline void uclamp_rq_inc_id(struct task_struct *p, struct rq *rq,
>         struct uclamp_se *uc_se = &p->uclamp[clamp_id];
>         struct uclamp_bucket *bucket;
>
> +       /* Update task effective clamp */
> +       p->uclamp[clamp_id] = uclamp_eff_get(p, clamp_id);
> +
>         bucket = &uc_rq->bucket[uc_se->bucket_id];
>         bucket->tasks++;
> +       uc_se->active = true;
>
>         uclamp_idle_reset(rq, clamp_id, uc_se->value);
>
> @@ -856,6 +914,7 @@ static inline void uclamp_rq_dec_id(struct task_struct *p, struct rq *rq,
>         SCHED_WARN_ON(!bucket->tasks);
>         if (likely(bucket->tasks))
>                 bucket->tasks--;
> +       uc_se->active = false;
>
>         /*
>          * Keep "local max aggregation" simple and accept to (possibly)
> @@ -909,8 +968,69 @@ static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
>                 uclamp_rq_dec_id(p, rq, clamp_id);
>  }
>
> +int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
> +                               void __user *buffer, size_t *lenp,
> +                               loff_t *ppos)
> +{
> +       int old_min, old_max;
> +       static DEFINE_MUTEX(mutex);
> +       int result;
> +
> +       mutex_lock(&mutex);
> +       old_min = sysctl_sched_uclamp_util_min;
> +       old_max = sysctl_sched_uclamp_util_max;
> +
> +       result = proc_dointvec(table, write, buffer, lenp, ppos);
> +       if (result)
> +               goto undo;
> +       if (!write)
> +               goto done;
> +
> +       if (sysctl_sched_uclamp_util_min > sysctl_sched_uclamp_util_max ||
> +           sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCALE) {
> +               result = -EINVAL;
> +               goto undo;
> +       }
> +
> +       if (old_min != sysctl_sched_uclamp_util_min) {
> +               uclamp_default[UCLAMP_MIN].value =
> +                       sysctl_sched_uclamp_util_min;
> +               uclamp_default[UCLAMP_MIN].bucket_id =
> +                       uclamp_bucket_id(sysctl_sched_uclamp_util_min);
> +       }
> +       if (old_max != sysctl_sched_uclamp_util_max) {
> +               uclamp_default[UCLAMP_MAX].value =
> +                       sysctl_sched_uclamp_util_max;
> +               uclamp_default[UCLAMP_MAX].bucket_id =
> +                       uclamp_bucket_id(sysctl_sched_uclamp_util_max);
> +       }
> +
> +       /*
> +        * Updating all the RUNNABLE task is expensive, keep it simple and do
> +        * just a lazy update at each next enqueue time.
> +        */
> +       goto done;
> +
> +undo:
> +       sysctl_sched_uclamp_util_min = old_min;
> +       sysctl_sched_uclamp_util_max = old_max;
> +done:
> +       mutex_unlock(&mutex);
> +
> +       return result;
> +}
> +
> +static void uclamp_fork(struct task_struct *p)
> +{
> +       unsigned int clamp_id;
> +
> +       for (clamp_id = 0; clamp_id < UCLAMP_CNT; ++clamp_id)
> +               p->uclamp[clamp_id].active = false;
> +}
> +
>  static void __init init_uclamp(void)
>  {
> +       struct uclamp_se uc_max = {};
>         unsigned int clamp_id;
>         int cpu;
>
> @@ -920,16 +1040,23 @@ static void __init init_uclamp(void)
>         }
>
>         for (clamp_id = 0; clamp_id < UCLAMP_CNT; ++clamp_id) {
> -               struct uclamp_se *uc_se = &init_task.uclamp[clamp_id];
> +               struct uclamp_se *uc_se = &init_task.uclamp_req[clamp_id];
>
>                 uc_se->value = uclamp_none(clamp_id);
>                 uc_se->bucket_id = uclamp_bucket_id(uc_se->value);
>         }
> +
> +       /* System defaults allow max clamp values for both indexes */
> +       uc_max.value = uclamp_none(UCLAMP_MAX);
> +       uc_max.bucket_id = uclamp_bucket_id(uc_max.value);
> +       for (clamp_id = 0; clamp_id < UCLAMP_CNT; ++clamp_id)
> +               uclamp_default[clamp_id] = uc_max;
>  }
>
>  #else /* CONFIG_UCLAMP_TASK */
>  static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) { }
>  static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) { }
> +static inline void uclamp_fork(struct task_struct *p) { }
>  static inline void init_uclamp(void) { }
>  #endif /* CONFIG_UCLAMP_TASK */
>
> @@ -2530,6 +2657,8 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
>          */
>         p->prio = current->normal_prio;
>
> +       uclamp_fork(p);
> +
>         /*
>          * Revert to default priority/policy on fork if requested.
>          */
> diff --git a/kernel/sysctl.c b/kernel/sysctl.c
> index 987ae08147bf..72277f09887d 100644
> --- a/kernel/sysctl.c
> +++ b/kernel/sysctl.c
> @@ -446,6 +446,22 @@ static struct ctl_table kern_table[] = {
>                 .mode           = 0644,
>                 .proc_handler   = sched_rr_handler,
>         },
> +#ifdef CONFIG_UCLAMP_TASK
> +       {
> +               .procname       = "sched_uclamp_util_min",
> +               .data           = &sysctl_sched_uclamp_util_min,
> +               .maxlen         = sizeof(unsigned int),
> +               .mode           = 0644,
> +               .proc_handler   = sysctl_sched_uclamp_handler,
> +       },
> +       {
> +               .procname       = "sched_uclamp_util_max",
> +               .data           = &sysctl_sched_uclamp_util_max,
> +               .maxlen         = sizeof(unsigned int),
> +               .mode           = 0644,
> +               .proc_handler   = sysctl_sched_uclamp_handler,
> +       },
> +#endif
>  #ifdef CONFIG_SCHED_AUTOGROUP
>         {
>                 .procname       = "sched_autogroup_enabled",
> --
> 2.20.1
>