Micro quanta is a lightweight scheduling class designed for microsecond level scheduling intervals. It provides a more flexible way to share cores between highly latency sensitive tasks with other tasks - limiting the cpu share of latency sensitive tasks and maintaining low scheduling latency at the same time. The scheduling policy is similar to real-time group scheduling period and runtime. There is no group support but more frequent context switching is possible. Tasks running with 2us scheduling period and 1us runtime has been demonstrated. The practical range of microq scheduling period is expected to be 10us ~ 1000us. Cc: Paul Turner <pjt@xxxxxxxxxx> Cc: Ben Segall <bsegall@xxxxxxxxxx>' Signed-off-by: Xi Wang <xii@xxxxxxxxxx> --- Documentation/scheduler/sched-microq.txt | 72 +++ include/linux/sched.h | 12 + include/linux/sched/sysctl.h | 12 + include/uapi/linux/sched.h | 1 + init/Kconfig | 13 + kernel/sched/Makefile | 1 + kernel/sched/core.c | 138 ++++- kernel/sched/debug.c | 28 + kernel/sched/fair.c | 4 +- kernel/sched/microq.c | 750 +++++++++++++++++++++++ kernel/sched/pelt.c | 30 +- kernel/sched/pelt.h | 6 + kernel/sched/rt.c | 6 +- kernel/sched/sched.h | 63 +- kernel/sysctl.c | 23 + 15 files changed, 1135 insertions(+), 24 deletions(-) create mode 100644 Documentation/scheduler/sched-microq.txt create mode 100644 kernel/sched/microq.c diff --git a/Documentation/scheduler/sched-microq.txt b/Documentation/scheduler/sched-microq.txt new file mode 100644 index 000000000000..b3deefdc265f --- /dev/null +++ b/Documentation/scheduler/sched-microq.txt @@ -0,0 +1,72 @@ +Micro Quanta Scheduling + +Micro quanta (microq) is a lightweight scheduling class for microsecond level +scheduling intervals. It can simultaneously provide low scheduling latency for +real time tasks while causing no starvation or excessive latency for normal +tasks. It is a safe (avoiding many of the priority inversion problems etc.) and +high performance way to share a cpu. + +Main characteristics: + + - A sample configuration is one microq task configured to run with 16us +runtime and 20us period. Most of the time the microq task won’t wait for more +than 20 - 16 = 4us. kworker and ksoftirqd tasks under cfs can still get 4us +every 20us. + + - The scheduling policy between microq class and cfs class is weighted fair +queuing with latency guarantees for microq. + + - No priority among microq tasks. Multiple microq tasks on the same cpu will +round robin at tick interval. + + - The microq class is work conserving. If no other task is running on a cpu, a +microq task can take all cpu cycles regardless of runtime/period bandwidth +allocation. + + - Simple push load balancing only. + + - No cgroup support. + + - Driven by per cpu hrtimers. + +There are similarities between microq and rt group scheduling or between microq +and SCHED_DEADLINE. Below is a quick comparison: + + - rt group scheduling uses both tick and a global hrtimer for bandwidth +control, which doesn’t work well for below tick interval. If rt tasks are +throttled on many cpus the global hrtimer becomes a bottleneck. SCHED_DEADLINE +appears to have similar behaviors. microq uses per cpu hrtimers. + + - microq is based on fair queuing (or somewhat equivalent to SCHED_DEADLINE +with period == deadline), not fixed scheduling intervals. A blocked task can +accumulate credit similar to cfs or SCHED_DEADLINE, but different from fixed +intervals of rt group scheduling. + + - No priority among microq threads. + + - microq is work conserving. rt group scheduling has fixed bandwidth +allocation. + + - Compared to SCHED_DEADLINE, microq is a lightweight scheduling class with a +very limited feature set, but also with less restrictions, e.g. no admission +control or cpu affinity requirements. + + +Usage and expected behaviors: + +microq bandwidth can be controlled with either global or per task bandwidth +parameters. If none of the microq tasks on a cpu specifies the bandwidth +parameters, global parameters take effect. + +When there are multiple microq tasks on a run queue, the bandwidth parameters +of the task with shortest period takes effect. (Note runtime in effect is not +the sum of microq tasks, it is exactly the runtime requested by one microq +task.) + +When there are multiple microq tasks on the same cpu, they will round-robin at +tick interval. The microq scheduling period is likely shorter than the tick +interval, thus the scheduler can switch between one microq task and a cfs task +several times before switching to another microq task. The scheduling latency +of a microq tasks is well shielded from cfs tasks but not from another microq +task. Cooperative multitasking might be an option for some applications if +multiple microq tasks on the same cpu is needed. diff --git a/include/linux/sched.h b/include/linux/sched.h index 8dc1811487f5..3c741225d057 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -578,6 +578,15 @@ struct sched_dl_entity { struct hrtimer inactive_timer; }; +#ifdef CONFIG_SCHED_CLASS_MICROQ +struct sched_microq_entity { + struct list_head run_list; + int sched_period; + int sched_runtime; + unsigned int time_slice; +}; +#endif + #ifdef CONFIG_UCLAMP_TASK /* Number of utilization clamp buckets (shorter alias) */ #define UCLAMP_BUCKETS CONFIG_UCLAMP_BUCKETS_COUNT @@ -688,6 +697,9 @@ struct task_struct { const struct sched_class *sched_class; struct sched_entity se; struct sched_rt_entity rt; +#ifdef CONFIG_SCHED_CLASS_MICROQ + struct sched_microq_entity microq; +#endif #ifdef CONFIG_CGROUP_SCHED struct task_group *sched_task_group; #endif diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index d4f6215ee03f..f1b8a0a23504 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -56,6 +56,13 @@ int sched_proc_update_handler(struct ctl_table *table, int write, extern unsigned int sysctl_sched_rt_period; extern int sysctl_sched_rt_runtime; +#ifdef CONFIG_SCHED_CLASS_MICROQ +/* micro quanta period and runtime */ +extern int sysctl_sched_microq_period; +extern int sysctl_sched_microq_runtime; +extern int sched_microq_timeslice; +#endif + #ifdef CONFIG_UCLAMP_TASK extern unsigned int sysctl_sched_uclamp_util_min; extern unsigned int sysctl_sched_uclamp_util_max; @@ -101,4 +108,9 @@ extern int sched_energy_aware_handler(struct ctl_table *table, int write, loff_t *ppos); #endif +#ifdef CONFIG_SCHED_CLASS_MICROQ +extern int sched_microq_proc_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos); +#endif + #endif /* _LINUX_SCHED_SYSCTL_H */ diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h index b3105ac1381a..71b4af023f17 100644 --- a/include/uapi/linux/sched.h +++ b/include/uapi/linux/sched.h @@ -57,6 +57,7 @@ struct clone_args { /* SCHED_ISO: reserved but not implemented yet */ #define SCHED_IDLE 5 #define SCHED_DEADLINE 6 +#define SCHED_MICROQ 7 /* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */ #define SCHED_RESET_ON_FORK 0x40000000 diff --git a/init/Kconfig b/init/Kconfig index d3ad48272924..f544c2d6ded9 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -2126,6 +2126,19 @@ config PADATA depends on SMP bool +config SCHED_CLASS_MICROQ + bool "Micro Quanta scheduling class" + default y + help + Micro quanta is a lightweight scheduling class designed for + microsecond level scheduling intervals. It provides a more flexible + way to share cores between highly latency sensitive tasks with other + tasks - limiting the cpu share of latency sensitive tasks and + maintaining low scheduling latency at the same time. The scheduling + policy is similar to real-time group scheduling period and runtime. + There is no group support but more frequent context switching + is possible. + config ASN1 tristate help diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 21fb5a5662b5..76255ef8636c 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -30,3 +30,4 @@ obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o obj-$(CONFIG_MEMBARRIER) += membarrier.o obj-$(CONFIG_CPU_ISOLATION) += isolation.o obj-$(CONFIG_PSI) += psi.o +obj-$(CONFIG_SCHED_CLASS_MICROQ) += microq.o diff --git a/kernel/sched/core.c b/kernel/sched/core.c index fa43ce3962e7..c8556169043c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1236,7 +1236,7 @@ static inline int normal_prio(struct task_struct *p) if (task_has_dl_policy(p)) prio = MAX_DL_PRIO-1; - else if (task_has_rt_policy(p)) + else if (task_has_rt_fiforr_policy(p)) prio = MAX_RT_PRIO-1 - p->rt_priority; else prio = __normal_prio(p); @@ -4475,6 +4475,36 @@ static struct task_struct *find_process_by_pid(pid_t pid) */ #define SETPARAM_POLICY -1 +#ifdef CONFIG_SCHED_CLASS_MICROQ + +void microq_adjust_bandwidth(struct task_struct *p); + +static inline int microq_period_from_attr(const struct sched_attr *attr) +{ + int period = (unsigned int)attr->sched_priority >> 16; + /* ffff means undefined, otherwise convert from us to ns */ + return period == 0xffff ? MICROQ_BANDWIDTH_UNDEFINED : period * 1000; +} + +static inline int microq_runtime_from_attr(const struct sched_attr *attr) +{ + int runtime = (unsigned int)attr->sched_priority & 0xffff; + /* ffff means undefined, otherwise convert from us to ns */ + return runtime == 0xffff ? MICROQ_BANDWIDTH_UNDEFINED : runtime * 1000; +} + +static inline int microq_pack_bandwidth(const struct task_struct *p) +{ + int period = p->microq.sched_period; + int runtime = p->microq.sched_runtime; + + period = (period == -1) ? 0xffff : period / 1000; + runtime = (runtime == -1) ? 0xffff : runtime / 1000; + return (period << 16) | runtime; +} + +#endif + static void __setscheduler_params(struct task_struct *p, const struct sched_attr *attr) { @@ -4485,6 +4515,17 @@ static void __setscheduler_params(struct task_struct *p, p->policy = policy; +#ifdef CONFIG_SCHED_CLASS_MICROQ + if (microq_policy(policy)) { + p->microq.sched_period = microq_period_from_attr(attr); + p->microq.sched_runtime = microq_runtime_from_attr(attr); + p->rt_priority = DEFAULT_MICROQ_RT_PRIORITY; + p->normal_prio = normal_prio(p); + set_load_weight(p, true); + return; + } +#endif + if (dl_policy(policy)) __setparam_dl(p, attr); else if (fair_policy(policy)) @@ -4521,6 +4562,13 @@ static void __setscheduler(struct rq *rq, struct task_struct *p, if (keep_boost) p->prio = rt_effective_prio(p, p->prio); +#ifdef CONFIG_SCHED_CLASS_MICROQ + if (microq_policy(attr->sched_policy)) { + p->sched_class = µq_sched_class; + return; + } +#endif + if (dl_prio(p->prio)) p->sched_class = &dl_sched_class; else if (rt_prio(p->prio)) @@ -4576,17 +4624,35 @@ static int __sched_setscheduler(struct task_struct *p, if (attr->sched_flags & ~(SCHED_FLAG_ALL | SCHED_FLAG_SUGOV)) return -EINVAL; - /* - * Valid priorities for SCHED_FIFO and SCHED_RR are - * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, - * SCHED_BATCH and SCHED_IDLE is 0. - */ - if ((p->mm && attr->sched_priority > MAX_USER_RT_PRIO-1) || - (!p->mm && attr->sched_priority > MAX_RT_PRIO-1)) - return -EINVAL; - if ((dl_policy(policy) && !__checkparam_dl(attr)) || - (rt_policy(policy) != (attr->sched_priority != 0))) - return -EINVAL; +#ifdef CONFIG_SCHED_CLASS_MICROQ + if (microq_policy(policy)) { + int period = microq_period_from_attr(attr); + int runtime = microq_runtime_from_attr(attr); + + if (period == MICROQ_BANDWIDTH_UNDEFINED) { + if (runtime != MICROQ_BANDWIDTH_UNDEFINED) + return -EINVAL; + } else if (period < MICROQ_MIN_PERIOD) { + return -EINVAL; + } else if (runtime < MICROQ_MIN_RUNTIME && + runtime != MICROQ_BANDWIDTH_UNDEFINED) { + return -EINVAL; + } + } else +#endif + { + /* + * Valid priorities for SCHED_FIFO and SCHED_RR are + * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, + * SCHED_BATCH and SCHED_IDLE is 0. + */ + if ((p->mm && attr->sched_priority > MAX_USER_RT_PRIO-1) || + (!p->mm && attr->sched_priority > MAX_RT_PRIO-1)) + return -EINVAL; + if ((dl_policy(policy) && !__checkparam_dl(attr)) || + (rt_fiforr_policy(policy) != (attr->sched_priority != 0))) + return -EINVAL; + } /* * Allow unprivileged RT tasks to decrease priority: @@ -4598,7 +4664,7 @@ static int __sched_setscheduler(struct task_struct *p, return -EPERM; } - if (rt_policy(policy)) { + if (rt_fiforr_policy(policy)) { unsigned long rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO); @@ -4621,6 +4687,11 @@ static int __sched_setscheduler(struct task_struct *p, if (dl_policy(policy)) return -EPERM; +#ifdef CONFIG_SCHED_CLASS_MICROQ + if (microq_policy(policy)) + return -EPERM; +#endif + /* * Treat SCHED_IDLE as nice 20. Only allow a switch to * SCHED_NORMAL if the RLIMIT_NICE would normally permit it. @@ -4680,13 +4751,20 @@ static int __sched_setscheduler(struct task_struct *p, if (unlikely(policy == p->policy)) { if (fair_policy(policy) && attr->sched_nice != task_nice(p)) goto change; - if (rt_policy(policy) && attr->sched_priority != p->rt_priority) + if (rt_fiforr_policy(policy) && attr->sched_priority != p->rt_priority) goto change; if (dl_policy(policy) && dl_param_changed(p, attr)) goto change; if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) goto change; +#ifdef CONFIG_SCHED_CLASS_MICROQ + if (microq_policy(policy)) { + __setscheduler_params(p, attr); + microq_adjust_bandwidth(p); + } +#endif + p->sched_reset_on_fork = reset_on_fork; task_rq_unlock(rq, p, &rf); return 0; @@ -4699,7 +4777,7 @@ static int __sched_setscheduler(struct task_struct *p, * Do not allow realtime tasks into groups that have no runtime * assigned. */ - if (rt_bandwidth_enabled() && rt_policy(policy) && + if (rt_bandwidth_enabled() && rt_fiforr_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0 && !task_group_is_autogroup(task_group(p))) { task_rq_unlock(rq, p, &rf); @@ -4754,7 +4832,21 @@ static int __sched_setscheduler(struct task_struct *p, * itself. */ new_effective_prio = rt_effective_prio(p, newprio); +#ifdef CONFIG_SCHED_CLASS_MICROQ + /* + * A microq task overloads priorities and pretends to be a cfs task. We need + * a stronger check here. + * + * Priorities has no effect on microq tasks, but the microq class still needs to + * function as a priority inheritance passthrough, thus we cannot simply turn + * off all pi code. + */ + + if (!microq_policy(policy) && !microq_policy(oldpolicy) && + new_effective_prio == oldprio) +#else if (new_effective_prio == oldprio) +#endif queue_flags &= ~DEQUEUE_MOVE; } @@ -5084,8 +5176,13 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) if (retval) goto out_unlock; - if (task_has_rt_policy(p)) + if (task_has_rt_fiforr_policy(p)) lp.sched_priority = p->rt_priority; +#ifdef CONFIG_SCHED_CLASS_MICROQ + else if (microq_policy(p->policy)) + lp.sched_priority = microq_pack_bandwidth(p); +#endif + rcu_read_unlock(); /* @@ -5171,8 +5268,12 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; if (task_has_dl_policy(p)) __getparam_dl(p, &attr); - else if (task_has_rt_policy(p)) + else if (task_has_rt_fiforr_policy(p)) attr.sched_priority = p->rt_priority; +#ifdef CONFIG_SCHED_CLASS_MICROQ + else if (microq_policy(p->policy)) + attr.sched_priority = microq_pack_bandwidth(p); +#endif else attr.sched_nice = task_nice(p); @@ -6441,6 +6542,9 @@ void __init sched_init(void) init_cfs_rq(&rq->cfs); init_rt_rq(&rq->rt); init_dl_rq(&rq->dl); +#ifdef CONFIG_SCHED_CLASS_MICROQ + init_microq_rq(&rq->microq); +#endif #ifdef CONFIG_FAIR_GROUP_SCHED root_task_group.shares = ROOT_TASK_GROUP_LOAD; INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index f7e4579e746c..9bfd4b106b7e 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -611,6 +611,31 @@ void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq) #undef PU } +#ifdef CONFIG_SCHED_CLASS_MICROQ +void print_microq_rq(struct seq_file *m, int cpu, struct microq_rq *microq_rq) +{ + SEQ_printf(m, "\nmicroq_rq[%d]:\n", cpu); + +#define P(x) \ + SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(microq_rq->x)) +#define PN(x) \ + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(microq_rq->x)) + + P(microq_nr_running); + P(microq_throttled); + PN(microq_time); + PN(microq_target_time); + P(microq_runtime); + P(microq_period); + P(last_push_failed); + PN(quanta_start); + P(delta_exec_uncharged); + +#undef PN +#undef P +} +#endif + static void print_cpu(struct seq_file *m, int cpu) { struct rq *rq = cpu_rq(cpu); @@ -670,6 +695,9 @@ do { \ print_cfs_stats(m, cpu); print_rt_stats(m, cpu); print_dl_stats(m, cpu); +#ifdef CONFIG_SCHED_CLASS_MICROQ + print_microq_stats(m, cpu); +#endif print_rq(m, rq, cpu); spin_unlock_irqrestore(&sched_debug_lock, flags); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 036be95a87e9..31f3f058cec1 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7556,7 +7556,7 @@ static void update_blocked_averages(int cpu) } curr_class = rq->curr->sched_class; - update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class); + update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class || curr_class == µq_sched_class); update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class); update_irq_load_avg(rq, 0); /* Don't need periodic decay once load/util_avg are null */ @@ -7626,7 +7626,7 @@ static inline void update_blocked_averages(int cpu) update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq); curr_class = rq->curr->sched_class; - update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class); + update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class || curr_class == µq_sched_class); update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class); update_irq_load_avg(rq, 0); update_blocked_load_status(rq, cfs_rq_has_blocked(cfs_rq) || others_have_blocked(rq)); diff --git a/kernel/sched/microq.c b/kernel/sched/microq.c new file mode 100644 index 000000000000..575499950f18 --- /dev/null +++ b/kernel/sched/microq.c @@ -0,0 +1,750 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Micro Quanta Scheduling Class + * + * Micro quanta is a lightweight scheduling class designed for microsecond + * level scheduling intervals. It provides a flexible way to share cores + * between latency sensitive tasks and other tasks - limiting the cpu share + * of latency sensitive tasks and maintaining low scheduling latency at the + * same time + * + * See sched-microq.txt for more details. + */ + +#include "sched.h" +#include "pelt.h" +#include <linux/slab.h> + +/* + * Global micro quanta period and runtime in ns. + * Can be overridden by per task parameters. + */ +int sysctl_sched_microq_period = 40000; +int sysctl_sched_microq_runtime = 20000; + +int sched_microq_timeslice = 1; + +static DEFINE_PER_CPU(struct callback_head, microq_push_head); + +static enum hrtimer_restart sched_microq_period_timer(struct hrtimer *timer); +static void __task_tick_microq(struct rq *rq, struct task_struct *p, int queued); +static void push_one_microq_task(struct rq *rq); +static void push_microq_tasks(struct rq *rq); + +static inline u64 u_saturation_sub(u64 a, u64 b) +{ + return a > b ? a - b : 0; +} + +static inline struct task_struct *microq_task_of(struct sched_microq_entity *microq_se) +{ + return container_of(microq_se, struct task_struct, microq); +} + +static inline struct rq *rq_of_microq_rq(struct microq_rq *microq_rq) +{ + return container_of(microq_rq, struct rq, microq); +} + +static inline struct microq_rq *microq_rq_of_se(struct sched_microq_entity *microq_se) +{ + struct task_struct *p = microq_task_of(microq_se); + struct rq *rq = task_rq(p); + + return &rq->microq; +} + +static inline int on_microq_rq(struct sched_microq_entity *microq_se) +{ + return !list_empty(µq_se->run_list); +} + +int microq_task(struct task_struct *p) +{ + return p->sched_class == µq_sched_class; +} + +/* + * microq bandwidth can be controlled with either global or per task bandwidth + * parameters. If none of the microq tasks on a cpu specifies the bandwidth + * parameters, global parameters take effect. + * + * When there are multiple microq tasks on a run queue, the bandwidth + * parameters of the task with shortest period takes effect. (Note runtime in + * effect is *not* the sum of microq tasks, it is exactly the runtime requested + * by one microq task.) + * + * See also sched-microq.txt + */ +static inline void get_microq_bandwidth(struct microq_rq *microq_rq, int *period, int *runtime) +{ + if (microq_rq->microq_period != MICROQ_BANDWIDTH_UNDEFINED) { + *period = microq_rq->microq_period; + *runtime = microq_rq->microq_runtime; + } else { + *period = READ_ONCE(sysctl_sched_microq_period); + *runtime = READ_ONCE(sysctl_sched_microq_runtime); + *period = max(MICROQ_MIN_PERIOD, *period); + if (*runtime != MICROQ_BANDWIDTH_UNDEFINED) /* if runtime != unlimited */ + *runtime = max(MICROQ_MIN_RUNTIME, *runtime); + } +} + +static inline int microq_timer_needed(struct microq_rq *microq_rq) +{ + struct rq *rq = rq_of_microq_rq(microq_rq); + int period, runtime; + + get_microq_bandwidth(microq_rq, &period, &runtime); + return runtime != MICROQ_BANDWIDTH_UNDEFINED && rq->microq.microq_nr_running && + rq->nr_running > rq->microq.microq_nr_running; +} + +void init_microq_rq(struct microq_rq *microq_rq) +{ + INIT_LIST_HEAD(µq_rq->tasks); + + microq_rq->microq_period = MICROQ_BANDWIDTH_UNDEFINED; + microq_rq->microq_runtime = MICROQ_BANDWIDTH_UNDEFINED; + microq_rq->microq_time = 0; + microq_rq->microq_target_time = 0; + microq_rq->microq_throttled = 0; + + hrtimer_init(µq_rq->microq_period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + microq_rq->microq_period_timer.function = sched_microq_period_timer; + microq_rq->quanta_start = 0; + microq_rq->delta_exec_uncharged = 0; + microq_rq->delta_exec_total = 0; +} + +static void account_microq_bandwidth(struct rq *rq) +{ + s64 delta; + int period; + int runtime; + struct microq_rq *microq_rq = &rq->microq; + + delta = sched_clock_cpu(cpu_of(rq)) - microq_rq->quanta_start; + microq_rq->quanta_start += delta; + delta = max(0LL, delta); + + get_microq_bandwidth(microq_rq, &period, &runtime); + if (microq_task(rq->curr)) { + microq_rq->microq_time += delta; + microq_rq->delta_exec_uncharged += delta; + } + microq_rq->delta_exec_total += delta; + microq_rq->microq_target_time += delta*runtime/period; + microq_rq->microq_time = clamp(microq_rq->microq_time, + u_saturation_sub(microq_rq->microq_target_time, period*2), + microq_rq->microq_target_time + period*2); +} + +static void check_microq_timer(struct rq *rq) +{ + struct microq_rq *microq_rq = &rq->microq; + int period, runtime, expire; + + if (hrtimer_active(µq_rq->microq_period_timer)) + return; + + get_microq_bandwidth(microq_rq, &period, &runtime); + account_microq_bandwidth(rq); + if (microq_rq->microq_time < microq_rq->microq_target_time) { + microq_rq->microq_throttled = 0; + expire = microq_rq->microq_target_time - microq_rq->microq_time; + expire = max(runtime, expire); + } else { + microq_rq->microq_throttled = 1; + expire = microq_rq->microq_time - microq_rq->microq_target_time; + expire = max(expire, period - runtime); + } + microq_rq->period_count = 0; + microq_rq->periods_to_jiffies = (jiffies_to_usecs(1) * 1000) / period; + microq_rq->periods_to_jiffies = max(1U, microq_rq->periods_to_jiffies); + + hrtimer_start_range_ns(µq_rq->microq_period_timer, ns_to_ktime(expire), 0, + HRTIMER_MODE_REL_PINNED); +} + +void microq_adjust_bandwidth(struct task_struct *p) +{ + struct microq_rq *microq_rq = &task_rq(p)->microq; + struct sched_microq_entity *microq_se; + + microq_rq->microq_period = MICROQ_BANDWIDTH_UNDEFINED; + microq_rq->microq_runtime = MICROQ_BANDWIDTH_UNDEFINED; + list_for_each_entry(microq_se, µq_rq->tasks, run_list) { + if (microq_se->sched_period >= 0 && + microq_se->sched_runtime >= MICROQ_BANDWIDTH_UNDEFINED && + (microq_rq->microq_period == MICROQ_BANDWIDTH_UNDEFINED || + microq_se->sched_period <= microq_rq->microq_period)) { + microq_rq->microq_period = microq_se->sched_period; + microq_rq->microq_runtime = microq_se->sched_runtime; + } + } + +} + +static void microq_update_load_avg_ratio(struct rq *rq) +{ + struct microq_rq *microq_rq = &rq->microq; + int period; + int runtime; + u64 contrib_ratio; + + /* + * For cpu_power accounting (SCHED_POWER_SCALE) + * + * Note micro's contribution to rt_avg is capped at elapsed_time*runtime/period for + * the desired load balancing behavior. We allow microq to go over its bandwidth limit + * when there are free cpu cycles. However if we don't cap the rt_avg contribution + * the cpu running microq can report cpu_power of 0, such that no cfs task can be + * moved to it by load balancer. + */ + get_microq_bandwidth(microq_rq, &period, &runtime); + if (microq_rq->delta_exec_uncharged * period > microq_rq->delta_exec_total * runtime) + contrib_ratio = runtime * SCHED_FIXEDPOINT_SCALE / period; + + else + contrib_ratio = SCHED_FIXEDPOINT_SCALE; + update_rt_rq_load_avg_ratio(rq_clock_pelt(rq), contrib_ratio, rq, 1); +} + +static void update_curr_microq(struct rq *rq) +{ + struct task_struct *curr = rq->curr; + struct microq_rq *microq_rq = &rq->microq; + + if (!microq_task(curr)) + return; + + account_microq_bandwidth(rq); + + schedstat_set(curr->se.statistics.exec_max, + max(curr->se.statistics.exec_max, microq_rq->delta_exec_uncharged)); + + curr->se.sum_exec_runtime += microq_rq->delta_exec_uncharged; + account_group_exec_runtime(curr, microq_rq->delta_exec_uncharged); + + curr->se.exec_start = rq_clock_task(rq); + cpuacct_charge(curr, microq_rq->delta_exec_uncharged); + + microq_update_load_avg_ratio(rq), + + microq_rq->delta_exec_uncharged = 0; + microq_rq->delta_exec_total = 0; +} + +static enum hrtimer_restart sched_microq_period_timer(struct hrtimer *timer) +{ + int period; + int runtime; + struct microq_rq *microq_rq; + struct rq *rq; + ktime_t nextslice = {0}; + u64 ns; + + microq_rq = container_of(timer, struct microq_rq, microq_period_timer); + rq = rq_of_microq_rq(microq_rq); + get_microq_bandwidth(microq_rq, &period, &runtime); + nextslice = ns_to_ktime(period); + + raw_spin_lock(&rq->lock); + + account_microq_bandwidth(rq); + if (microq_timer_needed(microq_rq)) { + if (microq_rq->microq_throttled) { + microq_rq->microq_throttled = 0; + ns = u_saturation_sub(microq_rq->microq_target_time, + microq_rq->microq_time); + nextslice = ns_to_ktime(max(ns, (u64)runtime)); + } else { + microq_rq->microq_throttled = 1; + ns = u_saturation_sub(microq_rq->microq_time, + microq_rq->microq_target_time); + /* + * The exact time lower class tasks need to run to maintain the bandwidth + * ratio + */ + ns = ns*period/runtime; + /* + * Don't want to make the next time slice too short for lower class tasks or + * microq tasks. Also don't want the time slice for lower class tasks to + * last too long as microq tasks are latency sensitive. The time slice is + * not limited in this function for microq tasks (but difference between + * microq_time and microq_target_time is clamped). + */ + ns = clamp_val(ns, u_saturation_sub(period, runtime), + u_saturation_sub(period, runtime/2)); + nextslice = ns_to_ktime(ns); + + if (++microq_rq->period_count >= microq_rq->periods_to_jiffies) { + microq_rq->period_count = 0; + microq_rq->periods_to_jiffies = (jiffies_to_usecs(1) * 1000) / period; + microq_rq->periods_to_jiffies = max(1U, microq_rq->periods_to_jiffies); + update_rq_clock(rq); + __task_tick_microq(rq, rq->curr, 0); + } + } + + resched_curr(rq); + hrtimer_set_expires(timer, + ktime_add(hrtimer_cb_get_time(µq_rq->microq_period_timer), nextslice)); + raw_spin_unlock(&rq->lock); + return HRTIMER_RESTART; + } else { + microq_rq->microq_throttled = 0; + raw_spin_unlock(&rq->lock); + return HRTIMER_NORESTART; + } +} + +static void dequeue_microq_entity(struct sched_microq_entity *microq_se) +{ + struct microq_rq *microq_rq = microq_rq_of_se(microq_se); + + list_del_init(µq_se->run_list); + BUG_ON(!microq_rq->microq_nr_running); + microq_rq->microq_nr_running--; +} + +static void enqueue_microq_entity(struct sched_microq_entity *microq_se) +{ + struct microq_rq *microq_rq = microq_rq_of_se(microq_se); + + list_add_tail(µq_se->run_list, µq_rq->tasks); + microq_rq->microq_nr_running++; + microq_rq->last_push_failed = 0; +} + +static void +enqueue_task_microq(struct rq *rq, struct task_struct *p, int flags) +{ + struct sched_microq_entity *microq_se = &p->microq; + struct microq_rq *microq_rq = microq_rq_of_se(microq_se); + + /* + * microq keeps uncharged runtime stats internally. Need to mark the end of non mircoq task + * execution here so their runtime won't be counted as microq. + */ + account_microq_bandwidth(rq); + + enqueue_microq_entity(microq_se); + add_nr_running(rq, 1); + + /* The parameters of the task with shortest period takes effect, see get_microq_bandwidth */ + if (microq_se->sched_period >= 0 && + microq_se->sched_runtime >= MICROQ_BANDWIDTH_UNDEFINED && + (microq_rq->microq_period == MICROQ_BANDWIDTH_UNDEFINED || + microq_se->sched_period <= microq_rq->microq_period)) { + microq_rq->microq_period = microq_se->sched_period; + microq_rq->microq_runtime = microq_se->sched_runtime; + } +} + +static void dequeue_task_microq(struct rq *rq, struct task_struct *p, int flags) +{ + struct sched_microq_entity *microq_se = &p->microq; + + update_curr_microq(rq); + dequeue_microq_entity(microq_se); + sub_nr_running(rq, 1); + + microq_adjust_bandwidth(p); +} + +/* + * Put task to the head or the end of the run list without the overhead of + * dequeue followed by enqueue. + */ +static void +requeue_microq_entity(struct microq_rq *microq_rq, struct sched_microq_entity *microq_se) +{ + if (on_microq_rq(microq_se)) + list_move_tail(µq_se->run_list, µq_rq->tasks); +} + +static void requeue_task_microq(struct rq *rq, struct task_struct *p) +{ + struct sched_microq_entity *microq_se = &p->microq; + struct microq_rq *microq_rq; + + microq_rq = microq_rq_of_se(microq_se); + requeue_microq_entity(microq_rq, microq_se); +} + +static void yield_task_microq(struct rq *rq) +{ + requeue_task_microq(rq, rq->curr); +} + +static void check_preempt_curr_microq(struct rq *rq, struct task_struct *p, int flags) +{ +} + +static struct task_struct *pick_next_task_microq(struct rq *rq, struct task_struct *prev, + struct rq_flags *rf) +{ + struct sched_microq_entity *microq_se; + struct task_struct *p; + struct microq_rq *microq_rq = &rq->microq; + + if (!microq_rq->microq_nr_running) + return NULL; + + if (microq_timer_needed(microq_rq)) { + check_microq_timer(rq); + if (microq_rq->microq_throttled) + return NULL; + } else { + microq_rq->microq_throttled = 0; + } + + put_prev_task(rq, prev); + + microq_se = list_entry(microq_rq->tasks.next, struct sched_microq_entity, run_list); + BUG_ON(!microq_se); + + p = microq_task_of(microq_se); + +#ifdef CONFIG_SMP + if (rq->microq.microq_nr_running > 1 && !rq->microq.last_push_failed) + queue_balance_callback(rq, &per_cpu(microq_push_head, rq->cpu), + push_one_microq_task); +#endif + + if (rq->curr->sched_class != &rt_sched_class && rq->curr->sched_class != µq_sched_class) + update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0); + + return p; +} + +static void put_prev_task_microq(struct rq *rq, struct task_struct *p) +{ + microq_update_load_avg_ratio(rq); +} + +#ifdef CONFIG_SMP + +static int microq_find_rq(struct task_struct *p) +{ + int pcpu = task_cpu(p); + int cpu, tcpu; + struct rq *rq; + int idle_ht = -1; + int lowprio_cpu = -1; + int low_nmicroq_cpu = -1; + unsigned int low_nmicroq = task_rq(p)->microq.microq_nr_running; + + if (!cpumask_test_cpu(pcpu, p->cpus_ptr)) + pcpu = cpumask_first(p->cpus_ptr); + + for (cpu = pcpu;;) { + /* search from current cpu to avoid crowding lower numbered cpus */ + cpu = cpumask_next(cpu, p->cpus_ptr); + if (cpu >= nr_cpu_ids) + cpu = cpumask_first(p->cpus_ptr); + if (cpu == pcpu) + break; + + for_each_cpu(tcpu, topology_sibling_cpumask(cpu)) { + if (!available_idle_cpu(tcpu)) + break; + } + if (tcpu >= nr_cpu_ids) + return cpu; + + if (idle_ht == -1) { + rq = cpu_rq(cpu); + if (idle_cpu(cpu)) { + idle_ht = cpu; + } else if (lowprio_cpu == -1) { + if (rq->nr_running == rq->cfs.h_nr_running) { + lowprio_cpu = cpu; + } else if (rq->microq.microq_nr_running + 1 < low_nmicroq) { + low_nmicroq_cpu = cpu; + low_nmicroq = rq->microq.microq_nr_running; + } + } + } + } + if (idle_ht != -1) + return idle_ht; + if (lowprio_cpu != -1) + return lowprio_cpu; + return low_nmicroq_cpu; +} + +/* Will lock the rq it finds */ +static struct rq *microq_find_potential_rq(struct task_struct *task, struct rq *rq) +{ + struct rq *t_rq = NULL; + int cpu; + + cpu = microq_find_rq(task); + + if (cpu == -1) + return NULL; + + t_rq = cpu_rq(cpu); + + if (double_lock_balance(rq, t_rq)) { + /* + * We had to unlock the run queue. In + * the mean time, task could have + * migrated already or had its affinity changed. + * Also make sure that it wasn't scheduled on its rq. + */ + if (unlikely(task_rq(task) != rq || + !cpumask_test_cpu(t_rq->cpu, + task->cpus_ptr) || + task_running(rq, task) || + !task->on_rq)) { + + double_unlock_balance(rq, t_rq); + t_rq = NULL; + } + } + + return t_rq; +} + +/* + * If the current CPU has more than one microq task, see if a non + * running task can be kicked out. + */ +static int __push_microq_task(struct rq *rq) +{ + struct microq_rq *microq_rq = &rq->microq; + struct task_struct *next_task = NULL; + struct sched_microq_entity *microq_se; + struct rq *t_rq; + int ret = 0; + + if (microq_rq->microq_nr_running <= 1 || microq_rq->last_push_failed) + return 0; + + list_for_each_entry(microq_se, &rq->microq.tasks, run_list) { + struct task_struct *p = microq_task_of(microq_se); + + if (!task_running(rq, p) && p->nr_cpus_allowed > 1) { + next_task = p; + break; + } + } + if (!next_task) + goto out; + + BUG_ON(rq->cpu != task_cpu(next_task)); + BUG_ON(!next_task->on_rq); + + /* We might release rq lock */ + get_task_struct(next_task); + + /* find_lock_rq locks the rq if found */ + t_rq = microq_find_potential_rq(next_task, rq); + + if (t_rq) { + deactivate_task(rq, next_task, 0); + set_task_cpu(next_task, t_rq->cpu); + activate_task(t_rq, next_task, 0); + resched_curr(t_rq); + double_unlock_balance(rq, t_rq); + ret = 1; + } + + put_task_struct(next_task); + +out: + if (!ret) + microq_rq->last_push_failed = 1; + return ret; +} + +static void push_one_microq_task(struct rq *rq) +{ + __push_microq_task(rq); +} + +static void push_microq_tasks(struct rq *rq) +{ + while (__push_microq_task(rq)) {} +} + +/* + * If we are not running and we are not going to reschedule soon, we should + * try to push tasks away now + */ +static void task_woken_microq(struct rq *rq, struct task_struct *p) +{ + if (!test_tsk_need_resched(rq->curr) && + p->nr_cpus_allowed > 1 && + rq->microq.microq_nr_running) + push_microq_tasks(rq); +} + +/* Assumes rq->lock is held */ +static void rq_online_microq(struct rq *rq) +{ + struct microq_rq *microq_rq = &rq->microq; + + microq_rq->microq_period = MICROQ_BANDWIDTH_UNDEFINED; + microq_rq->microq_runtime = MICROQ_BANDWIDTH_UNDEFINED; + microq_rq->microq_time = 0; + microq_rq->microq_target_time = 0; + microq_rq->microq_throttled = 0; + microq_rq->quanta_start = 0; + microq_rq->delta_exec_uncharged = 0; + microq_rq->delta_exec_total = 0; +} + +static int select_task_rq_microq(struct task_struct *p, int cpu, int sd_flag, int flags) +{ + struct task_struct *curr; + struct rq *rq; + + if (p->nr_cpus_allowed == 1) + goto out; + + /* For anything but wake ups, just return the task_cpu */ + if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK) + goto out; + + rq = cpu_rq(cpu); + + rcu_read_lock(); + curr = READ_ONCE(rq->curr); /* unlocked access */ + + /* Try to push the incoming microq task if the current rq is busy */ + if (unlikely(rq->microq.microq_nr_running) && (p->nr_cpus_allowed > 1)) { + int target = microq_find_rq(p); + + if (target != -1) + cpu = target; + } + rcu_read_unlock(); + +out: + return cpu; +} + +#endif /* CONFIG_SMP */ + +static void switched_to_microq(struct rq *rq, struct task_struct *p) +{ +#ifdef CONFIG_SMP + if (p->on_rq && rq->curr != p) { + rq->microq.last_push_failed = 0; + if (rq->microq.microq_nr_running > 1) + queue_balance_callback(rq, &per_cpu(microq_push_head, rq->cpu), + push_one_microq_task); + } +#endif +} + +static void prio_changed_microq(struct rq *rq, struct task_struct *p, int oldprio) +{ +} + +static void __task_tick_microq(struct rq *rq, struct task_struct *p, int queued) +{ + struct sched_microq_entity *microq_se = &p->microq; + + if (microq_timer_needed(&rq->microq)) + check_microq_timer(rq); + + update_curr_microq(rq); + + if (p->microq.time_slice--) + return; + p->microq.time_slice = sched_microq_timeslice; + if (microq_se->run_list.prev != microq_se->run_list.next) { + requeue_task_microq(rq, p); + resched_curr(rq); + return; + } +} + +static void task_tick_microq(struct rq *rq, struct task_struct *p, int queued) +{ + + if (hrtimer_active(&rq->microq.microq_period_timer)) + return; + + __task_tick_microq(rq, p, queued); +} + +static void set_curr_task_microq(struct rq *rq) +{ + struct task_struct *p = rq->curr; + + p->se.exec_start = rq_clock_task(rq); +} + +static unsigned int get_rr_interval_microq(struct rq *rq, struct task_struct *task) +{ + return sched_microq_timeslice; +} + +const struct sched_class microq_sched_class = { + .next = &fair_sched_class, + .enqueue_task = enqueue_task_microq, + .dequeue_task = dequeue_task_microq, + .yield_task = yield_task_microq, + .check_preempt_curr = check_preempt_curr_microq, + .pick_next_task = pick_next_task_microq, + .put_prev_task = put_prev_task_microq, +#ifdef CONFIG_SMP + .select_task_rq = select_task_rq_microq, + .set_cpus_allowed = set_cpus_allowed_common, + .rq_online = rq_online_microq, + .task_woken = task_woken_microq, +#endif + .set_curr_task = set_curr_task_microq, + .task_tick = task_tick_microq, + .get_rr_interval = get_rr_interval_microq, + .prio_changed = prio_changed_microq, + .switched_to = switched_to_microq, + .update_curr = update_curr_microq, +}; + +int sched_microq_proc_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int ret; + int old_period, old_runtime; + static DEFINE_MUTEX(mutex); + + mutex_lock(&mutex); + old_period = sysctl_sched_microq_period; + old_runtime = sysctl_sched_microq_runtime; + + ret = proc_dointvec(table, write, buffer, lenp, ppos); + + if (!ret && write) { + if (sysctl_sched_microq_period < MICROQ_MIN_PERIOD) { + sysctl_sched_microq_period = old_period; + ret = -EINVAL; + } + if (sysctl_sched_microq_runtime < MICROQ_MIN_RUNTIME && + sysctl_sched_microq_runtime != MICROQ_BANDWIDTH_UNDEFINED) { + sysctl_sched_microq_runtime = old_runtime; + ret = -EINVAL; + } + } + + mutex_unlock(&mutex); + return ret; +} + +#ifdef CONFIG_SCHED_DEBUG +extern void print_microq_rq(struct seq_file *m, int cpu, struct microq_rq *microq_rq); + +void print_microq_stats(struct seq_file *m, int cpu) +{ + rcu_read_lock(); + print_microq_rq(m, cpu, &cpu_rq(cpu)->microq); + rcu_read_unlock(); +} +#endif /* CONFIG_SCHED_DEBUG */ diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c index a96db50d40e0..486bf7e6ee31 100644 --- a/kernel/sched/pelt.c +++ b/kernel/sched/pelt.c @@ -107,7 +107,7 @@ static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3) * n=1 */ static __always_inline u32 -accumulate_sum(u64 delta, struct sched_avg *sa, +accumulate_sum(u64 delta, u64 contrib_ratio, struct sched_avg *sa, unsigned long load, unsigned long runnable, int running) { u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */ @@ -134,6 +134,7 @@ accumulate_sum(u64 delta, struct sched_avg *sa, } sa->period_contrib = delta; + contrib = contrib * contrib_ratio / SCHED_FIXEDPOINT_SCALE; if (load) sa->load_sum += load * contrib; if (runnable) @@ -173,7 +174,7 @@ accumulate_sum(u64 delta, struct sched_avg *sa, * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}] */ static __always_inline int -___update_load_sum(u64 now, struct sched_avg *sa, +___update_load_sum_ratio(u64 now, u64 contrib_ratio, struct sched_avg *sa, unsigned long load, unsigned long runnable, int running) { u64 delta; @@ -217,12 +218,20 @@ ___update_load_sum(u64 now, struct sched_avg *sa, * Step 1: accumulate *_sum since last_update_time. If we haven't * crossed period boundaries, finish. */ - if (!accumulate_sum(delta, sa, load, runnable, running)) + if (!accumulate_sum(delta, contrib_ratio, sa, load, runnable, running)) return 0; return 1; } +static __always_inline int +___update_load_sum(u64 now, struct sched_avg *sa, + unsigned long load, unsigned long runnable, int running) +{ + return ___update_load_sum_ratio(now, SCHED_FIXEDPOINT_SCALE, sa, load, runnable, running); +} + + static __always_inline void ___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runnable) { @@ -329,6 +338,21 @@ int update_rt_rq_load_avg(u64 now, struct rq *rq, int running) return 0; } +int update_rt_rq_load_avg_ratio(u64 now, u64 contrib_ratio, struct rq *rq, int running) +{ + if (___update_load_sum_ratio(now, contrib_ratio, &rq->avg_rt, + running, + running, + running)) { + + ___update_load_avg(&rq->avg_rt, 1, 1); + trace_pelt_rt_tp(rq); + return 1; + } + + return 0; +} + /* * dl_rq: * diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h index afff644da065..52840dbb02ab 100644 --- a/kernel/sched/pelt.h +++ b/kernel/sched/pelt.h @@ -5,6 +5,7 @@ int __update_load_avg_blocked_se(u64 now, struct sched_entity *se); int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se); int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq); int update_rt_rq_load_avg(u64 now, struct rq *rq, int running); +int update_rt_rq_load_avg_ratio(u64 now, u64 contrib_ratio, struct rq *rq, int running); int update_dl_rq_load_avg(u64 now, struct rq *rq, int running); #ifdef CONFIG_HAVE_SCHED_AVG_IRQ @@ -152,6 +153,11 @@ update_rt_rq_load_avg(u64 now, struct rq *rq, int running) return 0; } +int update_rt_rq_load_avg_ratio(u64 now, u64 contrib_ratio, struct rq *rq, int running) +{ + return 0; +} + static inline int update_dl_rq_load_avg(u64 now, struct rq *rq, int running) { diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index a532558a5176..a50afb692ce1 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1586,7 +1586,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) * utilization. We only care of the case where we start to schedule a * rt task */ - if (rq->curr->sched_class != &rt_sched_class) + if (rq->curr->sched_class != &rt_sched_class && rq->curr->sched_class != µq_sched_class) update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0); return p; @@ -2371,7 +2371,11 @@ static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task) } const struct sched_class rt_sched_class = { +#ifdef CONFIG_SCHED_CLASS_MICROQ + .next = µq_sched_class, +#else .next = &fair_sched_class, +#endif .enqueue_task = enqueue_task_rt, .dequeue_task = dequeue_task_rt, .yield_task = yield_task_rt, diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 802b1f3405f2..7e3beb49ad39 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -159,7 +159,23 @@ static inline int fair_policy(int policy) static inline int rt_policy(int policy) { - return policy == SCHED_FIFO || policy == SCHED_RR; + if (policy == SCHED_FIFO || policy == SCHED_RR || policy == SCHED_MICROQ) + return 1; + return 0; +} + +static inline int rt_fiforr_policy(int policy) +{ + if (policy == SCHED_FIFO || policy == SCHED_RR) + return 1; + return 0; +} + +static inline int microq_policy(int policy) +{ + if (policy == SCHED_MICROQ) + return 1; + return 0; } static inline int dl_policy(int policy) @@ -182,6 +198,11 @@ static inline int task_has_rt_policy(struct task_struct *p) return rt_policy(p->policy); } +static inline int task_has_rt_fiforr_policy(struct task_struct *p) +{ + return rt_fiforr_policy(p->policy); +} + static inline int task_has_dl_policy(struct task_struct *p) { return dl_policy(p->policy); @@ -685,6 +706,37 @@ struct dl_rq { #define entity_is_task(se) 1 #endif +#ifdef CONFIG_SCHED_CLASS_MICROQ + +/* + * If combined with a valid period, runtime == MICROQ_BANDWIDTH_UNDEFINED also + * indicates unlimited runtime + */ +#define MICROQ_BANDWIDTH_UNDEFINED (-1) +#define MICROQ_MIN_RUNTIME (1000) +#define MICROQ_MIN_PERIOD (2000) + +#define DEFAULT_MICROQ_RT_PRIORITY (0) + +/* Micro Quanta class */ +struct microq_rq { + struct list_head tasks; + unsigned int microq_nr_running; + int last_push_failed; + int microq_runtime; + int microq_period; + int microq_throttled; + u64 microq_time; + u64 microq_target_time; + struct hrtimer microq_period_timer; /* Nests inside the rq lock */ + u64 quanta_start; + u64 delta_exec_uncharged; + u64 delta_exec_total; + unsigned int period_count; + unsigned int periods_to_jiffies; +}; +#endif + #ifdef CONFIG_SMP /* * XXX we want to get rid of these helpers and use the full load resolution. @@ -877,6 +929,9 @@ struct rq { struct cfs_rq cfs; struct rt_rq rt; struct dl_rq dl; +#ifdef CONFIG_SCHED_CLASS_MICROQ + struct microq_rq microq; +#endif #ifdef CONFIG_FAIR_GROUP_SCHED /* list of leaf cfs_rq on this CPU: */ @@ -1773,6 +1828,7 @@ static inline void set_curr_task(struct rq *rq, struct task_struct *curr) extern const struct sched_class stop_sched_class; extern const struct sched_class dl_sched_class; +extern const struct sched_class microq_sched_class; extern const struct sched_class rt_sched_class; extern const struct sched_class fair_sched_class; extern const struct sched_class idle_sched_class; @@ -2151,6 +2207,11 @@ extern void init_cfs_rq(struct cfs_rq *cfs_rq); extern void init_rt_rq(struct rt_rq *rt_rq); extern void init_dl_rq(struct dl_rq *dl_rq); +#ifdef CONFIG_SCHED_CLASS_MICROQ +extern void print_microq_stats(struct seq_file *m, int cpu); +extern void init_microq_rq(struct microq_rq *microq_rq); +#endif + extern void cfs_bandwidth_usage_inc(void); extern void cfs_bandwidth_usage_dec(void); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 1c1ad1e14f21..aba857c78c2d 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -452,6 +452,29 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = sched_rr_handler, }, +#ifdef CONFIG_SCHED_CLASS_MICROQ + { + .procname = "sched_microq_period_ns", + .data = &sysctl_sched_microq_period, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = sched_microq_proc_handler, + }, + { + .procname = "sched_microq_runtime_ns", + .data = &sysctl_sched_microq_runtime, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = sched_microq_proc_handler, + }, + { + .procname = "sched_microq_timeslice", + .data = &sched_microq_timeslice, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif #ifdef CONFIG_UCLAMP_TASK { .procname = "sched_util_clamp_min", -- 2.23.0.187.g17f5b7556c-goog