On CPUs that implement per-core frequency scaling (and not per-socket), it is beneficial to try to use cores running at a higher frequency. One way to do this on fork/exec/wakeup is to keep the targeted thread on its parent/previous/waker core, since it should already be running at a high frequency. This is how this works: - choose the new_cpu as usual in select_task_rq_fair() - if we are in a fork/exec/wakeup and new_cpu runs at a low frequency, start a timer in 50us and place the thread on its parent/previous/waker core - if the thread is scheduled, cancel the timer - if the timer expires, migrate the thread to new_cpu This way, if the previous cpu is too busy to be used, the thread will use another cpu. This is particularly useful in fork/wait patterns where the child thread is placed on an idle core (low frequency) and the parent thread waits and makes its core idle (high frequency). This patch avoids using a low frequency core if a higher frquency one is available. There are two configuration parameters for this feature: - the frequency threshold under which we consider the core to be running at a low frequency, in kHz (/proc/sys/kernel/sched_lowfreq). By default, this is set to 0, which disables the delayed thread migration feature. - the delay of the timer, in ns (/proc/sys/kernel/sched_delayed_placement) Co-developed-by: Damien Carver <carverdamien@xxxxxxxxx> Signed-off-by: Damien Carver <carverdamien@xxxxxxxxx> Signed-off-by: Redha Gouicem <redha.gouicem@xxxxxxxxx> --- include/linux/sched.h | 4 ++++ include/linux/sched/sysctl.h | 3 +++ kernel/sched/core.c | 32 +++++++++++++++++++++++++++ kernel/sched/fair.c | 42 +++++++++++++++++++++++++++++++++++- kernel/sched/sched.h | 3 +++ kernel/sysctl.c | 14 ++++++++++++ 6 files changed, 97 insertions(+), 1 deletion(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 2bf0af19a62a..ae823d458f94 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -662,6 +662,10 @@ struct task_struct { unsigned long wakee_flip_decay_ts; struct task_struct *last_wakee; + /* Delayed placement */ + struct hrtimer delay_placement_timer; + int delay_placement_cpu; + /* * recent_used_cpu is initially set as the last CPU used by a task * that wakes affine another task. Waker/wakee relationships can diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 3c31ba88aca5..97a1f4489910 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -52,6 +52,9 @@ int sched_proc_update_handler(struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos); #endif +extern __read_mostly unsigned int sysctl_sched_delayed_placement; +extern __read_mostly unsigned int sysctl_sched_lowfreq; + /* * control realtime throttling: * diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d6d27a6fc23c..9958b38a5b6f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3217,6 +3217,33 @@ int sysctl_schedstats(struct ctl_table *table, int write, void *buffer, static inline void init_schedstats(void) {} #endif /* CONFIG_SCHEDSTATS */ +static enum hrtimer_restart delayed_placement_fn(struct hrtimer *data) +{ + struct task_struct *p = container_of(data, struct task_struct, + delay_placement_timer); + struct rq *rq; + struct rq_flags rf; + bool queued, running; + + /* + * If, by chance, p was already migrated to this cpu, no need to do + * anything. This can happen because of load balancing for example. + */ + if (task_cpu(p) == p->delay_placement_cpu) + return HRTIMER_NORESTART; + + rq = task_rq_lock(p, &rf); + + queued = task_on_rq_queued(p); + running = task_current(rq, p); + if (queued && !running) + rq = __migrate_task(rq, &rf, p, p->delay_placement_cpu); + + task_rq_unlock(rq, p, &rf); + + return HRTIMER_NORESTART; +} + /* * fork()/clone()-time setup: */ @@ -3299,6 +3326,11 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) plist_node_init(&p->pushable_tasks, MAX_PRIO); RB_CLEAR_NODE(&p->pushable_dl_tasks); #endif + + hrtimer_init(&p->delay_placement_timer, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); + p->delay_placement_timer.function = delayed_placement_fn; + return 0; } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 33699db27ed5..99c42c215477 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -84,6 +84,15 @@ static unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL; const_debug unsigned int sysctl_sched_migration_cost = 500000UL; +/* + * After fork, exec or wakeup, thread placement is delayed. This option gives + * the delay in nanoseconds. + * + * (default: 50us) + */ +unsigned int sysctl_sched_delayed_placement = 50000; +unsigned int sysctl_sched_lowfreq; + int sched_thermal_decay_shift; static int __init setup_sched_thermal_decay_shift(char *str) { @@ -6656,6 +6665,13 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) return -1; } +static bool is_cpu_low_freq(int cpu) +{ + if (!sysctl_sched_lowfreq) + return false; + return cpu_rq(cpu)->freq <= sysctl_sched_lowfreq; +} + /* * select_task_rq_fair: Select target runqueue for the waking task in domains * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE, @@ -6683,7 +6699,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f if (sched_energy_enabled()) { new_cpu = find_energy_efficient_cpu(p, prev_cpu); if (new_cpu >= 0) - return new_cpu; + goto local; new_cpu = prev_cpu; } @@ -6724,6 +6740,28 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f } rcu_read_unlock(); +local: + if (!is_cpu_low_freq(new_cpu)) + goto end; + /* + * If fork/wake/exec, trigger an interrupt in 50us (default) to eventually steal the thread + * and place the thread locally. + */ + if (new_cpu == task_cpu(p)) + goto end; + + if (sd_flag & (SD_BALANCE_FORK | SD_BALANCE_WAKE | SD_BALANCE_EXEC)) { + p->delay_placement_cpu = new_cpu; + new_cpu = task_cpu(current); + + /* Arm timer in 50us */ + hrtimer_start(&p->delay_placement_timer, + ktime_set(0, + sysctl_sched_delayed_placement), + HRTIMER_MODE_REL); + } + +end: return new_cpu; } @@ -7085,6 +7123,8 @@ done: __maybe_unused; if (hrtick_enabled(rq)) hrtick_start_fair(rq, p); + hrtimer_try_to_cancel(&p->delay_placement_timer); + update_misfit_status(p, rq); return p; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 7d794ab756d2..02da9ca69b4a 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2018,6 +2018,9 @@ extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags); extern const_debug unsigned int sysctl_sched_nr_migrate; extern const_debug unsigned int sysctl_sched_migration_cost; +extern unsigned int sysctl_sched_delayed_placement; +extern unsigned int sysctl_sched_lowfreq; + #ifdef CONFIG_SCHED_HRTICK /* diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 287862f91717..e8cc36624330 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1712,6 +1712,20 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "sched_delayed_placement", + .data = &sysctl_sched_delayed_placement, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "sched_lowfreq", + .data = &sysctl_sched_lowfreq, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, #ifdef CONFIG_SCHEDSTATS { .procname = "sched_schedstats", -- 2.28.0