From: Konstantin Khlebnikov <khlebnikov@xxxxxxxxxxxxxx> inject_delay() allows to pause current task before returning into userspace in place where kernel doesn't hold any locks thus wait wouldn't introduce any priority-inversion problems. This code abuses existing task-work and 'TASK_PARKED' state. Parked tasks are killable and don't contribute into cpu load. Together with percpu_ratelimit this could be used in this manner: if (percpu_ratelimit_charge(&ratelimit, events)) inject_delay(percpu_ratelimit_target(&ratelimit)); Signed-off-by: Konstantin Khlebnikov <khlebnikov@xxxxxxxxxxxxxx> --- include/linux/sched.h | 7 ++++ include/trace/events/sched.h | 7 ++++ kernel/sched/core.c | 66 ++++++++++++++++++++++++++++++++++++++++++ kernel/sched/fair.c | 12 ++++++++ 4 files changed, 92 insertions(+) diff --git a/include/linux/sched.h b/include/linux/sched.h index 8db31ef..2363918 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1132,6 +1132,7 @@ struct sched_statistics { u64 iowait_sum; u64 sleep_start; + u64 delay_start; u64 sleep_max; s64 sum_sleep_runtime; @@ -1662,6 +1663,10 @@ struct task_struct { unsigned long timer_slack_ns; unsigned long default_timer_slack_ns; + /* Pause task till this time before returning into userspace */ + ktime_t delay_injection_target; + struct callback_head delay_injection_work; + #ifdef CONFIG_FUNCTION_GRAPH_TRACER /* Index of current stored address in ret_stack */ int curr_ret_stack; @@ -2277,6 +2282,8 @@ extern void set_curr_task(int cpu, struct task_struct *p); void yield(void); +extern void inject_delay(ktime_t target); + /* * The default (Linux) execution domain. */ diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 30fedaf..d35154e 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -365,6 +365,13 @@ DEFINE_EVENT(sched_stat_template, sched_stat_blocked, TP_ARGS(tsk, delay)); /* + * Tracepoint for accounting delay-injection + */ +DEFINE_EVENT(sched_stat_template, sched_stat_delayed, + TP_PROTO(struct task_struct *tsk, u64 delay), + TP_ARGS(tsk, delay)); + +/* * Tracepoint for accounting runtime (time the task is executing * on a CPU). */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c0accc0..7a9d6a1 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -65,6 +65,7 @@ #include <linux/unistd.h> #include <linux/pagemap.h> #include <linux/hrtimer.h> +#include <linux/task_work.h> #include <linux/tick.h> #include <linux/debugfs.h> #include <linux/ctype.h> @@ -8377,3 +8378,68 @@ void dump_cpu_task(int cpu) pr_info("Task dump for CPU %d:\n", cpu); sched_show_task(cpu_curr(cpu)); } + +#define DELAY_INJECTION_SLACK_NS (NSEC_PER_SEC / 50) + +static enum hrtimer_restart delay_injection_wakeup(struct hrtimer *timer) +{ + struct hrtimer_sleeper *t = + container_of(timer, struct hrtimer_sleeper, timer); + struct task_struct *task = t->task; + + t->task = NULL; + if (task) + wake_up_state(task, TASK_PARKED); + + return HRTIMER_NORESTART; +} + +/* + * Here delayed task sleeps in 'P'arked state. + */ +static void delay_injection_sleep(struct callback_head *head) +{ + struct task_struct *task = current; + struct hrtimer_sleeper t; + + head->func = NULL; + __set_task_state(task, TASK_WAKEKILL | TASK_PARKED); + hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + hrtimer_set_expires_range_ns(&t.timer, current->delay_injection_target, + DELAY_INJECTION_SLACK_NS); + + t.timer.function = delay_injection_wakeup; + t.task = task; + + hrtimer_start_expires(&t.timer, HRTIMER_MODE_ABS); + if (!hrtimer_active(&t.timer)) + t.task = NULL; + + if (likely(t.task)) + schedule(); + + hrtimer_cancel(&t.timer); + destroy_hrtimer_on_stack(&t.timer); + + __set_task_state(task, TASK_RUNNING); +} + +/* + * inject_delay - injects delay before returning into userspace + * @target: absolute monotomic timestamp to sleeping for, + * task will not return into userspace before this time + */ +void inject_delay(ktime_t target) +{ + struct task_struct *task = current; + + if (ktime_after(target, task->delay_injection_target)) { + task->delay_injection_target = target; + if (!task->delay_injection_work.func) { + init_task_work(&task->delay_injection_work, + delay_injection_sleep); + task_work_add(task, &task->delay_injection_work, true); + } + } +} +EXPORT_SYMBOL(inject_delay); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 40667cb..2e3269b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2944,6 +2944,15 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) account_scheduler_latency(tsk, delta >> 10, 0); } } + if (se->statistics.delay_start) { + u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.delay_start; + + if ((s64)delta < 0) + delta = 0; + + se->statistics.delay_start = 0; + trace_sched_stat_delayed(tsk, delta); + } #endif } @@ -3095,6 +3104,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) se->statistics.sleep_start = rq_clock(rq_of(cfs_rq)); if (tsk->state & TASK_UNINTERRUPTIBLE) se->statistics.block_start = rq_clock(rq_of(cfs_rq)); + if ((tsk->state & TASK_PARKED) && + tsk->delay_injection_target.tv64) + se->statistics.delay_start = rq_clock(rq_of(cfs_rq)); } #endif } -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html