[PATCH 5/6] delay-injection: resource management via procrastination

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



From: Konstantin Khlebnikov <khlebnikov@xxxxxxxxxxxxxx>

inject_delay() allows to pause current task before returning
into userspace in place where kernel doesn't hold any locks
thus wait wouldn't introduce any priority-inversion problems.

This code abuses existing task-work and 'TASK_PARKED' state.
Parked tasks are killable and don't contribute into cpu load.

Together with percpu_ratelimit this could be used in this manner:

if (percpu_ratelimit_charge(&ratelimit, events))
        inject_delay(percpu_ratelimit_target(&ratelimit));

Signed-off-by: Konstantin Khlebnikov <khlebnikov@xxxxxxxxxxxxxx>
---
 include/linux/sched.h        |    7 ++++
 include/trace/events/sched.h |    7 ++++
 kernel/sched/core.c          |   66 ++++++++++++++++++++++++++++++++++++++++++
 kernel/sched/fair.c          |   12 ++++++++
 4 files changed, 92 insertions(+)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8db31ef..2363918 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1132,6 +1132,7 @@ struct sched_statistics {
 	u64			iowait_sum;
 
 	u64			sleep_start;
+	u64			delay_start;
 	u64			sleep_max;
 	s64			sum_sleep_runtime;
 
@@ -1662,6 +1663,10 @@ struct task_struct {
 	unsigned long timer_slack_ns;
 	unsigned long default_timer_slack_ns;
 
+	/* Pause task till this time before returning into userspace */
+	ktime_t delay_injection_target;
+	struct callback_head delay_injection_work;
+
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 	/* Index of current stored address in ret_stack */
 	int curr_ret_stack;
@@ -2277,6 +2282,8 @@ extern void set_curr_task(int cpu, struct task_struct *p);
 
 void yield(void);
 
+extern void inject_delay(ktime_t target);
+
 /*
  * The default (Linux) execution domain.
  */
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 30fedaf..d35154e 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -365,6 +365,13 @@ DEFINE_EVENT(sched_stat_template, sched_stat_blocked,
 	     TP_ARGS(tsk, delay));
 
 /*
+ * Tracepoint for accounting delay-injection
+ */
+DEFINE_EVENT(sched_stat_template, sched_stat_delayed,
+	     TP_PROTO(struct task_struct *tsk, u64 delay),
+	     TP_ARGS(tsk, delay));
+
+/*
  * Tracepoint for accounting runtime (time the task is executing
  * on a CPU).
  */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c0accc0..7a9d6a1 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -65,6 +65,7 @@
 #include <linux/unistd.h>
 #include <linux/pagemap.h>
 #include <linux/hrtimer.h>
+#include <linux/task_work.h>
 #include <linux/tick.h>
 #include <linux/debugfs.h>
 #include <linux/ctype.h>
@@ -8377,3 +8378,68 @@ void dump_cpu_task(int cpu)
 	pr_info("Task dump for CPU %d:\n", cpu);
 	sched_show_task(cpu_curr(cpu));
 }
+
+#define DELAY_INJECTION_SLACK_NS	(NSEC_PER_SEC / 50)
+
+static enum hrtimer_restart delay_injection_wakeup(struct hrtimer *timer)
+{
+	struct hrtimer_sleeper *t =
+		container_of(timer, struct hrtimer_sleeper, timer);
+	struct task_struct *task = t->task;
+
+	t->task = NULL;
+	if (task)
+		wake_up_state(task, TASK_PARKED);
+
+	return HRTIMER_NORESTART;
+}
+
+/*
+ * Here delayed task sleeps in 'P'arked state.
+ */
+static void delay_injection_sleep(struct callback_head *head)
+{
+	struct task_struct *task = current;
+	struct hrtimer_sleeper t;
+
+	head->func = NULL;
+	__set_task_state(task, TASK_WAKEKILL | TASK_PARKED);
+	hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+	hrtimer_set_expires_range_ns(&t.timer, current->delay_injection_target,
+				     DELAY_INJECTION_SLACK_NS);
+
+	t.timer.function = delay_injection_wakeup;
+	t.task = task;
+
+	hrtimer_start_expires(&t.timer, HRTIMER_MODE_ABS);
+	if (!hrtimer_active(&t.timer))
+		t.task = NULL;
+
+	if (likely(t.task))
+		schedule();
+
+	hrtimer_cancel(&t.timer);
+	destroy_hrtimer_on_stack(&t.timer);
+
+	__set_task_state(task, TASK_RUNNING);
+}
+
+/*
+ * inject_delay - injects delay before returning into userspace
+ * @target: absolute monotomic timestamp to sleeping for,
+ *	    task will not return into userspace before this time
+ */
+void inject_delay(ktime_t target)
+{
+	struct task_struct *task = current;
+
+	if (ktime_after(target, task->delay_injection_target)) {
+		task->delay_injection_target = target;
+		if (!task->delay_injection_work.func) {
+			init_task_work(&task->delay_injection_work,
+					delay_injection_sleep);
+			task_work_add(task, &task->delay_injection_work, true);
+		}
+	}
+}
+EXPORT_SYMBOL(inject_delay);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 40667cb..2e3269b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2944,6 +2944,15 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
 			account_scheduler_latency(tsk, delta >> 10, 0);
 		}
 	}
+	if (se->statistics.delay_start) {
+		u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.delay_start;
+
+		if ((s64)delta < 0)
+			delta = 0;
+
+		se->statistics.delay_start = 0;
+		trace_sched_stat_delayed(tsk, delta);
+	}
 #endif
 }
 
@@ -3095,6 +3104,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 				se->statistics.sleep_start = rq_clock(rq_of(cfs_rq));
 			if (tsk->state & TASK_UNINTERRUPTIBLE)
 				se->statistics.block_start = rq_clock(rq_of(cfs_rq));
+			if ((tsk->state & TASK_PARKED) &&
+			    tsk->delay_injection_target.tv64)
+				se->statistics.delay_start = rq_clock(rq_of(cfs_rq));
 		}
 #endif
 	}

--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html



[Index of Archives]     [Linux Ext4 Filesystem]     [Union Filesystem]     [Filesystem Testing]     [Ceph Users]     [Ecryptfs]     [AutoFS]     [Kernel Newbies]     [Share Photos]     [Security]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux Cachefs]     [Reiser Filesystem]     [Linux RAID]     [Samba]     [Device Mapper]     [CEPH Development]
  Powered by Linux