+ cfs-scheduler-v16.patch added to -mm tree

akpm@xxxxxxxxxxxxxxxxxxxx · Mon, 11 Jun 2007 11:36:38 -0700

The patch titled
     CFS scheduler, -v16
has been added to the -mm tree.  Its filename is
     cfs-scheduler-v16.patch

*** Remember to use Documentation/SubmitChecklist when testing your code ***

See http://www.zip.com.au/~akpm/linux/patches/stuff/added-to-mm.txt to find
out what to do about this

------------------------------------------------------
Subject: CFS scheduler, -v16
From: Ingo Molnar <mingo@xxxxxxx>

-v16 includes smaller fixes. Continued work on precise /proc CPU
accounting of both SCHED_OTHER and RT tasks by Dmitry Adamushko and
Balbir Singh. Reniced tasks should now disturb nice-0 tasks even less.
Also, i have changed SCHED_BATCH back to its current mainline meaning
and have added a SCHED_IDLEPRIO instead (first introduced by Con Kolivas
in staircase/RSDL/SD).

Changes since -v15:

 - more /proc CPU stats accounting improvements (Dmitry Adamushko,
   Balbir Singh)

 - fix SCHED_BATCH (reported by Con Kolivas)

 - update_load_fair() - use 64-bit arithmetics (Dmitry Adamushko)

 - fix RT->NORMAL accounting issue raised by Srivatsa Vaddagiri: have
   correct exec_start stamping. (Dmitry Adamushko)

 - check for negative deltas in task_sched_runtime() (Dmitry Adamushko)

 - check for large forward-jumping sched_clock()

 - cleanup: remove task_struct :: last_ran (Dmitry Adamushko)

 - /proc/sched_debug printk fixes (Andrew Morton)

 - add SCHED_IDLEPRIO

 - consolidate the granularity settings and make them scale together

 - improve /proc/sched_debug output

 - remove the yield workarounds - the default seems to be working now.

 - introduce lower and upper limits for the granularity tunables.
   Setting them to zero accidentally broke nice levels.

 - various small fixes/cleanups

Signed-off-by: Ingo Molnar <mingo@xxxxxxx>
Signed-off-by: Dmitry Adamushko <dmitry.adamushko@xxxxxxxxx>
Signed-off-by: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx>
---

 drivers/char/drm/radeon_cp.c |    5 -
 fs/proc/array.c              |   32 ++++--
 include/linux/sched.h        |   10 +-
 kernel/posix-cpu-timers.c    |    2 
 kernel/sched.c               |  127 +++++++++++++++-----------
 kernel/sched_debug.c         |   38 +++++--
 kernel/sched_fair.c          |  160 +++++++++++++--------------------
 kernel/sched_rt.c            |   36 ++++++-
 kernel/sysctl.c              |   34 +++----
 9 files changed, 247 insertions(+), 197 deletions(-)

diff -puN drivers/char/drm/radeon_cp.c~cfs-scheduler-v16 drivers/char/drm/radeon_cp.c

--- a/drivers/char/drm/radeon_cp.c~cfs-scheduler-v16
+++ a/drivers/char/drm/radeon_cp.c
@@ -2267,11 +2267,6 @@ int radeon_driver_load(struct drm_device
 
 	DRM_DEBUG("%s card detected\n",
 		  ((dev_priv->flags & RADEON_IS_AGP) ? "AGP" : (((dev_priv->flags & RADEON_IS_PCIE) ? "PCIE" : "PCI"))));
-	if (sysctl_sched_yield_bug_workaround == -1) {
-		sysctl_sched_yield_bug_workaround = 1;
-		printk(KERN_WARNING "quirk installed: turning on "
-			"sys_sched_yield() workaround for Radeon DRM.\n");
-	}
 	return ret;
 }
 
diff -puN fs/proc/array.c~cfs-scheduler-v16 fs/proc/array.c
--- a/fs/proc/array.c~cfs-scheduler-v16
+++ a/fs/proc/array.c
@@ -172,8 +172,8 @@ static inline char * task_state(struct t
 		"Uid:\t%d\t%d\t%d\t%d\n"
 		"Gid:\t%d\t%d\t%d\t%d\n",
 		get_task_state(p),
-		p->tgid, p->pid,
-		pid_alive(p) ? rcu_dereference(p->real_parent)->tgid : 0,
+	       	p->tgid, p->pid,
+	       	pid_alive(p) ? rcu_dereference(p->real_parent)->tgid : 0,
 		pid_alive(p) && p->ptrace ? rcu_dereference(p->parent)->pid : 0,
 		p->uid, p->euid, p->suid, p->fsuid,
 		p->gid, p->egid, p->sgid, p->fsgid);
@@ -322,24 +322,38 @@ int proc_pid_status(struct task_struct *
 
 static clock_t task_utime(struct task_struct *p)
 {
+	clock_t utime = cputime_to_clock_t(p->utime),
+		total = utime + cputime_to_clock_t(p->stime);
+
 	/*
 	 * Use CFS's precise accounting, if available:
 	 */
-	if (!has_rt_policy(p) && !(sysctl_sched_load_smoothing & 128))
-		return nsec_to_clock_t(p->sum_exec_runtime);
+	if (!(sysctl_sched_features & 128)) {
+		u64 temp = (u64)nsec_to_clock_t(p->sum_exec_runtime);
+
+		if (total) {
+			temp *= utime;
+			do_div(temp, total);
+		}
+		utime = (clock_t)temp;
+	}
 
-	return cputime_to_clock_t(p->utime);
+	return utime;
 }
 
 static clock_t task_stime(struct task_struct *p)
 {
+	clock_t stime = cputime_to_clock_t(p->stime);
+
 	/*
-	 * Use CFS's precise accounting, if available:
+	 * Use CFS's precise accounting, if available (we subtract
+	 * utime from the total, to make sure the total observed
+	 * by userspace grows monotonically - apps rely on that):
 	 */
-	if (!has_rt_policy(p) && !(sysctl_sched_load_smoothing & 128))
-		return 0;
+	if (!(sysctl_sched_features & 128))
+		stime = nsec_to_clock_t(p->sum_exec_runtime) - task_utime(p);
 
-	return cputime_to_clock_t(p->stime);
+	return stime;
 }
 
 
diff -puN include/linux/sched.h~cfs-scheduler-v16 include/linux/sched.h
--- a/include/linux/sched.h~cfs-scheduler-v16
+++ a/include/linux/sched.h
@@ -34,6 +34,8 @@
 #define SCHED_FIFO		1
 #define SCHED_RR		2
 #define SCHED_BATCH		3
+#define SCHED_ISO		4
+#define SCHED_IDLEPRIO		5
 
 #ifdef __KERNEL__
 
@@ -876,7 +878,6 @@ struct task_struct {
 	u64 block_max;
 	u64 exec_max;
 	u64 wait_max;
-	u64 last_ran;
 
 	s64 wait_runtime;
 	u64 sum_exec_runtime;
@@ -1265,7 +1266,7 @@ static inline int set_cpus_allowed(struc
 extern unsigned long long sched_clock(void);
 extern void sched_clock_unstable_event(void);
 extern unsigned long long
-current_sched_runtime(const struct task_struct *current_task);
+task_sched_runtime(struct task_struct *task);
 
 /* sched_exec is called by processes performing an exec */
 #ifdef CONFIG_SMP
@@ -1284,11 +1285,10 @@ extern void sched_idle_next(void);
 extern char * sched_print_task_state(struct task_struct *p, char *buffer);
 
 extern unsigned int sysctl_sched_granularity;
-extern unsigned int sysctl_sched_wakeup_granularity;
+extern unsigned int sysctl_sched_batch_wakeup_granularity;
 extern unsigned int sysctl_sched_runtime_limit;
 extern unsigned int sysctl_sched_child_runs_first;
-extern unsigned int sysctl_sched_load_smoothing;
-extern int sysctl_sched_yield_bug_workaround;
+extern unsigned int sysctl_sched_features;
 
 #ifdef CONFIG_RT_MUTEXES
 extern int rt_mutex_getprio(struct task_struct *p);
diff -puN kernel/posix-cpu-timers.c~cfs-scheduler-v16 kernel/posix-cpu-timers.c
--- a/kernel/posix-cpu-timers.c~cfs-scheduler-v16
+++ a/kernel/posix-cpu-timers.c
@@ -161,7 +161,7 @@ static inline cputime_t virt_ticks(struc
 }
 static inline unsigned long long sched_ns(struct task_struct *p)
 {
-	return (p == current) ? current_sched_runtime(p) : p->sum_exec_runtime;
+	return task_sched_runtime(p);
 }
 
 int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp)
diff -puN kernel/sched.c~cfs-scheduler-v16 kernel/sched.c
--- a/kernel/sched.c~cfs-scheduler-v16
+++ a/kernel/sched.c
@@ -154,12 +154,12 @@ struct rq {
 
 	u64 clock, prev_clock_raw;
 	s64 clock_max_delta;
-	u64 fair_clock, prev_fair_clock;
-	u64 exec_clock, prev_exec_clock;
+	u64 fair_clock, delta_fair_clock;
+	u64 exec_clock, delta_exec_clock;
 	s64 wait_runtime;
 	unsigned long wait_runtime_overruns, wait_runtime_underruns;
 
-	unsigned int clock_warps;
+	unsigned int clock_warps, clock_overflows;
 	unsigned int clock_unstable_events;
 
 	struct sched_class *load_balance_class;
@@ -271,9 +271,17 @@ static inline unsigned long long __rq_cl
 		clock++;
 		rq->clock_warps++;
 	} else {
-		if (unlikely(delta > rq->clock_max_delta))
-			rq->clock_max_delta = delta;
-		clock += delta;
+		/*
+		 * Catch too large forward jumps too:
+		 */
+		if (delta > 2*TICK_NSEC) {
+			clock++;
+			rq->clock_overflows++;
+		} else {
+			if (unlikely(delta > rq->clock_max_delta))
+				rq->clock_max_delta = delta;
+			clock += delta;
+		}
 	}
 
 	rq->prev_clock_raw = now;
@@ -613,9 +621,9 @@ static void set_load_weight(struct task_
 		return;
 	}
 	/*
-	 * SCHED_BATCH tasks get minimal weight:
+	 * SCHED_IDLEPRIO tasks get minimal weight:
 	 */
-	if (p->policy == SCHED_BATCH) {
+	if (p->policy == SCHED_IDLEPRIO) {
 		p->load_weight = 1;
 		return;
 	}
@@ -1275,7 +1283,7 @@ static void task_running_tick(struct rq 
  */
 static void __sched_fork(struct task_struct *p)
 {
-	p->wait_start_fair = p->wait_start = p->exec_start = p->last_ran = 0;
+	p->wait_start_fair = p->wait_start = p->exec_start = 0;
 	p->sum_exec_runtime = 0;
 
 	p->wait_runtime = 0;
@@ -1579,37 +1587,34 @@ unsigned long nr_active(void)
 static void update_load_fair(struct rq *this_rq)
 {
 	unsigned long this_load, fair_delta, exec_delta, idle_delta;
+	u64 fair_delta64, exec_delta64, tmp64;
 	unsigned int i, scale;
-	s64 fair_delta64, exec_delta64;
-	unsigned long tmp;
-	u64 tmp64;
 
 	this_rq->nr_load_updates++;
-	if (!(sysctl_sched_load_smoothing & 64)) {
+	if (!(sysctl_sched_features & 64)) {
 		this_load = this_rq->raw_weighted_load;
 		goto do_avg;
 	}
 
-	fair_delta64 = this_rq->fair_clock - this_rq->prev_fair_clock + 1;
-	this_rq->prev_fair_clock = this_rq->fair_clock;
+	fair_delta64 = this_rq->delta_fair_clock + 1;
+	this_rq->delta_fair_clock = 0;
 
-	exec_delta64 = this_rq->exec_clock - this_rq->prev_exec_clock + 1;
-	this_rq->prev_exec_clock = this_rq->exec_clock;
+	exec_delta64 = this_rq->delta_exec_clock + 1;
+	this_rq->delta_exec_clock = 0;
 
-	if (fair_delta64 > (s64)LONG_MAX)
-		fair_delta64 = (s64)LONG_MAX;
+	if (fair_delta64 > (u64)LONG_MAX)
+		fair_delta64 = (u64)LONG_MAX;
 	fair_delta = (unsigned long)fair_delta64;
 
-	if (exec_delta64 > (s64)LONG_MAX)
-		exec_delta64 = (s64)LONG_MAX;
+	if (exec_delta64 > (u64)TICK_NSEC)
+		exec_delta64 = (u64)TICK_NSEC;
 	exec_delta = (unsigned long)exec_delta64;
-	if (exec_delta > TICK_NSEC)
-		exec_delta = TICK_NSEC;
 
 	idle_delta = TICK_NSEC - exec_delta;
 
-	tmp = (SCHED_LOAD_SCALE * exec_delta) / fair_delta;
-	tmp64 = (u64)tmp * (u64)exec_delta;
+	tmp64 = SCHED_LOAD_SCALE * exec_delta64;
+	do_div(tmp64, fair_delta);
+	tmp64 *= exec_delta64;
 	do_div(tmp64, TICK_NSEC);
 	this_load = (unsigned long)tmp64;
 
@@ -2821,17 +2826,23 @@ DEFINE_PER_CPU(struct kernel_stat, kstat
 EXPORT_PER_CPU_SYMBOL(kstat);
 
 /*
- * Return current->sum_exec_runtime plus any more ns on the sched_clock
- * that have not yet been banked.
+ * Return p->sum_exec_runtime plus any more ns on the sched_clock
+ * that have not yet been banked in case the task is currently running.
  */
-unsigned long long current_sched_runtime(const struct task_struct *p)
+unsigned long long task_sched_runtime(struct task_struct *p)
 {
-	unsigned long long ns;
 	unsigned long flags;
+	u64 ns, delta_exec;
+	struct rq *rq;
 
-	local_irq_save(flags);
-	ns = p->sum_exec_runtime + sched_clock() - p->last_ran;
-	local_irq_restore(flags);
+	rq = task_rq_lock(p, &flags);
+	ns = p->sum_exec_runtime;
+	if (rq->curr == p) {
+		delta_exec = rq_clock(rq) - p->exec_start;
+		if ((s64)delta_exec > 0)
+			ns += delta_exec;
+	}
+	task_rq_unlock(rq, &flags);
 
 	return ns;
 }
@@ -3565,7 +3576,7 @@ void set_user_nice(struct task_struct *p
 	 * The RT priorities are set via sched_setscheduler(), but we still
 	 * allow the 'normal' nice value to be set - but as expected
 	 * it wont have any effect on scheduling until the task is
-	 * not SCHED_NORMAL/SCHED_BATCH:
+	 * SCHED_FIFO/SCHED_RR:
 	 */
 	if (has_rt_policy(p)) {
 		p->static_prio = NICE_TO_PRIO(nice);
@@ -3714,6 +3725,7 @@ __setscheduler(struct rq *rq, struct tas
 	switch (p->policy) {
 	case SCHED_NORMAL:
 	case SCHED_BATCH:
+	case SCHED_IDLEPRIO:
 		p->sched_class = &fair_sched_class;
 		break;
 	case SCHED_FIFO:
@@ -3751,12 +3763,13 @@ recheck:
 	if (policy < 0)
 		policy = oldpolicy = p->policy;
 	else if (policy != SCHED_FIFO && policy != SCHED_RR &&
-			policy != SCHED_NORMAL && policy != SCHED_BATCH)
+			policy != SCHED_NORMAL && policy != SCHED_BATCH &&
+			policy != SCHED_IDLEPRIO)
 		return -EINVAL;
 	/*
 	 * Valid priorities for SCHED_FIFO and SCHED_RR are
-	 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and
-	 * SCHED_BATCH is 0.
+	 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
+	 * SCHED_BATCH and SCHED_IDLEPRIO is 0.
 	 */
 	if (param->sched_priority < 0 ||
 	    (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
@@ -4310,6 +4323,7 @@ asmlinkage long sys_sched_get_priority_m
 		break;
 	case SCHED_NORMAL:
 	case SCHED_BATCH:
+	case SCHED_IDLEPRIO:
 		ret = 0;
 		break;
 	}
@@ -4334,6 +4348,7 @@ asmlinkage long sys_sched_get_priority_m
 		break;
 	case SCHED_NORMAL:
 	case SCHED_BATCH:
+	case SCHED_IDLEPRIO:
 		ret = 0;
 	}
 	return ret;
@@ -4496,6 +4511,29 @@ void __cpuinit init_idle(struct task_str
  */
 cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
 
+/*
+ * Increase the granularity value when there are more CPUs,
+ * because with more CPUs the 'effective latency' as visible
+ * to users decreases. But the relationship is not linear,
+ * so pick a second-best guess by going with the log2 of the
+ * number of CPUs.
+ *
+ * This idea comes from the SD scheduler of Con Kolivas:
+ */
+static inline void sched_init_granularity(void)
+{
+	unsigned int factor = 1 + ilog2(num_online_cpus());
+	const unsigned long gran_limit = 10000000;
+
+	sysctl_sched_granularity *= factor;
+	sysctl_sched_runtime_limit *= factor;
+
+	if (sysctl_sched_granularity > gran_limit)
+		sysctl_sched_granularity = gran_limit;
+
+	sysctl_sched_runtime_limit = sysctl_sched_granularity * 2;
+}
+
 #ifdef CONFIG_SMP
 /*
  * This is how migration works:
@@ -5900,25 +5938,12 @@ void __init sched_init_smp(void)
 	/* Move init over to a non-isolated CPU */
 	if (set_cpus_allowed(current, non_isolated_cpus) < 0)
 		BUG();
-	/*
-	 * Increase the granularity value when there are more CPUs,
-	 * because with more CPUs the 'effective latency' as visible
-	 * to users decreases. But the relationship is not linear,
-	 * so pick a second-best guess by going with the log2 of the
-	 * number of CPUs.
-	 *
-	 * This idea comes from the SD scheduler of Con Kolivas:
-	 */
-	{
-		unsigned int factor = 1 + ilog2(num_online_cpus());
-
-		sysctl_sched_granularity *= factor;
-		sysctl_sched_runtime_limit *= factor;
-	}
+	sched_init_granularity();
 }
 #else
 void __init sched_init_smp(void)
 {
+	sched_init_granularity();
 }
 #endif /* CONFIG_SMP */
 
diff -puN kernel/sched_debug.c~cfs-scheduler-v16 kernel/sched_debug.c
--- a/kernel/sched_debug.c~cfs-scheduler-v16
+++ a/kernel/sched_debug.c
@@ -54,8 +54,7 @@ print_task(struct seq_file *m, struct rq
 
 static void print_rq(struct seq_file *m, struct rq *rq, u64 now)
 {
-	struct task_struct *p;
-	struct rb_node *curr;
+	struct task_struct *g, *p;
 
 	SEQ_printf(m,
 	"\nrunnable tasks:\n"
@@ -68,13 +67,16 @@ static void print_rq(struct seq_file *m,
 	"------------------------------------------------"
 	"--------------------------------\n");
 
-	curr = first_fair(rq);
-	while (curr) {
-		p = rb_entry(curr, struct task_struct, run_node);
+	read_lock_irq(&tasklist_lock);
+
+	do_each_thread(g, p) {
+		if (!p->on_rq)
+			continue;
+
 		print_task(m, rq, p, now);
+	} while_each_thread(g, p);
 
-		curr = rb_next(curr);
-	}
+	read_unlock_irq(&tasklist_lock);
 }
 
 static void print_rq_runtime_sum(struct seq_file *m, struct rq *rq)
@@ -117,13 +119,13 @@ static void print_cpu(struct seq_file *m
 	P(clock);
 	P(prev_clock_raw);
 	P(clock_warps);
+	P(clock_overflows);
 	P(clock_unstable_events);
 	P(clock_max_delta);
-	rq->clock_max_delta = 0;
 	P(fair_clock);
-	P(prev_fair_clock);
+	P(delta_fair_clock);
 	P(exec_clock);
-	P(prev_exec_clock);
+	P(delta_exec_clock);
 	P(wait_runtime);
 	P(wait_runtime_overruns);
 	P(wait_runtime_underruns);
@@ -188,6 +190,18 @@ __initcall(init_sched_debug_procfs);
 
 void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 {
+	unsigned long flags;
+	int num_threads = 1;
+
+	rcu_read_lock();
+	if (lock_task_sighand(p, &flags)) {
+		num_threads = atomic_read(&p->signal->count);
+		unlock_task_sighand(p, &flags);
+	}
+	rcu_read_unlock();
+
+	SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid, num_threads);
+	SEQ_printf(m, "----------------------------------------------\n");
 #define P(F) \
 	SEQ_printf(m, "%-25s:%20Ld\n", #F, (long long)p->F)
 
@@ -201,11 +215,13 @@ void proc_sched_show_task(struct task_st
 	P(block_max);
 	P(exec_max);
 	P(wait_max);
-	P(last_ran);
 	P(wait_runtime);
 	P(wait_runtime_overruns);
 	P(wait_runtime_underruns);
 	P(sum_exec_runtime);
+	P(load_weight);
+	P(policy);
+	P(prio);
 #undef P
 
 	{
diff -puN kernel/sched_fair.c~cfs-scheduler-v16 kernel/sched_fair.c
--- a/kernel/sched_fair.c~cfs-scheduler-v16
+++ a/kernel/sched_fair.c
@@ -1,5 +1,10 @@
 /*
  * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
+ *
+ *  Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@xxxxxxxxxx>
+ *
+ *  Cleanups and fixes by Dmitry Adamushko.
+ *  (C) 2007 Dmitry Adamushko <dmitry.adamushko@xxxxxxxxx>
  */
 
 /*
@@ -16,33 +21,24 @@
  * number of CPUs. (i.e. factor 2x on 2-way systems, 3x on 4-way
  * systems, 4x on 8-way systems, 5x on 16-way systems, etc.)
  */
-unsigned int sysctl_sched_granularity __read_mostly = 3000000000ULL/HZ;
+unsigned int sysctl_sched_granularity __read_mostly = 2000000000ULL/HZ;
 
 /*
- * Wake-up granularity.
- * (default: 0, units: nanoseconds)
+ * SCHED_BATCH wake-up granularity.
+ * (default: 1 msec, units: nanoseconds)
  *
  * This option delays the preemption effects of decoupled workloads
  * and reduces their over-scheduling. Synchronous workloads will still
  * have immediate wakeup/sleep latencies.
  */
-unsigned int sysctl_sched_wakeup_granularity __read_mostly = 0;
-
-unsigned int sysctl_sched_runtime_limit __read_mostly = 6000000000ULL/HZ;
-
-unsigned int sysctl_sched_load_smoothing __read_mostly = 1 | 2 | 4 | 8 | 0;
-
+unsigned int sysctl_sched_batch_wakeup_granularity __read_mostly =
+							1000000000ULL/HZ;
 /*
- * sys_sched_yield unfairness bug workaround switch.
- * (default: -1:auto-detect+disabled. Other values: 0:disabled, 1:enabled)
- *
- * This option switches the unfair yield implementation of the
- * old scheduler back on. Needed for good performance of certain
- * apps like 3D games on Radeon cards.
+ * Initialized in sched_init_granularity():
  */
-int sysctl_sched_yield_bug_workaround __read_mostly = 1;
+unsigned int sysctl_sched_runtime_limit __read_mostly;
 
-EXPORT_SYMBOL_GPL(sysctl_sched_yield_bug_workaround);
+unsigned int sysctl_sched_features __read_mostly = 1 | 2 | 4 | 8 | 0 | 0;
 
 extern struct sched_class fair_sched_class;
 
@@ -193,14 +189,14 @@ static inline void update_curr(struct rq
 	u64 delta_exec, delta_fair, delta_mine;
 	struct task_struct *curr = rq->curr;
 
-	if (curr->sched_class != &fair_sched_class || curr == rq->idle)
+	if (curr->sched_class != &fair_sched_class || curr == rq->idle || !load)
 		return;
 	/*
 	 * Get the amount of time the current task was running
 	 * since the last time we changed raw_weighted_load:
 	 */
 	delta_exec = now - curr->exec_start;
-	if (unlikely(delta_exec < 0))
+	if (unlikely((s64)delta_exec < 0))
 		delta_exec = 0;
 	if (unlikely(delta_exec > curr->exec_max))
 		curr->exec_max = delta_exec;
@@ -209,22 +205,24 @@ static inline void update_curr(struct rq
 	curr->exec_start = now;
 	rq->exec_clock += delta_exec;
 
-	if (!load)
-		return;
+	delta_fair = delta_exec * NICE_0_LOAD;
+	delta_fair += load >> 1; /* rounding */
+	do_div(delta_fair, load);
+
+	/* Load-balancing accounting. */
+	rq->delta_fair_clock += delta_fair;
+	rq->delta_exec_clock += delta_exec;
+
 	/*
 	 * Task already marked for preemption, do not burden
 	 * it with the cost of not having left the CPU yet:
 	 */
-	if (unlikely(sysctl_sched_load_smoothing & 1))
+	if (unlikely(sysctl_sched_features & 1))
 		if (unlikely(test_tsk_thread_flag(curr, TIF_NEED_RESCHED)))
 			return;
 
-	delta_fair = delta_exec * NICE_0_LOAD;
-	delta_fair += load >> 1;
-	do_div(delta_fair, load);
-
 	delta_mine = delta_exec * curr->load_weight;
-	delta_mine += load >> 1;
+	delta_mine += load >> 1; /* rounding */
 	do_div(delta_mine, load);
 
 	rq->fair_clock += delta_fair;
@@ -352,7 +350,7 @@ static void distribute_fair_add(struct r
 	struct task_struct *curr = rq->curr;
 	s64 delta_fair = 0;
 
-	if (!(sysctl_sched_load_smoothing & 2))
+	if (!(sysctl_sched_features & 2))
 		return;
 
 	if (rq->nr_running) {
@@ -361,7 +359,8 @@ static void distribute_fair_add(struct r
 		 * The currently running task's next wait_runtime value does
 		 * not depend on the fair_clock, so fix it up explicitly:
 		 */
-		add_wait_runtime(rq, curr, -delta_fair);
+		 if (curr->sched_class == &fair_sched_class)
+			add_wait_runtime(rq, curr, -delta_fair);
 	}
 	rq->fair_clock -= delta_fair;
 }
@@ -375,7 +374,7 @@ static void enqueue_sleeper(struct rq *r
 	unsigned long load = rq->raw_weighted_load;
 	s64 delta_fair, prev_runtime;
 
-	if (!(sysctl_sched_load_smoothing & 4))
+	if (p->policy == SCHED_BATCH || !(sysctl_sched_features & 4))
 		goto out;
 
 	delta_fair = rq->fair_clock - p->sleep_start_fair;
@@ -384,7 +383,9 @@ static void enqueue_sleeper(struct rq *r
 	 * Fix up delta_fair with the effect of us running
 	 * during the whole sleep period:
 	 */
-	delta_fair = div64_s(delta_fair * load, load + p->load_weight);
+	if (!(sysctl_sched_features & 32))
+		delta_fair = div64_s(delta_fair * load, load + p->load_weight);
+	delta_fair = div64_s(delta_fair * p->load_weight, NICE_0_LOAD);
 
 	prev_runtime = p->wait_runtime;
 	__add_wait_runtime(rq, p, delta_fair);
@@ -476,85 +477,39 @@ dequeue_task_fair(struct rq *rq, struct 
 static void
 yield_task_fair(struct rq *rq, struct task_struct *p, struct task_struct *p_to)
 {
-	struct rb_node *curr, *next, *first;
 	struct task_struct *p_next;
-	s64 yield_key;
 	u64 now;
 
+	now = __rq_clock(rq);
 	/*
-	 * Bug workaround for 3D apps running on the radeon 3D driver:
+	 * Dequeue and enqueue the task to update its
+	 * position within the tree:
 	 */
-	if (unlikely(sysctl_sched_yield_bug_workaround > 0)) {
-		if (sysctl_sched_yield_bug_workaround == 2) {
-			resched_task(p);
-			return;
-		}
-		now = __rq_clock(rq);
-		/*
-		 * Dequeue and enqueue the task to update its
-		 * position within the tree:
-		 */
-		dequeue_task_fair(rq, p, 0, now);
-		p->on_rq = 0;
-		enqueue_task_fair(rq, p, 0, now);
-		p->on_rq = 1;
-
-		/*
-		 * Reschedule if another task tops the current one.
-		 */
-		p_next = __pick_next_task_fair(rq);
-		if (p_next != p)
-			resched_task(p);
-		return;
-	}
+	dequeue_task_fair(rq, p, 0, now);
+	p->on_rq = 0;
+	enqueue_task_fair(rq, p, 0, now);
+	p->on_rq = 1;
 
 	/*
 	 * yield-to support: if we are on the same runqueue then
 	 * give half of our wait_runtime (if it's positive) to the other task:
 	 */
-	if (p_to && rq == task_rq(p_to) && p->wait_runtime > 0) {
+	if (p_to && rq == task_rq(p_to) &&
+			p_to->sched_class == &fair_sched_class
+			&& p->wait_runtime > 0) {
+
 		s64 delta = p->wait_runtime >> 1;
 
 		__add_wait_runtime(rq, p_to, delta);
 		__add_wait_runtime(rq, p, -delta);
 	}
 
-	curr = &p->run_node;
-	first = first_fair(rq);
-	/*
-	 * Move this task to the second place in the tree:
-	 */
-	if (unlikely(curr != first)) {
-		next = first;
-	} else {
-		next = rb_next(curr);
-		/*
-		 * We were the last one already - nothing to do, return
-		 * and reschedule:
-		 */
-		if (unlikely(!next))
-			return;
-	}
-
-	p_next = rb_entry(next, struct task_struct, run_node);
 	/*
-	 * Minimally necessary key value to be the second in the tree:
-	 */
-	yield_key = p_next->fair_key + 1;
-
-	now = __rq_clock(rq);
-	dequeue_task_fair(rq, p, 0, now);
-	p->on_rq = 0;
-
-	/*
-	 * Only update the key if we need to move more backwards
-	 * than the minimally necessary position to be the second:
+	 * Reschedule if another task tops the current one.
 	 */
-	if (p->fair_key < yield_key)
-		p->fair_key = yield_key;
-
-	__enqueue_task_fair(rq, p);
-	p->on_rq = 1;
+	p_next = __pick_next_task_fair(rq);
+	if (p_next != p)
+		resched_task(p);
 }
 
 /*
@@ -581,16 +536,23 @@ __check_preempt_curr_fair(struct rq *rq,
 static void check_preempt_curr_fair(struct rq *rq, struct task_struct *p)
 {
 	struct task_struct *curr = rq->curr;
+	unsigned long granularity;
 
 	if ((curr == rq->idle) || rt_prio(p->prio)) {
-		if (sysctl_sched_load_smoothing & 8) {
+		if (sysctl_sched_features & 8) {
 			if (rt_prio(p->prio))
 				update_curr(rq, rq_clock(rq));
 		}
 		resched_task(curr);
 	} else {
-		__check_preempt_curr_fair(rq, p, curr,
-					  sysctl_sched_wakeup_granularity);
+		/*
+		 * Batch tasks prefer throughput over latency:
+		 */
+		granularity = 0;
+		if (unlikely(p->policy == SCHED_BATCH))
+			granularity = sysctl_sched_batch_wakeup_granularity;
+
+		__check_preempt_curr_fair(rq, p, curr, granularity);
 	}
 }
 
@@ -624,7 +586,7 @@ static void put_prev_task_fair(struct rq
 	 * preempted), update its position within the tree and
 	 * start the wait period:
 	 */
-	if (sysctl_sched_load_smoothing & 16)  {
+	if (sysctl_sched_features & 16)  {
 		if (prev->on_rq &&
 			test_tsk_thread_flag(prev, TIF_NEED_RESCHED)) {
 
@@ -735,6 +697,12 @@ static void task_new_fair(struct rq *rq,
 	 */
 	p->wait_start_fair = 0;
 
+	/*
+	 * The statistical average of wait_runtime is about
+	 * -granularity/2, so initialize the task with that:
+	 */
+//	p->wait_runtime = -(s64)(sysctl_sched_granularity / 2);
+
 	__enqueue_task_fair(rq, p);
 	p->on_rq = 1;
 	inc_nr_running(p, rq);
diff -puN kernel/sched_rt.c~cfs-scheduler-v16 kernel/sched_rt.c
--- a/kernel/sched_rt.c~cfs-scheduler-v16
+++ a/kernel/sched_rt.c
@@ -3,6 +3,28 @@
  * policies)
  */
 
+/*
+ * Update the current task's runtime statistics. Skip current tasks that
+ * are not in our scheduling class.
+ */
+static inline void update_curr_rt(struct rq *rq, u64 now)
+{
+	struct task_struct *curr = rq->curr;
+	u64 delta_exec;
+
+	if (!has_rt_policy(curr))
+		return;
+
+	delta_exec = now - curr->exec_start;
+	if (unlikely((s64)delta_exec < 0))
+		delta_exec = 0;
+	if (unlikely(delta_exec > curr->exec_max))
+		curr->exec_max = delta_exec;
+
+	curr->sum_exec_runtime += delta_exec;
+	curr->exec_start = now;
+}
+
 static void
 enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup, u64 now)
 {
@@ -20,6 +42,8 @@ dequeue_task_rt(struct rq *rq, struct ta
 {
 	struct prio_array *array = &rq->active;
 
+	update_curr_rt(rq, now);
+
 	list_del(&p->run_list);
 	if (list_empty(array->queue + p->prio))
 		__clear_bit(p->prio, array->bitmap);
@@ -54,6 +78,7 @@ static void check_preempt_curr_rt(struct
 static struct task_struct * pick_next_task_rt(struct rq *rq, u64 now)
 {
 	struct prio_array *array = &rq->active;
+	struct task_struct *next;
 	struct list_head *queue;
 	int idx;
 
@@ -62,14 +87,17 @@ static struct task_struct * pick_next_ta
 		return NULL;
 
 	queue = array->queue + idx;
-	return list_entry(queue->next, struct task_struct, run_list);
+	next = list_entry(queue->next, struct task_struct, run_list);
+
+	next->exec_start = now;
+
+	return next;
 }
 
-/*
- * No accounting done when RT tasks are descheduled:
- */
 static void put_prev_task_rt(struct rq *rq, struct task_struct *p, u64 now)
 {
+	update_curr_rt(rq, now);
+	p->exec_start = 0;
 }
 
 /*
diff -puN kernel/sysctl.c~cfs-scheduler-v16 kernel/sysctl.c
--- a/kernel/sysctl.c~cfs-scheduler-v16
+++ a/kernel/sysctl.c
@@ -207,6 +207,9 @@ static ctl_table root_table[] = {
 	{ .ctl_name = 0 }
 };
 
+static unsigned long min_sched_granularity_ns = 100000; /* 100 usecs */
+static unsigned long max_sched_granularity_ns = 1000000000; /* 1 second */
+
 static ctl_table kern_table[] = {
 	{
 		.ctl_name	= CTL_UNNUMBERED,
@@ -214,15 +217,21 @@ static ctl_table kern_table[] = {
 		.data		= &sysctl_sched_granularity,
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &min_sched_granularity_ns,
+		.extra2		= &max_sched_granularity_ns,
 	},
 	{
 		.ctl_name	= CTL_UNNUMBERED,
-		.procname	= "sched_wakeup_granularity_ns",
-		.data		= &sysctl_sched_wakeup_granularity,
+		.procname	= "sched_batch_wakeup_granularity_ns",
+		.data		= &sysctl_sched_batch_wakeup_granularity,
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &min_sched_granularity_ns,
+		.extra2		= &max_sched_granularity_ns,
 	},
 	{
 		.ctl_name	= CTL_UNNUMBERED,
@@ -230,7 +239,10 @@ static ctl_table kern_table[] = {
 		.data		= &sysctl_sched_runtime_limit,
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &min_sched_granularity_ns,
+		.extra2		= &max_sched_granularity_ns,
 	},
 	{
 		.ctl_name	= CTL_UNNUMBERED,
@@ -242,16 +254,8 @@ static ctl_table kern_table[] = {
 	},
 	{
 		.ctl_name	= CTL_UNNUMBERED,
-		.procname	= "sched_load_smoothing",
-		.data		= &sysctl_sched_load_smoothing,
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
-	},
-	{
-		.ctl_name	= CTL_UNNUMBERED,
-		.procname	= "sched_yield_bug_workaround",
-		.data		= &sysctl_sched_yield_bug_workaround,
+		.procname	= "sched_features",
+		.data		= &sysctl_sched_features,
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
_

Patches currently in -mm which might be from mingo@xxxxxxx are

rt-mutex-fix-stale-return-value.patch
rt-mutex-fix-chain-walk-early-wakeup-bug.patch
pi-futex-fix-exit-races-and-locking-problems.patch
git-acpi-add-exports.patch
git-kvm.patch
git-selinux.patch
x86_64-irq-check-remote-irr-bit-before-migrating-level-triggered-irq-v3.patch
only-allow-nonlinear-vmas-for-ram-backed-filesystems.patch
cpuset-remove-sched-domain-hooks-from-cpusets.patch
introduce-write_trylock_irqsave.patch
use-write_trylock_irqsave-in-ptrace_attach.patch
fix-stop_machine_run-problem-with-naughty-real-time-process.patch
cpu-hotplug-fix-ksoftirqd-termination-on-cpu-hotplug-with-naughty-realtime-process.patch
cpu-hotplug-fix-ksoftirqd-termination-on-cpu-hotplug-with-naughty-realtime-process-fix.patch
pie-randomization.patch
vdso-print-fatal-signals.patch
remove-clockevents_releaserequest_device.patch
add-a-flag-to-indicate-deferrable-timers-in-proc-timer_stats.patch
introduce-o_cloexec-take-2.patch
introduce-o_cloexec-parisc-fix.patch
o_cloexec-for-scm_rights.patch
o_cloexec-for-scm_rights-fix.patch
o_cloexec-for-scm_rights-fix-2.patch
futex-tidy-up-the-code.patch
improve-behaviour-of-spurious-irq-detect.patch
improve-behaviour-of-spurious-irq-detect-fix.patch
lock-debugging-loop-nicer-in-mark_rt_mutex_waiters.patch
cfs-scheduler.patch
cfs-scheduler-vs-detach-schedh-from-mmh.patch
cfs-scheduler-v14-rc2-mm1.patch
cfs-scheduler-warning-fixes.patch
cfs-scheduler-v15-rc3-mm1.patch
fs-proc-basec-make-a-struct-static.patch
cfs-warning-fixes.patch
schedstats-fix-printk-format.patch
cfs-scheduler-v16.patch
sched-add-above-background-load-function.patch
mm-implement-swap-prefetching.patch
fix-raw_spinlock_t-vs-lockdep.patch
lockdep-sanitise-config_prove_locking.patch
lockdep-reduce-the-ifdeffery.patch
lockstat-core-infrastructure.patch
lockstat-core-infrastructure-fix.patch
lockstat-core-infrastructure-fix-fix.patch
lockstat-core-infrastructure-fix-fix-fix.patch
lockstat-human-readability-tweaks.patch
lockstat-hook-into-spinlock_t-rwlock_t-rwsem-and-mutex.patch
detect-atomic-counter-underflows.patch
make-frame_pointer-default=y.patch
mutex-subsystem-synchro-test-module.patch
lockdep-show-held-locks-when-showing-a-stackdump.patch
kmap_atomic-debugging.patch
random-warning-squishes.patch

-
To unsubscribe from this list: send the line "unsubscribe mm-commits" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html