+ cfs-v19.patch added to -mm tree

akpm@xxxxxxxxxxxxxxxxxxxx · Fri, 06 Jul 2007 11:12:30 -0700

The patch titled
     CFS -v19
has been added to the -mm tree.  Its filename is
     cfs-v19.patch

*** Remember to use Documentation/SubmitChecklist when testing your code ***

See http://www.zip.com.au/~akpm/linux/patches/stuff/added-to-mm.txt to find
out what to do about this

------------------------------------------------------
Subject: CFS -v19
From: Ingo Molnar <mingo@xxxxxxx>

The biggest user-visible change in -v19 is reworked sleeper fairness:
it's similar in behavior to -v18 but works more consistently across nice
levels. Fork-happy workloads (like kernel builds) should behave better
as well. There are also a handful of speedups: unsigned math, 32-bit
speedups, O(1) task pickup, debloating and other micro-optimizations.

Changes since -v18:

 - merged the group-scheduling CFS-core changes from Srivatsa Vaddagiri.
   This makes up for the bulk of the changes in -v19 but has no
   behavioral impact. The final group-fairness enabler patch is now a
   small and lean add-on patch to CFS.

 - fix the bloat noticed by Andrew. On 32-bit it's now this:

      text    data     bss     dec     hex   filename
     24362    3905      24   28291    6e83   sched.o-rc7
     33015    2538      20   35573    8af5   sched.o-v18
     25805    2426      20   28251    6e5b   sched.o-v19

   so it's a net win compared to vanilla. On 64-bit it's even better:

      text    data     bss     dec     hex   filename
     35732   40314    2168   78214   13186   sched.o.x64-rc7
     41397   37642    2168   81207   13d37   sched.o.x64-v18
     36132   37410    2168   75710   127be   sched.o.x64-v19

   ( and there's also a +1.5K data win per CPU on x32, which is not
     shown here. [+3.0K data win per CPU on x64.] )

 - good number of core code updates, cleanups and streamlining.
   (Mike Galbraith, Srivatsa Vaddagiri, Dmitry Adamushko, me.)

 - use unsigned data types almost everywhere in CFS. This produces
   faster and smaller code, and simplifies the logic.

 - turn as many 'u64' data types into 'unsigned long' as possible, to
   reduce the 32-bit footprint and to reduce 64-bit arithmetics.

 - replaced the nr_running based 'sleep fairness' logic with a more
   robust concept. The end-result is similar in behavior to v18, but
   negative nice levels are handled much better in this scheme.

 - speedup: O(1) task pickup by Srivatsa Vaddagiri. [sleep/wakeup is
   O(log2(nr_running)).] This gives 5-10% better hackbench 100/500
   results on a 4-way box.

 - fix: set idle->sched_class back to &idle_sched_class in
   migration_call(). (Dmitry Adamushko)

 - cleanup: use an enum for the sched_feature flags. (suggested by
   Andrew Morton)

 - cleanup: turn the priority macros into inlines. (suggested by
   Andrew Morton)

 - (other cleanups suggested by Andrew Morton)

 - debug: split out the debugging data into CONFIG_SCHED_DEBUG.

Signed-off-by: Ingo Molnar <mingo@xxxxxxx>
Signed-off-by: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx>
---

 Makefile                |    0 
 fs/proc/array.c         |   24 -
 fs/proc/base.c          |    7 
 include/linux/sched.h   |  157 ++++++----
 kernel/exit.c           |    2 
 kernel/sched.c          |  313 ++++++++++++--------
 kernel/sched_debug.c    |   34 +-
 kernel/sched_fair.c     |  571 ++++++++++++++++++++++++++------------
 kernel/sched_idletask.c |   13 
 kernel/sched_rt.c       |   62 +++-
 lib/Kconfig.debug       |    9 
 11 files changed, 804 insertions(+), 388 deletions(-)

diff -puN fs/proc/array.c~cfs-v19 fs/proc/array.c

--- a/fs/proc/array.c~cfs-v19
+++ a/fs/proc/array.c
@@ -324,19 +324,18 @@ static clock_t task_utime(struct task_st
 {
 	clock_t utime = cputime_to_clock_t(p->utime),
 		total = utime + cputime_to_clock_t(p->stime);
+	u64 temp;
 
 	/*
-	 * Use CFS's precise accounting, if available:
+	 * Use CFS's precise accounting:
 	 */
-	if (!(sysctl_sched_features & 128)) {
-		u64 temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime);
+	temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime);
 
-		if (total) {
-			temp *= utime;
-			do_div(temp, total);
-		}
-		utime = (clock_t)temp;
+	if (total) {
+		temp *= utime;
+		do_div(temp, total);
 	}
+	utime = (clock_t)temp;
 
 	return utime;
 }
@@ -346,12 +345,11 @@ static clock_t task_stime(struct task_st
 	clock_t stime = cputime_to_clock_t(p->stime);
 
 	/*
-	 * Use CFS's precise accounting, if available (we subtract
-	 * utime from the total, to make sure the total observed
-	 * by userspace grows monotonically - apps rely on that):
+	 * Use CFS's precise accounting. (we subtract utime from
+	 * the total, to make sure the total observed by userspace
+	 * grows monotonically - apps rely on that):
 	 */
-	if (!(sysctl_sched_features & 128))
-		stime = nsec_to_clock_t(p->se.sum_exec_runtime) - task_utime(p);
+	stime = nsec_to_clock_t(p->se.sum_exec_runtime) - task_utime(p);
 
 	return stime;
 }
diff -puN fs/proc/base.c~cfs-v19 fs/proc/base.c
--- a/fs/proc/base.c~cfs-v19
+++ a/fs/proc/base.c
@@ -832,6 +832,7 @@ static const struct file_operations proc
 };
 #endif
 
+#ifdef CONFIG_SCHED_DEBUG
 /*
  * Print out various scheduling related per-task fields:
  */
@@ -892,6 +893,8 @@ static const struct file_operations proc
 	.release	= seq_release,
 };
 
+#endif
+
 static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd)
 {
 	struct inode *inode = dentry->d_inode;
@@ -1927,7 +1930,9 @@ static const struct pid_entry tgid_base_
 	INF("environ",    S_IRUSR, pid_environ),
 	INF("auxv",       S_IRUSR, pid_auxv),
 	INF("status",     S_IRUGO, pid_status),
+#ifdef CONFIG_SCHED_DEBUG
 	REG("sched",      S_IRUGO|S_IWUSR, pid_sched),
+#endif
 	INF("cmdline",    S_IRUGO, pid_cmdline),
 	INF("stat",       S_IRUGO, tgid_stat),
 	INF("statm",      S_IRUGO, pid_statm),
@@ -2216,7 +2221,9 @@ static const struct pid_entry tid_base_s
 	INF("environ",   S_IRUSR, pid_environ),
 	INF("auxv",      S_IRUSR, pid_auxv),
 	INF("status",    S_IRUGO, pid_status),
+#ifdef CONFIG_SCHED_DEBUG
 	REG("sched",     S_IRUGO|S_IWUSR, pid_sched),
+#endif
 	INF("cmdline",   S_IRUGO, pid_cmdline),
 	INF("stat",      S_IRUGO, tid_stat),
 	INF("statm",     S_IRUGO, pid_statm),
diff -puN include/linux/sched.h~cfs-v19 include/linux/sched.h
--- a/include/linux/sched.h~cfs-v19
+++ a/include/linux/sched.h
@@ -136,8 +136,25 @@ extern unsigned long nr_iowait(void);
 extern unsigned long weighted_cpuload(const int cpu);
 
 struct seq_file;
+struct cfs_rq;
+#ifdef CONFIG_SCHED_DEBUG
 extern void proc_sched_show_task(struct task_struct *p, struct seq_file *m);
 extern void proc_sched_set_task(struct task_struct *p);
+extern void
+print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq, u64 now);
+#else
+static inline void
+proc_sched_show_task(struct task_struct *p, struct seq_file *m)
+{
+}
+static inline void proc_sched_set_task(struct task_struct *p)
+{
+}
+static inline void
+print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq, u64 now)
+{
+}
+#endif
 
 /*
  * Task state bitmask. NOTE! These bits are also
@@ -535,31 +552,6 @@ struct signal_struct {
 #define SIGNAL_STOP_CONTINUED	0x00000004 /* SIGCONT since WCONTINUED reap */
 #define SIGNAL_GROUP_EXIT	0x00000008 /* group exit in progress */
 
-
-/*
- * Priority of a process goes from 0..MAX_PRIO-1, valid RT
- * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH
- * tasks are in the range MAX_RT_PRIO..MAX_PRIO-1. Priority
- * values are inverted: lower p->prio value means higher priority.
- *
- * The MAX_USER_RT_PRIO value allows the actual maximum
- * RT priority to be separate from the value exported to
- * user-space.  This allows kernel threads to set their
- * priority to a value higher than any user task. Note:
- * MAX_RT_PRIO must not be smaller than MAX_USER_RT_PRIO.
- */
-
-#define MAX_USER_RT_PRIO	100
-#define MAX_RT_PRIO		MAX_USER_RT_PRIO
-
-#define MAX_PRIO		(MAX_RT_PRIO + 40)
-#define DEFAULT_PRIO		(MAX_RT_PRIO + 20)
-
-#define rt_prio(prio)		unlikely((prio) < MAX_RT_PRIO)
-#define rt_task(p)		rt_prio((p)->prio)
-#define is_rt_policy(p)		((p) == SCHED_FIFO || (p) == SCHED_RR)
-#define has_rt_policy(p)	unlikely(is_rt_policy((p)->policy))
-
 /*
  * Some day this will be a full-fledged user tracking system..
  */
@@ -653,8 +645,7 @@ static inline int sched_info_on(void)
 #endif
 }
 
-enum cpu_idle_type
-{
+enum cpu_idle_type {
 	CPU_IDLE,
 	CPU_NOT_IDLE,
 	CPU_NEWLY_IDLE,
@@ -669,7 +660,7 @@ enum cpu_idle_type
  * Increase resolution of nice-level calculations:
  */
 #define SCHED_LOAD_SHIFT	10
-#define SCHED_LOAD_SCALE	(1UL << SCHED_LOAD_SHIFT)
+#define SCHED_LOAD_SCALE	(1L << SCHED_LOAD_SHIFT)
 
 #define SCHED_LOAD_SCALE_FUZZ	(SCHED_LOAD_SCALE >> 5)
 
@@ -838,6 +829,7 @@ struct pipe_inode_info;
 struct uts_namespace;
 
 struct rq;
+struct sched_domain;
 
 struct sched_class {
 	struct sched_class *next;
@@ -853,8 +845,13 @@ struct sched_class {
 	struct task_struct * (*pick_next_task) (struct rq *rq, u64 now);
 	void (*put_prev_task) (struct rq *rq, struct task_struct *p, u64 now);
 
-	struct task_struct * (*load_balance_start) (struct rq *rq);
-	struct task_struct * (*load_balance_next) (struct rq *rq);
+	int (*load_balance) (struct rq *this_rq, int this_cpu,
+			struct rq *busiest,
+			unsigned long max_nr_move, unsigned long max_load_move,
+			struct sched_domain *sd, enum cpu_idle_type idle,
+			int *all_pinned, unsigned long *total_load_moved);
+
+	void (*set_curr_task) (struct rq *rq);
 	void (*task_tick) (struct rq *rq, struct task_struct *p);
 	void (*task_new) (struct rq *rq, struct task_struct *p);
 };
@@ -863,30 +860,53 @@ struct load_weight {
 	unsigned long weight, inv_weight;
 };
 
-/* CFS stats for a schedulable entity (task, task-group etc) */
+/*
+ * CFS stats for a schedulable entity (task, task-group etc)
+ *
+ * Current field usage histogram:
+ *
+ *     4 se->block_start
+ *     4 se->run_node
+ *     4 se->sleep_start
+ *     4 se->sleep_start_fair
+ *     6 se->load.weight
+ *     7 se->delta_fair
+ *    15 se->wait_runtime
+ */
 struct sched_entity {
-	struct load_weight load;	/* for nice- load-balancing purposes */
-	int on_rq;
-	struct rb_node run_node;
-	unsigned long delta_exec;
-	s64 delta_fair;
-
-	u64 wait_start_fair;
-	u64 wait_start;
-	u64 exec_start;
-	u64 sleep_start, sleep_start_fair;
-	u64 block_start;
-	u64 sleep_max;
-	u64 block_max;
-	u64 exec_max;
-	u64 wait_max;
-	u64 last_ran;
-
-	s64 wait_runtime;
-	u64 sum_exec_runtime;
-	s64 fair_key;
-	s64 sum_wait_runtime, sum_sleep_runtime;
-	unsigned long wait_runtime_overruns, wait_runtime_underruns;
+	long			wait_runtime;
+	unsigned long		delta_fair_run;
+	unsigned long		delta_fair_sleep;
+	unsigned long		delta_exec;
+	s64			fair_key;
+	struct load_weight	load;		/* for load-balancing */
+	struct rb_node		run_node;
+	unsigned int		on_rq;
+
+	u64			wait_start_fair;
+	u64			wait_start;
+	u64			exec_start;
+	u64			sleep_start;
+	u64			sleep_start_fair;
+	u64			block_start;
+	u64			sleep_max;
+	u64			block_max;
+	u64			exec_max;
+	u64			wait_max;
+	u64			last_ran;
+
+	u64			sum_exec_runtime;
+	s64			sum_wait_runtime;
+	s64			sum_sleep_runtime;
+	unsigned long		wait_runtime_overruns;
+	unsigned long		wait_runtime_underruns;
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	struct sched_entity	*parent;
+	/* rq on which this entity is (to be) queued: */
+	struct cfs_rq		*cfs_rq;
+	/* rq "owned" by this entity/group: */
+	struct cfs_rq		*my_q;
+#endif
 };
 
 struct task_struct {
@@ -1147,6 +1167,37 @@ struct task_struct {
 #endif
 };
 
+/*
+ * Priority of a process goes from 0..MAX_PRIO-1, valid RT
+ * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH
+ * tasks are in the range MAX_RT_PRIO..MAX_PRIO-1. Priority
+ * values are inverted: lower p->prio value means higher priority.
+ *
+ * The MAX_USER_RT_PRIO value allows the actual maximum
+ * RT priority to be separate from the value exported to
+ * user-space.  This allows kernel threads to set their
+ * priority to a value higher than any user task. Note:
+ * MAX_RT_PRIO must not be smaller than MAX_USER_RT_PRIO.
+ */
+
+#define MAX_USER_RT_PRIO	100
+#define MAX_RT_PRIO		MAX_USER_RT_PRIO
+
+#define MAX_PRIO		(MAX_RT_PRIO + 40)
+#define DEFAULT_PRIO		(MAX_RT_PRIO + 20)
+
+static inline int rt_prio(int prio)
+{
+	if (unlikely(prio < MAX_RT_PRIO))
+		return 1;
+	return 0;
+}
+
+static inline int rt_task(struct task_struct *p)
+{
+	return rt_prio(p->prio);
+}
+
 static inline pid_t process_group(struct task_struct *tsk)
 {
 	return tsk->signal->pgrp;
diff -puN kernel/exit.c~cfs-v19 kernel/exit.c
--- a/kernel/exit.c~cfs-v19
+++ a/kernel/exit.c
@@ -292,7 +292,7 @@ static void reparent_to_kthreadd(void)
 	/* Set the exit signal to SIGCHLD so we signal init on exit */
 	current->exit_signal = SIGCHLD;
 
-	if (!has_rt_policy(current) && (task_nice(current) < 0))
+	if (task_nice(current) < 0)
 		set_user_nice(current, 0);
 	/* cpus_allowed? */
 	/* rt_priority? */
diff -puN kernel/sched.c~cfs-v19 kernel/sched.c
--- a/kernel/sched.c~cfs-v19
+++ a/kernel/sched.c
@@ -108,6 +108,18 @@ unsigned long long __attribute__((weak))
 #define MIN_TIMESLICE		max(5 * HZ / 1000, 1)
 #define DEF_TIMESLICE		(100 * HZ / 1000)
 
+static inline int rt_policy(int policy)
+{
+	if (unlikely(policy == SCHED_FIFO) || unlikely(policy == SCHED_RR))
+		return 1;
+	return 0;
+}
+
+static inline int task_has_rt_policy(struct task_struct *p)
+{
+	return rt_policy(p->policy);
+}
+
 /*
  * This is the priority-queue data structure of the RT scheduling class:
  */
@@ -119,7 +131,7 @@ struct prio_array {
 struct load_stat {
 	struct load_weight load;
 	u64 load_update_start, load_update_last;
-	u64 delta_fair, delta_exec, delta_stat;
+	unsigned long delta_fair, delta_exec, delta_stat;
 };
 
 /* CFS-related fields in a runqueue */
@@ -127,14 +139,31 @@ struct cfs_rq {
 	struct load_weight load;
 	unsigned long nr_running;
 
-	u64 fair_clock;
+	s64 fair_clock;
 	u64 exec_clock;
 	s64 wait_runtime;
+	u64 sleeper_bonus;
 	unsigned long wait_runtime_overruns, wait_runtime_underruns;
 
 	struct rb_root tasks_timeline;
 	struct rb_node *rb_leftmost;
 	struct rb_node *rb_load_balance_curr;
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	/* 'curr' points to currently running entity on this cfs_rq.
+	 * It is set to NULL otherwise (i.e when none are currently running).
+	 */
+	struct sched_entity *curr;
+	struct rq *rq;	/* cpu runqueue to which this cfs_rq is attached */
+
+	/* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
+	 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
+	 * (like users, containers etc.)
+	 *
+	 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
+	 * list is used during load balance.
+	 */
+	struct list_head leaf_cfs_rq_list; /* Better name : task_cfs_rq_list? */
+#endif
 };
 
 /* Real-Time classes' related field in a runqueue: */
@@ -158,7 +187,7 @@ struct rq {
 	 * nr_running and cpu_load should be in the same cacheline because
 	 * remote CPUs use both these fields when doing load calculation.
 	 */
-	long nr_running;
+	unsigned long nr_running;
 	#define CPU_LOAD_IDX_MAX 5
 	unsigned long cpu_load[CPU_LOAD_IDX_MAX];
 	unsigned char idle_at_tick;
@@ -170,6 +199,9 @@ struct rq {
 	u64 nr_switches;
 
 	struct cfs_rq cfs;
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	struct list_head leaf_cfs_rq_list; /* list of leaf cfs_rq on this cpu */
+#endif
 	struct rt_rq  rt;
 
 	/*
@@ -344,6 +376,18 @@ static inline unsigned long long rq_cloc
 #define task_rq(p)		cpu_rq(task_cpu(p))
 #define cpu_curr(cpu)		(cpu_rq(cpu)->curr)
 
+#ifdef CONFIG_FAIR_GROUP_SCHED
+/* Change a task's ->cfs_rq if it moves across CPUs */
+static inline void set_task_cfs_rq(struct task_struct *p)
+{
+	p->se.cfs_rq = &task_rq(p)->cfs;
+}
+#else
+static inline void set_task_cfs_rq(struct task_struct *p)
+{
+}
+#endif
+
 #ifndef prepare_arch_switch
 # define prepare_arch_switch(next)	do { } while (0)
 #endif
@@ -571,19 +615,6 @@ static u64 div64_likely32(u64 divident, 
 #endif
 }
 
-static s64 div64_s(s64 divident, unsigned long divisor)
-{
-	u64 tmp;
-
-	if (divident < 0) {
-		tmp = -divident;
-		return -(s64)div64_likely32(tmp, divisor);
-	} else {
-		tmp = divident;
-		return (s64)div64_likely32(tmp, divisor);
-	}
-}
-
 #if BITS_PER_LONG == 32
 # define WMULT_CONST	(~0UL)
 #else
@@ -592,16 +623,31 @@ static s64 div64_s(s64 divident, unsigne
 
 #define WMULT_SHIFT	32
 
-static inline u64
-calc_delta_mine(u64 delta_exec, unsigned long weight, struct load_weight *lw)
+static inline unsigned long
+calc_delta_mine(unsigned long delta_exec, unsigned long weight,
+		struct load_weight *lw)
 {
+	u64 tmp;
+
 	if (unlikely(!lw->inv_weight))
 		lw->inv_weight = WMULT_CONST / lw->weight;
 
-	return (delta_exec * weight * lw->inv_weight) >> WMULT_SHIFT;
+	tmp = (u64)delta_exec * weight;
+	/*
+	 * Check whether we'd overflow the 64-bit multiplication:
+	 */
+	if (unlikely(tmp > WMULT_CONST)) {
+		tmp = ((tmp >> WMULT_SHIFT/2) * lw->inv_weight)
+				>> (WMULT_SHIFT/2);
+	} else {
+		tmp = (tmp * lw->inv_weight) >> WMULT_SHIFT;
+	}
+
+	return (unsigned long)min(tmp, (u64)sysctl_sched_runtime_limit);
 }
 
-static inline u64 calc_delta_fair(u64 delta_exec, struct load_weight *lw)
+static inline unsigned long
+calc_delta_fair(unsigned long delta_exec, struct load_weight *lw)
 {
 	return calc_delta_mine(delta_exec, NICE_0_LOAD, lw);
 }
@@ -642,7 +688,7 @@ static void __update_curr_load(struct rq
  * This function is called /before/ updating rq->ls.load
  * and when switching tasks.
  */
-static inline void update_curr_load(struct rq *rq, u64 now)
+static void update_curr_load(struct rq *rq, u64 now)
 {
 	struct load_stat *ls = &rq->ls;
 	u64 start;
@@ -658,7 +704,6 @@ static inline void update_curr_load(stru
 		__update_curr_load(rq, ls);
 }
 
-
 /*
  * To aid in avoiding the subversion of "niceness" due to uneven distribution
  * of tasks with abnormal "nice" values across CPUs the contribution that
@@ -741,11 +786,26 @@ static inline void dec_nr_running(struct
 
 static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);
 
+struct rq_iterator {
+	void *arg;
+	struct task_struct *(*start)(void *);
+	struct task_struct *(*next)(void *);
+};
+
+static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
+		      unsigned long max_nr_move, unsigned long max_load_move,
+		      struct sched_domain *sd, enum cpu_idle_type idle,
+		      int *all_pinned, unsigned long *load_moved,
+		      int this_best_prio, int best_prio, int best_prio_seen,
+		      struct rq_iterator *iterator);
+
 #include "sched_stats.h"
 #include "sched_rt.c"
 #include "sched_fair.c"
 #include "sched_idletask.c"
-#include "sched_debug.c"
+#ifdef CONFIG_SCHED_DEBUG
+# include "sched_debug.c"
+#endif
 
 #define sched_class_highest (&rt_sched_class)
 
@@ -754,7 +814,7 @@ static void set_load_weight(struct task_
 	task_rq(p)->cfs.wait_runtime -= p->se.wait_runtime;
 	p->se.wait_runtime = 0;
 
-	if (has_rt_policy(p)) {
+	if (task_has_rt_policy(p)) {
 		p->se.load.weight = prio_to_weight[0] * 2;
 		p->se.load.inv_weight = prio_to_wmult[0] >> 1;
 		return;
@@ -807,7 +867,7 @@ static inline int normal_prio(struct tas
 {
 	int prio;
 
-	if (has_rt_policy(p))
+	if (task_has_rt_policy(p))
 		prio = MAX_RT_PRIO-1 - p->rt_priority;
 	else
 		prio = __normal_prio(p);
@@ -896,6 +956,7 @@ unsigned long weighted_cpuload(const int
 static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
 {
 	task_thread_info(p)->cpu = cpu;
+	set_task_cfs_rq(p);
 }
 
 void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
@@ -907,7 +968,6 @@ void set_task_cpu(struct task_struct *p,
 	clock_offset = old_rq->clock - new_rq->clock;
 	fair_clock_offset = old_rq->cfs.fair_clock -
 						 new_rq->cfs.fair_clock;
-
 	if (p->se.wait_start)
 		p->se.wait_start -= clock_offset;
 	if (p->se.wait_start_fair)
@@ -921,6 +981,7 @@ void set_task_cpu(struct task_struct *p,
 
 	task_thread_info(p)->cpu = new_cpu;
 
+	set_task_cfs_rq(p);
 }
 
 struct migration_req {
@@ -1478,17 +1539,25 @@ int fastcall wake_up_state(struct task_s
  */
 static void __sched_fork(struct task_struct *p)
 {
-	p->se.wait_start_fair = p->se.wait_start = p->se.exec_start = 0;
-	p->se.sum_exec_runtime = 0;
-	p->se.delta_exec = 0;
-	p->se.delta_fair = 0;
-
-	p->se.wait_runtime = 0;
-
-	p->se.sum_wait_runtime = p->se.sum_sleep_runtime = 0;
-	p->se.sleep_start = p->se.sleep_start_fair = p->se.block_start = 0;
-	p->se.sleep_max = p->se.block_max = p->se.exec_max = p->se.wait_max = 0;
-	p->se.wait_runtime_overruns = p->se.wait_runtime_underruns = 0;
+	p->se.wait_start_fair		= 0;
+	p->se.wait_start		= 0;
+	p->se.exec_start		= 0;
+	p->se.sum_exec_runtime		= 0;
+	p->se.delta_exec		= 0;
+	p->se.delta_fair_run		= 0;
+	p->se.delta_fair_sleep		= 0;
+	p->se.wait_runtime		= 0;
+	p->se.sum_wait_runtime		= 0;
+	p->se.sum_sleep_runtime		= 0;
+	p->se.sleep_start		= 0;
+	p->se.sleep_start_fair		= 0;
+	p->se.block_start		= 0;
+	p->se.sleep_max			= 0;
+	p->se.block_max			= 0;
+	p->se.exec_max			= 0;
+	p->se.wait_max			= 0;
+	p->se.wait_runtime_overruns	= 0;
+	p->se.wait_runtime_underruns	= 0;
 
 	INIT_LIST_HEAD(&p->run_list);
 	p->se.on_rq = 0;
@@ -1801,7 +1870,7 @@ static void update_cpu_load(struct rq *t
 	int i, scale;
 
 	this_rq->nr_load_updates++;
-	if (sysctl_sched_features & 64)
+	if (unlikely(!(sysctl_sched_features & SCHED_FEAT_PRECISE_CPU_LOAD)))
 		goto do_avg;
 
 	/* Update delta_fair/delta_exec fields first */
@@ -2005,89 +2074,26 @@ int can_migrate_task(struct task_struct 
 	return 1;
 }
 
-/*
- * Load-balancing iterator: iterate through the hieararchy of scheduling
- * classes, starting with the highest-prio one:
- */
-
-struct task_struct * load_balance_start(struct rq *rq)
-{
-	struct sched_class *class = sched_class_highest;
-	struct task_struct *p;
-
-	do {
-		p = class->load_balance_start(rq);
-		if (p) {
-			rq->load_balance_class = class;
-			return p;
-		}
-		class = class->next;
-	} while (class);
-
-	return NULL;
-}
-
-struct task_struct * load_balance_next(struct rq *rq)
-{
-	struct sched_class *class = rq->load_balance_class;
-	struct task_struct *p;
-
-	p = class->load_balance_next(rq);
-	if (p)
-		return p;
-	/*
-	 * Pick up the next class (if any) and attempt to start
-	 * the iterator there:
-	 */
-	while ((class = class->next)) {
-		p = class->load_balance_start(rq);
-		if (p) {
-			rq->load_balance_class = class;
-			return p;
-		}
-	}
-	return NULL;
-}
-
-#define rq_best_prio(rq) (rq)->curr->prio
-
-/*
- * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted
- * load from busiest to this_rq, as part of a balancing operation within
- * "domain". Returns the number of tasks moved.
- *
- * Called with both runqueues locked.
- */
-static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
+static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
 		      unsigned long max_nr_move, unsigned long max_load_move,
 		      struct sched_domain *sd, enum cpu_idle_type idle,
-		      int *all_pinned)
+		      int *all_pinned, unsigned long *load_moved,
+		      int this_best_prio, int best_prio, int best_prio_seen,
+		      struct rq_iterator *iterator)
 {
-	int pulled = 0, pinned = 0, this_best_prio, best_prio,
-	    best_prio_seen, skip_for_load;
+	int pulled = 0, pinned = 0, skip_for_load;
 	struct task_struct *p;
-	long rem_load_move;
+	long rem_load_move = max_load_move;
 
 	if (max_nr_move == 0 || max_load_move == 0)
 		goto out;
 
-	rem_load_move = max_load_move;
 	pinned = 1;
-	this_best_prio = rq_best_prio(this_rq);
-	best_prio = rq_best_prio(busiest);
-	/*
-	 * Enable handling of the case where there is more than one task
-	 * with the best priority.   If the current running task is one
-	 * of those with prio==best_prio we know it won't be moved
-	 * and therefore it's safe to override the skip (based on load) of
-	 * any task we find with that prio.
-	 */
-	best_prio_seen = best_prio == busiest->curr->prio;
 
 	/*
 	 * Start the load-balancing iterator:
 	 */
-	p = load_balance_start(busiest);
+	p = iterator->start(iterator->arg);
 next:
 	if (!p)
 		goto out;
@@ -2104,7 +2110,7 @@ next:
 	    !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
 
 		best_prio_seen |= p->prio == best_prio;
-		p = load_balance_next(busiest);
+		p = iterator->next(iterator->arg);
 		goto next;
 	}
 
@@ -2119,7 +2125,7 @@ next:
 	if (pulled < max_nr_move && rem_load_move > 0) {
 		if (p->prio < this_best_prio)
 			this_best_prio = p->prio;
-		p = load_balance_next(busiest);
+		p = iterator->next(iterator->arg);
 		goto next;
 	}
 out:
@@ -2132,10 +2138,40 @@ out:
 
 	if (all_pinned)
 		*all_pinned = pinned;
+	*load_moved = max_load_move - rem_load_move;
 	return pulled;
 }
 
 /*
+ * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted
+ * load from busiest to this_rq, as part of a balancing operation within
+ * "domain". Returns the number of tasks moved.
+ *
+ * Called with both runqueues locked.
+ */
+static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
+		      unsigned long max_nr_move, unsigned long max_load_move,
+		      struct sched_domain *sd, enum cpu_idle_type idle,
+		      int *all_pinned)
+{
+	struct sched_class *class = sched_class_highest;
+	unsigned long load_moved, total_nr_moved = 0, nr_moved;
+	long rem_load_move = max_load_move;
+
+	do {
+		nr_moved = class->load_balance(this_rq, this_cpu, busiest,
+				max_nr_move, (unsigned long)rem_load_move,
+				sd, idle, all_pinned, &load_moved);
+		total_nr_moved += nr_moved;
+		max_nr_move -= nr_moved;
+		rem_load_move -= load_moved;
+		class = class->next;
+	} while (class && max_nr_move && rem_load_move > 0);
+
+	return total_nr_moved;
+}
+
+/*
  * find_busiest_group finds and returns the busiest CPU group within the
  * domain. It calculates and returns the amount of weighted load which
  * should be moved to restore balance via the imbalance parameter.
@@ -3043,6 +3079,19 @@ static inline void idle_balance(int cpu,
 {
 }
 
+/* Avoid "used but not defined" warning on UP */
+static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
+		      unsigned long max_nr_move, unsigned long max_load_move,
+		      struct sched_domain *sd, enum cpu_idle_type idle,
+		      int *all_pinned, unsigned long *load_moved,
+		      int this_best_prio, int best_prio, int best_prio_seen,
+		      struct rq_iterator *iterator)
+{
+	*load_moved = 0;
+
+	return 0;
+}
+
 #endif	/* CONFIG_SMP */
 
 DEFINE_PER_CPU(struct kernel_stat, kstat);
@@ -3803,7 +3852,7 @@ void set_user_nice(struct task_struct *p
 	 * it wont have any effect on scheduling until the task is
 	 * SCHED_FIFO/SCHED_RR:
 	 */
-	if (has_rt_policy(p)) {
+	if (task_has_rt_policy(p)) {
 		p->static_prio = NICE_TO_PRIO(nice);
 		goto out_unlock;
 	}
@@ -4001,16 +4050,15 @@ recheck:
 	    (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
 	    (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
 		return -EINVAL;
-	if (is_rt_policy(policy) != (param->sched_priority != 0))
+	if (rt_policy(policy) != (param->sched_priority != 0))
 		return -EINVAL;
 
 	/*
 	 * Allow unprivileged RT tasks to decrease priority:
 	 */
 	if (!capable(CAP_SYS_NICE)) {
-		if (is_rt_policy(policy)) {
+		if (rt_policy(policy)) {
 			unsigned long rlim_rtprio;
-			unsigned long flags;
 
 			if (!lock_task_sighand(p, &flags))
 				return -ESRCH;
@@ -4653,7 +4701,9 @@ void show_state_filter(unsigned long sta
 	 */
 	if (state_filter == -1)
 		debug_show_all_locks();
+#ifdef CONFIG_SCHED_DEBUG
 	sysrq_sched_debug_show();
+#endif
 }
 
 void __cpuinit init_idle_bootup_task(struct task_struct *idle)
@@ -4728,6 +4778,7 @@ static inline void sched_init_granularit
 		sysctl_sched_granularity = gran_limit;
 
 	sysctl_sched_runtime_limit = sysctl_sched_granularity * 4;
+	sysctl_sched_wakeup_granularity = sysctl_sched_granularity / 2;
 }
 
 #ifdef CONFIG_SMP
@@ -5233,6 +5284,7 @@ migration_call(struct notifier_block *nf
 		deactivate_task(rq, rq->idle, 0);
 		rq->idle->static_prio = MAX_PRIO;
 		__setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
+		rq->idle->sched_class = &idle_sched_class;
 		migrate_dead_tasks(cpu);
 		task_rq_unlock(rq, &flags);
 		migrate_nr_uninterruptible(rq);
@@ -5841,7 +5893,6 @@ static void init_sched_groups_power(int 
 static int build_sched_domains(const cpumask_t *cpu_map)
 {
 	int i;
-	struct sched_domain *sd;
 #ifdef CONFIG_NUMA
 	struct sched_group **sched_group_nodes = NULL;
 	int sd_allnodes = 0;
@@ -5986,6 +6037,7 @@ static int build_sched_domains(const cpu
 		sched_group_nodes[i] = sg;
 		for_each_cpu_mask(j, nodemask) {
 			struct sched_domain *sd;
+
 			sd = &per_cpu(node_domains, j);
 			sd->groups = sg;
 		}
@@ -6030,19 +6082,22 @@ static int build_sched_domains(const cpu
 	/* Calculate CPU power for physical packages and nodes */
 #ifdef CONFIG_SCHED_SMT
 	for_each_cpu_mask(i, *cpu_map) {
-		sd = &per_cpu(cpu_domains, i);
+		struct sched_domain *sd = &per_cpu(cpu_domains, i);
+
 		init_sched_groups_power(i, sd);
 	}
 #endif
 #ifdef CONFIG_SCHED_MC
 	for_each_cpu_mask(i, *cpu_map) {
-		sd = &per_cpu(core_domains, i);
+		struct sched_domain *sd = &per_cpu(core_domains, i);
+
 		init_sched_groups_power(i, sd);
 	}
 #endif
 
 	for_each_cpu_mask(i, *cpu_map) {
-		sd = &per_cpu(phys_domains, i);
+		struct sched_domain *sd = &per_cpu(phys_domains, i);
+
 		init_sched_groups_power(i, sd);
 	}
 
@@ -6270,8 +6325,18 @@ int in_sched_functions(unsigned long add
 		&& addr < (unsigned long)__sched_text_end);
 }
 
+static inline void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
+{
+	cfs_rq->tasks_timeline = RB_ROOT;
+	cfs_rq->fair_clock = 1;
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	cfs_rq->rq = rq;
+#endif
+}
+
 void __init sched_init(void)
 {
+	u64 now = sched_clock();
 	int highest_cpu = 0;
 	int i, j;
 
@@ -6290,10 +6355,14 @@ void __init sched_init(void)
 		spin_lock_init(&rq->lock);
 		lockdep_set_class(&rq->lock, &rq->rq_lock_key);
 		rq->nr_running = 0;
-		rq->cfs.tasks_timeline = RB_ROOT;
-		rq->clock = rq->cfs.fair_clock = 1;
-		rq->ls.load_update_last = sched_clock();
-		rq->ls.load_update_start = sched_clock();
+		rq->clock = 1;
+		init_cfs_rq(&rq->cfs, rq);
+#ifdef CONFIG_FAIR_GROUP_SCHED
+		INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
+		list_add(&rq->cfs.leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
+#endif
+		rq->ls.load_update_last = now;
+		rq->ls.load_update_start = now;
 
 		for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
 			rq->cpu_load[j] = 0;
diff -puN kernel/sched_debug.c~cfs-v19 kernel/sched_debug.c
--- a/kernel/sched_debug.c~cfs-v19
+++ a/kernel/sched_debug.c
@@ -80,15 +80,17 @@ static void print_rq(struct seq_file *m,
 	read_unlock_irq(&tasklist_lock);
 }
 
-static void print_rq_runtime_sum(struct seq_file *m, struct rq *rq)
+static void
+print_cfs_rq_runtime_sum(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 {
 	s64 wait_runtime_rq_sum = 0;
 	struct task_struct *p;
 	struct rb_node *curr;
 	unsigned long flags;
+	struct rq *rq = &per_cpu(runqueues, cpu);
 
 	spin_lock_irqsave(&rq->lock, flags);
-	curr = first_fair(&rq->cfs);
+	curr = first_fair(cfs_rq);
 	while (curr) {
 		p = rb_entry(curr, struct task_struct, se.run_node);
 		wait_runtime_rq_sum += p->se.wait_runtime;
@@ -101,6 +103,24 @@ static void print_rq_runtime_sum(struct 
 		(long long)wait_runtime_rq_sum);
 }
 
+void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq, u64 now)
+{
+	SEQ_printf(m, "\ncfs_rq %p\n", cfs_rq);
+
+#define P(x) \
+	SEQ_printf(m, "  .%-30s: %Ld\n", #x, (long long)(cfs_rq->x))
+
+	P(fair_clock);
+	P(exec_clock);
+	P(wait_runtime);
+	P(wait_runtime_overruns);
+	P(wait_runtime_underruns);
+	P(sleeper_bonus);
+#undef P
+
+	print_cfs_rq_runtime_sum(m, cpu, cfs_rq);
+}
+
 static void print_cpu(struct seq_file *m, int cpu, u64 now)
 {
 	struct rq *rq = &per_cpu(runqueues, cpu);
@@ -136,18 +156,14 @@ static void print_cpu(struct seq_file *m
 	P(clock_overflows);
 	P(clock_unstable_events);
 	P(clock_max_delta);
-	P(cfs.fair_clock);
-	P(cfs.exec_clock);
-	P(cfs.wait_runtime);
-	P(cfs.wait_runtime_overruns);
-	P(cfs.wait_runtime_underruns);
 	P(cpu_load[0]);
 	P(cpu_load[1]);
 	P(cpu_load[2]);
 	P(cpu_load[3]);
 	P(cpu_load[4]);
 #undef P
-	print_rq_runtime_sum(m, rq);
+
+	print_cfs_stats(m, cpu, now);
 
 	print_rq(m, rq, cpu, now);
 }
@@ -157,7 +173,7 @@ static int sched_debug_show(struct seq_f
 	u64 now = ktime_to_ns(ktime_get());
 	int cpu;
 
-	SEQ_printf(m, "Sched Debug Version: v0.03, cfs-v18, %s %.*s\n",
+	SEQ_printf(m, "Sched Debug Version: v0.04, cfs-v19, %s %.*s\n",
 		init_utsname()->release,
 		(int)strcspn(init_utsname()->version, " "),
 		init_utsname()->version);
diff -puN kernel/sched_fair.c~cfs-v19 kernel/sched_fair.c
--- a/kernel/sched_fair.c~cfs-v19
+++ a/kernel/sched_fair.c
@@ -61,8 +61,25 @@ unsigned int sysctl_sched_stat_granulari
  */
 unsigned int sysctl_sched_runtime_limit __read_mostly;
 
+/*
+ * Debugging: various feature bits
+ */
+enum {
+	SCHED_FEAT_FAIR_SLEEPERS	= 1,
+	SCHED_FEAT_SLEEPER_AVG		= 2,
+	SCHED_FEAT_SLEEPER_LOAD_AVG	= 4,
+	SCHED_FEAT_PRECISE_CPU_LOAD	= 8,
+	SCHED_FEAT_START_DEBIT		= 16,
+	SCHED_FEAT_SKIP_INITIAL		= 32,
+};
+
 unsigned int sysctl_sched_features __read_mostly =
-			0 | 2 | 4 | 8 | 0 | 0 | 0 | 0;
+		SCHED_FEAT_FAIR_SLEEPERS	*1 |
+		SCHED_FEAT_SLEEPER_AVG		*1 |
+		SCHED_FEAT_SLEEPER_LOAD_AVG	*1 |
+		SCHED_FEAT_PRECISE_CPU_LOAD	*1 |
+		SCHED_FEAT_START_DEBIT		*1 |
+		SCHED_FEAT_SKIP_INITIAL		*0;
 
 extern struct sched_class fair_sched_class;
 
@@ -70,6 +87,31 @@ extern struct sched_class fair_sched_cla
  * CFS operations on generic schedulable entities:
  */
 
+#ifdef CONFIG_FAIR_GROUP_SCHED
+
+/* cpu runqueue to which this cfs_rq is attached */
+static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
+{
+	return cfs_rq->rq;
+}
+
+/* currently running entity (if any) on this cfs_rq */
+static inline struct sched_entity *cfs_rq_curr(struct cfs_rq *cfs_rq)
+{
+	return cfs_rq->curr;
+}
+
+/* An entity is a task if it doesn't "own" a runqueue */
+#define entity_is_task(se)	(!se->my_q)
+
+static inline void
+set_cfs_rq_curr(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+	cfs_rq->curr = se;
+}
+
+#else	/* CONFIG_FAIR_GROUP_SCHED */
+
 static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
 {
 	return container_of(cfs_rq, struct rq, cfs);
@@ -87,6 +129,11 @@ static inline struct sched_entity *cfs_r
 
 #define entity_is_task(se)	1
 
+static inline void
+set_cfs_rq_curr(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
+
+#endif	/* CONFIG_FAIR_GROUP_SCHED */
+
 static inline struct task_struct *task_of(struct sched_entity *se)
 {
 	return container_of(se, struct task_struct, se);
@@ -119,7 +166,7 @@ __enqueue_entity(struct cfs_rq *cfs_rq, 
 		 * We dont care about collisions. Nodes with
 		 * the same key stay together.
 		 */
-		if ((s64)(key - entry->fair_key) < 0) {
+		if (key - entry->fair_key < 0) {
 			link = &parent->rb_left;
 		} else {
 			link = &parent->rb_right;
@@ -145,7 +192,7 @@ static inline void
 __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	if (cfs_rq->rb_leftmost == &se->run_node)
-		cfs_rq->rb_leftmost = NULL;
+		cfs_rq->rb_leftmost = rb_next(&se->run_node);
 	rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
 	update_load_sub(&cfs_rq->load, se->load.weight);
 	cfs_rq->nr_running--;
@@ -154,10 +201,6 @@ __dequeue_entity(struct cfs_rq *cfs_rq, 
 
 static inline struct rb_node * first_fair(struct cfs_rq *cfs_rq)
 {
-	if (cfs_rq->rb_leftmost)
-		return cfs_rq->rb_leftmost;
-	/* Cache the value returned by rb_first() */
-	cfs_rq->rb_leftmost = rb_first(&cfs_rq->tasks_timeline);
 	return cfs_rq->rb_leftmost;
 }
 
@@ -174,7 +217,7 @@ static struct sched_entity * __pick_next
  * We rescale the rescheduling granularity of tasks according to their
  * nice level, but only linearly, not exponentially:
  */
-static s64
+static long
 niced_granularity(struct sched_entity *curr, unsigned long granularity)
 {
 	u64 tmp;
@@ -182,20 +225,24 @@ niced_granularity(struct sched_entity *c
 	/*
 	 * Negative nice levels get the same granularity as nice-0:
 	 */
-	if (curr->load.weight >= NICE_0_LOAD)
+	if (likely(curr->load.weight >= NICE_0_LOAD))
 		return granularity;
 	/*
 	 * Positive nice level tasks get linearly finer
 	 * granularity:
 	 */
 	tmp = curr->load.weight * (u64)granularity;
-	return (s64) (tmp >> NICE_0_SHIFT);
+
+	/*
+	 * It will always fit into 'long':
+	 */
+	return (long) (tmp >> NICE_0_SHIFT);
 }
 
 static inline void
 limit_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-	s64 limit = sysctl_sched_runtime_limit;
+	long limit = sysctl_sched_runtime_limit;
 
 	/*
 	 * Niced tasks have the same history dynamic range as
@@ -214,7 +261,7 @@ limit_wait_runtime(struct cfs_rq *cfs_rq
 }
 
 static inline void
-__add_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se, s64 delta)
+__add_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se, long delta)
 {
 	se->wait_runtime += delta;
 	schedstat_add(se, sum_wait_runtime, delta);
@@ -222,7 +269,7 @@ __add_wait_runtime(struct cfs_rq *cfs_rq
 }
 
 static void
-add_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se, s64 delta)
+add_wait_runtime(struct cfs_rq *cfs_rq, struct sched_entity *se, long delta)
 {
 	schedstat_add(cfs_rq, wait_runtime, -se->wait_runtime);
 	__add_wait_runtime(cfs_rq, se, delta);
@@ -236,33 +283,35 @@ add_wait_runtime(struct cfs_rq *cfs_rq, 
 static inline void
 __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, u64 now)
 {
+	unsigned long delta, delta_exec, delta_fair;
+	long delta_mine;
 	struct load_weight *lw = &cfs_rq->load;
-	u64 delta_exec, delta_fair, delta_mine;
-	struct rq *rq = rq_of(cfs_rq);
-	struct task_struct *curtask = rq->curr;
 	unsigned long load = lw->weight;
 
 	if (unlikely(!load))
 		return;
 
 	delta_exec = curr->delta_exec;
+#ifdef CONFIG_SCHEDSTATS
 	if (unlikely(delta_exec > curr->exec_max))
 		curr->exec_max = delta_exec;
+#endif
 
 	curr->sum_exec_runtime += delta_exec;
 	cfs_rq->exec_clock += delta_exec;
 
 	delta_fair = calc_delta_fair(delta_exec, lw);
+	delta_mine = calc_delta_mine(delta_exec, curr->load.weight, lw);
 
-	/*
-	 * Task already marked for preemption, do not burden
-	 * it with the cost of not having left the CPU yet:
-	 */
-	if (unlikely(sysctl_sched_features & 1))
-		if (unlikely(test_tsk_thread_flag(curtask, TIF_NEED_RESCHED)))
-			return;
+	if (cfs_rq->sleeper_bonus > sysctl_sched_stat_granularity) {
+		delta = calc_delta_mine(cfs_rq->sleeper_bonus,
+					curr->load.weight, lw);
+		if (unlikely(delta > cfs_rq->sleeper_bonus))
+			delta = cfs_rq->sleeper_bonus;
 
-	delta_mine = calc_delta_mine(delta_exec, curr->load.weight, lw);
+		cfs_rq->sleeper_bonus -= delta;
+		delta_mine -= delta;
+	}
 
 	cfs_rq->fair_clock += delta_fair;
 	/*
@@ -285,13 +334,14 @@ static void update_curr(struct cfs_rq *c
 
 	/*
 	 * Get the amount of time the current task was running
-	 * since the last time we changed load:
+	 * since the last time we changed load (this cannot
+	 * overflow on 32 bits):
 	 */
-	delta_exec = now - curr->exec_start;
+	delta_exec = (unsigned long)(now - curr->exec_start);
 
 	curr->delta_exec += delta_exec;
 
-	if (curr->delta_exec > sysctl_sched_stat_granularity) {
+	if (unlikely(curr->delta_exec > sysctl_sched_stat_granularity)) {
 		__update_curr(cfs_rq, curr, now);
 		curr->delta_exec = 0;
 	}
@@ -305,19 +355,10 @@ update_stats_wait_start(struct cfs_rq *c
 	se->wait_start = now;
 }
 
-static inline s64 weight_s64(s64 calc, unsigned long weight, int shift)
-{
-	if (calc < 0) {
-		calc = - calc * weight;
-		return - (calc >> shift);
-	}
-	return (calc * weight) >> shift;
-}
-
 /*
  * Task is being enqueued - update stats:
  */
-static inline void
+static void
 update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
 {
 	s64 key;
@@ -339,15 +380,15 @@ update_stats_enqueue(struct cfs_rq *cfs_
 	if (likely(se->load.weight == NICE_0_LOAD)) {
 		key -= se->wait_runtime;
 	} else {
-		s64 tmp;
+		u64 tmp;
 
 		if (se->wait_runtime < 0) {
 			tmp = -se->wait_runtime;
 			key += (tmp * se->load.inv_weight) >>
 					(WMULT_SHIFT - NICE_0_SHIFT);
 		} else {
-			tmp = se->wait_runtime * se->load.weight;
-			key -= tmp >> NICE_0_SHIFT;
+			tmp = se->wait_runtime;
+			key -= (tmp * se->load.weight) >> NICE_0_SHIFT;
 		}
 	}
 
@@ -360,17 +401,18 @@ update_stats_enqueue(struct cfs_rq *cfs_
 static inline void
 __update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
 {
-	s64 delta_fair, delta_wait;
-
-	delta_wait = now - se->wait_start;
-	if (unlikely(delta_wait > se->wait_max))
-		se->wait_max = delta_wait;
+	unsigned long delta_fair = se->delta_fair_run;
 
-	delta_fair = se->delta_fair;
+#ifdef CONFIG_SCHEDSTATS
+	{
+		s64 delta_wait = now - se->wait_start;
+		if (unlikely(delta_wait > se->wait_max))
+			se->wait_max = delta_wait;
+	}
+#endif
 
 	if (unlikely(se->load.weight != NICE_0_LOAD))
-		delta_fair = weight_s64(delta_fair, se->load.weight,
-						NICE_0_SHIFT);
+		delta_fair = (u64)delta_fair * se->load.weight >> NICE_0_SHIFT;
 
 	add_wait_runtime(cfs_rq, se, delta_fair);
 }
@@ -378,12 +420,16 @@ __update_stats_wait_end(struct cfs_rq *c
 static void
 update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
 {
-	s64 delta_fair = cfs_rq->fair_clock - se->wait_start_fair;
+	unsigned long delta_fair;
+
+	delta_fair = (unsigned long)min((u64)(2*sysctl_sched_runtime_limit),
+			(u64)(cfs_rq->fair_clock - se->wait_start_fair));
 
-	se->delta_fair += delta_fair;
-	if (unlikely(se->delta_fair >= sysctl_sched_stat_granularity)) {
+	se->delta_fair_run += delta_fair;
+	if (unlikely(abs(se->delta_fair_run) >=
+				sysctl_sched_stat_granularity)) {
 		__update_stats_wait_end(cfs_rq, se, now);
-		se->delta_fair = 0;
+		se->delta_fair_run = 0;
 	}
 
 	se->wait_start_fair = 0;
@@ -423,36 +469,6 @@ update_stats_curr_end(struct cfs_rq *cfs
 	se->exec_start = 0;
 }
 
-/*
- * A task gets added back to the runnable tasks and gets
- * a small credit for the CPU time it missed out on, so
- * fix up all other runnable task's wait_runtime so that
- * the sum stays constant (around 0).
- *
- * Instead of looping over all runnable tasks in an O(N)
- * manner we move the fair clock back by a proportional
- * amount of the new wait_runtime this task adds to the pool.
- */
-static void distribute_fair_add(struct cfs_rq *cfs_rq, s64 delta)
-{
-	struct sched_entity *curr = cfs_rq_curr(cfs_rq);
-	s64 delta_fair = 0;
-
-	if (!(sysctl_sched_features & 2))
-		return;
-
-	if (cfs_rq->nr_running) {
-		delta_fair = div64_s(delta, cfs_rq->nr_running);
-		/*
-		 * The currently running task's next wait_runtime value does
-		 * not depend on the fair_clock, so fix it up explicitly:
-		 */
-		 if (curr)
-			add_wait_runtime(cfs_rq, curr, -delta_fair);
-	}
-	cfs_rq->fair_clock -= delta_fair;
-}
-
 /**************************************************
  * Scheduling class queueing methods:
  */
@@ -460,30 +476,33 @@ static void distribute_fair_add(struct c
 static void
 __enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
 {
-	unsigned long load = cfs_rq->load.weight;
-	s64 delta_fair, prev_runtime;
+	unsigned long load = cfs_rq->load.weight, delta_fair;
+	long prev_runtime;
+
+	if (sysctl_sched_features & SCHED_FEAT_SLEEPER_LOAD_AVG)
+		load = rq_of(cfs_rq)->cpu_load[2];
 
-	delta_fair = cfs_rq->fair_clock - se->sleep_start_fair;
+	delta_fair = se->delta_fair_sleep;
 
 	/*
 	 * Fix up delta_fair with the effect of us running
 	 * during the whole sleep period:
 	 */
-	if (!(sysctl_sched_features & 32))
-		delta_fair = div64_s(delta_fair * load, load + se->load.weight);
+	if (sysctl_sched_features & SCHED_FEAT_SLEEPER_AVG)
+		delta_fair = div64_likely32((u64)delta_fair * load,
+						load + se->load.weight);
 
-	delta_fair = weight_s64(delta_fair, se->load.weight, NICE_0_SHIFT);
+	if (unlikely(se->load.weight != NICE_0_LOAD))
+		delta_fair = (u64)delta_fair * se->load.weight >> NICE_0_SHIFT;
 
 	prev_runtime = se->wait_runtime;
 	__add_wait_runtime(cfs_rq, se, delta_fair);
 	delta_fair = se->wait_runtime - prev_runtime;
 
 	/*
-	 * We move the fair clock back by a load-proportional
-	 * amount of the new wait_runtime this task adds to
-	 * the 'pool':
+	 * Track the amount of bonus we've given to sleepers:
 	 */
-	distribute_fair_add(cfs_rq, delta_fair);
+	cfs_rq->sleeper_bonus += delta_fair;
 
 	schedstat_add(cfs_rq, wait_runtime, se->wait_runtime);
 }
@@ -492,17 +511,20 @@ static void
 enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
 {
 	struct task_struct *tsk = task_of(se);
-	s64 delta_fair;
+	unsigned long delta_fair;
 
 	if ((entity_is_task(se) && tsk->policy == SCHED_BATCH) ||
-						 !(sysctl_sched_features & 4))
+			 !(sysctl_sched_features & SCHED_FEAT_FAIR_SLEEPERS))
 		return;
 
-	delta_fair = cfs_rq->fair_clock - se->sleep_start_fair;
-	se->delta_fair += delta_fair;
-	if (unlikely(se->delta_fair >= sysctl_sched_stat_granularity)) {
+	delta_fair = (unsigned long)min((u64)(2*sysctl_sched_runtime_limit),
+		(u64)(cfs_rq->fair_clock - se->sleep_start_fair));
+
+	se->delta_fair_sleep += delta_fair;
+	if (unlikely(abs(se->delta_fair_sleep) >=
+				sysctl_sched_stat_granularity)) {
 		__enqueue_sleeper(cfs_rq, se, now);
-		se->delta_fair = 0;
+		se->delta_fair_sleep = 0;
 	}
 
 	se->sleep_start_fair = 0;
@@ -535,8 +557,9 @@ enqueue_sleeper(struct cfs_rq *cfs_rq, s
 #endif
 }
 
-static void enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
-							 int wakeup, u64 now)
+static void
+enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
+	       int wakeup, u64 now)
 {
 	/*
 	 * Update the fair clock.
@@ -550,8 +573,9 @@ static void enqueue_entity(struct cfs_rq
 	__enqueue_entity(cfs_rq, se);
 }
 
-static void dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
-							 int sleep, u64 now)
+static void
+dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
+	       int sleep, u64 now)
 {
 	update_stats_dequeue(cfs_rq, se, now);
 	if (sleep) {
@@ -574,7 +598,7 @@ static void dequeue_entity(struct cfs_rq
 /*
  * Preempt the current task with a newly woken task if needed:
  */
-static inline void
+static void
 __check_preempt_curr_fair(struct cfs_rq *cfs_rq, struct sched_entity *se,
 			  struct sched_entity *curr, unsigned long granularity)
 {
@@ -589,10 +613,9 @@ __check_preempt_curr_fair(struct cfs_rq 
 		resched_task(rq_of(cfs_rq)->curr);
 }
 
-static struct sched_entity * pick_next_entity(struct cfs_rq *cfs_rq, u64 now)
+static inline void
+set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
 {
-	struct sched_entity *se = __pick_next_entity(cfs_rq);
-
 	/*
 	 * Any task has to be enqueued before it get to execute on
 	 * a CPU. So account for the time it spent waiting on the
@@ -602,6 +625,14 @@ static struct sched_entity * pick_next_e
 	 */
 	update_stats_wait_end(cfs_rq, se, now);
 	update_stats_curr_start(cfs_rq, se, now);
+	set_cfs_rq_curr(cfs_rq, se);
+}
+
+static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq, u64 now)
+{
+	struct sched_entity *se = __pick_next_entity(cfs_rq);
+
+	set_next_entity(cfs_rq, se, now);
 
 	return se;
 }
@@ -609,42 +640,24 @@ static struct sched_entity * pick_next_e
 static void
 put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev, u64 now)
 {
-	int updated = 0;
-
-	/*
-	 * If the task is still waiting for the CPU (it just got
-	 * preempted), update its position within the tree and
-	 * start the wait period:
-	 */
-	if ((sysctl_sched_features & 16) && entity_is_task(prev))  {
-		struct task_struct *prevtask = task_of(prev);
-
-		if (prev->on_rq &&
-			test_tsk_thread_flag(prevtask, TIF_NEED_RESCHED)) {
-
-			dequeue_entity(cfs_rq, prev, 0, now);
-			enqueue_entity(cfs_rq, prev, 0, now);
-			updated = 1;
-		}
-	}
-
 	/*
 	 * If still on the runqueue then deactivate_task()
 	 * was not called and update_curr() has to be done:
 	 */
-	if (prev->on_rq && !updated)
+	if (prev->on_rq)
 		update_curr(cfs_rq, now);
 
 	update_stats_curr_end(cfs_rq, prev, now);
 
 	if (prev->on_rq)
 		update_stats_wait_start(cfs_rq, prev, now);
+	set_cfs_rq_curr(cfs_rq, NULL);
 }
 
 static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 {
-	struct sched_entity *next;
 	struct rq *rq = rq_of(cfs_rq);
+	struct sched_entity *next;
 	u64 now = __rq_clock(rq);
 
 	/*
@@ -661,16 +674,6 @@ static void entity_tick(struct cfs_rq *c
 	if (next == curr)
 		return;
 
-	if (entity_is_task(curr)) {
-		struct task_struct *curtask = task_of(curr),
-				   *nexttask = task_of(next);
-
-		if ((rt_prio(nexttask->prio) &&
-				(nexttask->prio < curtask->prio))) {
-			resched_task(curtask);
-			return;
-		}
-	}
 	__check_preempt_curr_fair(cfs_rq, next, curr, sysctl_sched_granularity);
 }
 
@@ -678,11 +681,90 @@ static void entity_tick(struct cfs_rq *c
  * CFS operations on tasks:
  */
 
+#ifdef CONFIG_FAIR_GROUP_SCHED
+
+/* Walk up scheduling entities hierarchy */
+#define for_each_sched_entity(se) \
+		for (; se; se = se->parent)
+
+static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
+{
+	return p->se.cfs_rq;
+}
+
+/* runqueue on which this entity is (to be) queued */
+static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
+{
+	return se->cfs_rq;
+}
+
+/* runqueue "owned" by this group */
+static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
+{
+	return grp->my_q;
+}
+
+/* Given a group's cfs_rq on one cpu, return its corresponding cfs_rq on
+ * another cpu ('this_cpu')
+ */
+static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
+{
+	/* A later patch will take group into account */
+	return &cpu_rq(this_cpu)->cfs;
+}
+
+/* Iterate thr' all leaf cfs_rq's on a runqueue */
+#define for_each_leaf_cfs_rq(rq, cfs_rq) \
+	list_for_each_entry(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
+
+/* Do the two (enqueued) tasks belong to the same group ? */
+static inline int is_same_group(struct task_struct *curr, struct task_struct *p)
+{
+	if (curr->se.cfs_rq == p->se.cfs_rq)
+		return 1;
+
+	return 0;
+}
+
+#else	/* CONFIG_FAIR_GROUP_SCHED */
+
+#define for_each_sched_entity(se) \
+		for (; se; se = NULL)
+
 static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
 {
 	return &task_rq(p)->cfs;
 }
 
+static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
+{
+	struct task_struct *p = task_of(se);
+	struct rq *rq = task_rq(p);
+
+	return &rq->cfs;
+}
+
+/* runqueue "owned" by this group */
+static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
+{
+	return NULL;
+}
+
+static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
+{
+	return &cpu_rq(this_cpu)->cfs;
+}
+
+#define for_each_leaf_cfs_rq(rq, cfs_rq) \
+		for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
+
+static inline int is_same_group(struct task_struct *curr, struct task_struct *p)
+{
+	return 1;
+}
+
+#endif	/* CONFIG_FAIR_GROUP_SCHED */
+
 /*
  * The enqueue_task method is called before nr_running is
  * increased. Here we update the fair scheduling stats and
@@ -691,10 +773,15 @@ static inline struct cfs_rq *task_cfs_rq
 static void
 enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup, u64 now)
 {
-	struct cfs_rq *cfs_rq = task_cfs_rq(p);
+	struct cfs_rq *cfs_rq;
 	struct sched_entity *se = &p->se;
 
-	enqueue_entity(cfs_rq, se, wakeup, now);
+	for_each_sched_entity(se) {
+		if (se->on_rq)
+			break;
+		cfs_rq = cfs_rq_of(se);
+		enqueue_entity(cfs_rq, se, wakeup, now);
+	}
 }
 
 /*
@@ -705,30 +792,32 @@ enqueue_task_fair(struct rq *rq, struct 
 static void
 dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep, u64 now)
 {
-	struct cfs_rq *cfs_rq = task_cfs_rq(p);
+	struct cfs_rq *cfs_rq;
 	struct sched_entity *se = &p->se;
 
-	dequeue_entity(cfs_rq, se, sleep, now);
+	for_each_sched_entity(se) {
+		cfs_rq = cfs_rq_of(se);
+		dequeue_entity(cfs_rq, se, sleep, now);
+		/* Don't dequeue parent if it has other entities besides us */
+		if (cfs_rq->load.weight)
+			break;
+	}
 }
 
 /*
- * sched_yield() support is very simple via the rbtree: we just
- * dequeue the task and move it after the next task, which
- * causes tasks to roundrobin.
+ * sched_yield() support is very simple - we dequeue and enqueue
  */
 static void yield_task_fair(struct rq *rq, struct task_struct *p)
 {
 	struct cfs_rq *cfs_rq = task_cfs_rq(p);
-	struct sched_entity *se = &p->se;
-	struct rb_node *first, *curr, *next;
+	u64 now = __rq_clock(rq);
 
-	curr = &se->run_node;
-	next = rb_next(curr);
-	first = rb_first(&cfs_rq->tasks_timeline);
-	if ((first == curr) && next)
-		cfs_rq->rb_leftmost = next;
-	else
-		cfs_rq->rb_leftmost = first;
+	/*
+	 * Dequeue and enqueue the task to update its
+	 * position within the tree:
+	 */
+	dequeue_entity(cfs_rq, &p->se, 0, now);
+	enqueue_entity(cfs_rq, &p->se, 0, now);
 }
 
 /*
@@ -741,10 +830,7 @@ static void check_preempt_curr_fair(stru
 	unsigned long gran;
 
 	if (unlikely(rt_prio(p->prio))) {
-		if (sysctl_sched_features & 8) {
-			if (rt_prio(p->prio))
-				update_curr(cfs_rq, rq_clock(rq));
-		}
+		update_curr(cfs_rq, rq_clock(rq));
 		resched_task(curr);
 		return;
 	}
@@ -756,7 +842,8 @@ static void check_preempt_curr_fair(stru
 	if (unlikely(p->policy == SCHED_BATCH))
 		gran = sysctl_sched_batch_wakeup_granularity;
 
-	__check_preempt_curr_fair(cfs_rq, &p->se, &curr->se, gran);
+	if (is_same_group(curr, p))
+		__check_preempt_curr_fair(cfs_rq, &p->se, &curr->se, gran);
 }
 
 static struct task_struct * pick_next_task_fair(struct rq *rq, u64 now)
@@ -767,7 +854,10 @@ static struct task_struct * pick_next_ta
 	if (unlikely(!cfs_rq->nr_running))
 		return NULL;
 
-	se = pick_next_entity(cfs_rq, now);
+	do {
+		se = pick_next_entity(cfs_rq, now);
+		cfs_rq = group_cfs_rq(se);
+	} while (cfs_rq);
 
 	return task_of(se);
 }
@@ -777,7 +867,13 @@ static struct task_struct * pick_next_ta
  */
 static void put_prev_task_fair(struct rq *rq, struct task_struct *prev, u64 now)
 {
-	put_prev_entity(task_cfs_rq(prev), &prev->se, now);
+	struct sched_entity *se = &prev->se;
+	struct cfs_rq *cfs_rq;
+
+	for_each_sched_entity(se) {
+		cfs_rq = cfs_rq_of(se);
+		put_prev_entity(cfs_rq, se, now);
+	}
 }
 
 /**************************************************
@@ -792,7 +888,7 @@ static void put_prev_task_fair(struct rq
  * the current task:
  */
 static inline struct task_struct *
-__load_balance_iterator(struct rq *rq, struct rb_node *curr)
+__load_balance_iterator(struct cfs_rq *cfs_rq, struct rb_node *curr)
 {
 	struct task_struct *p;
 
@@ -800,19 +896,104 @@ __load_balance_iterator(struct rq *rq, s
 		return NULL;
 
 	p = rb_entry(curr, struct task_struct, se.run_node);
-	rq->cfs.rb_load_balance_curr = rb_next(curr);
+	cfs_rq->rb_load_balance_curr = rb_next(curr);
 
 	return p;
 }
 
-static struct task_struct * load_balance_start_fair(struct rq *rq)
+static struct task_struct *load_balance_start_fair(void *arg)
 {
-	return __load_balance_iterator(rq, first_fair(&rq->cfs));
+	struct cfs_rq *cfs_rq = arg;
+
+	return __load_balance_iterator(cfs_rq, first_fair(cfs_rq));
 }
 
-static struct task_struct * load_balance_next_fair(struct rq *rq)
+static struct task_struct *load_balance_next_fair(void *arg)
 {
-	return __load_balance_iterator(rq, rq->cfs.rb_load_balance_curr);
+	struct cfs_rq *cfs_rq = arg;
+
+	return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr);
+}
+
+static int cfs_rq_best_prio(struct cfs_rq *cfs_rq)
+{
+	struct sched_entity *curr;
+	struct task_struct *p;
+
+	if (!cfs_rq->nr_running)
+		return MAX_PRIO;
+
+	curr = __pick_next_entity(cfs_rq);
+	p = task_of(curr);
+
+	return p->prio;
+}
+
+static int
+load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
+			unsigned long max_nr_move, unsigned long max_load_move,
+			struct sched_domain *sd, enum cpu_idle_type idle,
+			int *all_pinned, unsigned long *total_load_moved)
+{
+	struct cfs_rq *busy_cfs_rq;
+	unsigned long load_moved, total_nr_moved = 0, nr_moved;
+	long rem_load_move = max_load_move;
+	struct rq_iterator cfs_rq_iterator;
+
+	cfs_rq_iterator.start = load_balance_start_fair;
+	cfs_rq_iterator.next = load_balance_next_fair;
+
+	for_each_leaf_cfs_rq(busiest, busy_cfs_rq) {
+		struct cfs_rq *this_cfs_rq;
+		long imbalance;
+		unsigned long maxload;
+		int this_best_prio, best_prio, best_prio_seen = 0;
+
+		this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu);
+
+		imbalance = busy_cfs_rq->load.weight -
+						 this_cfs_rq->load.weight;
+		/* Don't pull if this_cfs_rq has more load than busy_cfs_rq */
+		if (imbalance <= 0)
+			continue;
+
+		/* Don't pull more than imbalance/2 */
+		imbalance /= 2;
+		maxload = min(rem_load_move, imbalance);
+
+		this_best_prio = cfs_rq_best_prio(this_cfs_rq);
+		best_prio = cfs_rq_best_prio(busy_cfs_rq);
+
+		/*
+		 * Enable handling of the case where there is more than one task
+		 * with the best priority. If the current running task is one
+		 * of those with prio==best_prio we know it won't be moved
+		 * and therefore it's safe to override the skip (based on load)
+		 * of any task we find with that prio.
+		 */
+		if (cfs_rq_curr(busy_cfs_rq) == &busiest->curr->se)
+			best_prio_seen = 1;
+
+		/* pass busy_cfs_rq argument into
+		 * load_balance_[start|next]_fair iterators
+		 */
+		cfs_rq_iterator.arg = busy_cfs_rq;
+		nr_moved = balance_tasks(this_rq, this_cpu, busiest,
+				max_nr_move, maxload, sd, idle, all_pinned,
+				&load_moved, this_best_prio, best_prio,
+				best_prio_seen, &cfs_rq_iterator);
+
+		total_nr_moved += nr_moved;
+		max_nr_move -= nr_moved;
+		rem_load_move -= load_moved;
+
+		if (max_nr_move <= 0 || rem_load_move <= 0)
+			break;
+	}
+
+	*total_load_moved = max_load_move - rem_load_move;
+
+	return total_nr_moved;
 }
 
 /*
@@ -820,7 +1001,13 @@ static struct task_struct * load_balance
  */
 static void task_tick_fair(struct rq *rq, struct task_struct *curr)
 {
-	entity_tick(task_cfs_rq(curr), &curr->se);
+	struct cfs_rq *cfs_rq;
+	struct sched_entity *se = &curr->se;
+
+	for_each_sched_entity(se) {
+		cfs_rq = cfs_rq_of(se);
+		entity_tick(cfs_rq, se);
+	}
 }
 
 /*
@@ -844,26 +1031,50 @@ static void task_new_fair(struct rq *rq,
 	 * until it reschedules once. We set up the key so that
 	 * it will preempt the parent:
 	 */
-	p->se.fair_key = current->se.fair_key - niced_granularity(&rq->curr->se,
-						sysctl_sched_granularity) - 1;
+	p->se.fair_key = current->se.fair_key -
+		niced_granularity(&rq->curr->se, sysctl_sched_granularity) - 1;
 	/*
 	 * The first wait is dominated by the child-runs-first logic,
 	 * so do not credit it with that waiting time yet:
 	 */
-	if (sysctl_sched_features & 256)
+	if (sysctl_sched_features & SCHED_FEAT_SKIP_INITIAL)
 		p->se.wait_start_fair = 0;
 
 	/*
 	 * The statistical average of wait_runtime is about
 	 * -granularity/2, so initialize the task with that:
 	 */
-	if (sysctl_sched_features & 128)
-		p->se.wait_runtime = -(s64)(sysctl_sched_granularity / 2);
+	if (sysctl_sched_features & SCHED_FEAT_START_DEBIT)
+		p->se.wait_runtime = -(sysctl_sched_granularity / 2);
 
 	__enqueue_entity(cfs_rq, se);
 	inc_nr_running(p, rq, now);
 }
 
+#ifdef CONFIG_FAIR_GROUP_SCHED
+/* Account for a task changing its policy or group.
+ *
+ * This routine is mostly called to set cfs_rq->curr field when a task
+ * migrates between groups/classes.
+ */
+static void set_curr_task_fair(struct rq *rq)
+{
+	struct task_struct *curr = rq->curr;
+	struct sched_entity *se = &curr->se;
+	u64 now = rq_clock(rq);
+	struct cfs_rq *cfs_rq;
+
+	for_each_sched_entity(se) {
+		cfs_rq = cfs_rq_of(se);
+		set_next_entity(cfs_rq, se, now);
+	}
+}
+#else
+static void set_curr_task_fair(struct rq *rq)
+{
+}
+#endif
+
 /*
  * All the scheduling class methods:
  */
@@ -877,8 +1088,20 @@ struct sched_class fair_sched_class __re
 	.pick_next_task		= pick_next_task_fair,
 	.put_prev_task		= put_prev_task_fair,
 
-	.load_balance_start	= load_balance_start_fair,
-	.load_balance_next	= load_balance_next_fair,
+	.load_balance		= load_balance_fair,
+
+	.set_curr_task          = set_curr_task_fair,
 	.task_tick		= task_tick_fair,
 	.task_new		= task_new_fair,
 };
+
+#ifdef CONFIG_SCHED_DEBUG
+void print_cfs_stats(struct seq_file *m, int cpu, u64 now)
+{
+	struct rq *rq = cpu_rq(cpu);
+	struct cfs_rq *cfs_rq;
+
+	for_each_leaf_cfs_rq(rq, cfs_rq)
+		print_cfs_rq(m, cpu, cfs_rq, now);
+}
+#endif
diff -puN kernel/sched_idletask.c~cfs-v19 kernel/sched_idletask.c
--- a/kernel/sched_idletask.c~cfs-v19
+++ a/kernel/sched_idletask.c
@@ -37,9 +37,13 @@ static void put_prev_task_idle(struct rq
 {
 }
 
-static struct task_struct *load_balance_start_idle(struct rq *rq)
+static int
+load_balance_idle(struct rq *this_rq, int this_cpu, struct rq *busiest,
+			unsigned long max_nr_move, unsigned long max_load_move,
+			struct sched_domain *sd, enum cpu_idle_type idle,
+			int *all_pinned, unsigned long *total_load_moved)
 {
-	return NULL;
+	return 0;
 }
 
 static void task_tick_idle(struct rq *rq, struct task_struct *curr)
@@ -49,7 +53,7 @@ static void task_tick_idle(struct rq *rq
 /*
  * Simple, special scheduling class for the per-CPU idle tasks:
  */
-struct sched_class idle_sched_class __read_mostly = {
+static struct sched_class idle_sched_class __read_mostly = {
 	/* no enqueue/yield_task for idle tasks */
 
 	/* dequeue is not valid, we print a debug message there: */
@@ -60,8 +64,7 @@ struct sched_class idle_sched_class __re
 	.pick_next_task		= pick_next_task_idle,
 	.put_prev_task		= put_prev_task_idle,
 
-	.load_balance_start	= load_balance_start_idle,
-	/* no .load_balance_next for idle tasks */
+	.load_balance		= load_balance_idle,
 
 	.task_tick		= task_tick_idle,
 	/* no .task_new for idle tasks */
diff -puN kernel/sched_rt.c~cfs-v19 kernel/sched_rt.c
--- a/kernel/sched_rt.c~cfs-v19
+++ a/kernel/sched_rt.c
@@ -12,7 +12,7 @@ static inline void update_curr_rt(struct
 	struct task_struct *curr = rq->curr;
 	u64 delta_exec;
 
-	if (!has_rt_policy(curr))
+	if (!task_has_rt_policy(curr))
 		return;
 
 	delta_exec = now - curr->se.exec_start;
@@ -107,8 +107,9 @@ static void put_prev_task_rt(struct rq *
  * achieve that by always pre-iterating before returning
  * the current task:
  */
-static struct task_struct * load_balance_start_rt(struct rq *rq)
+static struct task_struct *load_balance_start_rt(void *arg)
 {
+	struct rq *rq = arg;
 	struct prio_array *array = &rq->rt.active;
 	struct list_head *head, *curr;
 	struct task_struct *p;
@@ -132,8 +133,9 @@ static struct task_struct * load_balance
 	return p;
 }
 
-static struct task_struct * load_balance_next_rt(struct rq *rq)
+static struct task_struct *load_balance_next_rt(void *arg)
 {
+	struct rq *rq = arg;
 	struct prio_array *array = &rq->rt.active;
 	struct list_head *head, *curr;
 	struct task_struct *p;
@@ -170,6 +172,44 @@ static struct task_struct * load_balance
 	return p;
 }
 
+static int
+load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
+			unsigned long max_nr_move, unsigned long max_load_move,
+			struct sched_domain *sd, enum cpu_idle_type idle,
+			int *all_pinned, unsigned long *load_moved)
+{
+	int this_best_prio, best_prio, best_prio_seen = 0;
+	int nr_moved;
+	struct rq_iterator rt_rq_iterator;
+
+	best_prio = sched_find_first_bit(busiest->rt.active.bitmap);
+	this_best_prio = sched_find_first_bit(this_rq->rt.active.bitmap);
+
+	/*
+	 * Enable handling of the case where there is more than one task
+	 * with the best priority.   If the current running task is one
+	 * of those with prio==best_prio we know it won't be moved
+	 * and therefore it's safe to override the skip (based on load)
+	 * of any task we find with that prio.
+	 */
+	if (busiest->curr->prio == best_prio)
+		best_prio_seen = 1;
+
+	rt_rq_iterator.start = load_balance_start_rt;
+	rt_rq_iterator.next = load_balance_next_rt;
+	/* pass 'busiest' rq argument into
+	 * load_balance_[start|next]_rt iterators
+	 */
+	rt_rq_iterator.arg = busiest;
+
+	nr_moved = balance_tasks(this_rq, this_cpu, busiest, max_nr_move,
+			max_load_move, sd, idle, all_pinned, load_moved,
+			this_best_prio, best_prio, best_prio_seen,
+			&rt_rq_iterator);
+
+	return nr_moved;
+}
+
 static void task_tick_rt(struct rq *rq, struct task_struct *p)
 {
 	/*
@@ -179,13 +219,14 @@ static void task_tick_rt(struct rq *rq, 
 	if (p->policy != SCHED_RR)
 		return;
 
-	if (!(--p->time_slice)) {
-		p->time_slice = static_prio_timeslice(p->static_prio);
-		set_tsk_need_resched(p);
+	if (--p->time_slice)
+		return;
 
-		/* put it at the end of the queue: */
-		requeue_task_rt(rq, p);
-	}
+	p->time_slice = static_prio_timeslice(p->static_prio);
+	set_tsk_need_resched(p);
+
+	/* put it at the end of the queue: */
+	requeue_task_rt(rq, p);
 }
 
 /*
@@ -207,8 +248,7 @@ static struct sched_class rt_sched_class
 	.pick_next_task		= pick_next_task_rt,
 	.put_prev_task		= put_prev_task_rt,
 
-	.load_balance_start	= load_balance_start_rt,
-	.load_balance_next	= load_balance_next_rt,
+	.load_balance		= load_balance_rt,
 
 	.task_tick		= task_tick_rt,
 	.task_new		= task_new_rt,
diff -puN lib/Kconfig.debug~cfs-v19 lib/Kconfig.debug
--- a/lib/Kconfig.debug~cfs-v19
+++ a/lib/Kconfig.debug
@@ -105,6 +105,15 @@ config DETECT_SOFTLOCKUP
 	   can be detected via the NMI-watchdog, on platforms that
 	   support it.)
 
+config SCHED_DEBUG
+	bool "Collect scheduler debugging info"
+	depends on DEBUG_KERNEL && PROC_FS
+	default y
+	help
+	  If you say Y here, the /proc/sched_debug file will be provided
+	  that can help debug the scheduler. The runtime overhead of this
+	  option is minimal.
+
 config SCHEDSTATS
 	bool "Collect scheduler statistics"
 	depends on DEBUG_KERNEL && PROC_FS
diff -puN Makefile~cfs-v19 Makefile
_

Patches currently in -mm which might be from mingo@xxxxxxx are

origin.patch
git-acpi.patch
git-kvm.patch
git-selinux.patch
s390-rename-cpu_idle-to-s390_cpu_idle.patch
x86_64-irq-check-remote-irr-bit-before-migrating-level-triggered-irq-v3.patch
nohz-fix-nohz-x86-dyntick-idle-handling.patch
tick-management-spread-timer-interrupt.patch
highres-improve-debug-output.patch
highres-improve-debug-output-fix.patch
hrtimer-speedup-hrtimer_enqueue.patch
pcspkr-use-the-global-pit-lock.patch
ntp-move-the-cmos-update-code-into-ntpc.patch
ntp-move-the-cmos-update-code-into-ntpc-fix.patch
ntp-move-the-cmos-update-code-into-ntpc-fix-fix.patch
i386-pit-stop-only-when-in-periodic-or-oneshot-mode.patch
i386-remove-volatile-in-apicc.patch
i386-hpet-assumes-boot-cpu-is-0.patch
i386-move-pit-function-declarations-and-constants-to-correct-header-file.patch
x86_64-untangle-asm-hpeth-from-asm-timexh.patch
x86_64-use-generic-cmos-update.patch
x86_64-remove-dead-code-and-other-janitor-work-in-tscc.patch
x86_64-fix-apic-typo.patch
x86_64-convert-to-cleckevents.patch
acpi-remove-the-useless-ifdef-code.patch
x86_64-hpet-restore-vread.patch
x86_64-restore-restore-nohpet-cmdline.patch
x86_64-block-irq-balancing-for-timer.patch
x86_64-prep-idle-loop-for-dynticks.patch
x86_64-enable-high-resolution-timers-and-dynticks.patch
x86_64-dynticks-disable-hpet_id_legsup-hpets.patch
ich-force-hpet-make-generic-time-capable-of-switching-broadcast-timer.patch
ich-force-hpet-restructure-hpet-generic-clock-code.patch
ich-force-hpet-ich7-or-later-quirk-to-force-detect-enable.patch
ich-force-hpet-ich7-or-later-quirk-to-force-detect-enable-fix.patch
ich-force-hpet-late-initialization-of-hpet-after-quirk.patch
ich-force-hpet-ich5-quirk-to-force-detect-enable.patch
ich-force-hpet-ich5-quirk-to-force-detect-enable-fix.patch
ich-force-hpet-ich5-fix-a-bug-with-suspend-resume.patch
ich-force-hpet-add-ich7_0-pciid-to-quirk-list.patch
geode-mfgpt-clock-event-device-support.patch
only-allow-nonlinear-vmas-for-ram-backed-filesystems.patch
add-generic-exit-time-stack-depth-checking-to-config_debug_stack_usage.patch
cpuset-remove-sched-domain-hooks-from-cpusets.patch
introduce-write_trylock_irqsave.patch
use-write_trylock_irqsave-in-ptrace_attach.patch
fix-stop_machine_run-problem-with-naughty-real-time-process.patch
cpu-hotplug-fix-ksoftirqd-termination-on-cpu-hotplug-with-naughty-realtime-process.patch
cpu-hotplug-fix-ksoftirqd-termination-on-cpu-hotplug-with-naughty-realtime-process-fix.patch
pie-randomization.patch
vdso-print-fatal-signals.patch
remove-clockevents_releaserequest_device.patch
add-a-flag-to-indicate-deferrable-timers-in-proc-timer_stats.patch
introduce-o_cloexec-take-2.patch
introduce-o_cloexec-parisc-fix.patch
o_cloexec-for-scm_rights.patch
o_cloexec-for-scm_rights-fix.patch
o_cloexec-for-scm_rights-fix-2.patch
improve-behaviour-of-spurious-irq-detect.patch
improve-behaviour-of-spurious-irq-detect-fix.patch
allow-softlockup-to-be-runtime-disabled.patch
sys_time-speedup.patch
sys_time-speedup-build-fixes.patch
futex-tidy-up-the-code-v2.patch
modules-remove-modlist_lock.patch
lockdep-debugging-give-stacktrace-for-init_error.patch
stacktrace-fix-header-file-for-config_stacktrace.patch
cfs-scheduler.patch
cfs-scheduler-vs-detach-schedh-from-mmh.patch
cfs-scheduler-v14-rc2-mm1.patch
cfs-scheduler-warning-fixes.patch
cfs-scheduler-v15-rc3-mm1.patch
fs-proc-basec-make-a-struct-static.patch
cfs-warning-fixes.patch
schedstats-fix-printk-format.patch
cfs-scheduler-v16.patch
sched-cfs-v2.6.22-git-v18.patch
sched-add-above-background-load-function.patch
mm-implement-swap-prefetching.patch
cfs-v19.patch
cfs-kernel-schedc-make-2-functions-static.patch
fix-raw_spinlock_t-vs-lockdep.patch
lockdep-sanitise-config_prove_locking.patch
lockdep-reduce-the-ifdeffery.patch
lockstat-core-infrastructure.patch
lockstat-core-infrastructure-fix.patch
lockstat-core-infrastructure-fix-fix.patch
lockstat-core-infrastructure-fix-fix-fix.patch
lockstat-human-readability-tweaks.patch
lockstat-hook-into-spinlock_t-rwlock_t-rwsem-and-mutex.patch
lockdep-various-fixes.patch
lockdep-various-fixes-checkpatch.patch
lockdep-fixup-sk_callback_lock-annotation.patch
lockstat-measure-lock-bouncing.patch
lockstat-measure-lock-bouncing-checkpatch.patch
lockstat-better-class-name-representation.patch
detect-atomic-counter-underflows.patch
make-frame_pointer-default=y.patch
mutex-subsystem-synchro-test-module.patch
lockdep-show-held-locks-when-showing-a-stackdump.patch
kmap_atomic-debugging.patch
random-warning-squishes.patch

-
To unsubscribe from this list: send the line "unsubscribe mm-commits" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html