On Wed, 2010-12-08 at 12:55 -0500, Rik van Riel wrote: > > > > Right, so another approach might be to simply swap the vruntime between > > curr and p. > > Doesn't that run into the same scale issue you described > above? Not really, but its tricky on SMP because vruntime only has meaning within a cfs_rq. The below is quickly cobbled together from a few patches I had laying about dealing with similar issues. The avg_vruntime() stuff takes the weighted average of the vruntimes on a cfs runqueue, this weighted average corresponds to the 0-lag point, that is the point where an ideal scheduler would have all tasks. Using the 0-lag point you can compute the lag of a task, the lag is a measure of service for a task, negative lag means the task is owed services, positive lag means its got too much service (at least, thats the sign convention used here, I always forget what the standard is). What we do is, when @p the target task, is owed less service than current, we flip lags such that p will become more eligible. The trouble with all this is that computing the weighted average is terribly expensive (it increases cost of all scheduler hot paths). The dyn_vruntime stuff mixed in there is an attempt at computing something similar, although its not used and I haven't tested the quality of the approximation in a while. Anyway, complete untested and such.. --- include/linux/sched.h | 2 + kernel/sched.c | 39 ++++++++++ kernel/sched_debug.c | 31 ++++----- kernel/sched_fair.c | 179 ++++++++++++++++++++++++++++++++++++++--------- kernel/sched_features.h | 8 +-- 5 files changed, 203 insertions(+), 56 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index b0fc8ee..538559e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1095,6 +1095,8 @@ struct sched_class { #ifdef CONFIG_FAIR_GROUP_SCHED void (*task_move_group) (struct task_struct *p, int on_rq); #endif + + void (*yield_to) (struct task_struct *p); }; struct load_weight { diff --git a/kernel/sched.c b/kernel/sched.c index 0debad9..fe0adb0 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -313,6 +313,9 @@ struct cfs_rq { struct load_weight load; unsigned long nr_running; + s64 avg_vruntime; + u64 avg_load; + u64 exec_clock; u64 min_vruntime; @@ -5062,6 +5065,42 @@ SYSCALL_DEFINE0(sched_yield) return 0; } +void yield_to(struct task_struct *p) +{ + struct task_struct *curr = current; + struct rq *p_rq, *rq; + unsigned long flags; + int on_rq; + + local_irq_save(flags); + rq = this_rq(); +again: + p_rq = task_rq(p); + double_rq_lock(rq, p_rq); + if (p_rq != task_rq(p)) { + double_rq_unlock(rq, p_rq); + goto again; + } + + update_rq_clock(rq); + update_rq_clock(p_rq); + + on_rq = p->se.on_rq; + if (on_rq) + dequeue_task(p_rq, p, 0); + + ret = 0; + if (p->sched_class == curr->sched_class && curr->sched_class->yield_to) + curr->sched_class->yield_to(p); + + if (on_rq) + enqueue_task(p_rq, p, 0); + + double_rq_unlock(rq, p_rq); + local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(yield_to); + static inline int should_resched(void) { return need_resched() && !(preempt_count() & PREEMPT_ACTIVE); diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 1dfae3d..b5cc773 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -138,10 +138,9 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) { - s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1, - spread, rq0_min_vruntime, spread0; + s64 left_vruntime = -1, min_vruntime, right_vruntime = -1, spread; struct rq *rq = cpu_rq(cpu); - struct sched_entity *last; + struct sched_entity *last, *first; unsigned long flags; SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu); @@ -149,28 +148,26 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) SPLIT_NS(cfs_rq->exec_clock)); raw_spin_lock_irqsave(&rq->lock, flags); - if (cfs_rq->rb_leftmost) - MIN_vruntime = (__pick_next_entity(cfs_rq))->vruntime; + first = __pick_first_entity(cfs_rq); + if (first) + left_vruntime = first->vruntime; last = __pick_last_entity(cfs_rq); if (last) - max_vruntime = last->vruntime; + right_vruntime = last->vruntime; min_vruntime = cfs_rq->min_vruntime; - rq0_min_vruntime = cpu_rq(0)->cfs.min_vruntime; raw_spin_unlock_irqrestore(&rq->lock, flags); - SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime", - SPLIT_NS(MIN_vruntime)); + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "left_vruntime", + SPLIT_NS(left_vruntime)); SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "min_vruntime", SPLIT_NS(min_vruntime)); - SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "max_vruntime", - SPLIT_NS(max_vruntime)); - spread = max_vruntime - MIN_vruntime; - SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread", - SPLIT_NS(spread)); - spread0 = min_vruntime - rq0_min_vruntime; - SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread0", - SPLIT_NS(spread0)); SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over", cfs_rq->nr_spread_over); + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "avg_vruntime", + SPLIT_NS(avg_vruntime(cfs_rq))); + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "right_vruntime", + SPLIT_NS(right_vruntime)); + spread = right_vruntime - left_vruntime; + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread", SPLIT_NS(spread)); SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running); SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); #ifdef CONFIG_FAIR_GROUP_SCHED diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index c886717..8689bcd 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -346,25 +346,90 @@ static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) return se->vruntime - cfs_rq->min_vruntime; } -static void update_min_vruntime(struct cfs_rq *cfs_rq) +/* + * Compute virtual time from the per-task service numbers: + * + * Fair schedulers conserve lag: \Sum lag_i = 0 + * + * lag_i = S_i - s_i = w_i * (V - v_i) + * + * \Sum lag_i = 0 -> \Sum w_i * (V - v_i) = V * \Sum w_i - \Sum w_i * v_i = 0 + * + * From which we solve V: + * + * \Sum v_i * w_i + * V = -------------- + * \Sum w_i + * + * However, since v_i is u64, and the multiplcation could easily overflow + * transform it into a relative form that uses smaller quantities: + * + * Substitute: v_i == (v_i - v) + v + * + * \Sum ((v_i - v) + v) * w_i \Sum (v_i - v) * w_i + * V = -------------------------- = -------------------- + v + * \Sum w_i \Sum w_i + * + * min_vruntime = v + * avg_vruntime = \Sum (v_i - v) * w_i + * cfs_rq->load = \Sum w_i + * + * Since min_vruntime is a monotonic increasing variable that closely tracks + * the per-task service, these deltas: (v_i - v), will be in the order of the + * maximal (virtual) lag induced in the system due to quantisation. + */ +static void +avg_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se) { - u64 vruntime = cfs_rq->min_vruntime; + s64 key = entity_key(cfs_rq, se); + cfs_rq->avg_vruntime += key * se->load.weight; + cfs_rq->avg_load += se->load.weight; +} - if (cfs_rq->curr) - vruntime = cfs_rq->curr->vruntime; +static void +avg_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + s64 key = entity_key(cfs_rq, se); + cfs_rq->avg_vruntime -= key * se->load.weight; + cfs_rq->avg_load -= se->load.weight; +} + +static inline +void avg_vruntime_update(struct cfs_rq *cfs_rq, s64 delta) +{ + /* + * v' = v + d ==> avg_vruntime' = avg_runtime - d*avg_load + */ + cfs_rq->avg_vruntime -= cfs_rq->avg_load * delta; +} - if (cfs_rq->rb_leftmost) { - struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost, - struct sched_entity, - run_node); +static u64 avg_vruntime(struct cfs_rq *cfs_rq) +{ + struct sched_entity *se = cfs_rq->curr; + s64 lag = cfs_rq->avg_vruntime; + long load = cfs_rq->avg_load; - if (!cfs_rq->curr) - vruntime = se->vruntime; - else - vruntime = min_vruntime(vruntime, se->vruntime); + if (se) { + lag += entity_key(cfs_rq, se) * se->load.weight; + load += se->load.weight; } - cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime); + if (load) + lag = div_s64(lag, load); + + return cfs_rq->min_vruntime + lag; +} + +static void __update_min_vruntime(struct cfs_rq *cfs_rq, u64 vruntime) +{ + /* + * open coded max_vruntime() to allow updating avg_vruntime + */ + s64 delta = (s64)(vruntime - cfs_rq->min_vruntime); + if (delta > 0) { + avg_vruntime_update(cfs_rq, delta); + cfs_rq->min_vruntime = vruntime; + } } /* @@ -378,6 +443,8 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) s64 key = entity_key(cfs_rq, se); int leftmost = 1; + avg_vruntime_add(cfs_rq, se); + /* * Find the right place in the rbtree: */ @@ -417,9 +484,10 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) } rb_erase(&se->run_node, &cfs_rq->tasks_timeline); + avg_vruntime_sub(cfs_rq, se); } -static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq) +static struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) { struct rb_node *left = cfs_rq->rb_leftmost; @@ -542,6 +610,33 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update); static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta); +static void update_min_vruntime(struct cfs_rq *cfs_rq, unsigned long delta_exec) +{ + struct sched_entity *left = __pick_first_entity(cfs_rq); + struct sched_entity *curr = cfs_rq->curr; + u64 vruntime; + + if (left && curr) + vruntime = min_vruntime(left->vruntime, curr->vruntime); + else if (left) + vruntime = left->vruntime; + else if (curr) + vruntime = curr->vruntime; + else + return; + + if (sched_feat(DYN_MIN_VRUNTIME)) { + u64 new_vruntime = cfs_rq->min_vruntime; + if (delta_exec) { + new_vruntime += calc_delta_mine(delta_exec, + NICE_0_LOAD, &cfs_rq->load); + } + vruntime = max_vruntime(new_vruntime, vruntime); + } + + __update_min_vruntime(cfs_rq, vruntime); +} + /* * Update the current task's runtime statistics. Skip current tasks that * are not in our scheduling class. @@ -560,7 +655,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, delta_exec_weighted = calc_delta_fair(delta_exec, curr); curr->vruntime += delta_exec_weighted; - update_min_vruntime(cfs_rq); + update_min_vruntime(cfs_rq, delta_exec); #if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED cfs_rq->load_unacc_exec_time += delta_exec; @@ -895,16 +990,7 @@ static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se) static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) { - u64 vruntime = cfs_rq->min_vruntime; - - /* - * The 'current' period is already promised to the current tasks, - * however the extra weight of the new task will slow them down a - * little, place the new task so that it fits in the slot that - * stays open at the end. - */ - if (initial && sched_feat(START_DEBIT)) - vruntime += sched_vslice(cfs_rq, se); + u64 vruntime = avg_vruntime(cfs_rq); /* sleeps up to a single latency don't count. */ if (!initial) { @@ -934,7 +1020,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * through callig update_curr(). */ if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING)) - se->vruntime += cfs_rq->min_vruntime; + se->vruntime += avg_vruntime(cfs_rq); /* * Update run-time statistics of the 'current'. @@ -1003,7 +1089,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) se->on_rq = 0; update_cfs_load(cfs_rq, 0); account_entity_dequeue(cfs_rq, se); - update_min_vruntime(cfs_rq); + update_min_vruntime(cfs_rq, 0); update_cfs_shares(cfs_rq, 0); /* @@ -1012,7 +1098,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * movement in our normalized position. */ if (!(flags & DEQUEUE_SLEEP)) - se->vruntime -= cfs_rq->min_vruntime; + se->vruntime -= avg_vruntime(cfs_rq); } /* @@ -1090,7 +1176,7 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) { - struct sched_entity *se = __pick_next_entity(cfs_rq); + struct sched_entity *se = __pick_first_entity(cfs_rq); struct sched_entity *left = se; if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) @@ -1326,7 +1412,7 @@ static void task_waking_fair(struct rq *rq, struct task_struct *p) struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq = cfs_rq_of(se); - se->vruntime -= cfs_rq->min_vruntime; + se->vruntime -= avg_vruntime(cfs_rq); } #ifdef CONFIG_FAIR_GROUP_SCHED @@ -4025,7 +4111,7 @@ static void task_fork_fair(struct task_struct *p) resched_task(rq->curr); } - se->vruntime -= cfs_rq->min_vruntime; + se->vruntime -= avg_vruntime(cfs_rq); raw_spin_unlock_irqrestore(&rq->lock, flags); } @@ -4096,10 +4182,10 @@ static void task_move_group_fair(struct task_struct *p, int on_rq) * fair sleeper stuff for the first placement, but who cares. */ if (!on_rq) - p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime; + p->se.vruntime -= avg_vruntime(cfs_rq_of(&p->se)); set_task_rq(p, task_cpu(p)); if (!on_rq) - p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime; + p->se.vruntime += avg_vruntime(cfs_rq_of(&p->se)); } #endif @@ -4118,6 +4204,31 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task return rr_interval; } +static void yield_to_fair(struct task_stuct *p) +{ + struct sched_entity *se = ¤t->se; + struct sched_entity *p_se = &p->se; + u64 lag0, p_lag0; + s64 lag, p_lag; + + lag0 = avg_vruntime(cfs_rq_of(se)); + p_lag0 = avg_vruntime(cfs_rq_of(p_se)); + + lag = se->vruntime - avg_vruntime(cfs_rq); + p_lag = p_se->vruntime - avg_vruntime(p_cfs_rq); + + if (p_lag > lag) { /* if P is owed less service */ + se->vruntime = lag0 + p_lag; + p_se->vruntime = p_lag + lag; + } + + /* + * XXX try something smarter here + */ + resched_task(p); + resched_task(current); +} + /* * All the scheduling class methods: */ @@ -4150,6 +4261,8 @@ static const struct sched_class fair_sched_class = { .get_rr_interval = get_rr_interval_fair, + .yield_to = yield_to_fair, + #ifdef CONFIG_FAIR_GROUP_SCHED .task_move_group = task_move_group_fair, #endif diff --git a/kernel/sched_features.h b/kernel/sched_features.h index 68e69ac..feda9b8 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h @@ -6,12 +6,6 @@ SCHED_FEAT(GENTLE_FAIR_SLEEPERS, 1) /* - * Place new tasks ahead so that they do not starve already running - * tasks - */ -SCHED_FEAT(START_DEBIT, 1) - -/* * Should wakeups try to preempt running tasks. */ SCHED_FEAT(WAKEUP_PREEMPT, 1) @@ -53,6 +47,8 @@ SCHED_FEAT(HRTICK, 0) SCHED_FEAT(DOUBLE_TICK, 0) SCHED_FEAT(LB_BIAS, 1) +SCHED_FEAT(DYN_MIN_VRUNTIME, 0) + /* * Spin-wait on mutex acquisition when the mutex owner is running on * another cpu -- assumes that when the owner is running, it will soon -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html