On Mon, 2010-12-13 at 22:46 -0500, Rik van Riel wrote: > diff --git a/include/linux/sched.h b/include/linux/sched.h > index 2c79e92..408326f 100644 > --- a/include/linux/sched.h > +++ b/include/linux/sched.h > @@ -1086,6 +1086,8 @@ struct sched_class { > #ifdef CONFIG_FAIR_GROUP_SCHED > void (*task_move_group) (struct task_struct *p, int on_rq); > #endif > + > + void (*yield_to) (struct rq *rq, struct task_struct *p); > }; > > struct load_weight { > @@ -1947,6 +1949,7 @@ extern void set_user_nice(struct task_struct *p, long nice); > extern int task_prio(const struct task_struct *p); > extern int task_nice(const struct task_struct *p); > extern int can_nice(const struct task_struct *p, const int nice); > +extern void requeue_task(struct rq *rq, struct task_struct *p); That definitely doesn't want to be a globally visible symbol. > extern int task_curr(const struct task_struct *p); > extern int idle_cpu(int cpu); > extern int sched_setscheduler(struct task_struct *, int, struct sched_param *); > @@ -2020,6 +2023,10 @@ extern int wake_up_state(struct task_struct *tsk, unsigned int state); > extern int wake_up_process(struct task_struct *tsk); > extern void wake_up_new_task(struct task_struct *tsk, > unsigned long clone_flags); > + > +extern u64 slice_remain(struct task_struct *); idem. > +void yield_to(struct task_struct *p) > +{ > + unsigned long flags; > + struct rq *rq, *p_rq; > + > + local_irq_save(flags); > + rq = this_rq(); > +again: > + p_rq = task_rq(p); > + double_rq_lock(rq, p_rq); > + if (p_rq != task_rq(p)) { > + double_rq_unlock(rq, p_rq); > + goto again; > + } > + > + /* We can't yield to a process that doesn't want to run. */ > + if (!p->se.on_rq) > + goto out; > + > + /* > + * We can only yield to a runnable task, in the same schedule class > + * as the current task, if the schedule class implements yield_to_task. > + */ > + if (!task_running(rq, p) && current->sched_class == p->sched_class && > + current->sched_class->yield_to) > + current->sched_class->yield_to(rq, p); rq and p don't match, see below. > + > +out: > + double_rq_unlock(rq, p_rq); > + local_irq_restore(flags); > + yield(); That wants to be plain: schedule(), possibly conditional on having called sched_class::yield_to. > +} > +EXPORT_SYMBOL_GPL(yield_to); > +u64 slice_remain(struct task_struct *p) > +{ > + unsigned long flags; > + struct sched_entity *se = &p->se; > + struct cfs_rq *cfs_rq; > + struct rq *rq; > + u64 slice, ran; > + s64 delta; > + > + rq = task_rq_lock(p, &flags); You're calling this from yield_to()->sched_class::yield_to()->yield_to_fair()->slice_remain(), yield_to() already holds p's rq lock. > + cfs_rq = cfs_rq_of(se); > + slice = sched_slice(cfs_rq, se); > + ran = se->sum_exec_runtime - se->prev_sum_exec_runtime; > + delta = slice - ran; > + task_rq_unlock(rq, &flags); > + > + return max(delta, 0LL); > +} Like Mike said, the returned figure doesn't really mean anything, its definitely not the remaining time of a slice. It might qualify for a weak random number generator though.. :-) > +static void yield_to_fair(struct rq *rq, struct task_struct *p) > +{ > + struct sched_entity *se = &p->se; > + struct cfs_rq *cfs_rq = cfs_rq_of(se); > + u64 remain = slice_remain(current); > + > + dequeue_task(rq, p, 0); Here you assume @p lives on @rq, but you passed: + current->sched_class->yield_to(rq, p); and rq = this_rq(), so this will go splat. > + se->vruntime -= remain; You cannot simply subtract wall-time from virtual time, see the usage of calc_delta_fair() in the proposal below. > + if (se->vruntime < cfs_rq->min_vruntime) > + se->vruntime = cfs_rq->min_vruntime; Then clipping it to min_vruntime doesn't make any sense at all. > + enqueue_task(rq, p, 0); > + check_preempt_curr(rq, p, 0); > +} Also, modifying the vruntime of one task without also modifying the vruntime of the other task breaks stuff. You're injecting time into p without taking time out of current. Maybe something like: static void yield_to_fair(struct rq *p_rq, struct task_struct *p) { struct rq *rq = this_rq(); struct sched_entity *se = ¤t->se; struct cfs_rq *cfs_rq = cfs_rq_of(se); struct sched_entity *pse = &p->se; struct cfs_rq *p_cfs_rq = cfs_rq_of(pse); /* * Transfer wakeup_gran worth of time from current to @p, * this should ensure current is no longer eligible to run. */ unsigned long wakeup_gran = ACCESS_ONCE(sysctl_sched_wakeup_granularity); update_rq_clock(rq); update_curr(cfs_rq); if (pse != p_cfs_rq->curr) { __dequeue_entity(p_cfs_rq, pse); } else { update_rq_clock(p_rq); update_curr(p_cfs_rq); } se->vruntime += calc_delta_fair(wakeup_gran, se); pse->vruntime -= calc_delta_fair(wakeup_gran, pse); clear_buddies(cfs_rq, se); if (pse != p_cfs_rq->curr) { __enqueue_entity(p_cfs_rq, pse); check_preempt_curr(prq, p, 0) } } This isn't strictly correct for the group scheduling case though, that wants a for_each_sched_entity() loop for both se and pse, but I'd have to like actually think about that ;-) A quick hack might be simply dis-allowing yield_to between different groups, add something like the below to the above function: #ifdef CONFIG_FAIR_GROUP_SCHED if (cfs_rq->tg != p_cfs_rq->tg) return; #endif -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html