-- On Thu, 25 Oct 2007, Gregory Haskins wrote: > Some RT tasks (particularly kthreads) are bound to one specific CPU. > It is fairly common for one or more bound tasks to get queued up at the > same time. Consider, for instance, softirq_timer and softirq_sched. A > timer goes off in an ISR which schedules softirq_thread to run at RT50. > Then during the handling of the timer, the system determines that it's > time to smp-rebalance the system so it schedules softirq_sched to run > from within the softirq_timer kthread context. So we are in a situation > where we have two RT50 tasks queued, and the system will go into > rt-overload condition to request other CPUs for help. > > The problem is that these tasks cannot ever be pulled away since they > are already running on their one and only valid RQ. However, the other > CPUs cannot determine that the tasks are unpullable without going > through expensive checks/locking. Therefore the helping CPUS > experience unecessary overhead/latencies regardless as they > ineffectively try to process the overload condition. > > This patch tries to optimize the situation by utilizing the hamming > weight of the task->cpus_allowed mask. A weight of 1 indicates that > the task cannot be migrated, which may be utilized by the overload > handling code to eliminate uncessary rebalance attempts. We also > introduce a per-rq variable to count the number of migratable tasks > that are currently running. We only go into overload if we have more > than one rt task, AND at least one of them is migratable. > > Calculating the weight is probably relatively expensive, so it is only > done when the cpus_allowed mask is updated (which should be relatively > infrequent, especially compared to scheduling frequency) and cached in > the task_struct. > > > Signed-off-by: Gregory Haskins <ghaskins@xxxxxxxxxx> > --- > > include/linux/sched.h | 2 + > kernel/fork.c | 1 > kernel/sched.c | 9 +++- > kernel/sched_rt.c | 116 +++++++++++++++++++++++++++++++++++++++++-------- > 4 files changed, 107 insertions(+), 21 deletions(-) > > diff --git a/include/linux/sched.h b/include/linux/sched.h > index 7a3829f..829de6f 100644 > --- a/include/linux/sched.h > +++ b/include/linux/sched.h > @@ -1048,6 +1048,7 @@ struct sched_class { > void (*set_curr_task) (struct rq *rq); > void (*task_tick) (struct rq *rq, struct task_struct *p); > void (*task_new) (struct rq *rq, struct task_struct *p); > + void (*set_cpus_allowed)(struct task_struct *p, cpumask_t newmask); > }; > > struct load_weight { > @@ -1144,6 +1145,7 @@ struct task_struct { > > unsigned int policy; > cpumask_t cpus_allowed; > + int nr_cpus_allowed; > unsigned int time_slice; > > #ifdef CONFIG_PREEMPT_RCU > diff --git a/kernel/fork.c b/kernel/fork.c > index 5f11f23..f808e18 100644 > --- a/kernel/fork.c > +++ b/kernel/fork.c > @@ -1257,6 +1257,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, > */ > preempt_disable(); > p->cpus_allowed = current->cpus_allowed; > + p->nr_cpus_allowed = current->nr_cpus_allowed; > if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) || > !cpu_online(task_cpu(p)))) > set_task_cpu(p, smp_processor_id()); > diff --git a/kernel/sched.c b/kernel/sched.c > index 30fa531..6c90093 100644 > --- a/kernel/sched.c > +++ b/kernel/sched.c > @@ -262,6 +262,7 @@ struct rt_rq { > int rt_load_balance_idx; > struct list_head *rt_load_balance_head, *rt_load_balance_curr; > unsigned long rt_nr_running; > + unsigned long rt_nr_migratory; > unsigned long rt_nr_uninterruptible; > /* highest queued rt task prio */ > int highest_prio; > @@ -5371,7 +5372,13 @@ int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask) > goto out; > } > > - p->cpus_allowed = new_mask; > + if (p->sched_class->set_cpus_allowed) Not sure we need this optimization (not doing the nr_cpus_allowed calculation). Since, due to priority boosting, we will need to calculate then. Calculating it here is better. > + p->sched_class->set_cpus_allowed(p, new_mask); > + else { > + p->cpus_allowed = new_mask; > + p->nr_cpus_allowed = cpus_weight(new_mask); > + } > + > /* Can the task run on the task's current CPU? If so, we're done */ > if (cpu_isset(task_cpu(p), new_mask)) > goto out; > diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c > index b59dc20..ad35c89 100644 > --- a/kernel/sched_rt.c > +++ b/kernel/sched_rt.c > @@ -50,6 +50,24 @@ static inline void update_curr_rt(struct rq *rq) > curr->se.sum_exec_runtime += delta_exec; > curr->se.exec_start = rq->clock; > } > +#ifdef CONFIG_SMP > +static void inc_rt_migration(struct task_struct *p, struct rq *rq) > +{ > + rq->rt.rt_nr_migratory++; > + > + if (rq->rt.rt_nr_running > 1) > + rt_set_overload(p, rq->cpu); > +} > + > +static void dec_rt_migration(struct task_struct *p, struct rq *rq) > +{ > + WARN_ON(!rq->rt.rt_nr_migratory); > + rq->rt.rt_nr_migratory--; > + > + if ((rq->rt.rt_nr_running < 2) || !rq->rt.rt_nr_migratory) > + rt_clear_overload(p, rq->cpu); > +} > +#endif > > static inline void inc_rt_tasks(struct task_struct *p, struct rq *rq) > { > @@ -58,8 +76,8 @@ static inline void inc_rt_tasks(struct task_struct *p, struct rq *rq) > #ifdef CONFIG_SMP > if (p->prio < rq->rt.highest_prio) > rq->rt.highest_prio = p->prio; > - if (rq->rt.rt_nr_running > 1) > - rt_set_overload(p, rq->cpu); > + if (p->nr_cpus_allowed > 1) > + inc_rt_migration(p, rq); > #endif /* CONFIG_SMP */ > } > > @@ -81,8 +99,8 @@ static inline void dec_rt_tasks(struct task_struct *p, struct rq *rq) > } /* otherwise leave rq->highest prio alone */ > } else > rq->rt.highest_prio = MAX_RT_PRIO; > - if (rq->rt.rt_nr_running < 2) > - rt_clear_overload(p, rq->cpu); > + if (p->nr_cpus_allowed > 1) > + dec_rt_migration(p, rq); > #endif /* CONFIG_SMP */ > } > > @@ -286,6 +304,27 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, > return next; > } > > +static int lock_migration_target(struct task_struct *p, struct rq *target_rq) > +{ > + struct rq *this_rq = task_rq(p); > + > + if (double_lock_balance(this_rq, target_rq)) { > + /* > + * We had to unlock the run queue. In > + * the mean time, task could have > + * migrated already or had its affinity changed. > + */ > + if (unlikely(task_running(this_rq, p) || > + task_rq(p) != this_rq || > + !cpu_isset(target_rq->cpu, p->cpus_allowed))) { > + spin_unlock(&target_rq->lock); > + return 0; > + } > + } > + > + return 1; > +} > + > /* Will lock the rq it finds */ > static struct rq *find_lock_lowest_rq(struct task_struct *task, > struct rq *this_rq) > @@ -295,6 +334,28 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, > int cpu; > int tries; > > + /* > + * We can optimize if the hamming weight of the cpus_allowed mask > + * is 1 because the task has nowhere to go but one CPU. So don't > + * waste the time trying to find the lowest RQ in this case. > + */ This code should be in the pick_next_highest_rt and not here. > + if (task->nr_cpus_allowed == 1) { > + /* If the task is already on the RQ, we are done */ > + if (cpu_isset(this_rq->cpu, task->cpus_allowed)) > + return NULL; > + > + cpu = first_cpu(task->cpus_allowed); > + > + lowest_rq = cpu_rq(cpu); > + BUG_ON(this_rq == lowest_rq); > + > + /* Otherwise, we can simply grab the new RQ */ > + if (lock_migration_target(task, lowest_rq)) > + return lowest_rq; > + else > + return NULL; > + } > + > cpus_and(cpu_mask, cpu_online_map, task->cpus_allowed); > > for (tries = 0; tries < RT_MAX_TRIES; tries++) { > @@ -324,22 +385,8 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, > break; > > /* if the prio of this runqueue changed, try again */ > - if (double_lock_balance(this_rq, lowest_rq)) { > - /* > - * We had to unlock the run queue. In > - * the mean time, task could have > - * migrated already or had its affinity changed. > - * Also make sure that it wasn't scheduled on its rq. > - */ > - if (unlikely(task_rq(task) != this_rq || > - !cpu_isset(lowest_rq->cpu, task->cpus_allowed) || > - task_running(this_rq, task) || > - !task->se.on_rq)) { > - spin_unlock(&lowest_rq->lock); > - lowest_rq = NULL; > - break; > - } > - } > + if (!lock_migration_target(task, lowest_rq)) > + return NULL; I don't like this encapsulating of the doubl_lock_balance. There's a reason I kept it out like this. Mainly because this is the source of most (if not all) of the race condititions I fought. So this is a very sensitive area to touch. In fact, I see a few races already introduced by this patch. One is that you took out the "!task->se.or_rq" test. Which means that a task could have ran and deactivated itself (TASK_UNINTERRUPTIBLE) and you just added it to a run queue. (I hit that race ;-) So keep that code out in the open where it belongs. Its very sensitive. > > /* If this rq is still suitable use it. */ > if (lowest_rq->rt.highest_prio > task->prio) > @@ -658,6 +705,31 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p) > } > } > > +#ifdef CONFIG_SMP > +static void set_cpus_allowed_rt(struct task_struct *p, cpumask_t new_mask) > +{ > + int weight = cpus_weight(new_mask); > + > + BUG_ON(!rt_task(p)); > + > + /* > + * Update the migration status of the RQ if we have an RT task > + * which is running AND changing its weight value. > + */ > + if (p->se.on_rq && (weight != p->nr_cpus_allowed)) { > + struct rq *rq = task_rq(p); > + > + if ((p->nr_cpus_allowed <= 1) && (weight > 1)) > + inc_rt_migration(p, rq); > + else if((p->nr_cpus_allowed > 1) && (weight <= 1)) > + dec_rt_migration(p, rq); > + } > + > + p->cpus_allowed = new_mask; > + p->nr_cpus_allowed = weight; > +} > +#endif > + > static struct sched_class rt_sched_class __read_mostly = { > .enqueue_task = enqueue_task_rt, > .dequeue_task = dequeue_task_rt, > @@ -671,4 +743,8 @@ static struct sched_class rt_sched_class __read_mostly = { > .load_balance = load_balance_rt, > > .task_tick = task_tick_rt, > + > +#ifdef CONFIG_SMP > + .set_cpus_allowed = set_cpus_allowed_rt, > +#endif Hmm, I must have missed where you update the mask at time of boosting. Anyway, we shouldn't. The set_cpus_allowed should be done for all tasks. Now you can have a handler to call for when a task changes class and changes weight. -- Steve - To unsubscribe from this list: send the line "unsubscribe linux-rt-users" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html