Re: [PATCH 2/3] RT: Cache cpus_allowed weight for optimizing migration

Steven Rostedt <rostedt@xxxxxxxxxxx> · Thu, 25 Oct 2007 11:48:14 -0400 (EDT)

--
On Thu, 25 Oct 2007, Gregory Haskins wrote:

> Some RT tasks (particularly kthreads) are bound to one specific CPU.
> It is fairly common for one or more bound tasks to get queued up at the
> same time.  Consider, for instance, softirq_timer and softirq_sched.  A
> timer goes off in an ISR which schedules softirq_thread to run at RT50.
> Then during the handling of the timer, the system determines that it's
> time to smp-rebalance the system so it schedules softirq_sched to run
> from within the softirq_timer kthread context. So we are in a situation
> where we have two RT50 tasks queued, and the system will go into
> rt-overload condition to request other CPUs for help.
>
> The problem is that these tasks cannot ever be pulled away since they
> are already running on their one and only valid RQ.  However, the other
> CPUs cannot determine that the tasks are unpullable without going
> through expensive checks/locking.  Therefore the helping CPUS
> experience unecessary overhead/latencies regardless as they
> ineffectively try to process the overload condition.
>
> This patch tries to optimize the situation by utilizing the hamming
> weight of the task->cpus_allowed mask.  A weight of 1 indicates that
> the task cannot be migrated, which may be utilized by the overload
> handling code to eliminate uncessary rebalance attempts.  We also
> introduce a per-rq variable to count the number of migratable tasks
> that are currently running.  We only go into overload if we have more
> than one rt task, AND at least one of them is migratable.
>
> Calculating the weight is probably relatively expensive, so it is only
> done when the cpus_allowed mask is updated (which should be relatively
> infrequent, especially compared to scheduling frequency) and cached in
> the task_struct.
>
>
> Signed-off-by: Gregory Haskins <ghaskins@xxxxxxxxxx>
> ---
>
>  include/linux/sched.h |    2 +
>  kernel/fork.c         |    1
>  kernel/sched.c        |    9 +++-
>  kernel/sched_rt.c     |  116 +++++++++++++++++++++++++++++++++++++++++--------
>  4 files changed, 107 insertions(+), 21 deletions(-)
>
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 7a3829f..829de6f 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1048,6 +1048,7 @@ struct sched_class {
>  	void (*set_curr_task) (struct rq *rq);
>  	void (*task_tick) (struct rq *rq, struct task_struct *p);
>  	void (*task_new) (struct rq *rq, struct task_struct *p);
> +	void (*set_cpus_allowed)(struct task_struct *p, cpumask_t newmask);
>  };
>
>  struct load_weight {
> @@ -1144,6 +1145,7 @@ struct task_struct {
>
>  	unsigned int policy;
>  	cpumask_t cpus_allowed;
> +	int nr_cpus_allowed;
>  	unsigned int time_slice;
>
>  #ifdef CONFIG_PREEMPT_RCU
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 5f11f23..f808e18 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -1257,6 +1257,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
>  	 */
>  	preempt_disable();
>  	p->cpus_allowed = current->cpus_allowed;
> +	p->nr_cpus_allowed = current->nr_cpus_allowed;
>  	if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) ||
>  			!cpu_online(task_cpu(p))))
>  		set_task_cpu(p, smp_processor_id());
> diff --git a/kernel/sched.c b/kernel/sched.c
> index 30fa531..6c90093 100644
> --- a/kernel/sched.c
> +++ b/kernel/sched.c
> @@ -262,6 +262,7 @@ struct rt_rq {
>  	int rt_load_balance_idx;
>  	struct list_head *rt_load_balance_head, *rt_load_balance_curr;
>  	unsigned long rt_nr_running;
> +	unsigned long rt_nr_migratory;
>  	unsigned long rt_nr_uninterruptible;
>  	/* highest queued rt task prio */
>  	int highest_prio;
> @@ -5371,7 +5372,13 @@ int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
>  		goto out;
>  	}
>
> -	p->cpus_allowed = new_mask;
> +	if (p->sched_class->set_cpus_allowed)

Not sure we need this optimization (not doing the nr_cpus_allowed
calculation). Since, due to priority boosting, we will need to calculate
then. Calculating it here is better.

> +		p->sched_class->set_cpus_allowed(p,
new_mask); > +	else {
> +		p->cpus_allowed    = new_mask;
> +		p->nr_cpus_allowed = cpus_weight(new_mask);
> +	}
> +
>  	/* Can the task run on the task's current CPU? If so, we're done */
>  	if (cpu_isset(task_cpu(p), new_mask))
>  		goto out;
> diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
> index b59dc20..ad35c89 100644
> --- a/kernel/sched_rt.c
> +++ b/kernel/sched_rt.c
> @@ -50,6 +50,24 @@ static inline void update_curr_rt(struct rq *rq)
>  	curr->se.sum_exec_runtime += delta_exec;
>  	curr->se.exec_start = rq->clock;
>  }
> +#ifdef CONFIG_SMP
> +static void inc_rt_migration(struct task_struct *p, struct rq *rq)
> +{
> +	rq->rt.rt_nr_migratory++;
> +
> +	if (rq->rt.rt_nr_running > 1)
> +		rt_set_overload(p, rq->cpu);
> +}
> +
> +static void dec_rt_migration(struct task_struct *p, struct rq *rq)
> +{
> +	WARN_ON(!rq->rt.rt_nr_migratory);
> +	rq->rt.rt_nr_migratory--;
> +
> +	if ((rq->rt.rt_nr_running < 2) || !rq->rt.rt_nr_migratory)
> +		rt_clear_overload(p, rq->cpu);
> +}
> +#endif
>
>  static inline void inc_rt_tasks(struct task_struct *p, struct rq *rq)
>  {
> @@ -58,8 +76,8 @@ static inline void inc_rt_tasks(struct task_struct *p, struct rq *rq)
>  #ifdef CONFIG_SMP
>  	if (p->prio < rq->rt.highest_prio)
>  		rq->rt.highest_prio = p->prio;
> -	if (rq->rt.rt_nr_running > 1)
> -		rt_set_overload(p, rq->cpu);
> +	if (p->nr_cpus_allowed > 1)
> +		inc_rt_migration(p, rq);
>  #endif /* CONFIG_SMP */
>  }
>
> @@ -81,8 +99,8 @@ static inline void dec_rt_tasks(struct task_struct *p, struct rq *rq)
>  		} /* otherwise leave rq->highest prio alone */
>  	} else
>  		rq->rt.highest_prio = MAX_RT_PRIO;
> -	if (rq->rt.rt_nr_running < 2)
> -		rt_clear_overload(p, rq->cpu);
> +	if (p->nr_cpus_allowed > 1)
> +		dec_rt_migration(p, rq);
>  #endif /* CONFIG_SMP */
>  }
>
> @@ -286,6 +304,27 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq,
>  	return next;
>  }
>
> +static int lock_migration_target(struct task_struct *p, struct rq *target_rq)
> +{
> +	struct rq *this_rq = task_rq(p);
> +
> +	if (double_lock_balance(this_rq, target_rq)) {
> +		/*
> +		 * We had to unlock the run queue. In
> +		 * the mean time, task could have
> +		 * migrated already or had its affinity changed.
> +		 */
> +		if (unlikely(task_running(this_rq, p) ||
> +			     task_rq(p) != this_rq ||
> +			     !cpu_isset(target_rq->cpu, p->cpus_allowed))) {
> +			spin_unlock(&target_rq->lock);
> +			return 0;
> +		}
> +	}
> +
> +	return 1;
> +}
> +
>  /* Will lock the rq it finds */
>  static struct rq *find_lock_lowest_rq(struct task_struct *task,
>  				      struct rq *this_rq)
> @@ -295,6 +334,28 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task,
>  	int cpu;
>  	int tries;
>
> + 	/*
> + 	 * We can optimize if the hamming weight of the cpus_allowed mask
> + 	 * is 1 because the task has nowhere to go but one CPU.  So don't
> + 	 * waste the time trying to find the lowest RQ in this case.
> + 	 */

This code should be in the pick_next_highest_rt and not here.

> + 	if (task->nr_cpus_allowed == 1) {
> + 		/* If the task is already on the RQ, we are done */
> + 		if (cpu_isset(this_rq->cpu, task->cpus_allowed))
> + 			return NULL;
> +
> + 		cpu = first_cpu(task->cpus_allowed);
> +
> + 		lowest_rq = cpu_rq(cpu);
> + 		BUG_ON(this_rq == lowest_rq);
> +
> + 		/* Otherwise, we can simply grab the new RQ */
> + 		if (lock_migration_target(task, lowest_rq))
> + 			return lowest_rq;
> + 		else
> + 			return NULL;
> + 	}
> +
>  	cpus_and(cpu_mask, cpu_online_map, task->cpus_allowed);
>
>  	for (tries = 0; tries < RT_MAX_TRIES; tries++) {
> @@ -324,22 +385,8 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task,
>  			break;
>
>  		/* if the prio of this runqueue changed, try again */
> -		if (double_lock_balance(this_rq, lowest_rq)) {
> -			/*
> -			 * We had to unlock the run queue. In
> -			 * the mean time, task could have
> -			 * migrated already or had its affinity changed.
> -			 * Also make sure that it wasn't scheduled on its rq.
> -			 */
> -			if (unlikely(task_rq(task) != this_rq ||
> -				     !cpu_isset(lowest_rq->cpu, task->cpus_allowed) ||
> -				     task_running(this_rq, task) ||
> -				     !task->se.on_rq)) {
> -				spin_unlock(&lowest_rq->lock);
> -				lowest_rq = NULL;
> -				break;
> -			}
> -		}
> + 		if (!lock_migration_target(task, lowest_rq))
> + 			return NULL;

I don't like this encapsulating of the doubl_lock_balance. There's a
reason I kept it out like this. Mainly because this is the source of most
(if not all) of the race condititions I fought. So this is a very
sensitive area to touch. In fact, I see a few races already introduced by
this patch. One is that you took out the "!task->se.or_rq" test. Which
means that a task could have ran and deactivated itself
(TASK_UNINTERRUPTIBLE) and you just added it to a run queue. (I hit that
race ;-)

So keep that code out in the open where it belongs. Its very sensitive.

>
>  		/* If this rq is still suitable use it. */
>  		if (lowest_rq->rt.highest_prio > task->prio)
> @@ -658,6 +705,31 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p)
>  	}
>  }
>
> +#ifdef CONFIG_SMP
> +static void set_cpus_allowed_rt(struct task_struct *p, cpumask_t new_mask)
> +{
> +	int weight = cpus_weight(new_mask);
> +
> +	BUG_ON(!rt_task(p));
> +
> +	/*
> +	 * Update the migration status of the RQ if we have an RT task
> +	 * which is running AND changing its weight value.
> +	 */
> +	if (p->se.on_rq && (weight != p->nr_cpus_allowed)) {
> +		struct rq *rq = task_rq(p);
> +
> +		if ((p->nr_cpus_allowed <= 1) && (weight > 1))
> +			inc_rt_migration(p, rq);
> +		else if((p->nr_cpus_allowed > 1) && (weight <= 1))
> +			dec_rt_migration(p, rq);
> +	}
> +
> +	p->cpus_allowed    = new_mask;
> +	p->nr_cpus_allowed = weight;
> +}
> +#endif
> +
>  static struct sched_class rt_sched_class __read_mostly = {
>  	.enqueue_task		= enqueue_task_rt,
>  	.dequeue_task		= dequeue_task_rt,
> @@ -671,4 +743,8 @@ static struct sched_class rt_sched_class __read_mostly = {
>  	.load_balance		= load_balance_rt,
>
>  	.task_tick		= task_tick_rt,
> +
> +#ifdef CONFIG_SMP
> +	.set_cpus_allowed       = set_cpus_allowed_rt,
> +#endif

Hmm, I must have missed where you update the mask at time of boosting.
Anyway, we shouldn't. The set_cpus_allowed should be done for all tasks.
Now you can have a handler to call for when a task changes class and
changes weight.

-- Steve

-
To unsubscribe from this list: send the line "unsubscribe linux-rt-users" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html