Re: [TIP/SCHED/DEVEL PATCH v3 6/6] sched: create "pushable_tasks" list to limit pushing to one attempt

Steven Rostedt <rostedt@xxxxxxxxxxx> · Thu, 4 Sep 2008 17:16:47 -0400 (EDT)

On Thu, 4 Sep 2008, Gregory Haskins wrote:
> 
> However, the current implementation suffers from a limitation in the
> push logic.  Once overloaded, all tasks (other than current) on the
> RQ are analyzed on every push operation, even if it was previously
> unpushable (due to affinity, etc).  Whats more, the operation stops
> at the first task that is unpushable and will not look at items
> lower in the queue.  This causes two problems:
> 
> 1) We can have the same tasks analyzed over and over again during each
>    push, which extends out the fast path in the scheduler for no
>    gain.  Consider a RQ that has dozens of tasks that are bound to a
>    core.  Each one of those tasks will be encountered and skipped
>    for each push operation while they are queued.
> 
> 2) There may be lower-priority tasks under the unpushable task that
>    could have been successfully pushed, but will never be considered
>    until either the unpushable task is cleared, or a pull operation
>    succeeds.  The net result is a potential latency source for mid
>    priority tasks.

Yep, we knew of these limitations when we created it. It was a big
TODO. Thanks for actually doing it. It is what? One year already?

;-)

> 
> This patch aims to rectify these two conditions by introducing a new
> priority sorted list: "pushable_tasks".  A task is added to the list
> each time a task is activated or preempted.  It is removed from the
> list any time it is deactivated, made current, or fails to push.
> 
> This works because a task only needs to be attempted to push once.
> After an initial failure to push, the other cpus will eventually try to
> pull the task when the conditions are proper.  This also solves the
> problem that we don't completely analyze all tasks due to encountering
> an unpushable tasks.  Now every task will have a push attempted (when
> appropriate).
> 
> This reduces latency both by shorting the critical section of the
> rq->lock for certain workloads, and by making sure the algorithm
> considers all eligible tasks in the system.
> 
> Signed-off-by: Gregory Haskins <ghaskins@xxxxxxxxxx>
> CC: Steven Rostedt <srostedt@xxxxxxxxxx>
> ---
> 
>  include/linux/init_task.h |    1 
>  include/linux/sched.h     |    1 
>  kernel/sched.c            |    4 ++
>  kernel/sched_rt.c         |  110 +++++++++++++++++++++++++++++++++++++++++----
>  4 files changed, 106 insertions(+), 10 deletions(-)
> 
> diff --git a/include/linux/init_task.h b/include/linux/init_task.h
> index 021d8e7..7f910f4 100644
> --- a/include/linux/init_task.h
> +++ b/include/linux/init_task.h
> @@ -140,6 +140,7 @@ extern struct group_info init_groups;
>  		.nr_cpus_allowed = NR_CPUS,				\
>  	},								\
>  	.tasks		= LIST_HEAD_INIT(tsk.tasks),			\
> +	.pushable_tasks = PLIST_NODE_INIT(tsk.pushable_tasks, MAX_PRIO), \
>  	.ptraced	= LIST_HEAD_INIT(tsk.ptraced),			\
>  	.ptrace_entry	= LIST_HEAD_INIT(tsk.ptrace_entry),		\
>  	.real_parent	= &tsk,						\
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index cf8cd8c..3480c8a 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1078,6 +1078,7 @@ struct task_struct {
>  #endif
>  
>  	struct list_head tasks;
> +	struct plist_node pushable_tasks;
>  
>  	struct mm_struct *mm, *active_mm;
>  
> diff --git a/kernel/sched.c b/kernel/sched.c
> index ddc3877..dbb9e8c 100644
> --- a/kernel/sched.c
> +++ b/kernel/sched.c
> @@ -447,6 +447,7 @@ struct rt_rq {
>  #ifdef CONFIG_SMP
>  	unsigned long rt_nr_migratory;
>  	int overloaded;
> +	struct plist_head pushable_tasks;
>  #endif
>  	int rt_throttled;
>  	u64 rt_time;
> @@ -2383,6 +2384,8 @@ void sched_fork(struct task_struct *p, int clone_flags)
>  	/* Want to start with kernel preemption disabled. */
>  	task_thread_info(p)->preempt_count = 1;
>  #endif
> +	plist_node_init(&p->pushable_tasks, MAX_PRIO);
> +
>  	put_cpu();
>  }
>  
> @@ -8009,6 +8012,7 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
>  #ifdef CONFIG_SMP
>  	rt_rq->rt_nr_migratory = 0;
>  	rt_rq->overloaded = 0;
> +	plist_head_init(&rq->rt.pushable_tasks, &rq->lock);
>  #endif
>  
>  	rt_rq->rt_time = 0;
> diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
> index 277ccd2..b22d18a 100644
> --- a/kernel/sched_rt.c
> +++ b/kernel/sched_rt.c
> @@ -49,6 +49,24 @@ static void update_rt_migration(struct rq *rq)
>  		rq->rt.overloaded = 0;
>  	}
>  }
> +
> +static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
> +{
> +	plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
> +	plist_node_init(&p->pushable_tasks, p->prio);
> +	plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks);
> +}
> +
> +static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
> +{
> +	plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
> +}
> +
> +#else
> +
> +#define enqueue_pushable_task(rq, p) do { } while (0)
> +#define dequeue_pushable_task(rq, p) do { } while (0)
> +
>  #endif /* CONFIG_SMP */
>  
>  static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
> @@ -669,6 +687,9 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
>  
>  	enqueue_rt_entity(rt_se);
>  
> +	if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
> +		enqueue_pushable_task(rq, p);
> +
>  	inc_cpu_load(rq, p->se.load.weight);
>  }
>  
> @@ -679,6 +700,8 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
>  	update_curr_rt(rq);
>  	dequeue_rt_entity(rt_se);
>  
> +	dequeue_pushable_task(rq, p);
> +
>  	dec_cpu_load(rq, p->se.load.weight);
>  }
>  
> @@ -824,7 +847,7 @@ static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
>  	return next;
>  }
>  
> -static struct task_struct *pick_next_task_rt(struct rq *rq)
> +static struct task_struct *_pick_next_task_rt(struct rq *rq)
>  {
>  	struct sched_rt_entity *rt_se;
>  	struct task_struct *p;
> @@ -846,6 +869,18 @@ static struct task_struct *pick_next_task_rt(struct rq *rq)
>  
>  	p = rt_task_of(rt_se);
>  	p->se.exec_start = rq->clock;
> +
> +	return p;
> +}
> +
> +static struct task_struct *pick_next_task_rt(struct rq *rq)
> +{
> +	struct task_struct *p = _pick_next_task_rt(rq);
> +
> +	/* The running task is never eligible for pushing */
> +	if (p)
> +		dequeue_pushable_task(rq, p);
> +
>  	return p;
>  }
>  
> @@ -853,6 +888,13 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
>  {
>  	update_curr_rt(rq);
>  	p->se.exec_start = 0;
> +
> +	/*
> +	 * The previous task needs to be made eligible for pushing
> +	 * if it is still active
> +	 */
> +	if (p->se.on_rq && p->rt.nr_cpus_allowed > 1)
> +		enqueue_pushable_task(rq, p);
>  }
>  
>  #ifdef CONFIG_SMP
> @@ -1031,6 +1073,28 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
>  	return lowest_rq;
>  }
>  
> +static inline int has_pushable_tasks(struct rq *rq)
> +{
> +	return !plist_head_empty(&rq->rt.pushable_tasks);
> +}
> +
> +static struct task_struct *pick_next_pushable_task(struct rq *rq)
> +{
> +	struct task_struct *p;
> +
> +	if (!has_pushable_tasks(rq))
> +		return NULL;
> +
> +	p = plist_first_entry(&rq->rt.pushable_tasks,
> +			      struct task_struct, pushable_tasks);
> +
> +	BUG_ON(rq->cpu != task_cpu(p));
> +	BUG_ON(task_current(rq, p));
> +	BUG_ON(p->rt.nr_cpus_allowed <= 1);

I have two new BUG_ONs for you. (This isn't a fast path)

	BUG_ON(!p->se.on_rq);
	BUG_ON(!rt_task(p));

> +
> +	return p;
> +}
> +
>  /*
>   * If the current CPU has more than one RT task, see if the non
>   * running task can migrate over to a CPU that is running a task
> @@ -1040,13 +1104,12 @@ static int push_rt_task(struct rq *rq)
>  {
>  	struct task_struct *next_task;
>  	struct rq *lowest_rq;
> -	int ret = 0;
>  	int paranoid = RT_MAX_TRIES;
>  
>  	if (!rq->rt.overloaded)
>  		return 0;
>  
> -	next_task = pick_next_highest_task_rt(rq, -1);
> +	next_task = pick_next_pushable_task(rq);
>  	if (!next_task)
>  		return 0;
>  
> @@ -1078,12 +1141,19 @@ static int push_rt_task(struct rq *rq)
>  		 * so it is possible that next_task has changed.
>  		 * If it has, then try again.
>  		 */
> -		task = pick_next_highest_task_rt(rq, -1);
> +		task = pick_next_pushable_task(rq);
>  		if (unlikely(task != next_task) && task && paranoid--) {
>  			put_task_struct(next_task);
>  			next_task = task;
>  			goto retry;
>  		}
> +
> +		/*
> +		 * Once we have failed to push this task, we will not
> +		 * try again, since the other cpus will pull from us
> +		 * when they are ready
> +		 */
> +		dequeue_pushable_task(rq, next_task);
>  		goto out;
>  	}
>  
> @@ -1095,11 +1165,10 @@ static int push_rt_task(struct rq *rq)
>  
>  	double_unlock_balance(rq, lowest_rq);
>  
> -	ret = 1;
>  out:
>  	put_task_struct(next_task);
>  
> -	return ret;
> +	return 1;
>  }
>  
>  /*
> @@ -1128,7 +1197,7 @@ static int pull_rt_task(struct rq *this_rq)
>  	if (likely(!rt_overloaded(this_rq)))
>  		return 0;
>  
> -	next = pick_next_task_rt(this_rq);
> +	next = _pick_next_task_rt(this_rq);
>  
>  	for_each_cpu_mask_nr(cpu, this_rq->rd->rto_mask) {
>  		if (this_cpu == cpu)
> @@ -1145,7 +1214,7 @@ static int pull_rt_task(struct rq *this_rq)
>  		if (double_lock_balance(this_rq, src_rq)) {
>  			struct task_struct *old_next = next;
>  
> -			next = pick_next_task_rt(this_rq);
> +			next = _pick_next_task_rt(this_rq);
>  			if (next != old_next)
>  				ret = 1;
>  		}
> @@ -1217,7 +1286,7 @@ static void pre_schedule_rt(struct rq *rq, struct task_struct *prev)
>   */
>  static int needs_post_schedule_rt(struct rq *rq)
>  {
> -	return rq->rt.overloaded ? 1 : 0;
> +	return has_pushable_tasks(rq);
>  }
>  
>  static void post_schedule_rt(struct rq *rq)
> @@ -1240,7 +1309,7 @@ static void task_wake_up_rt(struct rq *rq, struct task_struct *p)
>  {
>  	if (!task_running(rq, p) &&
>  	    !test_tsk_need_resched(rq->curr) &&
> -	    rq->rt.overloaded &&
> +	    has_pushable_tasks(rq) &&
>  	    p->rt.nr_cpus_allowed > 1)
>  		push_rt_tasks(rq);
>  }
> @@ -1277,6 +1346,24 @@ static void set_cpus_allowed_rt(struct task_struct *p,
>  	if (p->se.on_rq && (weight != p->rt.nr_cpus_allowed)) {
>  		struct rq *rq = task_rq(p);
>  
> +		if (!task_current(rq, p)) {
> +			/*
> +			 * Make sure we dequeue this task from the pushable list
> +			 * before going further.  It will either remain off of
> +			 * the list because we are no longer pushable, or it
> +			 * will be requeued.
> +			 */
> +			if (p->rt.nr_cpus_allowed > 1)
> +				dequeue_pushable_task(rq, p);
> +
> +			/*
> +			 * Requeue if our weight is changing and still > 1
> +			 */
> +			if (weight > 1)
> +				enqueue_pushable_task(rq, p);
> +
> +		}
> +
>  		if ((p->rt.nr_cpus_allowed <= 1) && (weight > 1)) {
>  			rq->rt.rt_nr_migratory++;
>  		} else if ((p->rt.nr_cpus_allowed > 1) && (weight <= 1)) {
> @@ -1453,6 +1540,9 @@ static void set_curr_task_rt(struct rq *rq)
>  	struct task_struct *p = rq->curr;
>  
>  	p->se.exec_start = rq->clock;
> +
> +	/* The running task is never eligible for pushing */
> +	dequeue_pushable_task(rq, p);
>  }
>  
>  static const struct sched_class rt_sched_class = {
> 

OK, I like the patch, but I'm not 100% that it doesn't have bugs. I'll ACK 
it, but this definitely needs to be heavily tested, before going mainline. 
But for linux-tip, -mm, or linux-next, I think it is, on the surface, 
good to go (with the added BUG_ONs).

I'll pull it into -rt as well.

Acked-by: Steven Rostedt <srostedt@xxxxxxxxxx>

Thanks,

-- Steve

--
To unsubscribe from this list: send the line "unsubscribe linux-rt-users" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html