Re: [rfc][patch] sched,rt: enqueue spinlock waiters to the head of their queue

Mike Galbraith <bitbucket@xxxxxxxxx> · Mon, 15 Apr 2013 15:01:44 +0200

BTW, if I were a reader, I'd be asking "where are the cold hard numbers
showing that this is a good thing to do?"  They're missing because I
don't have any.

This is the fallout from turning spinlocks back into 100% spinning locks
for rt explorations, which _can_ improve semop throughput up to 7 fold
when combined with a "Ok boss, how long may I spin before I have to
check for other runnable tasks at my prio" knob, but which also adds a
little overhead to the general case for one (making it a FAIL), and will
chew huge amounts of CPU in the heavily contended case just to close off
most priority inversions should a high priority task block briefly while
there's a low priority task runnable.  Preemptible spinning locks are
pure evil.

Doing this wake to head _should_ speed up lock turnaround, but I have
zero hard evidence that it really makes any difference in practice, so
yawn/NAK is perfectly understandable :)  These are only the leftover
bits that I think might be worth a ponder or two after declaring
preemptible spinning locks to be too evil to live.

On Sun, 2013-04-14 at 15:34 +0200, Mike Galbraith wrote: 
> If a task blocks on a spinlock, give the CPU back as soon as possible so
> we can turn over the lock as quickly as possible.  The task was at HEAD
> when it blocked, put it back, and tell everyone else to get the hell out
> of the way.
> 
> Signed-off-by: Mike Galbraith <bitbucket@xxxxxxxxx>
> ---
>  include/linux/sched.h |    1 +
>  kernel/rtmutex.c      |   13 +++++++++++--
>  kernel/sched/core.c   |    9 +++++++--
>  kernel/sched/fair.c   |    4 ++++
>  kernel/sched/rt.c     |   29 +++++++++++++++++++++++++++--
>  5 files changed, 50 insertions(+), 6 deletions(-)
> 
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1064,6 +1064,7 @@ struct sched_domain;
>  #define WF_FORK		0x02		/* child wakeup after fork */
>  #define WF_MIGRATED	0x04		/* internal use, task got migrated */
>  #define WF_LOCK_SLEEPER	0x08		/* wakeup spinlock "sleeper" */
> +#define WF_REQUEUE	0x10		/* requeue spinlock "sleeper" */
>  
>  #define ENQUEUE_WAKEUP		1
>  #define ENQUEUE_HEAD		2
> --- a/kernel/rtmutex.c
> +++ b/kernel/rtmutex.c
> @@ -722,7 +722,7 @@ static void  noinline __sched rt_spin_lo
>  {
>  	struct task_struct *lock_owner, *self = current;
>  	struct rt_mutex_waiter waiter, *top_waiter;
> -	int ret;
> +	int ret, wait, cpu = raw_smp_processor_id();
>  
>  	rt_mutex_init_waiter(&waiter, true);
>  
> @@ -757,12 +757,21 @@ static void  noinline __sched rt_spin_lo
>  
>  		top_waiter = rt_mutex_top_waiter(lock);
>  		lock_owner = rt_mutex_owner(lock);
> +		wait = top_waiter != &waiter;
> +
> +		/*
> +		 * If we preempt the lock owner, just preempt ourselves.
> +		 * the now boosted lock owner is queued to queue head.
> +		 * When we release the wait lock, lock owner runs.
> +		 */
> +		if (!wait && task_cpu(lock_owner) == cpu)
> +			set_tsk_need_resched(self);
>  
>  		raw_spin_unlock(&lock->wait_lock);
>  
>  		debug_rt_mutex_print_deadlock(&waiter);
>  
> -		if (top_waiter != &waiter || adaptive_wait(lock, lock_owner))
> +		if (wait || adaptive_wait(lock, lock_owner))
>  			schedule_rt_mutex(lock);
>  
>  		raw_spin_lock(&lock->wait_lock);
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -1611,7 +1611,12 @@ EXPORT_SYMBOL(wake_up_process);
>   */
>  int wake_up_lock_sleeper(struct task_struct *p)
>  {
> -	return try_to_wake_up(p, TASK_ALL, WF_LOCK_SLEEPER);
> +	int flags = WF_LOCK_SLEEPER;
> +
> +	if (rt_task(p))
> +		flags |= WF_REQUEUE;
> +
> +	return try_to_wake_up(p, TASK_ALL, flags);
>  }
>  
>  int wake_up_state(struct task_struct *p, unsigned int state)
> @@ -3815,7 +3820,7 @@ void rt_mutex_setprio(struct task_struct
>  	if (running)
>  		p->sched_class->set_curr_task(rq);
>  	if (on_rq)
> -		enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
> +		enqueue_task(rq, p, ENQUEUE_HEAD);
>  
>  	check_class_changed(rq, p, prev_class, oldprio);
>  out_unlock:
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -3522,6 +3522,10 @@ static void check_preempt_wakeup(struct
>  	if (unlikely(se == pse))
>  		return;
>  
> +	/* Preempting SCHED_OTHER lock holders harms throughput for no good reason */
> +	if (__migrate_disabled(curr))
> +		return;
> +
>  	/*
>  	 * This is possible from callers such as move_task(), in which we
>  	 * unconditionally check_prempt_curr() after an enqueue (which may have
> --- a/kernel/sched/rt.c
> +++ b/kernel/sched/rt.c
> @@ -1180,6 +1180,10 @@ enqueue_task_rt(struct rq *rq, struct ta
>  	if (flags & ENQUEUE_WAKEUP)
>  		rt_se->timeout = 0;
>  
> +	/* The wakee is a FIFO lock sleeper */
> +	if (flags & WF_REQUEUE)
> +		flags |= ENQUEUE_HEAD;
> +
>  	enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD);
>  
>  	if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
> @@ -1295,8 +1299,29 @@ select_task_rq_rt(struct task_struct *p,
>  	return cpu;
>  }
>  
> -static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
> +static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p, int wake_flags)
>  {
> +#ifdef CONFIG_PREEMPT_RT_BASE
> +	if (wake_flags & WF_REQUEUE) {
> +		if (!p->on_cpu)
> +			requeue_task_rt(rq, p, 1);
> +
> +		/*
> +		 * The lock owner was here first, top waiter
> +		 * must follow.  If the owner was PI boosted,
> +		 * it's gone RSN.  All others need to get off
> +		 * this CPU ASAP, this waiter had it first.
> +		 */
> +		if (rq == this_rq())
> +			requeue_task_rt(rq, rq->curr, 1);
> +		else if (__migrate_disabled(rq->curr))
> +			set_tsk_need_resched(rq->curr);
> +		else
> +			resched_task(rq->curr);
> +
> +		return;
> +	}
> +#endif
>  	if (rq->curr->nr_cpus_allowed == 1)
>  		return;
>  
> @@ -1342,7 +1367,7 @@ static void check_preempt_curr_rt(struct
>  	 * task.
>  	 */
>  	if (p->prio == rq->curr->prio && !test_tsk_need_resched(rq->curr))
> -		check_preempt_equal_prio(rq, p);
> +		check_preempt_equal_prio(rq, p, flags);
>  #endif
>  }
>  
> 
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-rt-users" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html