Re: rt: rtmutex experiment doubled tbench throughput

Steven Rostedt <rostedt@xxxxxxxxxxx> · Tue, 26 Feb 2013 14:45:44 -0500

On Fri, 2013-02-22 at 17:36 +0100, Mike Galbraith wrote:
> Greetings,
> 
> A user reported surprise at seeing a low priority rt task awakened in
> the same semop as a high priority task run before the high priority task
> could finish what it was supposed to do, and put itself to sleep.  The
> reason for that happening is that it, and a few of its brothers (highly
> synchronized task schedulers) enter semop to diddle the same array at
> the same time, meet at spinlock (rtmutex) and block.
> 
> Now you could say "Waking task foo and depending on it's lower than task
> bar priority to keep it off the CPU is your bad if you do so without a
> no-block guarantee in hand for everything task bar does." which I did,
> that and "The semop syscall is not atomic per POSIX, it's the array
> operations that are atomic.  Blocking on a contended lock is fine".
> 
> Anyway, I looked at rtmutex.c and started pondering, wondering if I
> could un-surprise the user without breaking the world, and maybe even
> make non-rt stuff perform a little better.
> 
> Numerous deadlocks and cool explosions later...
> 
> This seems to work, no harm to 60 core jitter testcase detected, and it
> doubled tbench throughput.  But be advised, your breakfast may emigrate,
> or a POSIX swat team may kick in your door if you even _look_ at this. 
> 
> 64 core DL980G2, 3.0.61-rt85
> 
> vogelweide:/:[0]# tbench.sh 128 10
> waiting for connections
> dbench version 3.04 - Copyright Andrew Tridgell 1999-2004
> 
> Running for 10 seconds with load '/usr/share/dbench/client.txt' and minimum warmup 2 secs
> ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^128 clients started
>  128      5881  6239.48 MB/sec  warmup   1 sec   
>  128     17978  4169.33 MB/sec  execute   1 sec   
>  128     24051  4173.54 MB/sec  execute   2 sec   
>  128     30131  4185.35 MB/sec  execute   3 sec   
>  128     36145  4173.59 MB/sec  execute   4 sec   
>  128     42233  4182.69 MB/sec  execute   5 sec   
>  128     48293  4181.18 MB/sec  execute   6 sec   
>  128     54407  4185.10 MB/sec  execute   7 sec   
>  128     60445  4183.09 MB/sec  execute   8 sec   
>  128     66482  4179.41 MB/sec  execute   9 sec   
>  128     72543  4179.37 MB/sec  cleanup  10 sec   
>  128     72543  4174.07 MB/sec  cleanup  10 sec   
> 
> Throughput 4179.49 MB/sec 128 procs
> 924536 packets/sec
> /root/bin/tbench.sh: line 33: 11292 Killed                  tbench_srv
> 
> 
> vogelweide:/:[0]# echo 1 >  /proc/sys/kernel/sched_rt_spin_yield
> vogelweide:/:[0]# tbench.sh 128 10
> waiting for connections
> dbench version 3.04 - Copyright Andrew Tridgell 1999-2004
> 
> Running for 10 seconds with load '/usr/share/dbench/client.txt' and minimum warmup 2 secs
> ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^128 clients started
>  128     14360  12090.05 MB/sec  warmup   1 sec   
>  128     42573  9740.67 MB/sec  execute   1 sec   
>  128     56661  9736.85 MB/sec  execute   2 sec   
>  128     70752  9723.79 MB/sec  execute   3 sec   
>  128     84850  9724.82 MB/sec  execute   4 sec   
>  128     98936  9720.49 MB/sec  execute   5 sec   
>  128    113021  9721.15 MB/sec  execute   6 sec   
>  128    127111  9723.26 MB/sec  execute   7 sec   
>  128    141203  9722.81 MB/sec  execute   8 sec   
>  128    155300  9722.90 MB/sec  execute   9 sec   
>  128    169392  9727.48 MB/sec  cleanup  10 sec   
>  128    169392  9712.52 MB/sec  cleanup  10 sec   
> 

Wow, that's some result!

> Throughput 9727.56 MB/sec 128 procs
> 2150132 packets/sec
> /root/bin/tbench.sh: line 34: 11568 Killed                  tbench_srv
> 
> ---
>  include/linux/sched.h   |    7 +++++--
>  kernel/rtmutex.c        |   48 +++++++++++++++++++++++++++++++++++++++++-------
>  kernel/rtmutex_common.h |   42 +++++++++++++++++++++++++++++++++++++++---
>  kernel/sched.c          |   31 ++++++++++++++++++++++++++++---
>  kernel/sched_fair.c     |    5 +++++
>  kernel/sched_rt.c       |    3 +++
>  kernel/sysctl.c         |   11 +++++++++++
>  7 files changed, 132 insertions(+), 15 deletions(-)
> 
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -2083,6 +2083,9 @@ extern unsigned int sysctl_sched_latency
>  extern unsigned int sysctl_sched_min_granularity;
>  extern unsigned int sysctl_sched_wakeup_granularity;
>  extern unsigned int sysctl_sched_child_runs_first;
> +#ifdef CONFIG_PREEMPT_RT_FULL
> +extern unsigned int sysctl_sched_rt_spin_yield;
> +#endif

I'm not sure we want this. I think it makes sense to either always do
it, or never do it.

>  
>  enum sched_tunable_scaling {
>  	SCHED_TUNABLESCALING_NONE,
> @@ -2146,12 +2149,12 @@ extern unsigned int sysctl_sched_cfs_ban
>  #endif
>  
>  #ifdef CONFIG_RT_MUTEXES
> -extern void task_setprio(struct task_struct *p, int prio);
> +extern void task_setprio(struct task_struct *p, int prio, int yield);
>  extern int rt_mutex_getprio(struct task_struct *p);
>  extern int rt_mutex_check_prio(struct task_struct *task, int newprio);
>  static inline void rt_mutex_setprio(struct task_struct *p, int prio)
>  {
> -	task_setprio(p, prio);
> +	task_setprio(p, prio, 0);
>  }
>  extern void rt_mutex_adjust_pi(struct task_struct *p);
>  static inline bool tsk_is_pi_blocked(struct task_struct *tsk)
> --- a/kernel/rtmutex.c
> +++ b/kernel/rtmutex.c
> @@ -689,7 +689,7 @@ static inline void rt_spin_lock_fastunlo
>   * on rcu_read_lock() and the check against the lock owner.
>   */
>  static int adaptive_wait(struct rt_mutex *lock,
> -			 struct task_struct *owner)
> +			 struct task_struct *owner, int spin)
>  {
>  	int res = 0;
>  
> @@ -702,8 +702,8 @@ static int adaptive_wait(struct rt_mutex
>  		 * checking the above to be valid.
>  		 */
>  		barrier();
> -		if (!owner->on_cpu) {
> -			res = 1;
> +		if (!owner || !owner->on_cpu) {
> +			res = !spin;
>  			break;

Hmm, if spin is set, this function *always* returns zero. Ug, don't do
this, a better solution below (where adaptive_wait() is called).

>  		}
>  		cpu_relax();
> @@ -713,7 +713,7 @@ static int adaptive_wait(struct rt_mutex
>  }
>  #else
>  static int adaptive_wait(struct rt_mutex *lock,
> -			 struct task_struct *orig_owner)
> +			 struct task_struct *orig_owner, int spin)
>  {
>  	return 1;
>  }
> @@ -733,7 +733,7 @@ static void  noinline __sched rt_spin_lo
>  {
>  	struct task_struct *lock_owner, *self = current;
>  	struct rt_mutex_waiter waiter, *top_waiter;
> -	int ret;
> +	int ret, wait, rt_spin = 0, other_spin = 0, cpu;
>  
>  	rt_mutex_init_waiter(&waiter, true);
>  
> @@ -761,6 +761,17 @@ static void  noinline __sched rt_spin_lo
>  	ret = task_blocks_on_rt_mutex(lock, &waiter, self, 0);
>  	BUG_ON(ret);
>  
> +	/* basic spin/yield sanity checks */
> +	if (rt_spin_yield_enabled()) {
> +		rt_spin = !self->saved_state;
> +		/* Here there be dragons */
> +		rt_spin &= !(self->flags & PF_EXITING);
> +		other_spin = rt_spin;
> +		rt_spin &= rt_task(self);
> +		other_spin &= !rt_spin;

The multiple assigning is very ugly.

	int self_state = !self->saved_state && !(self->flags & PF_EXITING);

	rt_spin = self_state && rt_task(self);
	other_spin = self_state && !rt_task(self);

Much easier to read.

> +	}
> +	cpu = raw_smp_processor_id();
> +
>  	for (;;) {
>  		/* Try to acquire the lock again. */
>  		if (__try_to_take_rt_mutex(lock, self, &waiter, STEAL_LATERAL))
> @@ -769,12 +780,25 @@ static void  noinline __sched rt_spin_lo
>  		top_waiter = rt_mutex_top_waiter(lock);
>  		lock_owner = rt_mutex_owner(lock);
>  
> +		if (rt_spin)
> +			wait = 1;
> +		else
> +			wait = top_waiter != &waiter;
> +
> +		/* SCHED_OTHER can laterally steal, let them try */
> +		if (other_spin) {
> +			wait &= task_cpu(top_waiter->task) == cpu;
> +			wait |= top_waiter->task->prio < self->prio;
> +			wait |= lock_owner && !lock_owner->on_cpu;
> +			wait |= lock_owner && !lock_owner->prio < self->prio;

Again very ugly and hard to read.

Let's see. If other_spin is set, this also means that rt_spin is not, so
wait is only set if top_waiter != &waiter (current is not top_waiter).
OK, if current is top waiter, we don't want to wait. That makes sense.

Now, if top_waiter->task cpu != current cpu, we don't wait. Ah, even if
we are not the top waiter, we can still spin if the top_waiter is on
another CPU. But if top_waiter is higher priority, then we still wait.
Or if the lock owner is not on a CPU, or if the lock owner is not higher
priority than current???

I'm confused to why you did this. This requires a hell of a lot of
explanation.

> +		}
> +
>  		raw_spin_unlock(&lock->wait_lock);
>  
>  		debug_rt_mutex_print_deadlock(&waiter);
>  
> -		if (top_waiter != &waiter || adaptive_wait(lock, lock_owner))
> -			schedule_rt_mutex(lock);
> +		if (wait || adaptive_wait(lock, lock_owner, rt_spin))
> +			schedule_rt_spinlock(lock, rt_spin);

I'm also confused, if rt_spin is set, we have wait = 1. wait can only be
cleared, if other_spin is set, but other_spin can't be set if rt_spin is
set.

But if I missed something, there's no reason to pass rt_spin to
adaptive_wait() as if it's set, adaptive_wait returns zero. You can just
do:

	if (wait || (adaptive_wait(lock, lock_owner) && !rt_spin)))

But I'm not sure I ever see
>  
>  		raw_spin_lock(&lock->wait_lock);
>  
> @@ -826,6 +850,16 @@ static void  noinline __sched rt_spin_lo
>  		return;
>  	}
>  
> +	if (rt_spin_yield_enabled() && rt_task(current)) {
> +		struct rt_mutex_waiter *top_waiter = rt_mutex_top_waiter(lock);
> +		struct task_struct *next = top_waiter->task;
> +
> +		/* Move next in line to head of its queue */
> +		pi_lock(&next->pi_lock);
> +		task_setprio(next, next->prio, 1);
> +		pi_unlock(&next->pi_lock);

You need to explain why this is done.

> +	}
> +
>  	wakeup_next_waiter(lock);
>  
>  	raw_spin_unlock(&lock->wait_lock);
> --- a/kernel/rtmutex_common.h
> +++ b/kernel/rtmutex_common.h
> @@ -32,9 +32,45 @@ extern void schedule_rt_mutex_test(struc
>  		schedule_rt_mutex_test(_lock);			\
>    } while (0)
>  
> -#else
> -# define schedule_rt_mutex(_lock)			schedule()
> -#endif
> +#ifdef CONFIG_PREEMPT_RT_FULL
> +#define rt_spin_yield_enabled()					\
> +	(sysctl_sched_rt_spin_yield && system_state == SYSTEM_RUNNING)
> +
> +#define schedule_rt_spinlock(_lock, _spin)			\
> +  do {								\
> +	if (!(current->flags & PF_MUTEX_TESTER)) {		\
> +		if (!_spin)					\
> +			schedule();				\
> +		else						\
> +			yield();				\
> +	} else							\
> +		schedule_rt_mutex_test(_lock);			\

I'm wondering if this is the only thing you needed to get your
performance boost.

> +  } while (0)
> +#else /* !CONFIG_PREEMPT_RT_FULL */
> +#define rt_spin_yield_enabled()	(0)
> +#define schedule_rt_spinlock(_lock, _spin) schedule_rt_mutex(_lock)
> +#endif /* CONFIG_PREEMPT_RT_FULL */
> +
> +#else /* !CONFIG_RT_MUTEX_TESTER */
> +
> +#define schedule_rt_mutex(_lock)			schedule()
> +
> +#ifdef CONFIG_PREEMPT_RT_FULL
> +#define rt_spin_yield_enabled()					\
> +	(sysctl_sched_rt_spin_yield && system_state == SYSTEM_RUNNING)
> +
> +#define schedule_rt_spinlock(_lock, _spin)			\
> +  do {								\
> +	if (!_spin)						\
> +		schedule();					\
> +	else							\
> +		yield();					\
> +  } while (0)
> +#else /* !CONFIG_PREEMPT_RT_FULL */
> +#define rt_spin_yield_enabled()	(0)
> +#define schedule_rt_spinlock(_lock, _spin) schedule_rt_mutex(_lock)
> +#endif /* CONFIG_PREEMPT_RT_FULL */
> +#endif /* CONFIG_RT_MUTEX_TESTER */
>  
>  /*
>   * This is the control structure for tasks blocked on a rt_mutex,
> --- a/kernel/sched.c
> +++ b/kernel/sched.c
> @@ -943,6 +943,11 @@ late_initcall(sched_init_debug);
>  const_debug unsigned int sysctl_sched_nr_migrate = 32;
>  #else
>  const_debug unsigned int sysctl_sched_nr_migrate = 8;
> +
> +/*
> + * rt spinlock waiters spin and yield() if necessary vs blocking
> + */
> +unsigned int sysctl_sched_rt_spin_yield __read_mostly;
>  #endif
>  
>  /*
> @@ -5292,6 +5297,7 @@ EXPORT_SYMBOL(sleep_on_timeout);
>   * task_setprio - set the current priority of a task
>   * @p: task
>   * @prio: prio value (kernel-internal form)
> + * @requeue: requeue an rt_spin_lock_slowlock() top waiter and preempt
>   *
>   * This function changes the 'effective' priority of a task. It does
>   * not touch ->normal_prio like __setscheduler().
> @@ -5299,7 +5305,7 @@ EXPORT_SYMBOL(sleep_on_timeout);
>   * Used by the rt_mutex code to implement priority inheritance
>   * logic. Call site only calls if the priority of the task changed.
>   */
> -void task_setprio(struct task_struct *p, int prio)
> +void task_setprio(struct task_struct *p, int prio, int requeue)
>  {
>  	int oldprio, on_rq, running;
>  	struct rq *rq;
> @@ -5332,6 +5338,8 @@ void task_setprio(struct task_struct *p,
>  	prev_class = p->sched_class;
>  	on_rq = p->on_rq;
>  	running = task_current(rq, p);
> +	if (requeue && (running || !on_rq || !rt_prio(oldprio)))
> +		goto out_unlock;
>  	if (on_rq)
>  		dequeue_task(rq, p, 0);
>  	if (running)
> @@ -5346,8 +5354,25 @@ void task_setprio(struct task_struct *p,
>  
>  	if (running)
>  		p->sched_class->set_curr_task(rq);
> -	if (on_rq)
> -		enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
> +	if (on_rq) {
> +		if (!sysctl_sched_rt_spin_yield) {
> +			enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
> +		} else {
> +			enqueue_task(rq, p, ENQUEUE_HEAD);
> +
> +			/*
> +			 * If we're requeueing a spinlock waiter, preempt any
> +			 * peer in the way, waiter involuntarily blocked, so
> +			 * has the right to use this CPU before its peers.
> +			 */
> +			requeue &= p->prio <= rq->curr->prio;
> +			requeue &= rq->curr->state == TASK_RUNNING;
> +			requeue &= rq->curr != current;
> +
> +			if (requeue)
> +				resched_task(rq->curr);

This too is interesting.

> +		}
> +	}
>  
>  	check_class_changed(rq, p, prev_class, oldprio);
>  out_unlock:
> --- a/kernel/sched_fair.c
> +++ b/kernel/sched_fair.c
> @@ -2510,6 +2510,11 @@ static void check_preempt_wakeup(struct
>  	if (unlikely(se == pse))
>  		return;
>  
> +#ifdef CONFIG_PREEMPT_RT_FULL
> +	if (sysctl_sched_rt_spin_yield && curr->migrate_disable)
> +		return;
> +#endif
> +
>  	/*
>  	 * This is possible from callers such as pull_task(), in which we
>  	 * unconditionally check_prempt_curr() after an enqueue (which may have
> --- a/kernel/sched_rt.c
> +++ b/kernel/sched_rt.c
> @@ -991,6 +991,9 @@ enqueue_task_rt(struct rq *rq, struct ta
>  	if (flags & ENQUEUE_WAKEUP)
>  		rt_se->timeout = 0;
>  
> +	if (sysctl_sched_rt_spin_yield && (flags & WF_LOCK_SLEEPER))
> +		flags |= ENQUEUE_HEAD;
> +
>  	enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD);
>  
>  	if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
> --- a/kernel/sysctl.c
> +++ b/kernel/sysctl.c
> @@ -368,6 +368,17 @@ static struct ctl_table kern_table[] = {
>  		.mode		= 0644,
>  		.proc_handler	= sched_rt_handler,
>  	},
> +#ifdef CONFIG_PREEMPT_RT_FULL
> +	{
> +		.procname	= "sched_rt_spin_yield",
> +		.data		= &sysctl_sched_rt_spin_yield,
> +		.maxlen		= sizeof(unsigned int),
> +		.mode		= 0644,
> +		.proc_handler	= proc_dointvec_minmax,
> +		.extra1		= &zero,
> +		.extra2		= &one,
> +	},
> +#endif
>  	{
>  		.procname	= "sched_compat_yield",
>  		.data		= &sysctl_sched_compat_yield,
> 
> 

I would be interested in seeing what happens if we just made rt_tasks
spin instead of sleep (call yield). Actually, if we made real-time tasks
always spin (never sleep), just yield, if this would give us much better
performance.

-- Steve

--
To unsubscribe from this list: send the line "unsubscribe linux-rt-users" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html