Re: rt: rtmutex experiment doubled tbench throughput

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



With some old refuse hauled away if anybody wants to play with it.

I can't find anything that it breaks, fwtw. 

---
 include/linux/sched.h   |   27 +++++++++++++++++++++++++--
 kernel/rtmutex.c        |   44 +++++++++++++++++++++++++++++++++++++++-----
 kernel/rtmutex_common.h |   34 +++++++++++++++++++++++++++++++---
 kernel/sched.c          |   32 +++++++++++++++++++++++++++++---
 kernel/sched_fair.c     |    4 ++++
 kernel/sched_rt.c       |    3 +++
 kernel/sys.c            |    1 +
 kernel/sysctl.c         |   11 +++++++++++
 8 files changed, 143 insertions(+), 13 deletions(-)

--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2146,12 +2146,35 @@ extern unsigned int sysctl_sched_cfs_ban
 #endif
 
 #ifdef CONFIG_RT_MUTEXES
-extern void task_setprio(struct task_struct *p, int prio);
+#ifdef CONFIG_PREEMPT_RT_FULL
+extern unsigned int sysctl_sched_rt_spin_yield;
+
+static inline bool rt_spin_yield_enabled(void)
+{
+	return sysctl_sched_rt_spin_yield;
+}
+
+static inline void rt_spin_yield_disable(void)
+{
+	sysctl_sched_rt_spin_yield = 0;
+}
+#else
+static inline bool rt_spin_yield_enabled(void)
+{
+	return 0;
+}
+static inline void rt_spin_yield_disable(void) { }
+#endif
+extern void task_setprio(struct task_struct *p, int prio, int requeue);
 extern int rt_mutex_getprio(struct task_struct *p);
 extern int rt_mutex_check_prio(struct task_struct *task, int newprio);
 static inline void rt_mutex_setprio(struct task_struct *p, int prio)
 {
-	task_setprio(p, prio);
+	task_setprio(p, prio, 0);
+}
+static inline void rt_mutex_requeue(struct task_struct *p, int prio)
+{
+	task_setprio(p, prio, 1);
 }
 extern void rt_mutex_adjust_pi(struct task_struct *p);
 static inline bool tsk_is_pi_blocked(struct task_struct *tsk)
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -702,8 +702,8 @@ static int adaptive_wait(struct rt_mutex
 		 * checking the above to be valid.
 		 */
 		barrier();
-		if (!owner->on_cpu) {
-			res = 1;
+		if (!owner || !owner->on_cpu) {
+			res = owner && !owner->on_cpu;
 			break;
 		}
 		cpu_relax();
@@ -733,7 +733,7 @@ static void  noinline __sched rt_spin_lo
 {
 	struct task_struct *lock_owner, *self = current;
 	struct rt_mutex_waiter waiter, *top_waiter;
-	int ret;
+	int ret, wait, rt_spin = 0, other_spin = 0, cpu;
 
 	rt_mutex_init_waiter(&waiter, true);
 
@@ -761,6 +761,17 @@ static void  noinline __sched rt_spin_lo
 	ret = task_blocks_on_rt_mutex(lock, &waiter, self, 0);
 	BUG_ON(ret);
 
+	/* basic spin/yield sanity checks */
+	if (rt_spin_yield_enabled()) {
+		rt_spin = !self->saved_state;
+		/* Here there be dragons */
+		rt_spin &= !(self->flags & PF_EXITING);
+		other_spin = rt_spin;
+		rt_spin &= rt_task(self);
+		other_spin &= !rt_spin;
+	}
+	cpu = raw_smp_processor_id();
+
 	for (;;) {
 		/* Try to acquire the lock again. */
 		if (__try_to_take_rt_mutex(lock, self, &waiter, STEAL_LATERAL))
@@ -769,12 +780,25 @@ static void  noinline __sched rt_spin_lo
 		top_waiter = rt_mutex_top_waiter(lock);
 		lock_owner = rt_mutex_owner(lock);
 
+		if (rt_spin)
+			wait = 1;
+		else
+			wait = top_waiter != &waiter;
+
+		/* SCHED_OTHER can laterally steal, let them try */
+		if (other_spin) {
+			wait &= task_cpu(top_waiter->task) == cpu;
+			wait |= top_waiter->task->prio < self->prio;
+			wait |= lock_owner && !lock_owner->on_cpu;
+			wait |= lock_owner && !lock_owner->prio < self->prio;
+		}
+
 		raw_spin_unlock(&lock->wait_lock);
 
 		debug_rt_mutex_print_deadlock(&waiter);
 
-		if (top_waiter != &waiter || adaptive_wait(lock, lock_owner))
-			schedule_rt_mutex(lock);
+		if (wait || adaptive_wait(lock, lock_owner))
+			schedule_rt_spinlock(lock, rt_spin);
 
 		raw_spin_lock(&lock->wait_lock);
 
@@ -826,6 +850,16 @@ static void  noinline __sched rt_spin_lo
 		return;
 	}
 
+	if (rt_spin_yield_enabled() && rt_task(current)) {
+		struct rt_mutex_waiter *top_waiter = rt_mutex_top_waiter(lock);
+		struct task_struct *next = top_waiter->task;
+
+		/* Move next in line to head of its queue */
+		pi_lock(&next->pi_lock);
+		rt_mutex_requeue(next, next->prio);
+		pi_unlock(&next->pi_lock);
+	}
+
 	wakeup_next_waiter(lock);
 
 	raw_spin_unlock(&lock->wait_lock);
--- a/kernel/rtmutex_common.h
+++ b/kernel/rtmutex_common.h
@@ -32,9 +32,37 @@ extern void schedule_rt_mutex_test(struc
 		schedule_rt_mutex_test(_lock);			\
   } while (0)
 
-#else
-# define schedule_rt_mutex(_lock)			schedule()
-#endif
+#ifdef CONFIG_PREEMPT_RT_FULL
+#define schedule_rt_spinlock(_lock, _spin)			\
+  do {								\
+	if (!(current->flags & PF_MUTEX_TESTER)) {		\
+		if (!_spin)					\
+			schedule();				\
+		else						\
+			yield();				\
+	} else							\
+		schedule_rt_mutex_test(_lock);			\
+  } while (0)
+#else /* !CONFIG_PREEMPT_RT_FULL */
+#define schedule_rt_spinlock(_lock, _spin) schedule_rt_mutex(_lock)
+#endif /* CONFIG_PREEMPT_RT_FULL */
+
+#else /* !CONFIG_RT_MUTEX_TESTER */
+
+#define schedule_rt_mutex(_lock)			schedule()
+
+#ifdef CONFIG_PREEMPT_RT_FULL
+#define schedule_rt_spinlock(_lock, _spin)			\
+  do {								\
+	if (!_spin)						\
+		schedule();					\
+	else							\
+		yield();					\
+  } while (0)
+#else /* !CONFIG_PREEMPT_RT_FULL */
+#define schedule_rt_spinlock(_lock, _spin) schedule_rt_mutex(_lock)
+#endif /* CONFIG_PREEMPT_RT_FULL */
+#endif /* CONFIG_RT_MUTEX_TESTER */
 
 /*
  * This is the control structure for tasks blocked on a rt_mutex,
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -943,6 +943,12 @@ late_initcall(sched_init_debug);
 const_debug unsigned int sysctl_sched_nr_migrate = 32;
 #else
 const_debug unsigned int sysctl_sched_nr_migrate = 8;
+
+/*
+ * rt spinlock waiters yield() if necessary vs blocking.
+ * SCHED_OTHER must block, but spin if they can do so.
+ */
+unsigned int sysctl_sched_rt_spin_yield __read_mostly;
 #endif
 
 /*
@@ -5292,6 +5298,7 @@ EXPORT_SYMBOL(sleep_on_timeout);
  * task_setprio - set the current priority of a task
  * @p: task
  * @prio: prio value (kernel-internal form)
+ * @requeue: requeue an rt_spin_lock_slowlock() top waiter and preempt
  *
  * This function changes the 'effective' priority of a task. It does
  * not touch ->normal_prio like __setscheduler().
@@ -5299,7 +5306,7 @@ EXPORT_SYMBOL(sleep_on_timeout);
  * Used by the rt_mutex code to implement priority inheritance
  * logic. Call site only calls if the priority of the task changed.
  */
-void task_setprio(struct task_struct *p, int prio)
+void task_setprio(struct task_struct *p, int prio, int requeue)
 {
 	int oldprio, on_rq, running;
 	struct rq *rq;
@@ -5332,6 +5339,8 @@ void task_setprio(struct task_struct *p,
 	prev_class = p->sched_class;
 	on_rq = p->on_rq;
 	running = task_current(rq, p);
+	if (requeue && (running || !on_rq || !rt_prio(oldprio)))
+		goto out_unlock;
 	if (on_rq)
 		dequeue_task(rq, p, 0);
 	if (running)
@@ -5346,8 +5355,25 @@ void task_setprio(struct task_struct *p,
 
 	if (running)
 		p->sched_class->set_curr_task(rq);
-	if (on_rq)
-		enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
+	if (on_rq) {
+		if (!sysctl_sched_rt_spin_yield) {
+			enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
+		} else {
+			enqueue_task(rq, p, ENQUEUE_HEAD);
+
+			/*
+			 * If we're requeueing a spinlock waiter, preempt any
+			 * peer in the way, waiter involuntarily blocked, so
+			 * has the right to use this CPU before its peers.
+			 */
+			requeue &= p->prio <= rq->curr->prio;
+			requeue &= rq->curr->state == TASK_RUNNING;
+			requeue &= rq->curr != current;
+
+			if (requeue)
+				resched_task(rq->curr);
+		}
+	}
 
 	check_class_changed(rq, p, prev_class, oldprio);
 out_unlock:
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -2510,6 +2510,10 @@ static void check_preempt_wakeup(struct
 	if (unlikely(se == pse))
 		return;
 
+	/* FIXME: might be spinning on a lock, good enough to play with */
+	if (rt_spin_yield_enabled() && curr->migrate_disable)
+		return;
+
 	/*
 	 * This is possible from callers such as pull_task(), in which we
 	 * unconditionally check_prempt_curr() after an enqueue (which may have
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -991,6 +991,9 @@ enqueue_task_rt(struct rq *rq, struct ta
 	if (flags & ENQUEUE_WAKEUP)
 		rt_se->timeout = 0;
 
+	if (rt_spin_yield_enabled() && (flags & WF_LOCK_SLEEPER))
+		flags |= ENQUEUE_HEAD;
+
 	enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD);
 
 	if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -316,6 +316,7 @@ void kernel_restart_prepare(char *cmd)
 {
 	blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd);
 	system_state = SYSTEM_RESTART;
+	rt_spin_yield_disable();
 	usermodehelper_disable();
 	device_shutdown();
 	syscore_shutdown();
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -368,6 +368,17 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= sched_rt_handler,
 	},
+#ifdef CONFIG_PREEMPT_RT_FULL
+	{
+		.procname	= "sched_rt_spin_yield",
+		.data		= &sysctl_sched_rt_spin_yield,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &one,
+	},
+#endif
 	{
 		.procname	= "sched_compat_yield",
 		.data		= &sysctl_sched_compat_yield,


--
To unsubscribe from this list: send the line "unsubscribe linux-rt-users" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[Index of Archives]     [RT Stable]     [Kernel Newbies]     [IDE]     [Security]     [Git]     [Netfilter]     [Bugtraq]     [Yosemite]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Linux ATA RAID]     [Samba]     [Video 4 Linux]     [Device Mapper]

  Powered by Linux