[ANNOUNCE] v6.5-rc7-rt4

Sebastian Andrzej Siewior <bigeasy@xxxxxxxxxxxxx> · Fri, 25 Aug 2023 19:01:24 +0200

Dear RT folks!

I'm pleased to announce the v6.5-rc7-rt4 patch set. 

Changes since v6.5-rc7-rt3:

  - The locking patches for flushed queued I/O on lock contention have
    been updated. The last version from the upstream discussion has been
    picked up and updated based on ongoing discussion.

Known issues
     None

The delta patch against v6.5-rc7-rt3 is appended below and can be found here:
 
     https://cdn.kernel.org/pub/linux/kernel/projects/rt/6.5/incr/patch-6.5-rc7-rt3-rt4.patch.xz

You can get this release via the git tree at:

    https://git.kernel.org/pub/scm/linux/kernel/git/rt/linux-rt-devel.git v6.5-rc7-rt4

The RT patch against v6.5-rc7 can be found here:

    https://cdn.kernel.org/pub/linux/kernel/projects/rt/6.5/older/patch-6.5-rc7-rt4.patch.xz

The split quilt queue is available at:

    https://cdn.kernel.org/pub/linux/kernel/projects/rt/6.5/older/patches-6.5-rc7-rt4.tar.xz

Sebastian

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 7b235881a0ad0..b4a34e80ffea6 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -304,11 +304,6 @@ extern long schedule_timeout_idle(long timeout);
 asmlinkage void schedule(void);
 extern void schedule_preempt_disabled(void);
 asmlinkage void preempt_schedule_irq(void);
-
-extern void sched_submit_work(void);
-extern void sched_resume_work(void);
-extern void schedule_rtmutex(void);
-
 #ifdef CONFIG_PREEMPT_RT
  extern void schedule_rtlock(void);
 #endif
@@ -911,6 +906,9 @@ struct task_struct {
 	 * ->sched_remote_wakeup gets used, so it can be in this word.
 	 */
 	unsigned			sched_remote_wakeup:1;
+#ifdef CONFIG_RT_MUTEXES
+	unsigned			sched_rt_mutex:1;
+#endif
 
 	/* Bit to tell LSMs we're in execve(): */
 	unsigned			in_execve:1;
diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h
index 994c25640e156..b2b9e6eb96830 100644
--- a/include/linux/sched/rt.h
+++ b/include/linux/sched/rt.h
@@ -30,6 +30,10 @@ static inline bool task_is_realtime(struct task_struct *tsk)
 }
 
 #ifdef CONFIG_RT_MUTEXES
+extern void rt_mutex_pre_schedule(void);
+extern void rt_mutex_schedule(void);
+extern void rt_mutex_post_schedule(void);
+
 /*
  * Must hold either p->pi_lock or task_rq(p)->lock.
  */
diff --git a/kernel/futex/pi.c b/kernel/futex/pi.c
index ce2889f123755..f8e65b27d9d6b 100644
--- a/kernel/futex/pi.c
+++ b/kernel/futex/pi.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
 
 #include <linux/slab.h>
+#include <linux/sched/rt.h>
 #include <linux/sched/task.h>
 
 #include "futex.h"
@@ -1002,6 +1003,12 @@ int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int tryl
 		goto no_block;
 	}
 
+	/*
+	 * Must be done before we enqueue the waiter, here is unfortunately
+	 * under the hb lock, but that *should* work because it does nothing.
+	 */
+	rt_mutex_pre_schedule();
+
 	rt_mutex_init_waiter(&rt_waiter);
 
 	/*
@@ -1052,6 +1059,10 @@ int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int tryl
 	if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter))
 		ret = 0;
 
+	/*
+	 * Waiter is unqueued.
+	 */
+	rt_mutex_post_schedule();
 no_block:
 	/*
 	 * Fixup the pi_state owner and possibly acquire the lock if we
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 08a4555076f73..4a10e8c16fd2b 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -307,15 +307,11 @@ static int __sched rt_mutex_slowtrylock(struct rt_mutex_base *lock);
 static __always_inline bool rt_mutex_try_acquire(struct rt_mutex_base *lock)
 {
 	/*
-	 * With debug enabled rt_mutex_cmpxchg trylock() will always fail,
-	 * which will unconditionally invoke sched_submit/resume_work() in
-	 * the slow path of __rt_mutex_lock() and __ww_rt_mutex_lock() even
-	 * in the non-contended case.
+	 * With debug enabled rt_mutex_cmpxchg trylock() will always fail.
 	 *
-	 * Avoid that by using rt_mutex_slow_trylock() which is covered by
-	 * the debug code and can acquire a non-contended rtmutex. On
-	 * success the callsite avoids the sched_submit/resume_work()
-	 * dance.
+	 * Avoid unconditionally taking the slow path by using
+	 * rt_mutex_slow_trylock() which is covered by the debug code and can
+	 * acquire a non-contended rtmutex.
 	 */
 	return rt_mutex_slowtrylock(lock);
 }
@@ -1636,7 +1632,7 @@ static int __sched rt_mutex_slowlock_block(struct rt_mutex_base *lock,
 		raw_spin_unlock_irq(&lock->wait_lock);
 
 		if (!owner || !rtmutex_spin_on_owner(lock, waiter, owner))
-			schedule_rtmutex();
+			rt_mutex_schedule();
 
 		raw_spin_lock_irq(&lock->wait_lock);
 		set_current_state(state);
@@ -1665,7 +1661,7 @@ static void __sched rt_mutex_handle_deadlock(int res, int detect_deadlock,
 	WARN(1, "rtmutex deadlock detected\n");
 	while (1) {
 		set_current_state(TASK_INTERRUPTIBLE);
-		schedule_rtmutex();
+		rt_mutex_schedule();
 	}
 }
 
@@ -1761,10 +1757,13 @@ static int __sched rt_mutex_slowlock(struct rt_mutex_base *lock,
 	int ret;
 
 	/*
-	 * The task is about to sleep. Invoke sched_submit_work() before
-	 * blocking as that might take locks and corrupt tsk::pi_blocked_on.
+	 * Do all pre-schedule work here, before we queue a waiter and invoke
+	 * PI -- any such work that trips on rtlock (PREEMPT_RT spinlock) would
+	 * otherwise recurse back into task_blocks_on_rt_mutex() through
+	 * rtlock_slowlock() and will then enqueue a second waiter for this
+	 * same task and things get really confusing real fast.
 	 */
-	sched_submit_work();
+	rt_mutex_pre_schedule();
 
 	/*
 	 * Technically we could use raw_spin_[un]lock_irq() here, but this can
@@ -1777,8 +1776,8 @@ static int __sched rt_mutex_slowlock(struct rt_mutex_base *lock,
 	raw_spin_lock_irqsave(&lock->wait_lock, flags);
 	ret = __rt_mutex_slowlock_locked(lock, ww_ctx, state);
 	raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
+	rt_mutex_post_schedule();
 
-	sched_resume_work();
 	return ret;
 }
 
diff --git a/kernel/locking/rwbase_rt.c b/kernel/locking/rwbase_rt.c
index 5be92ca5afabc..b5e881250fec5 100644
--- a/kernel/locking/rwbase_rt.c
+++ b/kernel/locking/rwbase_rt.c
@@ -71,6 +71,7 @@ static int __sched __rwbase_read_lock(struct rwbase_rt *rwb,
 	struct rt_mutex_base *rtm = &rwb->rtmutex;
 	int ret;
 
+	rwbase_pre_schedule();
 	raw_spin_lock_irq(&rtm->wait_lock);
 
 	/*
@@ -125,29 +126,19 @@ static int __sched __rwbase_read_lock(struct rwbase_rt *rwb,
 		rwbase_rtmutex_unlock(rtm);
 
 	trace_contention_end(rwb, ret);
+	rwbase_post_schedule();
 	return ret;
 }
 
 static __always_inline int rwbase_read_lock(struct rwbase_rt *rwb,
 					    unsigned int state)
 {
-	int ret;
-
 	lockdep_assert(!current->pi_blocked_on);
 
 	if (rwbase_read_trylock(rwb))
 		return 0;
 
-	/*
-	 * The task is about to sleep. For rwsems this submits work as that
-	 * might take locks and corrupt tsk::pi_blocked_on. Must be
-	 * explicit here because __rwbase_read_lock() cannot invoke
-	 * rt_mutex_slowlock(). NOP for rwlocks.
-	 */
-	rwbase_sched_submit_work();
-	ret = __rwbase_read_lock(rwb, state);
-	rwbase_sched_resume_work();
-	return ret;
+	return __rwbase_read_lock(rwb, state);
 }
 
 static void __sched __rwbase_read_unlock(struct rwbase_rt *rwb,
@@ -243,16 +234,15 @@ static int __sched rwbase_write_lock(struct rwbase_rt *rwb,
 	struct rt_mutex_base *rtm = &rwb->rtmutex;
 	unsigned long flags;
 
-	/*
-	 * Take the rtmutex as a first step. For rwsem this will also
-	 * invoke sched_submit_work() to flush IO and workers.
-	 */
+	/* Take the rtmutex as a first step */
 	if (rwbase_rtmutex_lock_state(rtm, state))
 		return -EINTR;
 
 	/* Force readers into slow path */
 	atomic_sub(READER_BIAS, &rwb->readers);
 
+	rt_mutex_pre_schedule();
+
 	raw_spin_lock_irqsave(&rtm->wait_lock, flags);
 	if (__rwbase_write_trylock(rwb))
 		goto out_unlock;
@@ -264,6 +254,7 @@ static int __sched rwbase_write_lock(struct rwbase_rt *rwb,
 		if (rwbase_signal_pending_state(state, current)) {
 			rwbase_restore_current_state();
 			__rwbase_write_unlock(rwb, 0, flags);
+			rt_mutex_post_schedule();
 			trace_contention_end(rwb, -EINTR);
 			return -EINTR;
 		}
@@ -282,6 +273,7 @@ static int __sched rwbase_write_lock(struct rwbase_rt *rwb,
 
 out_unlock:
 	raw_spin_unlock_irqrestore(&rtm->wait_lock, flags);
+	rt_mutex_post_schedule();
 	return 0;
 }
 
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index e304db9ebfd95..2340b6d90ec6f 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -1415,12 +1415,6 @@ static inline void __downgrade_write(struct rw_semaphore *sem)
 #define rwbase_rtmutex_lock_state(rtm, state)		\
 	__rt_mutex_lock(rtm, state)
 
-#define rwbase_sched_submit_work()			\
-	sched_submit_work()
-
-#define rwbase_sched_resume_work()			\
-	sched_resume_work()
-
 #define rwbase_rtmutex_slowlock_locked(rtm, state)	\
 	__rt_mutex_slowlock_locked(rtm, NULL, state)
 
@@ -1433,8 +1427,14 @@ static inline void __downgrade_write(struct rw_semaphore *sem)
 #define rwbase_signal_pending_state(state, current)	\
 	signal_pending_state(state, current)
 
+#define rwbase_pre_schedule()				\
+	rt_mutex_pre_schedule()
+
 #define rwbase_schedule()				\
-	schedule()
+	rt_mutex_schedule()
+
+#define rwbase_post_schedule()				\
+	rt_mutex_post_schedule()
 
 #include "rwbase_rt.c"
 
diff --git a/kernel/locking/spinlock_rt.c b/kernel/locking/spinlock_rt.c
index 9fe282cd145d9..38e292454fccb 100644
--- a/kernel/locking/spinlock_rt.c
+++ b/kernel/locking/spinlock_rt.c
@@ -161,9 +161,6 @@ rwbase_rtmutex_lock_state(struct rt_mutex_base *rtm, unsigned int state)
 	return 0;
 }
 
-static __always_inline void rwbase_sched_submit_work(void) { }
-static __always_inline void rwbase_sched_resume_work(void) { }
-
 static __always_inline int
 rwbase_rtmutex_slowlock_locked(struct rt_mutex_base *rtm, unsigned int state)
 {
@@ -189,9 +186,13 @@ static __always_inline int  rwbase_rtmutex_trylock(struct rt_mutex_base *rtm)
 
 #define rwbase_signal_pending_state(state, current)	(0)
 
+#define rwbase_pre_schedule()
+
 #define rwbase_schedule()				\
 	schedule_rtlock()
 
+#define rwbase_post_schedule()
+
 #include "rwbase_rt.c"
 /*
  * The common functions which get wrapped into the rwlock API.
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2276b5d882380..fe944fa1efc4d 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6779,11 +6779,18 @@ void __noreturn do_task_dead(void)
 		cpu_relax();
 }
 
-void sched_submit_work(void)
+static inline void sched_submit_work(struct task_struct *tsk)
 {
-	struct task_struct *tsk = current;
-	unsigned int task_flags = tsk->flags;
+	static DEFINE_WAIT_OVERRIDE_MAP(sched_map, LD_WAIT_CONFIG);
+	unsigned int task_flags;
 
+	/*
+	 * Establish LD_WAIT_CONFIG context to ensure none of the code called
+	 * will use a blocking primitive -- which would lead to recursion.
+	 */
+	lock_map_acquire_try(&sched_map);
+
+	task_flags = tsk->flags;
 	/*
 	 * If a worker goes to sleep, notify and ask workqueue whether it
 	 * wants to wake up a task to maintain concurrency.
@@ -6807,12 +6814,12 @@ void sched_submit_work(void)
 	 * make sure to submit it to avoid deadlocks.
 	 */
 	blk_flush_plug(tsk->plug, true);
+
+	lock_map_release(&sched_map);
 }
 
-void sched_resume_work(void)
+static void sched_update_worker(struct task_struct *tsk)
 {
-	struct task_struct *tsk = current;
-
 	if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) {
 		if (tsk->flags & PF_WQ_WORKER)
 			wq_worker_running(tsk);
@@ -6821,7 +6828,7 @@ void sched_resume_work(void)
 	}
 }
 
-static void schedule_loop(unsigned int sched_mode)
+static __always_inline void __schedule_loop(unsigned int sched_mode)
 {
 	do {
 		preempt_disable();
@@ -6832,18 +6839,19 @@ static void schedule_loop(unsigned int sched_mode)
 
 asmlinkage __visible void __sched schedule(void)
 {
-	if (!task_is_running(current))
-		sched_submit_work();
-	schedule_loop(SM_NONE);
-	sched_resume_work();
+	struct task_struct *tsk = current;
+
+#ifdef CONFIG_RT_MUTEXES
+	lockdep_assert(!tsk->sched_rt_mutex);
+#endif
+
+	if (!task_is_running(tsk))
+		sched_submit_work(tsk);
+	__schedule_loop(SM_NONE);
+	sched_update_worker(tsk);
 }
 EXPORT_SYMBOL(schedule);
 
-void schedule_rtmutex(void)
-{
-	schedule_loop(SM_NONE);
-}
-
 /*
  * synchronize_rcu_tasks() makes sure that no task is stuck in preempted
  * state (have scheduled out non-voluntarily) by making sure that all
@@ -6903,7 +6911,7 @@ void __sched schedule_preempt_disabled(void)
 #ifdef CONFIG_PREEMPT_RT
 void __sched notrace schedule_rtlock(void)
 {
-	schedule_loop(SM_RTLOCK_WAIT);
+	__schedule_loop(SM_RTLOCK_WAIT);
 }
 NOKPROBE_SYMBOL(schedule_rtlock);
 #endif
@@ -7128,6 +7136,32 @@ static void __setscheduler_prio(struct task_struct *p, int prio)
 
 #ifdef CONFIG_RT_MUTEXES
 
+/*
+ * Would be more useful with typeof()/auto_type but they don't mix with
+ * bit-fields. Since it's a local thing, use int. Keep the generic sounding
+ * name such that if someone were to implement this function we get to compare
+ * notes.
+ */
+#define fetch_and_set(x, v) ({ int _x = (x); (x) = (v); _x; })
+
+void rt_mutex_pre_schedule(void)
+{
+	lockdep_assert(!fetch_and_set(current->sched_rt_mutex, 1));
+	sched_submit_work(current);
+}
+
+void rt_mutex_schedule(void)
+{
+	lockdep_assert(current->sched_rt_mutex);
+	__schedule_loop(SM_NONE);
+}
+
+void rt_mutex_post_schedule(void)
+{
+	sched_update_worker(current);
+	lockdep_assert(fetch_and_set(current->sched_rt_mutex, 0));
+}
+
 static inline int __rt_effective_prio(struct task_struct *pi_task, int prio)
 {
 	if (pi_task)
diff --git a/localversion-rt b/localversion-rt
index 1445cd65885cd..ad3da1bcab7e8 100644
--- a/localversion-rt
+++ b/localversion-rt
@@ -1 +1 @@
--rt3
+-rt4