Dear RT folks! I'm pleased to announce the v6.5-rc7-rt4 patch set. Changes since v6.5-rc7-rt3: - The locking patches for flushed queued I/O on lock contention have been updated. The last version from the upstream discussion has been picked up and updated based on ongoing discussion. Known issues None The delta patch against v6.5-rc7-rt3 is appended below and can be found here: https://cdn.kernel.org/pub/linux/kernel/projects/rt/6.5/incr/patch-6.5-rc7-rt3-rt4.patch.xz You can get this release via the git tree at: https://git.kernel.org/pub/scm/linux/kernel/git/rt/linux-rt-devel.git v6.5-rc7-rt4 The RT patch against v6.5-rc7 can be found here: https://cdn.kernel.org/pub/linux/kernel/projects/rt/6.5/older/patch-6.5-rc7-rt4.patch.xz The split quilt queue is available at: https://cdn.kernel.org/pub/linux/kernel/projects/rt/6.5/older/patches-6.5-rc7-rt4.tar.xz Sebastian diff --git a/include/linux/sched.h b/include/linux/sched.h index 7b235881a0ad0..b4a34e80ffea6 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -304,11 +304,6 @@ extern long schedule_timeout_idle(long timeout); asmlinkage void schedule(void); extern void schedule_preempt_disabled(void); asmlinkage void preempt_schedule_irq(void); - -extern void sched_submit_work(void); -extern void sched_resume_work(void); -extern void schedule_rtmutex(void); - #ifdef CONFIG_PREEMPT_RT extern void schedule_rtlock(void); #endif @@ -911,6 +906,9 @@ struct task_struct { * ->sched_remote_wakeup gets used, so it can be in this word. */ unsigned sched_remote_wakeup:1; +#ifdef CONFIG_RT_MUTEXES + unsigned sched_rt_mutex:1; +#endif /* Bit to tell LSMs we're in execve(): */ unsigned in_execve:1; diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h index 994c25640e156..b2b9e6eb96830 100644 --- a/include/linux/sched/rt.h +++ b/include/linux/sched/rt.h @@ -30,6 +30,10 @@ static inline bool task_is_realtime(struct task_struct *tsk) } #ifdef CONFIG_RT_MUTEXES +extern void rt_mutex_pre_schedule(void); +extern void rt_mutex_schedule(void); +extern void rt_mutex_post_schedule(void); + /* * Must hold either p->pi_lock or task_rq(p)->lock. */ diff --git a/kernel/futex/pi.c b/kernel/futex/pi.c index ce2889f123755..f8e65b27d9d6b 100644 --- a/kernel/futex/pi.c +++ b/kernel/futex/pi.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-or-later #include <linux/slab.h> +#include <linux/sched/rt.h> #include <linux/sched/task.h> #include "futex.h" @@ -1002,6 +1003,12 @@ int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int tryl goto no_block; } + /* + * Must be done before we enqueue the waiter, here is unfortunately + * under the hb lock, but that *should* work because it does nothing. + */ + rt_mutex_pre_schedule(); + rt_mutex_init_waiter(&rt_waiter); /* @@ -1052,6 +1059,10 @@ int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int tryl if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter)) ret = 0; + /* + * Waiter is unqueued. + */ + rt_mutex_post_schedule(); no_block: /* * Fixup the pi_state owner and possibly acquire the lock if we diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 08a4555076f73..4a10e8c16fd2b 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -307,15 +307,11 @@ static int __sched rt_mutex_slowtrylock(struct rt_mutex_base *lock); static __always_inline bool rt_mutex_try_acquire(struct rt_mutex_base *lock) { /* - * With debug enabled rt_mutex_cmpxchg trylock() will always fail, - * which will unconditionally invoke sched_submit/resume_work() in - * the slow path of __rt_mutex_lock() and __ww_rt_mutex_lock() even - * in the non-contended case. + * With debug enabled rt_mutex_cmpxchg trylock() will always fail. * - * Avoid that by using rt_mutex_slow_trylock() which is covered by - * the debug code and can acquire a non-contended rtmutex. On - * success the callsite avoids the sched_submit/resume_work() - * dance. + * Avoid unconditionally taking the slow path by using + * rt_mutex_slow_trylock() which is covered by the debug code and can + * acquire a non-contended rtmutex. */ return rt_mutex_slowtrylock(lock); } @@ -1636,7 +1632,7 @@ static int __sched rt_mutex_slowlock_block(struct rt_mutex_base *lock, raw_spin_unlock_irq(&lock->wait_lock); if (!owner || !rtmutex_spin_on_owner(lock, waiter, owner)) - schedule_rtmutex(); + rt_mutex_schedule(); raw_spin_lock_irq(&lock->wait_lock); set_current_state(state); @@ -1665,7 +1661,7 @@ static void __sched rt_mutex_handle_deadlock(int res, int detect_deadlock, WARN(1, "rtmutex deadlock detected\n"); while (1) { set_current_state(TASK_INTERRUPTIBLE); - schedule_rtmutex(); + rt_mutex_schedule(); } } @@ -1761,10 +1757,13 @@ static int __sched rt_mutex_slowlock(struct rt_mutex_base *lock, int ret; /* - * The task is about to sleep. Invoke sched_submit_work() before - * blocking as that might take locks and corrupt tsk::pi_blocked_on. + * Do all pre-schedule work here, before we queue a waiter and invoke + * PI -- any such work that trips on rtlock (PREEMPT_RT spinlock) would + * otherwise recurse back into task_blocks_on_rt_mutex() through + * rtlock_slowlock() and will then enqueue a second waiter for this + * same task and things get really confusing real fast. */ - sched_submit_work(); + rt_mutex_pre_schedule(); /* * Technically we could use raw_spin_[un]lock_irq() here, but this can @@ -1777,8 +1776,8 @@ static int __sched rt_mutex_slowlock(struct rt_mutex_base *lock, raw_spin_lock_irqsave(&lock->wait_lock, flags); ret = __rt_mutex_slowlock_locked(lock, ww_ctx, state); raw_spin_unlock_irqrestore(&lock->wait_lock, flags); + rt_mutex_post_schedule(); - sched_resume_work(); return ret; } diff --git a/kernel/locking/rwbase_rt.c b/kernel/locking/rwbase_rt.c index 5be92ca5afabc..b5e881250fec5 100644 --- a/kernel/locking/rwbase_rt.c +++ b/kernel/locking/rwbase_rt.c @@ -71,6 +71,7 @@ static int __sched __rwbase_read_lock(struct rwbase_rt *rwb, struct rt_mutex_base *rtm = &rwb->rtmutex; int ret; + rwbase_pre_schedule(); raw_spin_lock_irq(&rtm->wait_lock); /* @@ -125,29 +126,19 @@ static int __sched __rwbase_read_lock(struct rwbase_rt *rwb, rwbase_rtmutex_unlock(rtm); trace_contention_end(rwb, ret); + rwbase_post_schedule(); return ret; } static __always_inline int rwbase_read_lock(struct rwbase_rt *rwb, unsigned int state) { - int ret; - lockdep_assert(!current->pi_blocked_on); if (rwbase_read_trylock(rwb)) return 0; - /* - * The task is about to sleep. For rwsems this submits work as that - * might take locks and corrupt tsk::pi_blocked_on. Must be - * explicit here because __rwbase_read_lock() cannot invoke - * rt_mutex_slowlock(). NOP for rwlocks. - */ - rwbase_sched_submit_work(); - ret = __rwbase_read_lock(rwb, state); - rwbase_sched_resume_work(); - return ret; + return __rwbase_read_lock(rwb, state); } static void __sched __rwbase_read_unlock(struct rwbase_rt *rwb, @@ -243,16 +234,15 @@ static int __sched rwbase_write_lock(struct rwbase_rt *rwb, struct rt_mutex_base *rtm = &rwb->rtmutex; unsigned long flags; - /* - * Take the rtmutex as a first step. For rwsem this will also - * invoke sched_submit_work() to flush IO and workers. - */ + /* Take the rtmutex as a first step */ if (rwbase_rtmutex_lock_state(rtm, state)) return -EINTR; /* Force readers into slow path */ atomic_sub(READER_BIAS, &rwb->readers); + rt_mutex_pre_schedule(); + raw_spin_lock_irqsave(&rtm->wait_lock, flags); if (__rwbase_write_trylock(rwb)) goto out_unlock; @@ -264,6 +254,7 @@ static int __sched rwbase_write_lock(struct rwbase_rt *rwb, if (rwbase_signal_pending_state(state, current)) { rwbase_restore_current_state(); __rwbase_write_unlock(rwb, 0, flags); + rt_mutex_post_schedule(); trace_contention_end(rwb, -EINTR); return -EINTR; } @@ -282,6 +273,7 @@ static int __sched rwbase_write_lock(struct rwbase_rt *rwb, out_unlock: raw_spin_unlock_irqrestore(&rtm->wait_lock, flags); + rt_mutex_post_schedule(); return 0; } diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index e304db9ebfd95..2340b6d90ec6f 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -1415,12 +1415,6 @@ static inline void __downgrade_write(struct rw_semaphore *sem) #define rwbase_rtmutex_lock_state(rtm, state) \ __rt_mutex_lock(rtm, state) -#define rwbase_sched_submit_work() \ - sched_submit_work() - -#define rwbase_sched_resume_work() \ - sched_resume_work() - #define rwbase_rtmutex_slowlock_locked(rtm, state) \ __rt_mutex_slowlock_locked(rtm, NULL, state) @@ -1433,8 +1427,14 @@ static inline void __downgrade_write(struct rw_semaphore *sem) #define rwbase_signal_pending_state(state, current) \ signal_pending_state(state, current) +#define rwbase_pre_schedule() \ + rt_mutex_pre_schedule() + #define rwbase_schedule() \ - schedule() + rt_mutex_schedule() + +#define rwbase_post_schedule() \ + rt_mutex_post_schedule() #include "rwbase_rt.c" diff --git a/kernel/locking/spinlock_rt.c b/kernel/locking/spinlock_rt.c index 9fe282cd145d9..38e292454fccb 100644 --- a/kernel/locking/spinlock_rt.c +++ b/kernel/locking/spinlock_rt.c @@ -161,9 +161,6 @@ rwbase_rtmutex_lock_state(struct rt_mutex_base *rtm, unsigned int state) return 0; } -static __always_inline void rwbase_sched_submit_work(void) { } -static __always_inline void rwbase_sched_resume_work(void) { } - static __always_inline int rwbase_rtmutex_slowlock_locked(struct rt_mutex_base *rtm, unsigned int state) { @@ -189,9 +186,13 @@ static __always_inline int rwbase_rtmutex_trylock(struct rt_mutex_base *rtm) #define rwbase_signal_pending_state(state, current) (0) +#define rwbase_pre_schedule() + #define rwbase_schedule() \ schedule_rtlock() +#define rwbase_post_schedule() + #include "rwbase_rt.c" /* * The common functions which get wrapped into the rwlock API. diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2276b5d882380..fe944fa1efc4d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6779,11 +6779,18 @@ void __noreturn do_task_dead(void) cpu_relax(); } -void sched_submit_work(void) +static inline void sched_submit_work(struct task_struct *tsk) { - struct task_struct *tsk = current; - unsigned int task_flags = tsk->flags; + static DEFINE_WAIT_OVERRIDE_MAP(sched_map, LD_WAIT_CONFIG); + unsigned int task_flags; + /* + * Establish LD_WAIT_CONFIG context to ensure none of the code called + * will use a blocking primitive -- which would lead to recursion. + */ + lock_map_acquire_try(&sched_map); + + task_flags = tsk->flags; /* * If a worker goes to sleep, notify and ask workqueue whether it * wants to wake up a task to maintain concurrency. @@ -6807,12 +6814,12 @@ void sched_submit_work(void) * make sure to submit it to avoid deadlocks. */ blk_flush_plug(tsk->plug, true); + + lock_map_release(&sched_map); } -void sched_resume_work(void) +static void sched_update_worker(struct task_struct *tsk) { - struct task_struct *tsk = current; - if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) { if (tsk->flags & PF_WQ_WORKER) wq_worker_running(tsk); @@ -6821,7 +6828,7 @@ void sched_resume_work(void) } } -static void schedule_loop(unsigned int sched_mode) +static __always_inline void __schedule_loop(unsigned int sched_mode) { do { preempt_disable(); @@ -6832,18 +6839,19 @@ static void schedule_loop(unsigned int sched_mode) asmlinkage __visible void __sched schedule(void) { - if (!task_is_running(current)) - sched_submit_work(); - schedule_loop(SM_NONE); - sched_resume_work(); + struct task_struct *tsk = current; + +#ifdef CONFIG_RT_MUTEXES + lockdep_assert(!tsk->sched_rt_mutex); +#endif + + if (!task_is_running(tsk)) + sched_submit_work(tsk); + __schedule_loop(SM_NONE); + sched_update_worker(tsk); } EXPORT_SYMBOL(schedule); -void schedule_rtmutex(void) -{ - schedule_loop(SM_NONE); -} - /* * synchronize_rcu_tasks() makes sure that no task is stuck in preempted * state (have scheduled out non-voluntarily) by making sure that all @@ -6903,7 +6911,7 @@ void __sched schedule_preempt_disabled(void) #ifdef CONFIG_PREEMPT_RT void __sched notrace schedule_rtlock(void) { - schedule_loop(SM_RTLOCK_WAIT); + __schedule_loop(SM_RTLOCK_WAIT); } NOKPROBE_SYMBOL(schedule_rtlock); #endif @@ -7128,6 +7136,32 @@ static void __setscheduler_prio(struct task_struct *p, int prio) #ifdef CONFIG_RT_MUTEXES +/* + * Would be more useful with typeof()/auto_type but they don't mix with + * bit-fields. Since it's a local thing, use int. Keep the generic sounding + * name such that if someone were to implement this function we get to compare + * notes. + */ +#define fetch_and_set(x, v) ({ int _x = (x); (x) = (v); _x; }) + +void rt_mutex_pre_schedule(void) +{ + lockdep_assert(!fetch_and_set(current->sched_rt_mutex, 1)); + sched_submit_work(current); +} + +void rt_mutex_schedule(void) +{ + lockdep_assert(current->sched_rt_mutex); + __schedule_loop(SM_NONE); +} + +void rt_mutex_post_schedule(void) +{ + sched_update_worker(current); + lockdep_assert(fetch_and_set(current->sched_rt_mutex, 0)); +} + static inline int __rt_effective_prio(struct task_struct *pi_task, int prio) { if (pi_task) diff --git a/localversion-rt b/localversion-rt index 1445cd65885cd..ad3da1bcab7e8 100644 --- a/localversion-rt +++ b/localversion-rt @@ -1 +1 @@ --rt3 +-rt4