Dear RT folks! I'm pleased to announce the v5.16-rc3-rt6 patch set. Changes since v5.16-rc3-rt5: - ARM64 implemented HAVE_POSIX_CPU_TIMERS_TASK_WORK and it is now possible to enable KVM and PREEMPT_RT. Patch by Nicolas Saenz Julienne - Backport two patches from upstream to avoid scheduling ksoftirqd from a workqueue while completing mmc requests. - Replace a patch from lockdep series with a suggestion from Peter Zijlstra. - Don't acquire dev_base_lock with disabled bottom halves to avoid lockdep complaint on PREEMPT_RT. Reported by Pei Zhang and Luis Claudio R. Goncalves. - Dress a networking patch regarding inet_listen_hashbucket::lock differently so it looks better. Known issues - netconsole triggers WARN. - The "Memory controller" (CONFIG_MEMCG) has been disabled. - Valentin Schneider reported a few splats on ARM64, see https://lkml.kernel.org/r/20210810134127.1394269-1-valentin.schneider@xxxxxxx The delta patch against v5.16-rc3-rt5 is appended below and can be found here: https://cdn.kernel.org/pub/linux/kernel/projects/rt/5.16/incr/patch-5.16-rc3-rt5-rt6.patch.xz You can get this release via the git tree at: git://git.kernel.org/pub/scm/linux/kernel/git/rt/linux-rt-devel.git v5.16-rc3-rt6 The RT patch against v5.16-rc3 can be found here: https://cdn.kernel.org/pub/linux/kernel/projects/rt/5.16/older/patch-5.16-rc3-rt6.patch.xz The split quilt queue is available at: https://cdn.kernel.org/pub/linux/kernel/projects/rt/5.16/older/patches-5.16-rc3-rt6.tar.xz Sebastian diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 6bd4acc2be02f..260866cf53c95 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -89,7 +89,7 @@ config ARM64 select ARCH_SUPPORTS_ATOMIC_RMW select ARCH_SUPPORTS_INT128 if CC_HAS_INT128 select ARCH_SUPPORTS_NUMA_BALANCING - select ARCH_SUPPORTS_RT if HAVE_POSIX_CPU_TIMERS_TASK_WORK + select ARCH_SUPPORTS_RT select ARCH_WANT_COMPAT_IPC_PARSE_VERSION if COMPAT select ARCH_WANT_DEFAULT_BPF_JIT select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT @@ -216,7 +216,6 @@ config ARM64 select PCI_DOMAINS_GENERIC if PCI select PCI_ECAM if (ACPI && PCI) select PCI_SYSCALL if PCI - select HAVE_POSIX_CPU_TIMERS_TASK_WORK if !KVM select POWER_RESET select POWER_SUPPLY select SPARSE_IRQ diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c index 90e1bcd03b46c..52309b84be888 100644 --- a/drivers/mmc/core/block.c +++ b/drivers/mmc/core/block.c @@ -2051,7 +2051,8 @@ static void mmc_blk_mq_dec_in_flight(struct mmc_queue *mq, struct request *req) mmc_put_card(mq->card, &mq->ctx); } -static void mmc_blk_mq_post_req(struct mmc_queue *mq, struct request *req) +static void mmc_blk_mq_post_req(struct mmc_queue *mq, struct request *req, + bool can_sleep) { struct mmc_queue_req *mqrq = req_to_mmc_queue_req(req); struct mmc_request *mrq = &mqrq->brq.mrq; @@ -2063,10 +2064,14 @@ static void mmc_blk_mq_post_req(struct mmc_queue *mq, struct request *req) * Block layer timeouts race with completions which means the normal * completion path cannot be used during recovery. */ - if (mq->in_recovery) + if (mq->in_recovery) { mmc_blk_mq_complete_rq(mq, req); - else if (likely(!blk_should_fake_timeout(req->q))) - blk_mq_complete_request(req); + } else if (likely(!blk_should_fake_timeout(req->q))) { + if (can_sleep) + blk_mq_complete_request_direct(req, mmc_blk_mq_complete); + else + blk_mq_complete_request(req); + } mmc_blk_mq_dec_in_flight(mq, req); } @@ -2087,7 +2092,7 @@ void mmc_blk_mq_recovery(struct mmc_queue *mq) mmc_blk_urgent_bkops(mq, mqrq); - mmc_blk_mq_post_req(mq, req); + mmc_blk_mq_post_req(mq, req, true); } static void mmc_blk_mq_complete_prev_req(struct mmc_queue *mq, @@ -2106,7 +2111,7 @@ static void mmc_blk_mq_complete_prev_req(struct mmc_queue *mq, if (prev_req) *prev_req = mq->complete_req; else - mmc_blk_mq_post_req(mq, mq->complete_req); + mmc_blk_mq_post_req(mq, mq->complete_req, true); mq->complete_req = NULL; @@ -2178,7 +2183,8 @@ static void mmc_blk_mq_req_done(struct mmc_request *mrq) mq->rw_wait = false; wake_up(&mq->wait); - mmc_blk_mq_post_req(mq, req); + /* context unknown */ + mmc_blk_mq_post_req(mq, req, false); } static bool mmc_blk_rw_wait_cond(struct mmc_queue *mq, int *err) @@ -2238,7 +2244,7 @@ static int mmc_blk_mq_issue_rw_rq(struct mmc_queue *mq, err = mmc_start_request(host, &mqrq->brq.mrq); if (prev_req) - mmc_blk_mq_post_req(mq, prev_req); + mmc_blk_mq_post_req(mq, prev_req, true); if (err) mq->rw_wait = false; diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 2949d9ac74849..131b45dfec674 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -752,6 +752,17 @@ static inline void blk_mq_set_request_complete(struct request *rq) WRITE_ONCE(rq->state, MQ_RQ_COMPLETE); } +/* + * Complete the request directly instead of deferring it to softirq or + * completing it another CPU. Useful in preemptible instead of an interrupt. + */ +static inline void blk_mq_complete_request_direct(struct request *rq, + void (*complete)(struct request *rq)) +{ + WRITE_ONCE(rq->state, MQ_RQ_COMPLETE); + complete(rq); +} + void blk_mq_start_request(struct request *rq); void blk_mq_end_request(struct request *rq, blk_status_t error); void __blk_mq_end_request(struct request *rq, blk_status_t error); diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 5e19fe7dda4c5..f89620852774d 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1103,27 +1103,12 @@ static int __sched task_blocks_on_rt_mutex(struct rt_mutex_base *lock, * the other will detect the deadlock and return -EDEADLOCK, * which is wrong, as the other waiter is not in a deadlock * situation. + * + * Except for ww_mutex, in that case the chain walk must already deal + * with spurious cycles, see the comments at [3] and [6]. */ - if (owner == task) { -#if defined(DEBUG_WW_MUTEXES) && defined(CONFIG_DEBUG_LOCKING_API_SELFTESTS) - /* - * The lockdep selftest for ww-mutex assumes in a few cases - * the ww_ctx->contending_lock assignment via - * __ww_mutex_check_kill() which does not happen if the rtmutex - * detects the deadlock early. - */ - if (build_ww_mutex() && ww_ctx) { - struct rt_mutex *rtm; - - /* Check whether the waiter should backout immediately */ - rtm = container_of(lock, struct rt_mutex, rtmutex); - - __ww_mutex_add_waiter(waiter, rtm, ww_ctx); - __ww_mutex_check_kill(rtm, waiter, ww_ctx); - } -#endif + if (owner == task && !(build_ww_mutex() && ww_ctx)) return -EDEADLK; - } raw_spin_lock(&task->pi_lock); waiter->task = task; diff --git a/localversion-rt b/localversion-rt index 0efe7ba1930e1..8fc605d806670 100644 --- a/localversion-rt +++ b/localversion-rt @@ -1 +1 @@ --rt5 +-rt6 diff --git a/net/core/dev.c b/net/core/dev.c index 8b5cf8ad859b5..ad3cccbfa573b 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -371,12 +371,12 @@ static void list_netdevice(struct net_device *dev) ASSERT_RTNL(); - write_lock_bh(&dev_base_lock); + write_lock(&dev_base_lock); list_add_tail_rcu(&dev->dev_list, &net->dev_base_head); netdev_name_node_add(net, dev->name_node); hlist_add_head_rcu(&dev->index_hlist, dev_index_hash(net, dev->ifindex)); - write_unlock_bh(&dev_base_lock); + write_unlock(&dev_base_lock); dev_base_seq_inc(net); } @@ -389,11 +389,11 @@ static void unlist_netdevice(struct net_device *dev) ASSERT_RTNL(); /* Unlink dev from the device chain */ - write_lock_bh(&dev_base_lock); + write_lock(&dev_base_lock); list_del_rcu(&dev->dev_list); netdev_name_node_del(dev->name_node); hlist_del_rcu(&dev->index_hlist); - write_unlock_bh(&dev_base_lock); + write_unlock(&dev_base_lock); dev_base_seq_inc(dev_net(dev)); } @@ -1272,15 +1272,15 @@ int dev_change_name(struct net_device *dev, const char *newname) netdev_adjacent_rename_links(dev, oldname); - write_lock_bh(&dev_base_lock); + write_lock(&dev_base_lock); netdev_name_node_del(dev->name_node); - write_unlock_bh(&dev_base_lock); + write_unlock(&dev_base_lock); synchronize_rcu(); - write_lock_bh(&dev_base_lock); + write_lock(&dev_base_lock); netdev_name_node_add(net, dev->name_node); - write_unlock_bh(&dev_base_lock); + write_unlock(&dev_base_lock); ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev); ret = notifier_to_errno(ret); diff --git a/net/core/link_watch.c b/net/core/link_watch.c index 1a455847da54f..9599afd0862da 100644 --- a/net/core/link_watch.c +++ b/net/core/link_watch.c @@ -55,7 +55,7 @@ static void rfc2863_policy(struct net_device *dev) if (operstate == dev->operstate) return; - write_lock_bh(&dev_base_lock); + write_lock(&dev_base_lock); switch(dev->link_mode) { case IF_LINK_MODE_TESTING: @@ -74,7 +74,7 @@ static void rfc2863_policy(struct net_device *dev) dev->operstate = operstate; - write_unlock_bh(&dev_base_lock); + write_unlock(&dev_base_lock); } diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 2af8aeeadadf0..716be2f88cd75 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -842,9 +842,9 @@ static void set_operstate(struct net_device *dev, unsigned char transition) } if (dev->operstate != operstate) { - write_lock_bh(&dev_base_lock); + write_lock(&dev_base_lock); dev->operstate = operstate; - write_unlock_bh(&dev_base_lock); + write_unlock(&dev_base_lock); netdev_state_change(dev); } } @@ -2779,11 +2779,11 @@ static int do_setlink(const struct sk_buff *skb, if (tb[IFLA_LINKMODE]) { unsigned char value = nla_get_u8(tb[IFLA_LINKMODE]); - write_lock_bh(&dev_base_lock); + write_lock(&dev_base_lock); if (dev->link_mode ^ value) status |= DO_SETLINK_NOTIFY; dev->link_mode = value; - write_unlock_bh(&dev_base_lock); + write_unlock(&dev_base_lock); } if (tb[IFLA_VFINFO_LIST]) { diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c index 737e4f17e1c6d..e57fdad9ef942 100644 --- a/net/hsr/hsr_device.c +++ b/net/hsr/hsr_device.c @@ -30,13 +30,13 @@ static bool is_slave_up(struct net_device *dev) static void __hsr_set_operstate(struct net_device *dev, int transition) { - write_lock_bh(&dev_base_lock); + write_lock(&dev_base_lock); if (dev->operstate != transition) { dev->operstate = transition; - write_unlock_bh(&dev_base_lock); + write_unlock(&dev_base_lock); netdev_state_change(dev); } else { - write_unlock_bh(&dev_base_lock); + write_unlock(&dev_base_lock); } } diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index e460c84b1f8e4..7bd1e10086f0a 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -678,41 +678,47 @@ int inet_hash(struct sock *sk) } EXPORT_SYMBOL_GPL(inet_hash); -void inet_unhash(struct sock *sk) +static void __inet_unhash(struct sock *sk, struct inet_listen_hashbucket *ilb) { - struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; - struct inet_listen_hashbucket *ilb = NULL; - spinlock_t *lock; - bool state_listen; - if (sk_unhashed(sk)) return; - if (sk->sk_state == TCP_LISTEN) { - state_listen = true; - ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; - spin_lock(&ilb->lock); - } else { - state_listen = false; - lock = inet_ehash_lockp(hashinfo, sk->sk_hash); - spin_lock_bh(lock); - } - if (sk_unhashed(sk)) - goto unlock; - if (rcu_access_pointer(sk->sk_reuseport_cb)) reuseport_stop_listen_sock(sk); if (ilb) { + struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; + inet_unhash2(hashinfo, sk); ilb->count--; } __sk_nulls_del_node_init_rcu(sk); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); -unlock: - if (state_listen) +} + +void inet_unhash(struct sock *sk) +{ + struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; + + if (sk_unhashed(sk)) + return; + + if (sk->sk_state == TCP_LISTEN) { + struct inet_listen_hashbucket *ilb; + + ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; + /* Don't disable bottom halves while acquiring the lock to + * avoid circular locking dependency on PREEMPT_RT. + */ + spin_lock(&ilb->lock); + __inet_unhash(sk, ilb); spin_unlock(&ilb->lock); - else + } else { + spinlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash); + + spin_lock_bh(lock); + __inet_unhash(sk, NULL); spin_unlock_bh(lock); + } } EXPORT_SYMBOL_GPL(inet_unhash);