Patch "net/mlx5e: Prevent deadlock while disabling aRFS" has been added to the 6.6-stable tree

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



This is a note to let you know that I've just added the patch titled

    net/mlx5e: Prevent deadlock while disabling aRFS

to the 6.6-stable tree which can be found at:
    http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=summary

The filename of the patch is:
     net-mlx5e-prevent-deadlock-while-disabling-arfs.patch
and it can be found in the queue-6.6 subdirectory.

If you, or anyone else, feels it should not be added to the stable tree,
please let <stable@xxxxxxxxxxxxxxx> know about it.



commit 3cb0b8e3e641e022a220fc9fc68c380060d421bd
Author: Carolina Jubran <cjubran@xxxxxxxxxx>
Date:   Thu Apr 11 14:54:44 2024 +0300

    net/mlx5e: Prevent deadlock while disabling aRFS
    
    [ Upstream commit fef965764cf562f28afb997b626fc7c3cec99693 ]
    
    When disabling aRFS under the `priv->state_lock`, any scheduled
    aRFS works are canceled using the `cancel_work_sync` function,
    which waits for the work to end if it has already started.
    However, while waiting for the work handler, the handler will
    try to acquire the `state_lock` which is already acquired.
    
    The worker acquires the lock to delete the rules if the state
    is down, which is not the worker's responsibility since
    disabling aRFS deletes the rules.
    
    Add an aRFS state variable, which indicates whether the aRFS is
    enabled and prevent adding rules when the aRFS is disabled.
    
    Kernel log:
    
    ======================================================
    WARNING: possible circular locking dependency detected
    6.7.0-rc4_net_next_mlx5_5483eb2 #1 Tainted: G          I
    ------------------------------------------------------
    ethtool/386089 is trying to acquire lock:
    ffff88810f21ce68 ((work_completion)(&rule->arfs_work)){+.+.}-{0:0}, at: __flush_work+0x74/0x4e0
    
    but task is already holding lock:
    ffff8884a1808cc0 (&priv->state_lock){+.+.}-{3:3}, at: mlx5e_ethtool_set_channels+0x53/0x200 [mlx5_core]
    
    which lock already depends on the new lock.
    
    the existing dependency chain (in reverse order) is:
    
    -> #1 (&priv->state_lock){+.+.}-{3:3}:
           __mutex_lock+0x80/0xc90
           arfs_handle_work+0x4b/0x3b0 [mlx5_core]
           process_one_work+0x1dc/0x4a0
           worker_thread+0x1bf/0x3c0
           kthread+0xd7/0x100
           ret_from_fork+0x2d/0x50
           ret_from_fork_asm+0x11/0x20
    
    -> #0 ((work_completion)(&rule->arfs_work)){+.+.}-{0:0}:
           __lock_acquire+0x17b4/0x2c80
           lock_acquire+0xd0/0x2b0
           __flush_work+0x7a/0x4e0
           __cancel_work_timer+0x131/0x1c0
           arfs_del_rules+0x143/0x1e0 [mlx5_core]
           mlx5e_arfs_disable+0x1b/0x30 [mlx5_core]
           mlx5e_ethtool_set_channels+0xcb/0x200 [mlx5_core]
           ethnl_set_channels+0x28f/0x3b0
           ethnl_default_set_doit+0xec/0x240
           genl_family_rcv_msg_doit+0xd0/0x120
           genl_rcv_msg+0x188/0x2c0
           netlink_rcv_skb+0x54/0x100
           genl_rcv+0x24/0x40
           netlink_unicast+0x1a1/0x270
           netlink_sendmsg+0x214/0x460
           __sock_sendmsg+0x38/0x60
           __sys_sendto+0x113/0x170
           __x64_sys_sendto+0x20/0x30
           do_syscall_64+0x40/0xe0
           entry_SYSCALL_64_after_hwframe+0x46/0x4e
    
    other info that might help us debug this:
    
     Possible unsafe locking scenario:
    
           CPU0                    CPU1
           ----                    ----
      lock(&priv->state_lock);
                                   lock((work_completion)(&rule->arfs_work));
                                   lock(&priv->state_lock);
      lock((work_completion)(&rule->arfs_work));
    
     *** DEADLOCK ***
    
    3 locks held by ethtool/386089:
     #0: ffffffff82ea7210 (cb_lock){++++}-{3:3}, at: genl_rcv+0x15/0x40
     #1: ffffffff82e94c88 (rtnl_mutex){+.+.}-{3:3}, at: ethnl_default_set_doit+0xd3/0x240
     #2: ffff8884a1808cc0 (&priv->state_lock){+.+.}-{3:3}, at: mlx5e_ethtool_set_channels+0x53/0x200 [mlx5_core]
    
    stack backtrace:
    CPU: 15 PID: 386089 Comm: ethtool Tainted: G          I        6.7.0-rc4_net_next_mlx5_5483eb2 #1
    Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
    Call Trace:
     <TASK>
     dump_stack_lvl+0x60/0xa0
     check_noncircular+0x144/0x160
     __lock_acquire+0x17b4/0x2c80
     lock_acquire+0xd0/0x2b0
     ? __flush_work+0x74/0x4e0
     ? save_trace+0x3e/0x360
     ? __flush_work+0x74/0x4e0
     __flush_work+0x7a/0x4e0
     ? __flush_work+0x74/0x4e0
     ? __lock_acquire+0xa78/0x2c80
     ? lock_acquire+0xd0/0x2b0
     ? mark_held_locks+0x49/0x70
     __cancel_work_timer+0x131/0x1c0
     ? mark_held_locks+0x49/0x70
     arfs_del_rules+0x143/0x1e0 [mlx5_core]
     mlx5e_arfs_disable+0x1b/0x30 [mlx5_core]
     mlx5e_ethtool_set_channels+0xcb/0x200 [mlx5_core]
     ethnl_set_channels+0x28f/0x3b0
     ethnl_default_set_doit+0xec/0x240
     genl_family_rcv_msg_doit+0xd0/0x120
     genl_rcv_msg+0x188/0x2c0
     ? ethnl_ops_begin+0xb0/0xb0
     ? genl_family_rcv_msg_dumpit+0xf0/0xf0
     netlink_rcv_skb+0x54/0x100
     genl_rcv+0x24/0x40
     netlink_unicast+0x1a1/0x270
     netlink_sendmsg+0x214/0x460
     __sock_sendmsg+0x38/0x60
     __sys_sendto+0x113/0x170
     ? do_user_addr_fault+0x53f/0x8f0
     __x64_sys_sendto+0x20/0x30
     do_syscall_64+0x40/0xe0
     entry_SYSCALL_64_after_hwframe+0x46/0x4e
     </TASK>
    
    Fixes: 45bf454ae884 ("net/mlx5e: Enabling aRFS mechanism")
    Signed-off-by: Carolina Jubran <cjubran@xxxxxxxxxx>
    Signed-off-by: Tariq Toukan <tariqt@xxxxxxxxxx>
    Link: https://lore.kernel.org/r/20240411115444.374475-7-tariqt@xxxxxxxxxx
    Signed-off-by: Jakub Kicinski <kuba@xxxxxxxxxx>
    Signed-off-by: Sasha Levin <sashal@xxxxxxxxxx>

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c b/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c
index e66f486faafe1..415fec7763bd2 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c
@@ -45,6 +45,10 @@ struct arfs_table {
 	struct hlist_head	 rules_hash[ARFS_HASH_SIZE];
 };
 
+enum {
+	MLX5E_ARFS_STATE_ENABLED,
+};
+
 enum arfs_type {
 	ARFS_IPV4_TCP,
 	ARFS_IPV6_TCP,
@@ -59,6 +63,7 @@ struct mlx5e_arfs_tables {
 	spinlock_t                     arfs_lock;
 	int                            last_filter_id;
 	struct workqueue_struct        *wq;
+	unsigned long                  state;
 };
 
 struct arfs_tuple {
@@ -169,6 +174,8 @@ int mlx5e_arfs_enable(struct mlx5e_flow_steering *fs)
 			return err;
 		}
 	}
+	set_bit(MLX5E_ARFS_STATE_ENABLED, &arfs->state);
+
 	return 0;
 }
 
@@ -454,6 +461,8 @@ static void arfs_del_rules(struct mlx5e_flow_steering *fs)
 	int i;
 	int j;
 
+	clear_bit(MLX5E_ARFS_STATE_ENABLED, &arfs->state);
+
 	spin_lock_bh(&arfs->arfs_lock);
 	mlx5e_for_each_arfs_rule(rule, htmp, arfs->arfs_tables, i, j) {
 		hlist_del_init(&rule->hlist);
@@ -626,17 +635,8 @@ static void arfs_handle_work(struct work_struct *work)
 	struct mlx5_flow_handle *rule;
 
 	arfs = mlx5e_fs_get_arfs(priv->fs);
-	mutex_lock(&priv->state_lock);
-	if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) {
-		spin_lock_bh(&arfs->arfs_lock);
-		hlist_del(&arfs_rule->hlist);
-		spin_unlock_bh(&arfs->arfs_lock);
-
-		mutex_unlock(&priv->state_lock);
-		kfree(arfs_rule);
-		goto out;
-	}
-	mutex_unlock(&priv->state_lock);
+	if (!test_bit(MLX5E_ARFS_STATE_ENABLED, &arfs->state))
+		return;
 
 	if (!arfs_rule->rule) {
 		rule = arfs_add_rule(priv, arfs_rule);
@@ -752,6 +752,11 @@ int mlx5e_rx_flow_steer(struct net_device *dev, const struct sk_buff *skb,
 		return -EPROTONOSUPPORT;
 
 	spin_lock_bh(&arfs->arfs_lock);
+	if (!test_bit(MLX5E_ARFS_STATE_ENABLED, &arfs->state)) {
+		spin_unlock_bh(&arfs->arfs_lock);
+		return -EPERM;
+	}
+
 	arfs_rule = arfs_find_rule(arfs_t, &fk);
 	if (arfs_rule) {
 		if (arfs_rule->rxq == rxq_index || work_busy(&arfs_rule->arfs_work)) {




[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Index of Archives]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux