From: Parav Pandit <parav@xxxxxxxxxxxx> When rdmacm module is not loaded, and when netlink message is received to get char device info, it results into a deadlock due to recursive locking rdma_nl_mutex in below call sequence. [..] rdma_nl_rcv() mutex_lock() rdma_nl_rcv_msg() ib_get_client_nl_info() request_module() iw_cm_init() rdma_nl_register() mutex_lock(); <- Deadlock, acquiring mutex again Due to above call sequence, following call trace and deadlock is observed. kernel: __mutex_lock+0x35e/0x860 kernel: ? __mutex_lock+0x129/0x860 kernel: ? rdma_nl_register+0x1a/0x90 [ib_core] kernel: rdma_nl_register+0x1a/0x90 [ib_core] kernel: ? 0xffffffffc029b000 kernel: iw_cm_init+0x34/0x1000 [iw_cm] kernel: do_one_initcall+0x67/0x2d4 kernel: ? kmem_cache_alloc_trace+0x1ec/0x2a0 kernel: do_init_module+0x5a/0x223 kernel: load_module+0x1998/0x1e10 kernel: ? __symbol_put+0x60/0x60 kernel: __do_sys_finit_module+0x94/0xe0 kernel: do_syscall_64+0x5a/0x270 kernel: entry_SYSCALL_64_after_hwframe+0x49/0xbe process stack trace: [<0>] __request_module+0x1c9/0x460 [<0>] ib_get_client_nl_info+0x5e/0xb0 [ib_core] [<0>] nldev_get_chardev+0x1ac/0x320 [ib_core] [<0>] rdma_nl_rcv_msg+0xeb/0x1d0 [ib_core] [<0>] rdma_nl_rcv+0xcd/0x120 [ib_core] [<0>] netlink_unicast+0x179/0x220 [<0>] netlink_sendmsg+0x2f6/0x3f0 [<0>] sock_sendmsg+0x30/0x40 [<0>] ___sys_sendmsg+0x27a/0x290 [<0>] __sys_sendmsg+0x58/0xa0 [<0>] do_syscall_64+0x5a/0x270 [<0>] entry_SYSCALL_64_after_hwframe+0x49/0xbe To overcome this deadlock and to allow multiple netlink messages to progress in parallel, following scheme is implemented. 1. Block netlink table unregistration of a client until all the callers finish executing callback for a given client. 2. Netlink clients shouldn't register table multiple times for a given index. Such basic requirement from two non IB core module eliminates mutex usage for table registratio, Fixes: 0e2d00eb6fd45 ("RDMA: Add NLDEV_GET_CHARDEV to allow char dev discovery and autoload") Signed-off-by: Parav Pandit <parav@xxxxxxxxxxxx> Signed-off-by: Leon Romanovsky <leonro@xxxxxxxxxxxx> --- drivers/infiniband/core/core_priv.h | 1 + drivers/infiniband/core/device.c | 2 + drivers/infiniband/core/netlink.c | 93 ++++++++++++++++------------- 3 files changed, 54 insertions(+), 42 deletions(-) diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h index 3a8b0911c3bc..9d07378b5b42 100644 --- a/drivers/infiniband/core/core_priv.h +++ b/drivers/infiniband/core/core_priv.h @@ -199,6 +199,7 @@ void ib_mad_cleanup(void); int ib_sa_init(void); void ib_sa_cleanup(void); +void rdma_nl_init(void); void rdma_nl_exit(void); int ib_nl_handle_resolve_resp(struct sk_buff *skb, diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 2e53aa25f0c7..2f89c4d64b73 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -2716,6 +2716,8 @@ static int __init ib_core_init(void) goto err_comp_unbound; } + rdma_nl_init(); + ret = addr_init(); if (ret) { pr_warn("Could't init IB address resolution\n"); diff --git a/drivers/infiniband/core/netlink.c b/drivers/infiniband/core/netlink.c index 81dbd5f41bed..a3507b8be569 100644 --- a/drivers/infiniband/core/netlink.c +++ b/drivers/infiniband/core/netlink.c @@ -42,9 +42,12 @@ #include <linux/module.h> #include "core_priv.h" -static DEFINE_MUTEX(rdma_nl_mutex); static struct { - const struct rdma_nl_cbs *cb_table; + const struct rdma_nl_cbs __rcu *cb_table; + /* Synchronizes between ongoing netlink commands and netlink client + * unregistration. + */ + struct srcu_struct unreg_srcu; } rdma_nl_types[RDMA_NL_NUM_CLIENTS]; bool rdma_nl_chk_listeners(unsigned int group) @@ -78,8 +81,6 @@ static bool is_nl_msg_valid(unsigned int type, unsigned int op) static bool is_nl_valid(const struct sk_buff *skb, unsigned int type, unsigned int op) { - const struct rdma_nl_cbs *cb_table; - if (!is_nl_msg_valid(type, op)) return false; @@ -90,23 +91,12 @@ is_nl_valid(const struct sk_buff *skb, unsigned int type, unsigned int op) if (sock_net(skb->sk) != &init_net && type != RDMA_NL_NLDEV) return false; - if (!rdma_nl_types[type].cb_table) { - mutex_unlock(&rdma_nl_mutex); - request_module("rdma-netlink-subsys-%d", type); - mutex_lock(&rdma_nl_mutex); - } - - cb_table = rdma_nl_types[type].cb_table; - - if (!cb_table || (!cb_table[op].dump && !cb_table[op].doit)) - return false; return true; } void rdma_nl_register(unsigned int index, const struct rdma_nl_cbs cb_table[]) { - mutex_lock(&rdma_nl_mutex); if (!is_nl_msg_valid(index, 0)) { /* * All clients are not interesting in success/failure of @@ -114,31 +104,21 @@ void rdma_nl_register(unsigned int index, * continue their initialization. Print warning for them, * because it is programmer's error to be here. */ - mutex_unlock(&rdma_nl_mutex); WARN(true, "The not-valid %u index was supplied to RDMA netlink\n", index); return; } - if (rdma_nl_types[index].cb_table) { - mutex_unlock(&rdma_nl_mutex); - WARN(true, - "The %u index is already registered in RDMA netlink\n", - index); - return; - } - - rdma_nl_types[index].cb_table = cb_table; - mutex_unlock(&rdma_nl_mutex); + /* Publish now that this table entry can be accessed */ + rcu_assign_pointer(rdma_nl_types[index].cb_table, cb_table); } EXPORT_SYMBOL(rdma_nl_register); void rdma_nl_unregister(unsigned int index) { - mutex_lock(&rdma_nl_mutex); - rdma_nl_types[index].cb_table = NULL; - mutex_unlock(&rdma_nl_mutex); + rcu_assign_pointer(rdma_nl_types[index].cb_table, NULL); + synchronize_srcu(&rdma_nl_types[index].unreg_srcu); } EXPORT_SYMBOL(rdma_nl_unregister); @@ -170,15 +150,35 @@ static int rdma_nl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, unsigned int index = RDMA_NL_GET_CLIENT(type); unsigned int op = RDMA_NL_GET_OP(type); const struct rdma_nl_cbs *cb_table; + int srcu_idx; + int err = -EINVAL; if (!is_nl_valid(skb, index, op)) return -EINVAL; - cb_table = rdma_nl_types[index].cb_table; + srcu_idx = srcu_read_lock(&rdma_nl_types[index].unreg_srcu); + cb_table = srcu_dereference(rdma_nl_types[index].cb_table, + &rdma_nl_types[index].unreg_srcu); + if (!cb_table) { + /* Didn't get valid reference of the table; + * attempt module load once. + */ + srcu_read_unlock(&rdma_nl_types[index].unreg_srcu, srcu_idx); + + request_module("rdma-netlink-subsys-%d", index); + + srcu_idx = srcu_read_lock(&rdma_nl_types[index].unreg_srcu); + cb_table = srcu_dereference(rdma_nl_types[index].cb_table, + &rdma_nl_types[index].unreg_srcu); + } + if (!cb_table || (!cb_table[op].dump && !cb_table[op].doit)) + goto done; if ((cb_table[op].flags & RDMA_NL_ADMIN_PERM) && - !netlink_capable(skb, CAP_NET_ADMIN)) - return -EPERM; + !netlink_capable(skb, CAP_NET_ADMIN)) { + err = -EPERM; + goto done; + } /* * LS responses overload the 0x100 (NLM_F_ROOT) flag. Don't @@ -186,8 +186,8 @@ static int rdma_nl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, */ if (index == RDMA_NL_LS) { if (cb_table[op].doit) - return cb_table[op].doit(skb, nlh, extack); - return -EINVAL; + err = cb_table[op].doit(skb, nlh, extack); + goto done; } /* FIXME: Convert IWCM to properly handle doit callbacks */ if ((nlh->nlmsg_flags & NLM_F_DUMP) || index == RDMA_NL_IWCM) { @@ -195,14 +195,15 @@ static int rdma_nl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, .dump = cb_table[op].dump, }; if (c.dump) - return netlink_dump_start(skb->sk, skb, nlh, &c); - return -EINVAL; + err = netlink_dump_start(skb->sk, skb, nlh, &c); + goto done; } if (cb_table[op].doit) - return cb_table[op].doit(skb, nlh, extack); - - return 0; + err = cb_table[op].doit(skb, nlh, extack); +done: + srcu_read_unlock(&rdma_nl_types[index].unreg_srcu, srcu_idx); + return err; } /* @@ -263,9 +264,7 @@ static int rdma_nl_rcv_skb(struct sk_buff *skb, int (*cb)(struct sk_buff *, static void rdma_nl_rcv(struct sk_buff *skb) { - mutex_lock(&rdma_nl_mutex); rdma_nl_rcv_skb(skb, &rdma_nl_rcv_msg); - mutex_unlock(&rdma_nl_mutex); } int rdma_nl_unicast(struct net *net, struct sk_buff *skb, u32 pid) @@ -297,14 +296,24 @@ int rdma_nl_multicast(struct net *net, struct sk_buff *skb, } EXPORT_SYMBOL(rdma_nl_multicast); -void rdma_nl_exit(void) +void rdma_nl_init(void) { int idx; for (idx = 0; idx < RDMA_NL_NUM_CLIENTS; idx++) + init_srcu_struct(&rdma_nl_types[idx].unreg_srcu); +} + +void rdma_nl_exit(void) +{ + int idx; + + for (idx = 0; idx < RDMA_NL_NUM_CLIENTS; idx++) { + cleanup_srcu_struct(&rdma_nl_types[idx].unreg_srcu); WARN(rdma_nl_types[idx].cb_table, "Netlink client %d wasn't released prior to unloading %s\n", idx, KBUILD_MODNAME); + } } int rdma_nl_net_init(struct rdma_dev_net *rnet) -- 2.20.1