I made a simplified reproducer, using https://syzkaller.appspot.com/text?tag=ReproC&x=1193ae64f00000 as a base with main() rewritten like below. ---------- int main(void) { unshare(CLONE_NEWNET); initialize_netdevices(); const int fd1 = socket(AF_NETLINK, SOCK_RAW, NETLINK_RDMA); struct iovec iov = { .iov_base = "\x38\x00\x00\x00\x03\x14\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x09" "\x00\x02\x00\x73\x79\x6a\x31\x00\x00\x00\x00\x08\x00\x41\x00\x73\x69" "\x77\x00\x14\x00\x33\x00\x76\x6c\x61\x6e\x30", .iov_len = 0x38 }; struct msghdr msg = { .msg_iov = &iov, .msg_iovlen = 1 }; sendmsg(fd1, &msg, 0); if (0) { /* Enable this block if you want to observe hung without exit(0). */ const int fd2 = socket(AF_PACKET, SOCK_RAW, htons(ETH_P_ALL)); struct /* vlan_ioctl_args */ { int cmd; char device1[24]; union { char device2[24]; int VID; unsigned int skb_priority; unsigned int name_type; unsigned int bind_type; unsigned int flag; } u; short vlan_qos; } args = { .cmd = 1 /* DEL_VLAN_CMD */, .device1 = "vlan0", }; ioctl(fd2, 0x8982 /* SIOCGIFVLAN */, &args); } return 0; } ---------- Using this reproducer, I did several tests. This problem is reproduced with below diff (which pretends as if e.g. cma_add_one(), ib_uverbs_add_one(), srp_add_one() failed with -ENOMEM error because they do GFP_KERNEL allocation). ---------- diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index a666847bd714..f8f593bd81e5 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -710,23 +710,23 @@ static int add_client_context(struct ib_device *device, */ if (xa_get_mark(&device->client_data, client->client_id, CLIENT_DATA_REGISTERED)) goto out; ret = xa_err(xa_store(&device->client_data, client->client_id, NULL, GFP_KERNEL)); if (ret) goto out; downgrade_write(&device->client_data_rwsem); if (client->add) { - if (client->add(device)) { + if (1) { /* * If a client fails to add then the error code is * ignored, but we won't call any more ops on this * client. */ xa_erase(&device->client_data, client->client_id); up_read(&device->client_data_rwsem); ib_device_put(device); ib_client_put(client); return 0; } ---------- Since client->add() can fail with -ENOMEM, pretending as if client->add() always fails with -ENOMEM hides this problem. ---------- diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index a666847bd714..4c986f62514f 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -695,6 +695,9 @@ static int add_client_context(struct ib_device *device, if (!device->kverbs_provider && !client->no_kverbs_req) return 0; + if (client->add) + return -ENOMEM; + down_write(&device->client_data_rwsem); /* * So long as the client is registered hold both the client and device ---------- However, changing to return 0 instead of -ENOMEM again shows this problem. ---------- diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index a666847bd714..9dcf712e542d 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -695,6 +695,9 @@ static int add_client_context(struct ib_device *device, if (!device->kverbs_provider && !client->no_kverbs_req) return 0; + if (client->add) + return 0; + down_write(&device->client_data_rwsem); /* * So long as the client is registered hold both the client and device ---------- These results suggest that the "If a client fails to add then the error code is ignored" part is wrong. We need to teach the caller about the failure, even if we don't call any more ops on this client. Please re-evaluate commit 11a0ae4c4bff ("RDMA: Allow ib_client's to fail when add() is called"). I don't know why that commit ignores failures. There could be some error code which should be ignored, but I feel that there are error codes which should not be ignored. What do you think? >From 3e1cd08cb206afa827d196e4730ba9c3670a7fe7 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa <penguin-kernel@xxxxxxxxxxxxxxxxxxx> Date: Sun, 26 Mar 2023 00:34:05 +0900 Subject: [PATCH] RDMA: don't ignore client->add() failures syzbot is reporting refcount leak when client->add() from add_client_context() fails, for add_client_context() is ignoring error from client->add(). We need to return an error in order to indicate that client could not be added, or the caller will get confused. Reported-by: syzbot <syzbot+5e70d01ee8985ae62a3b@xxxxxxxxxxxxxxxxxxxxxxxxx> Link: https://syzkaller.appspot.com/bug?extid=5e70d01ee8985ae62a3b Signed-off-by: Tetsuo Handa <penguin-kernel@xxxxxxxxxxxxxxxxxxx> --- drivers/infiniband/core/device.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index a666847bd714..c72f810ceae1 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -718,17 +718,17 @@ static int add_client_context(struct ib_device *device, goto out; downgrade_write(&device->client_data_rwsem); if (client->add) { - if (client->add(device)) { + ret = client->add(device); + if (ret) { /* - * If a client fails to add then the error code is - * ignored, but we won't call any more ops on this - * client. + * If a client fails to add, we won't call any more + * ops on this client. */ xa_erase(&device->client_data, client->client_id); up_read(&device->client_data_rwsem); ib_device_put(device); ib_client_put(client); - return 0; + return ret; } } -- 2.34.1