-----Original Message----- From: Bob Pearson <rpearsonhpe@xxxxxxxxx> Sent: Tuesday, January 19, 2021 3:50 PM To: jgg@xxxxxxxx; zyjzyj2000@xxxxxxxxx; linux-rdma@xxxxxxxxxxxxxx Cc: Pearson, Robert B <robert.pearson2@xxxxxxx>; syzbot+ec2fd72374785d0e558e@xxxxxxxxxxxxxxxxxxxxxxxxx Subject: [PATCH for-next] RDMA/rxe: Fix bug in rxe_alloc A recent patch which added an 'unlocked' version of rxe_alloc introduced a bug causing kzalloc(..., GFP_KERNEL) to be called while holding a spin lock. This patch corrects that error. rxe_alloc_nl() should always be called while holding the pool->pool_lock so the argument to kzalloc should be GFP_ATOMIC. rxe_alloc() prior to the change only locked the code around checking that pool->state is RXE_POOL_STATE_VALID to avoid races between working threads and a thread shutting down the rxe driver. This patch reverts rxe_alloc() to this behavior so the lock is not held when kzalloc() is called. Reported-by: syzbot+ec2fd72374785d0e558e@xxxxxxxxxxxxxxxxxxxxxxxxx Fixes: 3853c35e243d ("RDMA/rxe: Add unlocked versions of pool APIs") Signed-off-by: Bob Pearson <rpearson@xxxxxxx> --- drivers/infiniband/sw/rxe/rxe_pool.c | 46 ++++++++++++++++++++++++---- 1 file changed, 40 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_pool.c b/drivers/infiniband/sw/rxe/rxe_pool.c index d26730eec720..dd02da007ae2 100644 --- a/drivers/infiniband/sw/rxe/rxe_pool.c +++ b/drivers/infiniband/sw/rxe/rxe_pool.c @@ -84,6 +84,7 @@ struct rxe_type_info rxe_type_info[RXE_NUM_TYPES] = { .name = "rxe-mc_elem", .size = sizeof(struct rxe_mc_elem), .elem_offset = offsetof(struct rxe_mc_elem, pelem), + /* is created at interrupt level so avoid sleeps */ .flags = RXE_POOL_ATOMIC, }, }; @@ -337,14 +338,13 @@ void __rxe_drop_index(struct rxe_pool_entry *elem) write_unlock_irqrestore(&pool->pool_lock, flags); } +/* only called while holding pool->pool_lock so must use GFP_ATOMIC */ void *rxe_alloc_nl(struct rxe_pool *pool) { struct rxe_type_info *info = &rxe_type_info[pool->type]; struct rxe_pool_entry *elem; u8 *obj; - might_sleep_if(!(pool->flags & RXE_POOL_ATOMIC)); - if (pool->state != RXE_POOL_STATE_VALID) return NULL; @@ -356,8 +356,7 @@ void *rxe_alloc_nl(struct rxe_pool *pool) if (atomic_inc_return(&pool->num_elem) > pool->max_elem) goto out_cnt; - obj = kzalloc(info->size, (pool->flags & RXE_POOL_ATOMIC) ? - GFP_ATOMIC : GFP_KERNEL); + obj = kzalloc(info->size, GFP_ATOMIC); if (!obj) goto out_cnt; @@ -376,16 +375,51 @@ void *rxe_alloc_nl(struct rxe_pool *pool) return NULL; } +/* objects which can be created at interrupt level should have + * have RXE_POOL_ATOMIC flag set + */ void *rxe_alloc(struct rxe_pool *pool) { - u8 *obj; unsigned long flags; + struct rxe_type_info *info = &rxe_type_info[pool->type]; + struct rxe_pool_entry *elem; + u8 *obj; + + might_sleep_if(!(pool->flags & RXE_POOL_ATOMIC)); read_lock_irqsave(&pool->pool_lock, flags); - obj = rxe_alloc_nl(pool); + if (pool->state != RXE_POOL_STATE_VALID) { + read_unlock_irqrestore(&pool->pool_lock, flags); + return NULL; + } + + kref_get(&pool->ref_cnt); read_unlock_irqrestore(&pool->pool_lock, flags); + if (!ib_device_try_get(&pool->rxe->ib_dev)) + goto out_put_pool; + + if (atomic_inc_return(&pool->num_elem) > pool->max_elem) + goto out_cnt; + + obj = kzalloc(info->size, (pool->flags & RXE_POOL_ATOMIC) ? + GFP_ATOMIC : GFP_KERNEL); + if (!obj) + goto out_cnt; + + elem = (struct rxe_pool_entry *)(obj + info->elem_offset); + + elem->pool = pool; + kref_init(&elem->ref_cnt); + return obj; + +out_cnt: + atomic_dec(&pool->num_elem); + ib_device_put(&pool->rxe->ib_dev); +out_put_pool: + rxe_pool_put(pool); + return NULL; } int __rxe_add_to_pool(struct rxe_pool *pool, struct rxe_pool_entry *elem) -- 2.27.0 Hillf, Not sure if you saw this. Your note also seems to be a patch to address the syzkaller bug report. It has some problems though. The only reason for adding the _nl versions of the pool APIs was to let the caller hold the pool->pool_lock which won't work since you take that lock in rxe_alloc_nl(). There was a race I saw in testing where two verbs API users were simultaneously trying to join a multicast group and ended up creating two copies of the mcast group object which broke the mcast code. The fix was to lock a sequence of checking to see if the group already exists followed by allocating and filling in a new one if it doesn't. There are some other situations where the same issue will come up. As far as I can tell the RXE_POOL_ATOMIC flag is never needed and can be deleted. New objects are only created from verbs API calls and never from interrupt level. bob