On 20.02.25 18:56, Jacob Moroni wrote:
In workloads where there are many processes establishing
connections using RDMA CM in parallel (large scale MPI),
there can be heavy contention for mad_agent_lock in
cm_alloc_msg.
This contention can occur while inside of a spin_lock_irq
region, leading to interrupts being disabled for extended
durations on many cores. Furthermore, it leads to the
serialization of rdma_create_ah calls, which has negative
performance impacts for NICs which are capable of processing
multiple address handle creations in parallel.
In the link:
https://www.cs.columbia.edu/~jae/4118-LAST/L12-interrupt-spinlock.html
"
...
spin_lock() / spin_unlock()
must not lose CPU while holding a spin lock, other threads will wait for
the lock for a long time
spin_lock() prevents kernel preemption by ++preempt_count in
uniprocessor, that’s all spin_lock() does
must NOT call any function that can potentially sleep
ex) kmalloc, copy_from_user
hardware interrupt is ok unless the interrupt handler may try to lock
this spin lock
spin lock not recursive: same thread locking twice will deadlock
keep the critical section as small as possible
...
"
And from the source code, it seems that spin_lock/spin_unlock are not
related with interrupts.
I wonder why "leading to interrupts being disabled for extended
durations on many cores" with spin_lock/spin_unlock?
I am not against this commit. I am just obvious why
spin_lock/spin_unlock are related with "interrupts being disabled".
Thanks a lot.
Zhu Yanjun
The end result is the machine becoming unresponsive, hung
task warnings, netdev TX timeouts, etc.
Since the lock appears to be only for protection from
cm_remove_one, it can be changed to a rwlock to resolve
these issues.
Reproducer:
Server:
for i in $(seq 1 512); do
ucmatose -c 32 -p $((i + 5000)) &
done
Client:
for i in $(seq 1 512); do
ucmatose -c 32 -p $((i + 5000)) -s 10.2.0.52 &
done
Fixes: 76039ac9095f5ee5 ("IB/cm: Protect cm_dev, cm_ports and mad_agent with kref and lock")
Signed-off-by: Jacob Moroni <jmoroni@xxxxxxxxxx>
---
drivers/infiniband/core/cm.c | 16 ++++++++--------
1 file changed, 8 insertions(+), 8 deletions(-)
diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
index 142170473e75..effa53dd6800 100644
--- a/drivers/infiniband/core/cm.c
+++ b/drivers/infiniband/core/cm.c
@@ -167,7 +167,7 @@ struct cm_port {
struct cm_device {
struct kref kref;
struct list_head list;
- spinlock_t mad_agent_lock;
+ rwlock_t mad_agent_lock;
struct ib_device *ib_device;
u8 ack_delay;
int going_down;
@@ -285,7 +285,7 @@ static struct ib_mad_send_buf *cm_alloc_msg(struct cm_id_private *cm_id_priv)
if (!cm_id_priv->av.port)
return ERR_PTR(-EINVAL);
- spin_lock(&cm_id_priv->av.port->cm_dev->mad_agent_lock);
+ read_lock(&cm_id_priv->av.port->cm_dev->mad_agent_lock);
mad_agent = cm_id_priv->av.port->mad_agent;
if (!mad_agent) {
m = ERR_PTR(-EINVAL);
@@ -311,7 +311,7 @@ static struct ib_mad_send_buf *cm_alloc_msg(struct cm_id_private *cm_id_priv)
m->ah = ah;
out:
- spin_unlock(&cm_id_priv->av.port->cm_dev->mad_agent_lock);
+ read_unlock(&cm_id_priv->av.port->cm_dev->mad_agent_lock);
return m;
}
@@ -1297,10 +1297,10 @@ static __be64 cm_form_tid(struct cm_id_private *cm_id_priv)
if (!cm_id_priv->av.port)
return cpu_to_be64(low_tid);
- spin_lock(&cm_id_priv->av.port->cm_dev->mad_agent_lock);
+ read_lock(&cm_id_priv->av.port->cm_dev->mad_agent_lock);
if (cm_id_priv->av.port->mad_agent)
hi_tid = ((u64)cm_id_priv->av.port->mad_agent->hi_tid) << 32;
- spin_unlock(&cm_id_priv->av.port->cm_dev->mad_agent_lock);
+ read_unlock(&cm_id_priv->av.port->cm_dev->mad_agent_lock);
return cpu_to_be64(hi_tid | low_tid);
}
@@ -4378,7 +4378,7 @@ static int cm_add_one(struct ib_device *ib_device)
return -ENOMEM;
kref_init(&cm_dev->kref);
- spin_lock_init(&cm_dev->mad_agent_lock);
+ rwlock_init(&cm_dev->mad_agent_lock);
cm_dev->ib_device = ib_device;
cm_dev->ack_delay = ib_device->attrs.local_ca_ack_delay;
cm_dev->going_down = 0;
@@ -4494,9 +4494,9 @@ static void cm_remove_one(struct ib_device *ib_device, void *client_data)
* The above ensures no call paths from the work are running,
* the remaining paths all take the mad_agent_lock.
*/
- spin_lock(&cm_dev->mad_agent_lock);
+ write_lock(&cm_dev->mad_agent_lock);
port->mad_agent = NULL;
- spin_unlock(&cm_dev->mad_agent_lock);
+ write_unlock(&cm_dev->mad_agent_lock);
ib_unregister_mad_agent(mad_agent);
ib_port_unregister_client_groups(ib_device, i,
cm_counter_groups);