Re: [PATCH RFC] RDMA/cm: add timeout to cm_destroy_id wait

Leon Romanovsky <leon@xxxxxxxxxx> · Thu, 7 Mar 2024 11:46:07 +0200

On Tue, Mar 05, 2024 at 02:59:11PM -0800, Manjunath Patil wrote:
> 
> 
> On 3/3/24 1:58 AM, Leon Romanovsky wrote:
> > On Tue, Feb 27, 2024 at 12:00:17PM -0800, Manjunath Patil wrote:
> > > Add timeout to cm_destroy_id, so that userspace can trigger any data
> > > collection that would help in analyzing the cause of delay in destroying
> > > the cm_id.
> > 
> > Why doesn't rdmatool resource cm_id dump help to see stalled cm_ids?
> Wouldn't this require us to know cm_id before hand?
> 
> I am unfamiliar with rdmatool. Can you explain how I use it to detect a stalled connection?

Please see it if it can help:
https://www.man7.org/linux/man-pages/man8/rdma-resource.8.html
rdma resource show cm_id ...

> I wouldn't know cm_id before hand to track it to see if that is stalled.
> 
> My intention is to have a script monitor for stalled connections[Ex: one of my connections was stuck in destroying it's cm_id] and trigger things like firmware dumps, enable more logging in related modules, crash node if this takes longer than few minutes etc.
> 
> The current logic is to, have this timeout trigger a function(which is traceable with ebpf/dtrace) in error path, if more than expected time is spent is destroying the cm_id.

I'm not against the idea to warn about stalled destroy_id, I'm against
adding new knob to control this timeout.

Thanks

> 
> -Thank you,
> Manjunath
> > 
> > Thanks
> > 
> > > 
> > > New noinline function helps dtrace/ebpf programs to hook on to it.
> > > Existing functionality isn't changed except triggering a probe-able new
> > > function at every timeout interval.
> > > 
> > > We have seen cases where CM messages stuck with MAD layer (either due to
> > > software bug or faulty HCA), leading to cm_id getting stuck in the
> > > following call stack. This patch helps in resolving such issues faster.
> > > 
> > > kernel: ... INFO: task XXXX:56778 blocked for more than 120 seconds.
> > > ...
> > > 	Call Trace:
> > > 	__schedule+0x2bc/0x895
> > > 	schedule+0x36/0x7c
> > > 	schedule_timeout+0x1f6/0x31f
> > >   	? __slab_free+0x19c/0x2ba
> > > 	wait_for_completion+0x12b/0x18a
> > > 	? wake_up_q+0x80/0x73
> > > 	cm_destroy_id+0x345/0x610 [ib_cm]
> > > 	ib_destroy_cm_id+0x10/0x20 [ib_cm]
> > > 	rdma_destroy_id+0xa8/0x300 [rdma_cm]
> > > 	ucma_destroy_id+0x13e/0x190 [rdma_ucm]
> > > 	ucma_write+0xe0/0x160 [rdma_ucm]
> > > 	__vfs_write+0x3a/0x16d
> > > 	vfs_write+0xb2/0x1a1
> > > 	? syscall_trace_enter+0x1ce/0x2b8
> > > 	SyS_write+0x5c/0xd3
> > > 	do_syscall_64+0x79/0x1b9
> > > 	entry_SYSCALL_64_after_hwframe+0x16d/0x0
> > > 
> > > Signed-off-by: Manjunath Patil <manjunath.b.patil@xxxxxxxxxx>
> > > ---
> > >   drivers/infiniband/core/cm.c | 38 +++++++++++++++++++++++++++++++++++-
> > >   1 file changed, 37 insertions(+), 1 deletion(-)
> > > 
> > > diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
> > > index ff58058aeadc..03f7b80efa77 100644
> > > --- a/drivers/infiniband/core/cm.c
> > > +++ b/drivers/infiniband/core/cm.c
> > > @@ -34,6 +34,20 @@ MODULE_AUTHOR("Sean Hefty");
> > >   MODULE_DESCRIPTION("InfiniBand CM");
> > >   MODULE_LICENSE("Dual BSD/GPL");
> > > +static unsigned long cm_destroy_id_wait_timeout_sec = 10;
> > > +
> > > +static struct ctl_table_header *cm_ctl_table_header;
> > > +static struct ctl_table cm_ctl_table[] = {
> > > +	{
> > > +		.procname	= "destroy_id_wait_timeout_sec",
> > > +		.data		= &cm_destroy_id_wait_timeout_sec,
> > > +		.maxlen		= sizeof(cm_destroy_id_wait_timeout_sec),
> > > +		.mode		= 0644,
> > > +		.proc_handler	= proc_doulongvec_minmax,
> > > +	},
> > > +	{ }
> > > +};
> > > +
> > >   static const char * const ibcm_rej_reason_strs[] = {
> > >   	[IB_CM_REJ_NO_QP]			= "no QP",
> > >   	[IB_CM_REJ_NO_EEC]			= "no EEC",
> > > @@ -1025,10 +1039,20 @@ static void cm_reset_to_idle(struct cm_id_private *cm_id_priv)
> > >   	}
> > >   }
> > > +static noinline void cm_destroy_id_wait_timeout(struct ib_cm_id *cm_id)
> > > +{
> > > +	struct cm_id_private *cm_id_priv;
> > > +
> > > +	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
> > > +	pr_err("%s: cm_id=%p timed out. state=%d refcnt=%d\n", __func__,
> > > +	       cm_id, cm_id->state, refcount_read(&cm_id_priv->refcount));
> > > +}
> > > +
> > >   static void cm_destroy_id(struct ib_cm_id *cm_id, int err)
> > >   {
> > >   	struct cm_id_private *cm_id_priv;
> > >   	struct cm_work *work;
> > > +	int ret;
> > >   	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
> > >   	spin_lock_irq(&cm_id_priv->lock);
> > > @@ -1135,7 +1159,14 @@ static void cm_destroy_id(struct ib_cm_id *cm_id, int err)
> > >   	xa_erase(&cm.local_id_table, cm_local_id(cm_id->local_id));
> > >   	cm_deref_id(cm_id_priv);
> > > -	wait_for_completion(&cm_id_priv->comp);
> > > +	do {
> > > +		ret = wait_for_completion_timeout(&cm_id_priv->comp,
> > > +						  msecs_to_jiffies(
> > > +				cm_destroy_id_wait_timeout_sec * 1000));
> > > +		if (!ret) /* timeout happened */
> > > +			cm_destroy_id_wait_timeout(cm_id);
> > > +	} while (!ret);
> > > +
> > >   	while ((work = cm_dequeue_work(cm_id_priv)) != NULL)
> > >   		cm_free_work(work);
> > > @@ -4505,6 +4536,10 @@ static int __init ib_cm_init(void)
> > >   	ret = ib_register_client(&cm_client);
> > >   	if (ret)
> > >   		goto error3;
> > > +	cm_ctl_table_header = register_net_sysctl(&init_net,
> > > +						  "net/ib_cm", cm_ctl_table);
> > > +	if (!cm_ctl_table_header)
> > > +		pr_warn("ib_cm: couldn't register sysctl path, using default values\n");
> > >   	return 0;
> > >   error3:
> > > @@ -4522,6 +4557,7 @@ static void __exit ib_cm_cleanup(void)
> > >   		cancel_delayed_work(&timewait_info->work.work);
> > >   	spin_unlock_irq(&cm.lock);
> > > +	unregister_net_sysctl_table(cm_ctl_table_header);
> > >   	ib_unregister_client(&cm_client);
> > >   	destroy_workqueue(cm.wq);
> > > -- 
> > > 2.31.1
> > > 
> > >