RE: [PATCH v2 14/14] rdma/siw: implement non-blocking connect.

Bernard Metzler <BMT@xxxxxxxxxxxxxx> · Tue, 21 Jun 2022 15:22:35 +0000

> -----Original Message-----
> From: Stefan Metzmacher <metze@xxxxxxxxx>
> Sent: Wednesday, 15 June 2022 17:27
> To: Bernard Metzler <BMT@xxxxxxxxxxxxxx>; linux-rdma@xxxxxxxxxxxxxxx
> Cc: Stefan Metzmacher <metze@xxxxxxxxx>
> Subject: [EXTERNAL] [PATCH v2 14/14] rdma/siw: implement non-blocking
> connect.
> 
> This is very important in order to prevent deadlocks.
> 
> The RDMA application layer expects rdma_connect() to be non-blocking
> as the completion is handled via RDMA_CM_EVENT_ESTABLISHED and
> other async events. It's not unlikely to hold a lock during
> the rdma_connect() call.
> 
> Without out this a connection attempt to a non-existing/reachable
> server block until the very long tcp timeout hits.
> The application layer had no chance to have its own timeout handler
> as that would just deadlock with the already blocking rdma_connect().
> 
> First rdma_connect() holds id_priv->handler_mutex and deadlocks
> rdma_destroy_id().
> 
> And iw_cm_connect() called from within rdma_connect() sets
> IWCM_F_CONNECT_WAIT during the call to cm_id->device->ops.iw_connect(),
> siw_connect() in this case. It means that iw_cm_disconnect()
> and iw_destroy_cm_id() will both deadlock waiting for
> IWCM_F_CONNECT_WAIT being cleared.
> 
> Fixes: 6c52fdc244b5 ("rdma/siw: connection management")
> Signed-off-by: Stefan Metzmacher <metze@xxxxxxxxx>
> Cc: Bernard Metzler <bmt@xxxxxxxxxxxxxx>
> Cc: linux-rdma@xxxxxxxxxxxxxxx
> ---
>  drivers/infiniband/sw/siw/siw_cm.c | 125 ++++++++++++++++++++++-------
>  drivers/infiniband/sw/siw/siw_cm.h |   1 +
>  2 files changed, 96 insertions(+), 30 deletions(-)
> 
> diff --git a/drivers/infiniband/sw/siw/siw_cm.c
> b/drivers/infiniband/sw/siw/siw_cm.c
> index 74ed2a5a8f47..5b051aaa3fec 100644
> --- a/drivers/infiniband/sw/siw/siw_cm.c
> +++ b/drivers/infiniband/sw/siw/siw_cm.c
> @@ -37,6 +37,7 @@ static void siw_cm_llp_write_space(struct sock *s);
>  static void siw_cm_llp_error_report(struct sock *s);
>  static int siw_cm_upcall(struct siw_cep *cep, enum iw_cm_event_type
> reason,
>  			 int status);
> +static void siw_connected(struct siw_cep *cep);
> 
>  static void siw_sk_assign_cm_upcalls(struct sock *sk)
>  {
> @@ -1074,6 +1075,10 @@ static void siw_cm_work_handler(struct work_struct
> *w)
>  		siw_accept_newconn(cep);
>  		break;
> 
> +	case SIW_CM_WORK_CONNECTED:
> +		siw_connected(cep);
> +		break;
> +
>  	case SIW_CM_WORK_READ_MPAHDR:
>  		if (cep->state == SIW_EPSTATE_AWAIT_MPAREQ) {
>  			if (cep->listen_cep) {
> @@ -1239,6 +1244,7 @@ static void siw_cm_llp_data_ready(struct sock *sk)
>  	switch (cep->state) {
>  	case SIW_EPSTATE_RDMA_MODE:
>  	case SIW_EPSTATE_LISTENING:
> +	case SIW_EPSTATE_CONNECTING:
>  		break;
> 
>  	case SIW_EPSTATE_AWAIT_MPAREQ:
> @@ -1292,12 +1298,26 @@ static void siw_cm_llp_state_change(struct sock
> *sk)
> 
>  	switch (sk->sk_state) {
>  	case TCP_ESTABLISHED:
> -		/*
> -		 * handle accepting socket as special case where only
> -		 * new connection is possible
> -		 */
> -		siw_cm_queue_work(cep, SIW_CM_WORK_ACCEPT);
> -		break;
> +		if (cep->state == SIW_EPSTATE_CONNECTING) {
> +			/*
> +			 * handle accepting socket as special case where only
> +			 * new connection is possible
> +			 */
> +			siw_cm_queue_work(cep, SIW_CM_WORK_CONNECTED);
> +			break;
> +
> +		} else if (cep->state == SIW_EPSTATE_LISTENING) {
> +			/*
> +			 * handle accepting socket as special case where only
> +			 * new connection is possible
> +			 */
> +			siw_cm_queue_work(cep, SIW_CM_WORK_ACCEPT);
> +			break;
> +		}
> +		siw_dbg_cep(cep,
> +			    "unexpected socket state %d with cep state %d\n",
> +			    sk->sk_state, cep->state);
> +		fallthrough;
> 
>  	case TCP_CLOSE:
>  	case TCP_CLOSE_WAIT:
> @@ -1316,7 +1336,7 @@ static void siw_cm_llp_state_change(struct sock *sk)
>  static int kernel_bindconnect(struct socket *s, struct sockaddr *laddr,
>  			      struct sockaddr *raddr, bool afonly)
>  {
> -	int rv, flags = 0;
> +	int rv;
>  	size_t size = laddr->sa_family == AF_INET ?
>  		sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6);
> 
> @@ -1335,7 +1355,18 @@ static int kernel_bindconnect(struct socket *s,
> struct sockaddr *laddr,
>  	if (rv < 0)
>  		return rv;
> 
> -	rv = kernel_connect(s, raddr, size, flags);
> +	/*
> +	 * Yes, this is really O_NONBLOCK instead of
> +	 * SOCK_NONBLOCK.
> +	 *
> +	 * __sys_connect_file() passes
> +	 * sock->file->f_flags | file_flags to
> +	 * sock->ops->connect().
> +	 *
> +	 * Also io_connect() from io_uring forces
> +	 * file_flags=O_NONBLOCK to __sys_connect_file().
> +	 */

No need for this comment. We have many cases in
kernel code where kernel_connect() gets called
using O_NONBLOCK. I also do not see why we
shall discuss io_uring details here.

btw, afaiks, SOCK_NONBLOCK is a #define to
O_NONBLOCK (linux/net.h)

> +	rv = kernel_connect(s, raddr, size, O_NONBLOCK);
> 
>  	return rv < 0 ? rv : 0;
>  }
> @@ -1482,36 +1513,27 @@ int siw_connect(struct iw_cm_id *id, struct
> iw_cm_conn_param *params)
>  		goto error;
>  	}
> 
> -	/*
> -	 * NOTE: For simplification, connect() is called in blocking
> -	 * mode. Might be reconsidered for async connection setup at
> -	 * TCP level.
> -	 */
>  	rv = kernel_bindconnect(s, laddr, raddr, id->afonly);
> +	if (rv == -EINPROGRESS) {
> +		siw_dbg_qp(qp, "kernel_bindconnect: EINPROGRESS\n");
> +		rv = 0;
> +	}
>  	if (rv != 0) {
>  		siw_dbg_qp(qp, "kernel_bindconnect: error %d\n", rv);
>  		goto error;
>  	}
> -	if (siw_tcp_nagle == false)
> -		tcp_sock_set_nodelay(s->sk);
> -
> -	cep->state = SIW_EPSTATE_AWAIT_MPAREP;
> 
> -	rv = siw_send_mpareqrep(cep, cep->mpa.pdata,
> -				cep->mpa.hdr.params.pd_len);
>  	/*
> -	 * Reset private data.

Here, and in some more places: 

Let's keep comments short and concise. I'd prefer
something like:
/*
 * Completion of TCP connection establishment will be
 * handled asynchronously via SIW_CM_WORK_CONNECTED.
 */

> +	 * The rest will be done by siw_connected()
> +	 *
> +	 * siw_cm_llp_state_change() will detect
> +	 * TCP_ESTABLISHED and schedules SIW_CM_WORK_CONNECTED,
> +	 * which will finally call siw_connected().
> +	 *
> +	 * As siw_cm_llp_state_change() handles everything
> +	 * siw_cm_llp_data_ready() can be a noop for
> +	 * SIW_EPSTATE_CONNECTING.
>  	 */
> -	if (cep->mpa.hdr.params.pd_len) {
> -		cep->mpa.hdr.params.pd_len = 0;
> -		kfree(cep->mpa.pdata);
> -		cep->mpa.pdata = NULL;
> -	}
> -
> -	if (rv < 0) {
> -		goto error;
> -	}
> -
>  	siw_dbg_cep(cep, "[QP %u]: exit\n", qp_id(qp));
>  	siw_cep_set_free(cep);
>  	return 0;
> @@ -1549,6 +1571,49 @@ int siw_connect(struct iw_cm_id *id, struct
> iw_cm_conn_param *params)
>  	return rv;
>  }
> 
> +static void siw_connected(struct siw_cep *cep)
> +{
> +	struct siw_qp *qp = cep->qp;
> +	struct socket *s = cep->sock;
> +	int rv = -ECONNABORTED;
> +
> +	/*
> +	 * already called with
> +	 * siw_cep_set_inuse(cep);
> +	 */
> +
> +	if (cep->state != SIW_EPSTATE_CONNECTING)
> +		goto error;
> +
> +	if (siw_tcp_nagle == false)
> +		tcp_sock_set_nodelay(s->sk);
> +
> +	cep->state = SIW_EPSTATE_AWAIT_MPAREP;
> +
> +	rv = siw_send_mpareqrep(cep, cep->mpa.pdata,
> +				cep->mpa.hdr.params.pd_len);
> +	/*
> +	 * Reset private data.
> +	 */
> +	if (cep->mpa.hdr.params.pd_len) {
> +		cep->mpa.hdr.params.pd_len = 0;
> +		kfree(cep->mpa.pdata);
> +		cep->mpa.pdata = NULL;
> +	}
> +
> +	if (rv < 0) {
> +		goto error;
> +	}
> +
> +	siw_dbg_cep(cep, "[QP %u]: exit\n", qp_id(qp));
> +	return;
> +
> +error:
> +	siw_dbg_cep(cep, "[QP %u]: exit, error %d\n", qp_id(qp), rv);
> +	siw_qp_cm_drop(qp, 1);
> +	return;
> +}
> +
>  /*
>   * siw_accept - Let SoftiWARP accept an RDMA connection request
>   *
> diff --git a/drivers/infiniband/sw/siw/siw_cm.h
> b/drivers/infiniband/sw/siw/siw_cm.h
> index 8c59cb3e2868..c01bdc8e64ee 100644
> --- a/drivers/infiniband/sw/siw/siw_cm.h
> +++ b/drivers/infiniband/sw/siw/siw_cm.h
> @@ -76,6 +76,7 @@ struct siw_cep {
> 
>  enum siw_work_type {
>  	SIW_CM_WORK_ACCEPT = 1,
> +	SIW_CM_WORK_CONNECTED,
>  	SIW_CM_WORK_READ_MPAHDR,
>  	SIW_CM_WORK_CLOSE_LLP, /* close socket */
>  	SIW_CM_WORK_PEER_CLOSE, /* socket indicated peer close */
> --
> 2.34.1

Actually, changes are larger than I would have expected,
since it also reorganizes some code.
It requires me to do some in depth testing to make sure
we did not break the (already over-complicated) siw CM
code (I remember how long it took to get it stable in
large -4k nodes and beyond - deployments).

I admit it may really help to understand the reasoning of some
code decisions better, if we look at incremental patches
in a set. But it makes it hard to comment and get the overall
picture. I'd suggest sending just two patches. One
introducing kernel_bind() and friends (your 08/14) since
it is unrelated to the overall aim of that set, it can be
immediately accepted, and one with the async connect logic.

Thank you,
Bernard.