Re: [PATCH v1] svcrdma: Remove max_sge check at connect time

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Fri, Jan 25, 2019 at 04:54:54PM -0500, Chuck Lever wrote:
> Two and a half years ago, the client was changed to use gathered
> Send for larger inline messages, in commit 655fec6987b ("xprtrdma:
> Use gathered Send for large inline messages"). Several fixes were
> required because there are a few in-kernel device drivers whose
> max_sge is 3, and these were broken by the change.
> 
> Apparently my memory is going, because some time later, I submitted
> commit 25fd86eca11c ("svcrdma: Don't overrun the SGE array in
> svc_rdma_send_ctxt"), and after that, commit f3c1fd0ee294 ("svcrdma:
> Reduce max_send_sges"). These too incorrectly assumed in-kernel
> device drivers would have more than a few Send SGEs available.
> 
> The fix for the server side is not the same. This is because the
> fundamental problem on the server is that, whether or not the client
> has provisioned a chunk for the RPC reply, the server must squeeze
> even the most complex RPC replies into a single RDMA Send. Failing
> in the send path because of Send SGE exhaustion should never be an
> option.
> 
> Therefore, instead of failing when the send path runs out of SGEs,
> switch to using a bounce buffer mechanism to handle RPC replies that
> are too complex for the device to send directly. That allows us to
> remove the max_sge check to enable drivers with small max_sge to
> work again.
> 
> Reported-by: Don Dutile <ddutile@xxxxxxxxxx>
> Fixes: 25fd86eca11c ("svcrdma: Don't overrun the SGE array in ...")

Thanks!  Do you think this is suitable for 5.0 and stable, or should I
save it up for 5.1?

--b.

> Signed-off-by: Chuck Lever <chuck.lever@xxxxxxxxxx>
> ---
>  net/sunrpc/xprtrdma/svc_rdma_sendto.c    |  105 ++++++++++++++++++++++++++++--
>  net/sunrpc/xprtrdma/svc_rdma_transport.c |    9 +--
>  2 files changed, 102 insertions(+), 12 deletions(-)
> 
> diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
> index 1aab305fbbb6..8235ca2159d1 100644
> --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
> +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
> @@ -531,6 +531,99 @@ void svc_rdma_sync_reply_hdr(struct svcxprt_rdma *rdma,
>  				      DMA_TO_DEVICE);
>  }
>  
> +/* If the xdr_buf has more elements than the device can
> + * transmit in a single RDMA Send, then the reply will
> + * have to be copied into a bounce buffer.
> + */
> +static bool svc_rdma_pull_up_needed(struct svcxprt_rdma *rdma,
> +				    struct xdr_buf *xdr,
> +				    __be32 *wr_lst)
> +{
> +	int elements;
> +
> +	/* xdr->head */
> +	elements = 1;
> +
> +	/* xdr->pages */
> +	if (!wr_lst) {
> +		unsigned int remaining;
> +		unsigned long pageoff;
> +
> +		pageoff = xdr->page_base & ~PAGE_MASK;
> +		remaining = xdr->page_len;
> +		while (remaining) {
> +			++elements;
> +			remaining -= min_t(u32, PAGE_SIZE - pageoff,
> +					   remaining);
> +			pageoff = 0;
> +		}
> +	}
> +
> +	/* xdr->tail */
> +	if (xdr->tail[0].iov_len)
> +		++elements;
> +
> +	/* assume 1 SGE is needed for the transport header */
> +	return elements >= rdma->sc_max_send_sges;
> +}
> +
> +/* The device is not capable of sending the reply directly.
> + * Assemble the elements of @xdr into the transport header
> + * buffer.
> + */
> +static int svc_rdma_pull_up_reply_msg(struct svcxprt_rdma *rdma,
> +				      struct svc_rdma_send_ctxt *ctxt,
> +				      struct xdr_buf *xdr, __be32 *wr_lst)
> +{
> +	unsigned char *dst, *tailbase;
> +	unsigned int taillen;
> +
> +	dst = ctxt->sc_xprt_buf;
> +	dst += ctxt->sc_sges[0].length;
> +
> +	memcpy(dst, xdr->head[0].iov_base, xdr->head[0].iov_len);
> +	dst += xdr->head[0].iov_len;
> +
> +	tailbase = xdr->tail[0].iov_base;
> +	taillen = xdr->tail[0].iov_len;
> +	if (wr_lst) {
> +		u32 xdrpad;
> +
> +		xdrpad = xdr_padsize(xdr->page_len);
> +		if (taillen && xdrpad) {
> +			tailbase += xdrpad;
> +			taillen -= xdrpad;
> +		}
> +	} else {
> +		unsigned int len, remaining;
> +		unsigned long pageoff;
> +		struct page **ppages;
> +
> +		ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT);
> +		pageoff = xdr->page_base & ~PAGE_MASK;
> +		remaining = xdr->page_len;
> +		while (remaining) {
> +			len = min_t(u32, PAGE_SIZE - pageoff, remaining);
> +
> +			memcpy(dst, page_address(*ppages), len);
> +			remaining -= len;
> +			dst += len;
> +			pageoff = 0;
> +		}
> +	}
> +
> +	if (taillen)
> +		memcpy(dst, tailbase, taillen);
> +
> +	ctxt->sc_sges[0].length += xdr->len;
> +	ib_dma_sync_single_for_device(rdma->sc_pd->device,
> +				      ctxt->sc_sges[0].addr,
> +				      ctxt->sc_sges[0].length,
> +				      DMA_TO_DEVICE);
> +
> +	return 0;
> +}
> +
>  /* svc_rdma_map_reply_msg - Map the buffer holding RPC message
>   * @rdma: controlling transport
>   * @ctxt: send_ctxt for the Send WR
> @@ -553,8 +646,10 @@ int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma,
>  	u32 xdr_pad;
>  	int ret;
>  
> -	if (++ctxt->sc_cur_sge_no >= rdma->sc_max_send_sges)
> -		return -EIO;
> +	if (svc_rdma_pull_up_needed(rdma, xdr, wr_lst))
> +		return svc_rdma_pull_up_reply_msg(rdma, ctxt, xdr, wr_lst);
> +
> +	++ctxt->sc_cur_sge_no;
>  	ret = svc_rdma_dma_map_buf(rdma, ctxt,
>  				   xdr->head[0].iov_base,
>  				   xdr->head[0].iov_len);
> @@ -585,8 +680,7 @@ int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma,
>  	while (remaining) {
>  		len = min_t(u32, PAGE_SIZE - page_off, remaining);
>  
> -		if (++ctxt->sc_cur_sge_no >= rdma->sc_max_send_sges)
> -			return -EIO;
> +		++ctxt->sc_cur_sge_no;
>  		ret = svc_rdma_dma_map_page(rdma, ctxt, *ppages++,
>  					    page_off, len);
>  		if (ret < 0)
> @@ -600,8 +694,7 @@ int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma,
>  	len = xdr->tail[0].iov_len;
>  tail:
>  	if (len) {
> -		if (++ctxt->sc_cur_sge_no >= rdma->sc_max_send_sges)
> -			return -EIO;
> +		++ctxt->sc_cur_sge_no;
>  		ret = svc_rdma_dma_map_buf(rdma, ctxt, base, len);
>  		if (ret < 0)
>  			return ret;
> diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
> index 6f9f4654338c..027a3b07d329 100644
> --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
> +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
> @@ -419,12 +419,9 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
>  	/* Transport header, head iovec, tail iovec */
>  	newxprt->sc_max_send_sges = 3;
>  	/* Add one SGE per page list entry */
> -	newxprt->sc_max_send_sges += svcrdma_max_req_size / PAGE_SIZE;
> -	if (newxprt->sc_max_send_sges > dev->attrs.max_send_sge) {
> -		pr_err("svcrdma: too few Send SGEs available (%d needed)\n",
> -		       newxprt->sc_max_send_sges);
> -		goto errout;
> -	}
> +	newxprt->sc_max_send_sges += (svcrdma_max_req_size / PAGE_SIZE) + 1;
> +	if (newxprt->sc_max_send_sges > dev->attrs.max_send_sge)
> +		newxprt->sc_max_send_sges = dev->attrs.max_send_sge;
>  	newxprt->sc_max_req_size = svcrdma_max_req_size;
>  	newxprt->sc_max_requests = svcrdma_max_requests;
>  	newxprt->sc_max_bc_requests = svcrdma_max_bc_requests;



[Index of Archives]     [Linux Filesystem Development]     [Linux USB Development]     [Linux Media Development]     [Video for Linux]     [Linux NILFS]     [Linux Audio Users]     [Yosemite Info]     [Linux SCSI]

  Powered by Linux