Re: [PATCH v1 3/6] svcrdma: Single-stage RDMA Read

Chuck Lever III <chuck.lever@xxxxxxxxxx> · Wed, 31 Mar 2021 19:33:06 +0000

> On Mar 31, 2021, at 2:34 PM, Tom Talpey <tom@xxxxxxxxxx> wrote:
> 
> On 3/29/2021 10:40 AM, Chuck Lever wrote:
>> Currently the generic RPC server layer calls svc_rdma_recvfrom()
>> twice to retrieve an RPC message that uses Read chunks. I'm not
>> exactly sure why this design was chosen originally.
> 
> I'm not either, but remember the design was written over a decade
> ago. I vaguely recall there was some bounce buffering for strange
> memreg corner cases. The RDMA stack has improved greatly.
> 
>> Instead, let's wait for the Read chunk completion inline in the
>> first call to svc_rdma_recvfrom().
>> The goal is to eliminate some page allocator churn.
>> rdma_read_complete() replaces pages in the second svc_rqst by
>> calling put_page() repeatedly while the upper layer waits for
>> the request to be constructed, which adds unnecessary round-
>> trip latency.
> 
> Local API round-trip, right? Same wire traffic either way. In fact,
> I don't see any Verbs changes too.

The rdma_read_complete() latency is incurred during the second call
to svc_rdma_recvfrom().

That holds up the construction of each message with a Read chunk,
since memory free operations need to complete first before the
second svc_rdma_recvfrom() call can return.

It also holds up the next message in the queue because XPT_BUSY
is held while the Read chunk and memory allocation is being
dealt with.

Therefore it lengthens the NFS WRITE round-trip latency by
inserting memory allocation operations (put_page()) in a hot path.

> Some comments/question below.
> 
>> Signed-off-by: Chuck Lever <chuck.lever@xxxxxxxxxx>
>> ---
>>  net/sunrpc/xprtrdma/svc_rdma_recvfrom.c |   10 +--
>>  net/sunrpc/xprtrdma/svc_rdma_rw.c       |   96 +++++++++++--------------------
>>  2 files changed, 39 insertions(+), 67 deletions(-)
>> diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
>> index 9cb5a09c4a01..b857a6805e95 100644
>> --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
>> +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
>> @@ -853,6 +853,9 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
>>  	spin_unlock(&rdma_xprt->sc_rq_dto_lock);
>>  	percpu_counter_inc(&svcrdma_stat_recv);
>>  +	/* Start receiving the next incoming message */
> 
> This comment confused me. This call just unblocks the xprt to move
> to the next message, it does not necessarily "start". So IIUC, it
> might be clearer to state "transport processing complete" or similar.

How about "/* Unblock the transport for the next receive */" ?

>> +	svc_xprt_received(xprt);
>> +
>>  	ib_dma_sync_single_for_cpu(rdma_xprt->sc_pd->device,
>>  				   ctxt->rc_recv_sge.addr, ctxt->rc_byte_len,
>>  				   DMA_FROM_DEVICE);
>> @@ -884,33 +887,28 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
>>  	rqstp->rq_xprt_ctxt = ctxt;
>>  	rqstp->rq_prot = IPPROTO_MAX;
>>  	svc_xprt_copy_addrs(rqstp, xprt);
>> -	svc_xprt_received(xprt);
>>  	return rqstp->rq_arg.len;
>>    out_readlist:
>>  	ret = svc_rdma_process_read_list(rdma_xprt, rqstp, ctxt);
>>  	if (ret < 0)
>>  		goto out_readfail;
>> -	svc_xprt_received(xprt);
>> -	return 0;
>> +	goto complete;
>>    out_err:
>>  	svc_rdma_send_error(rdma_xprt, ctxt, ret);
>>  	svc_rdma_recv_ctxt_put(rdma_xprt, ctxt);
>> -	svc_xprt_received(xprt);
>>  	return 0;
>>    out_readfail:
>>  	if (ret == -EINVAL)
>>  		svc_rdma_send_error(rdma_xprt, ctxt, ret);
>>  	svc_rdma_recv_ctxt_put(rdma_xprt, ctxt);
>> -	svc_xprt_received(xprt);
>>  	return ret;
>>    out_backchannel:
>>  	svc_rdma_handle_bc_reply(rqstp, ctxt);
>>  out_drop:
>>  	svc_rdma_recv_ctxt_put(rdma_xprt, ctxt);
>> -	svc_xprt_received(xprt);
>>  	return 0;
>>  }
>> diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c
>> index d7054e3a8e33..9163ab690288 100644
>> --- a/net/sunrpc/xprtrdma/svc_rdma_rw.c
>> +++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c
>> @@ -150,6 +150,8 @@ struct svc_rdma_chunk_ctxt {
>>  	struct svcxprt_rdma	*cc_rdma;
>>  	struct list_head	cc_rwctxts;
>>  	int			cc_sqecount;
>> +	enum ib_wc_status	cc_status;
>> +	struct completion	cc_done;
>>  };
>>    static void svc_rdma_cc_cid_init(struct svcxprt_rdma *rdma,
>> @@ -299,29 +301,15 @@ static void svc_rdma_wc_read_done(struct ib_cq *cq, struct ib_wc *wc)
>>  	struct svc_rdma_chunk_ctxt *cc =
>>  			container_of(cqe, struct svc_rdma_chunk_ctxt, cc_cqe);
>>  	struct svcxprt_rdma *rdma = cc->cc_rdma;
>> -	struct svc_rdma_read_info *info =
>> -			container_of(cc, struct svc_rdma_read_info, ri_cc);
>>    	trace_svcrdma_wc_read(wc, &cc->cc_cid);
>>    	atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail);
>>  	wake_up(&rdma->sc_send_wait);
>>  -	if (unlikely(wc->status != IB_WC_SUCCESS)) {
>> -		set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags);
>> -		svc_rdma_recv_ctxt_put(rdma, info->ri_readctxt);
>> -	} else {
>> -		spin_lock(&rdma->sc_rq_dto_lock);
>> -		list_add_tail(&info->ri_readctxt->rc_list,
>> -			      &rdma->sc_read_complete_q);
>> -		/* Note the unlock pairs with the smp_rmb in svc_xprt_ready: */
>> -		set_bit(XPT_DATA, &rdma->sc_xprt.xpt_flags);
>> -		spin_unlock(&rdma->sc_rq_dto_lock);
>> -
>> -		svc_xprt_enqueue(&rdma->sc_xprt);
>> -	}
>> -
>> -	svc_rdma_read_info_free(info);
>> +	cc->cc_status = wc->status;
>> +	complete(&cc->cc_done);
>> +	return;
>>  }
>>    /* This function sleeps when the transport's Send Queue is congested.
>> @@ -676,8 +664,8 @@ static int svc_rdma_build_read_segment(struct svc_rdma_read_info *info,
>>  	struct svc_rdma_recv_ctxt *head = info->ri_readctxt;
>>  	struct svc_rdma_chunk_ctxt *cc = &info->ri_cc;
>>  	struct svc_rqst *rqstp = info->ri_rqst;
>> -	struct svc_rdma_rw_ctxt *ctxt;
>>  	unsigned int sge_no, seg_len, len;
>> +	struct svc_rdma_rw_ctxt *ctxt;
>>  	struct scatterlist *sg;
>>  	int ret;
>>  @@ -693,8 +681,8 @@ static int svc_rdma_build_read_segment(struct svc_rdma_read_info *info,
>>  		seg_len = min_t(unsigned int, len,
>>  				PAGE_SIZE - info->ri_pageoff);
>>  -		head->rc_arg.pages[info->ri_pageno] =
>> -			rqstp->rq_pages[info->ri_pageno];
>> +		/* XXX: ri_pageno and rc_page_count might be exactly the same */
>> +
> 
> What is this comment conveying? It looks like a note-to-self that
> resulted in deleting the prior line.

Yeah, pretty much. I think it is a reminder that one of those
fields is superfluous and can be removed.

> If the "XXX" notation is
> still significant, it needs more detail on what needs to be
> fixed in future.

Or it should be removed before this patch is merged.

>>  		if (!info->ri_pageoff)
>>  			head->rc_page_count++;
>>  @@ -788,12 +776,10 @@ static int svc_rdma_copy_inline_range(struct svc_rdma_read_info *info,
>>  		page_len = min_t(unsigned int, remaining,
>>  				 PAGE_SIZE - info->ri_pageoff);
>>  -		head->rc_arg.pages[info->ri_pageno] =
>> -			rqstp->rq_pages[info->ri_pageno];
>>  		if (!info->ri_pageoff)
>>  			head->rc_page_count++;
>>  -		dst = page_address(head->rc_arg.pages[info->ri_pageno]);
>> +		dst = page_address(rqstp->rq_pages[info->ri_pageno]);
>>  		memcpy(dst + info->ri_pageno, src + offset, page_len);
>>    		info->ri_totalbytes += page_len;
>> @@ -813,7 +799,7 @@ static int svc_rdma_copy_inline_range(struct svc_rdma_read_info *info,
>>   * svc_rdma_read_multiple_chunks - Construct RDMA Reads to pull data item Read chunks
>>   * @info: context for RDMA Reads
>>   *
>> - * The chunk data lands in head->rc_arg as a series of contiguous pages,
>> + * The chunk data lands in rqstp->rq_arg as a series of contiguous pages,
>>   * like an incoming TCP call.
>>   *
>>   * Return values:
>> @@ -827,8 +813,8 @@ static noinline int svc_rdma_read_multiple_chunks(struct svc_rdma_read_info *inf
>>  {
>>  	struct svc_rdma_recv_ctxt *head = info->ri_readctxt;
>>  	const struct svc_rdma_pcl *pcl = &head->rc_read_pcl;
>> +	struct xdr_buf *buf = &info->ri_rqst->rq_arg;
>>  	struct svc_rdma_chunk *chunk, *next;
>> -	struct xdr_buf *buf = &head->rc_arg;
>>  	unsigned int start, length;
>>  	int ret;
>>  @@ -864,9 +850,9 @@ static noinline int svc_rdma_read_multiple_chunks(struct svc_rdma_read_info *inf
>>  	buf->len += info->ri_totalbytes;
>>  	buf->buflen += info->ri_totalbytes;
>>  -	head->rc_hdr_count = 1;
>> -	buf->head[0].iov_base = page_address(head->rc_pages[0]);
>> +	buf->head[0].iov_base = page_address(info->ri_rqst->rq_pages[0]);
>>  	buf->head[0].iov_len = min_t(size_t, PAGE_SIZE, info->ri_totalbytes);
>> +	buf->pages = &info->ri_rqst->rq_pages[1];
>>  	buf->page_len = info->ri_totalbytes - buf->head[0].iov_len;
>>  	return 0;
>>  }
>> @@ -875,9 +861,9 @@ static noinline int svc_rdma_read_multiple_chunks(struct svc_rdma_read_info *inf
>>   * svc_rdma_read_data_item - Construct RDMA Reads to pull data item Read chunks
>>   * @info: context for RDMA Reads
>>   *
>> - * The chunk data lands in the page list of head->rc_arg.pages.
>> + * The chunk data lands in the page list of rqstp->rq_arg.pages.
>>   *
>> - * Currently NFSD does not look at the head->rc_arg.tail[0] iovec.
>> + * Currently NFSD does not look at the rqstp->rq_arg.tail[0] kvec.
>>   * Therefore, XDR round-up of the Read chunk and trailing
>>   * inline content must both be added at the end of the pagelist.
>>   *
>> @@ -891,7 +877,7 @@ static noinline int svc_rdma_read_multiple_chunks(struct svc_rdma_read_info *inf
>>  static int svc_rdma_read_data_item(struct svc_rdma_read_info *info)
>>  {
>>  	struct svc_rdma_recv_ctxt *head = info->ri_readctxt;
>> -	struct xdr_buf *buf = &head->rc_arg;
>> +	struct xdr_buf *buf = &info->ri_rqst->rq_arg;
>>  	struct svc_rdma_chunk *chunk;
>>  	unsigned int length;
>>  	int ret;
>> @@ -901,8 +887,6 @@ static int svc_rdma_read_data_item(struct svc_rdma_read_info *info)
>>  	if (ret < 0)
>>  		goto out;
>>  -	head->rc_hdr_count = 0;
>> -
>>  	/* Split the Receive buffer between the head and tail
>>  	 * buffers at Read chunk's position. XDR roundup of the
>>  	 * chunk is not included in either the pagelist or in
>> @@ -921,7 +905,8 @@ static int svc_rdma_read_data_item(struct svc_rdma_read_info *info)
>>  	 * Currently these chunks always start at page offset 0,
>>  	 * thus the rounded-up length never crosses a page boundary.
>>  	 */
>> -	length = XDR_QUADLEN(info->ri_totalbytes) << 2;
>> +	buf->pages = &info->ri_rqst->rq_pages[0];
>> +	length = xdr_align_size(chunk->ch_length);
>>  	buf->page_len = length;
>>  	buf->len += length;
>>  	buf->buflen += length;
>> @@ -1033,8 +1018,7 @@ static int svc_rdma_read_call_chunk(struct svc_rdma_read_info *info)
>>   * @info: context for RDMA Reads
>>   *
>>   * The start of the data lands in the first page just after the
>> - * Transport header, and the rest lands in the page list of
>> - * head->rc_arg.pages.
>> + * Transport header, and the rest lands in rqstp->rq_arg.pages.
>>   *
>>   * Assumptions:
>>   *	- A PZRC is never sent in an RDMA_MSG message, though it's
>> @@ -1049,8 +1033,7 @@ static int svc_rdma_read_call_chunk(struct svc_rdma_read_info *info)
>>   */
>>  static noinline int svc_rdma_read_special(struct svc_rdma_read_info *info)
>>  {
>> -	struct svc_rdma_recv_ctxt *head = info->ri_readctxt;
>> -	struct xdr_buf *buf = &head->rc_arg;
>> +	struct xdr_buf *buf = &info->ri_rqst->rq_arg;
>>  	int ret;
>>    	ret = svc_rdma_read_call_chunk(info);
>> @@ -1060,35 +1043,15 @@ static noinline int svc_rdma_read_special(struct svc_rdma_read_info *info)
>>  	buf->len += info->ri_totalbytes;
>>  	buf->buflen += info->ri_totalbytes;
>>  -	head->rc_hdr_count = 1;
>> -	buf->head[0].iov_base = page_address(head->rc_pages[0]);
>> +	buf->head[0].iov_base = page_address(info->ri_rqst->rq_pages[0]);
>>  	buf->head[0].iov_len = min_t(size_t, PAGE_SIZE, info->ri_totalbytes);
>> +	buf->pages = &info->ri_rqst->rq_pages[1];
>>  	buf->page_len = info->ri_totalbytes - buf->head[0].iov_len;
>>    out:
>>  	return ret;
>>  }
>>  -/* Pages under I/O have been copied to head->rc_pages. Ensure they
>> - * are not released by svc_xprt_release() until the I/O is complete.
>> - *
>> - * This has to be done after all Read WRs are constructed to properly
>> - * handle a page that is part of I/O on behalf of two different RDMA
>> - * segments.
>> - *
>> - * Do this only if I/O has been posted. Otherwise, we do indeed want
>> - * svc_xprt_release() to clean things up properly.
>> - */
>> -static void svc_rdma_save_io_pages(struct svc_rqst *rqstp,
>> -				   const unsigned int start,
>> -				   const unsigned int num_pages)
>> -{
>> -	unsigned int i;
>> -
>> -	for (i = start; i < num_pages + start; i++)
>> -		rqstp->rq_pages[i] = NULL;
>> -}
>> -
>>  /**
>>   * svc_rdma_process_read_list - Pull list of Read chunks from the client
>>   * @rdma: controlling RDMA transport
>> @@ -1153,11 +1116,22 @@ int svc_rdma_process_read_list(struct svcxprt_rdma *rdma,
>>  		goto out_err;
>>    	trace_svcrdma_post_read_chunk(&cc->cc_cid, cc->cc_sqecount);
>> +	init_completion(&cc->cc_done);
>>  	ret = svc_rdma_post_chunk_ctxt(cc);
>>  	if (ret < 0)
>>  		goto out_err;
>> -	svc_rdma_save_io_pages(rqstp, 0, head->rc_page_count);
>> -	return 1;
>> +
>> +	ret = 1;
>> +	wait_for_completion(&cc->cc_done);
>> +	if (cc->cc_status != IB_WC_SUCCESS)
>> +		ret = -EIO;
>> +
>> +	/* rq_respages starts after the last arg page */
>> +	rqstp->rq_respages = &rqstp->rq_pages[head->rc_page_count];
>> +	rqstp->rq_next_page = rqstp->rq_respages + 1;
>> +
>> +	/* Ensure svc_rdma_recv_ctxt_put() does not try to release pages */
>> +	head->rc_page_count = 0;
>>    out_err:
>>  	svc_rdma_read_info_free(info);

--
Chuck Lever