Re: [PATCH v3] nfsd: Fix NFSv4 READ on RDMA when using readv

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 




> On Feb 4, 2020, at 12:21 PM, J. Bruce Fields <bfields@xxxxxxxxxxxx> wrote:
> 
> On Sat, Feb 01, 2020 at 03:05:19PM -0500, Chuck Lever wrote:
>> svcrdma expects that the payload falls precisely into the xdr_buf
>> page vector. This does not seem to be the case for
>> nfsd4_encode_readv().
>> 
>> This code is called only when fops->splice_read is missing or when
>> RQ_SPLICE_OK is clear, so it's not a noticeable problem in many
>> common cases.
>> 
>> Add new transport method: ->xpo_read_payload so that when a READ
>> payload does not fit exactly in rq_res's page vector, the XDR
>> encoder can inform the RPC transport exactly where that payload is,
>> without the payload's XDR pad.
>> 
>> That way, when a Write chunk is present, the transport knows what
>> byte range in the Reply message is supposed to be matched with the
>> chunk.
>> 
>> Note that the Linux NFS server implementation of NFS/RDMA can
>> currently handle only one Write chunk per RPC-over-RDMA message.
>> This simplifies the implementation of this fix.
>> 
>> Fixes: b04209806384 ("nfsd4: allow exotic read compounds")
>> Buglink: https://bugzilla.kernel.org/show_bug.cgi?id=198053
>> Signed-off-by: Chuck Lever <chuck.lever@xxxxxxxxxx>
>> ---
>> 
>> Hi Bruce-
>> 
>> This fix seems closer to being merge-able. I've tested it by wiring
>> the NFSv4 READ Reply encoder to always use readv. I've run the git
>> regression suite with NFSv3, NFSv4.0, and NFSv4.1 using all three
>> flavors of Kerberos and sec=sys, all on NFS/RDMA.
>> 
>> Aside from a new no-op function call, the TCP path is not perturbed.
>> 
>> 
>> Changes since RFC2:
>> - Take Trond's suggestion to use xdr->buf->len
>> - Squash fix down to a single patch
> 
> I liked them better split out.
> 
> This seems fine to me, though.
> 
> Could you ping me again in another week, after the merge window?
> 
>> @@ -3521,17 +3521,14 @@ static __be32 nfsd4_encode_readv(struct nfsd4_compoundres *resp,
>> 	u32 zzz = 0;
>> 	int pad;
>> 
>> +	/* Ensure xdr_reserve_space skips past xdr->buf->head */
> 
> Could the comment explain why we're doing this?  (Maybe take some
> language from the changelog.)

The new comment will also explain why the patches are combined.
I'll send a v4 after the merge window closes.


> --b.
> 
>> +	if (xdr->iov == xdr->buf->head) {
>> +		xdr->iov = NULL;
>> +		xdr->end = xdr->p;
>> +	}
>> +
>> 	len = maxcount;
>> 	v = 0;
>> -
>> -	thislen = min_t(long, len, ((void *)xdr->end - (void *)xdr->p));
>> -	p = xdr_reserve_space(xdr, (thislen+3)&~3);
>> -	WARN_ON_ONCE(!p);
>> -	resp->rqstp->rq_vec[v].iov_base = p;
>> -	resp->rqstp->rq_vec[v].iov_len = thislen;
>> -	v++;
>> -	len -= thislen;
>> -
>> 	while (len) {
>> 		thislen = min_t(long, len, PAGE_SIZE);
>> 		p = xdr_reserve_space(xdr, (thislen+3)&~3);
>> @@ -3550,6 +3547,7 @@ static __be32 nfsd4_encode_readv(struct nfsd4_compoundres *resp,
>> 	read->rd_length = maxcount;
>> 	if (nfserr)
>> 		return nfserr;
>> +	svc_mark_read_payload(resp->rqstp, starting_len + 8, read->rd_length);
>> 	xdr_truncate_encode(xdr, starting_len + 8 + ((maxcount+3)&~3));
>> 
>> 	tmp = htonl(eof);
>> diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
>> index 1afe38eb33f7..e0610e0e34f7 100644
>> --- a/include/linux/sunrpc/svc.h
>> +++ b/include/linux/sunrpc/svc.h
>> @@ -517,6 +517,9 @@ int		   svc_register(const struct svc_serv *, struct net *, const int,
>> void		   svc_reserve(struct svc_rqst *rqstp, int space);
>> struct svc_pool *  svc_pool_for_cpu(struct svc_serv *serv, int cpu);
>> char *		   svc_print_addr(struct svc_rqst *, char *, size_t);
>> +void		   svc_mark_read_payload(struct svc_rqst *rqstp,
>> +					 unsigned int pos,
>> +					 unsigned long length);
>> unsigned int	   svc_fill_write_vector(struct svc_rqst *rqstp,
>> 					 struct page **pages,
>> 					 struct kvec *first, size_t total);
>> diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
>> index 40f65888dd38..4fd3b8a16dfd 100644
>> --- a/include/linux/sunrpc/svc_rdma.h
>> +++ b/include/linux/sunrpc/svc_rdma.h
>> @@ -137,6 +137,7 @@ struct svc_rdma_recv_ctxt {
>> 	unsigned int		rc_page_count;
>> 	unsigned int		rc_hdr_count;
>> 	u32			rc_inv_rkey;
>> +	unsigned long		rc_read_payload_length;
>> 	struct page		*rc_pages[RPCSVC_MAXPAGES];
>> };
>> 
>> @@ -170,7 +171,8 @@ extern int svc_rdma_recv_read_chunk(struct svcxprt_rdma *rdma,
>> 				    struct svc_rqst *rqstp,
>> 				    struct svc_rdma_recv_ctxt *head, __be32 *p);
>> extern int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma,
>> -				     __be32 *wr_ch, struct xdr_buf *xdr);
>> +				     __be32 *wr_ch, struct xdr_buf *xdr,
>> +				     struct svc_rdma_recv_ctxt *rctxt);
>> extern int svc_rdma_send_reply_chunk(struct svcxprt_rdma *rdma,
>> 				     __be32 *rp_ch, bool writelist,
>> 				     struct xdr_buf *xdr);
>> @@ -189,6 +191,8 @@ extern int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma,
>> 				  struct svc_rdma_send_ctxt *ctxt,
>> 				  struct xdr_buf *xdr, __be32 *wr_lst);
>> extern int svc_rdma_sendto(struct svc_rqst *);
>> +extern void svc_rdma_read_payload(struct svc_rqst *rqstp, unsigned int pos,
>> +				  unsigned long length);
>> 
>> /* svc_rdma_transport.c */
>> extern int svc_rdma_create_listen(struct svc_serv *, int, struct sockaddr *);
>> diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h
>> index ea6f46be9cb7..d272acf7531f 100644
>> --- a/include/linux/sunrpc/svc_xprt.h
>> +++ b/include/linux/sunrpc/svc_xprt.h
>> @@ -21,6 +21,8 @@ struct svc_xprt_ops {
>> 	int		(*xpo_has_wspace)(struct svc_xprt *);
>> 	int		(*xpo_recvfrom)(struct svc_rqst *);
>> 	int		(*xpo_sendto)(struct svc_rqst *);
>> +	void		(*xpo_read_payload)(struct svc_rqst *, unsigned int,
>> +					    unsigned long);
>> 	void		(*xpo_release_rqst)(struct svc_rqst *);
>> 	void		(*xpo_detach)(struct svc_xprt *);
>> 	void		(*xpo_free)(struct svc_xprt *);
>> diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
>> index 187dd4e73d64..d4a0134f1ca1 100644
>> --- a/net/sunrpc/svc.c
>> +++ b/net/sunrpc/svc.c
>> @@ -1637,6 +1637,20 @@ u32 svc_max_payload(const struct svc_rqst *rqstp)
>> EXPORT_SYMBOL_GPL(svc_max_payload);
>> 
>> /**
>> + * svc_mark_read_payload - mark a range of bytes as a READ payload
>> + * @rqstp: svc_rqst to operate on
>> + * @pos: payload's byte offset in the RPC Reply message
>> + * @length: size of payload, in bytes
>> + *
>> + */
>> +void svc_mark_read_payload(struct svc_rqst *rqstp, unsigned int pos,
>> +			   unsigned long length)
>> +{
>> +	rqstp->rq_xprt->xpt_ops->xpo_read_payload(rqstp, pos, length);
>> +}
>> +EXPORT_SYMBOL_GPL(svc_mark_read_payload);
>> +
>> +/**
>>  * svc_fill_write_vector - Construct data argument for VFS write call
>>  * @rqstp: svc_rqst to operate on
>>  * @pages: list of pages containing data payload
>> diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
>> index 2934dd711715..fe045b3e7e08 100644
>> --- a/net/sunrpc/svcsock.c
>> +++ b/net/sunrpc/svcsock.c
>> @@ -279,6 +279,11 @@ static int svc_sendto(struct svc_rqst *rqstp, struct xdr_buf *xdr)
>> 	return len;
>> }
>> 
>> +static void svc_sock_read_payload(struct svc_rqst *rqstp, unsigned int pos,
>> +				  unsigned long length)
>> +{
>> +}
>> +
>> /*
>>  * Report socket names for nfsdfs
>>  */
>> @@ -653,6 +658,7 @@ static struct svc_xprt *svc_udp_create(struct svc_serv *serv,
>> 	.xpo_create = svc_udp_create,
>> 	.xpo_recvfrom = svc_udp_recvfrom,
>> 	.xpo_sendto = svc_udp_sendto,
>> +	.xpo_read_payload = svc_sock_read_payload,
>> 	.xpo_release_rqst = svc_release_udp_skb,
>> 	.xpo_detach = svc_sock_detach,
>> 	.xpo_free = svc_sock_free,
>> @@ -1171,6 +1177,7 @@ static struct svc_xprt *svc_tcp_create(struct svc_serv *serv,
>> 	.xpo_create = svc_tcp_create,
>> 	.xpo_recvfrom = svc_tcp_recvfrom,
>> 	.xpo_sendto = svc_tcp_sendto,
>> +	.xpo_read_payload = svc_sock_read_payload,
>> 	.xpo_release_rqst = svc_release_skb,
>> 	.xpo_detach = svc_tcp_sock_detach,
>> 	.xpo_free = svc_sock_free,
>> diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
>> index 393af8624f53..90f0a9ce7521 100644
>> --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
>> +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
>> @@ -191,6 +191,7 @@ void svc_rdma_recv_ctxts_destroy(struct svcxprt_rdma *rdma)
>> 
>> out:
>> 	ctxt->rc_page_count = 0;
>> +	ctxt->rc_read_payload_length = 0;
>> 	return ctxt;
>> 
>> out_empty:
>> diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c
>> index 467d40a1dffa..2cef19592565 100644
>> --- a/net/sunrpc/xprtrdma/svc_rdma_rw.c
>> +++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c
>> @@ -484,18 +484,18 @@ static int svc_rdma_send_xdr_kvec(struct svc_rdma_write_info *info,
>> 				     vec->iov_len);
>> }
>> 
>> -/* Send an xdr_buf's page list by itself. A Write chunk is
>> - * just the page list. a Reply chunk is the head, page list,
>> - * and tail. This function is shared between the two types
>> - * of chunk.
>> +/* Send an xdr_buf's page list by itself. A Write chunk is just
>> + * the page list. A Reply chunk is @xdr's head, page list, and
>> + * tail. This function is shared between the two types of chunk.
>>  */
>> static int svc_rdma_send_xdr_pagelist(struct svc_rdma_write_info *info,
>> -				      struct xdr_buf *xdr)
>> +				      struct xdr_buf *xdr,
>> +				      unsigned int length)
>> {
>> 	info->wi_xdr = xdr;
>> 	info->wi_next_off = 0;
>> 	return svc_rdma_build_writes(info, svc_rdma_pagelist_to_sg,
>> -				     xdr->page_len);
>> +				     length);
>> }
>> 
>> /**
>> @@ -503,6 +503,7 @@ static int svc_rdma_send_xdr_pagelist(struct svc_rdma_write_info *info,
>>  * @rdma: controlling RDMA transport
>>  * @wr_ch: Write chunk provided by client
>>  * @xdr: xdr_buf containing the data payload
>> + * @rctxt: location in @xdr of the data payload
>>  *
>>  * Returns a non-negative number of bytes the chunk consumed, or
>>  *	%-E2BIG if the payload was larger than the Write chunk,
>> @@ -512,9 +513,11 @@ static int svc_rdma_send_xdr_pagelist(struct svc_rdma_write_info *info,
>>  *	%-EIO if rdma_rw initialization failed (DMA mapping, etc).
>>  */
>> int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma, __be32 *wr_ch,
>> -			      struct xdr_buf *xdr)
>> +			      struct xdr_buf *xdr,
>> +			      struct svc_rdma_recv_ctxt *rctxt)
>> {
>> 	struct svc_rdma_write_info *info;
>> +	unsigned int length;
>> 	int ret;
>> 
>> 	if (!xdr->page_len)
>> @@ -524,7 +527,12 @@ int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma, __be32 *wr_ch,
>> 	if (!info)
>> 		return -ENOMEM;
>> 
>> -	ret = svc_rdma_send_xdr_pagelist(info, xdr);
>> +	/* xdr->page_len is the chunk length, unless the upper layer
>> +	 * has explicitly provided a payload length.
>> +	 */
>> +	length = rctxt->rc_read_payload_length ?
>> +			rctxt->rc_read_payload_length : xdr->page_len;
>> +	ret = svc_rdma_send_xdr_pagelist(info, xdr, length);
>> 	if (ret < 0)
>> 		goto out_err;
>> 
>> @@ -533,7 +541,7 @@ int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma, __be32 *wr_ch,
>> 		goto out_err;
>> 
>> 	trace_svcrdma_encode_write(xdr->page_len);
>> -	return xdr->page_len;
>> +	return length;
>> 
>> out_err:
>> 	svc_rdma_write_info_free(info);
>> @@ -573,7 +581,8 @@ int svc_rdma_send_reply_chunk(struct svcxprt_rdma *rdma, __be32 *rp_ch,
>> 	 * client did not provide Write chunks.
>> 	 */
>> 	if (!writelist && xdr->page_len) {
>> -		ret = svc_rdma_send_xdr_pagelist(info, xdr);
>> +		ret = svc_rdma_send_xdr_pagelist(info, xdr,
>> +						 xdr->page_len);
>> 		if (ret < 0)
>> 			goto out_err;
>> 		consumed += xdr->page_len;
>> diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
>> index a9ba040c70da..6f9b49b6768f 100644
>> --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
>> +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
>> @@ -871,7 +871,7 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
>> 
>> 	if (wr_lst) {
>> 		/* XXX: Presume the client sent only one Write chunk */
>> -		ret = svc_rdma_send_write_chunk(rdma, wr_lst, xdr);
>> +		ret = svc_rdma_send_write_chunk(rdma, wr_lst, xdr, rctxt);
>> 		if (ret < 0)
>> 			goto err2;
>> 		svc_rdma_xdr_encode_write_list(rdma_resp, wr_lst, ret);
>> @@ -913,3 +913,27 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
>> 	ret = -ENOTCONN;
>> 	goto out;
>> }
>> +
>> +/**
>> + * svc_rdma_read_payload - mark where a READ payload sites
>> + * @rqstp: svc_rqst to operate on
>> + * @pos: payload's byte offset in the RPC Reply message
>> + * @length: size of payload, in bytes
>> + *
>> + * For the moment, just record the xdr_buf location of the READ
>> + * payload. svc_rdma_sendto will use that location later when
>> + * we actually send the payload.
>> + */
>> +void svc_rdma_read_payload(struct svc_rqst *rqstp, unsigned int pos,
>> +			   unsigned long length)
>> +{
>> +	struct svc_rdma_recv_ctxt *ctxt = rqstp->rq_xprt_ctxt;
>> +
>> +	/* XXX: Just one READ payload slot for now, since our
>> +	 * transport implementation currently supports only one
>> +	 * Write chunk. We assume the one READ payload always
>> +	 * starts at the head<->pages boundary.
>> +	 */
>> +	WARN_ON(rqstp->rq_res.head[0].iov_len != pos);
>> +	ctxt->rc_read_payload_length = length;
>> +}
>> diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
>> index 145a3615c319..f6aad2798063 100644
>> --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
>> +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
>> @@ -82,6 +82,7 @@ static struct svc_xprt *svc_rdma_create(struct svc_serv *serv,
>> 	.xpo_create = svc_rdma_create,
>> 	.xpo_recvfrom = svc_rdma_recvfrom,
>> 	.xpo_sendto = svc_rdma_sendto,
>> +	.xpo_read_payload = svc_rdma_read_payload,
>> 	.xpo_release_rqst = svc_rdma_release_rqst,
>> 	.xpo_detach = svc_rdma_detach,
>> 	.xpo_free = svc_rdma_free,

--
Chuck Lever







[Index of Archives]     [Linux Filesystem Development]     [Linux USB Development]     [Linux Media Development]     [Video for Linux]     [Linux NILFS]     [Linux Audio Users]     [Yosemite Info]     [Linux SCSI]

  Powered by Linux