> On Feb 4, 2020, at 12:21 PM, J. Bruce Fields <bfields@xxxxxxxxxxxx> wrote: > > On Sat, Feb 01, 2020 at 03:05:19PM -0500, Chuck Lever wrote: >> svcrdma expects that the payload falls precisely into the xdr_buf >> page vector. This does not seem to be the case for >> nfsd4_encode_readv(). >> >> This code is called only when fops->splice_read is missing or when >> RQ_SPLICE_OK is clear, so it's not a noticeable problem in many >> common cases. >> >> Add new transport method: ->xpo_read_payload so that when a READ >> payload does not fit exactly in rq_res's page vector, the XDR >> encoder can inform the RPC transport exactly where that payload is, >> without the payload's XDR pad. >> >> That way, when a Write chunk is present, the transport knows what >> byte range in the Reply message is supposed to be matched with the >> chunk. >> >> Note that the Linux NFS server implementation of NFS/RDMA can >> currently handle only one Write chunk per RPC-over-RDMA message. >> This simplifies the implementation of this fix. >> >> Fixes: b04209806384 ("nfsd4: allow exotic read compounds") >> Buglink: https://bugzilla.kernel.org/show_bug.cgi?id=198053 >> Signed-off-by: Chuck Lever <chuck.lever@xxxxxxxxxx> >> --- >> >> Hi Bruce- >> >> This fix seems closer to being merge-able. I've tested it by wiring >> the NFSv4 READ Reply encoder to always use readv. I've run the git >> regression suite with NFSv3, NFSv4.0, and NFSv4.1 using all three >> flavors of Kerberos and sec=sys, all on NFS/RDMA. >> >> Aside from a new no-op function call, the TCP path is not perturbed. >> >> >> Changes since RFC2: >> - Take Trond's suggestion to use xdr->buf->len >> - Squash fix down to a single patch > > I liked them better split out. > > This seems fine to me, though. > > Could you ping me again in another week, after the merge window? > >> @@ -3521,17 +3521,14 @@ static __be32 nfsd4_encode_readv(struct nfsd4_compoundres *resp, >> u32 zzz = 0; >> int pad; >> >> + /* Ensure xdr_reserve_space skips past xdr->buf->head */ > > Could the comment explain why we're doing this? (Maybe take some > language from the changelog.) The new comment will also explain why the patches are combined. I'll send a v4 after the merge window closes. > --b. > >> + if (xdr->iov == xdr->buf->head) { >> + xdr->iov = NULL; >> + xdr->end = xdr->p; >> + } >> + >> len = maxcount; >> v = 0; >> - >> - thislen = min_t(long, len, ((void *)xdr->end - (void *)xdr->p)); >> - p = xdr_reserve_space(xdr, (thislen+3)&~3); >> - WARN_ON_ONCE(!p); >> - resp->rqstp->rq_vec[v].iov_base = p; >> - resp->rqstp->rq_vec[v].iov_len = thislen; >> - v++; >> - len -= thislen; >> - >> while (len) { >> thislen = min_t(long, len, PAGE_SIZE); >> p = xdr_reserve_space(xdr, (thislen+3)&~3); >> @@ -3550,6 +3547,7 @@ static __be32 nfsd4_encode_readv(struct nfsd4_compoundres *resp, >> read->rd_length = maxcount; >> if (nfserr) >> return nfserr; >> + svc_mark_read_payload(resp->rqstp, starting_len + 8, read->rd_length); >> xdr_truncate_encode(xdr, starting_len + 8 + ((maxcount+3)&~3)); >> >> tmp = htonl(eof); >> diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h >> index 1afe38eb33f7..e0610e0e34f7 100644 >> --- a/include/linux/sunrpc/svc.h >> +++ b/include/linux/sunrpc/svc.h >> @@ -517,6 +517,9 @@ int svc_register(const struct svc_serv *, struct net *, const int, >> void svc_reserve(struct svc_rqst *rqstp, int space); >> struct svc_pool * svc_pool_for_cpu(struct svc_serv *serv, int cpu); >> char * svc_print_addr(struct svc_rqst *, char *, size_t); >> +void svc_mark_read_payload(struct svc_rqst *rqstp, >> + unsigned int pos, >> + unsigned long length); >> unsigned int svc_fill_write_vector(struct svc_rqst *rqstp, >> struct page **pages, >> struct kvec *first, size_t total); >> diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h >> index 40f65888dd38..4fd3b8a16dfd 100644 >> --- a/include/linux/sunrpc/svc_rdma.h >> +++ b/include/linux/sunrpc/svc_rdma.h >> @@ -137,6 +137,7 @@ struct svc_rdma_recv_ctxt { >> unsigned int rc_page_count; >> unsigned int rc_hdr_count; >> u32 rc_inv_rkey; >> + unsigned long rc_read_payload_length; >> struct page *rc_pages[RPCSVC_MAXPAGES]; >> }; >> >> @@ -170,7 +171,8 @@ extern int svc_rdma_recv_read_chunk(struct svcxprt_rdma *rdma, >> struct svc_rqst *rqstp, >> struct svc_rdma_recv_ctxt *head, __be32 *p); >> extern int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma, >> - __be32 *wr_ch, struct xdr_buf *xdr); >> + __be32 *wr_ch, struct xdr_buf *xdr, >> + struct svc_rdma_recv_ctxt *rctxt); >> extern int svc_rdma_send_reply_chunk(struct svcxprt_rdma *rdma, >> __be32 *rp_ch, bool writelist, >> struct xdr_buf *xdr); >> @@ -189,6 +191,8 @@ extern int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma, >> struct svc_rdma_send_ctxt *ctxt, >> struct xdr_buf *xdr, __be32 *wr_lst); >> extern int svc_rdma_sendto(struct svc_rqst *); >> +extern void svc_rdma_read_payload(struct svc_rqst *rqstp, unsigned int pos, >> + unsigned long length); >> >> /* svc_rdma_transport.c */ >> extern int svc_rdma_create_listen(struct svc_serv *, int, struct sockaddr *); >> diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h >> index ea6f46be9cb7..d272acf7531f 100644 >> --- a/include/linux/sunrpc/svc_xprt.h >> +++ b/include/linux/sunrpc/svc_xprt.h >> @@ -21,6 +21,8 @@ struct svc_xprt_ops { >> int (*xpo_has_wspace)(struct svc_xprt *); >> int (*xpo_recvfrom)(struct svc_rqst *); >> int (*xpo_sendto)(struct svc_rqst *); >> + void (*xpo_read_payload)(struct svc_rqst *, unsigned int, >> + unsigned long); >> void (*xpo_release_rqst)(struct svc_rqst *); >> void (*xpo_detach)(struct svc_xprt *); >> void (*xpo_free)(struct svc_xprt *); >> diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c >> index 187dd4e73d64..d4a0134f1ca1 100644 >> --- a/net/sunrpc/svc.c >> +++ b/net/sunrpc/svc.c >> @@ -1637,6 +1637,20 @@ u32 svc_max_payload(const struct svc_rqst *rqstp) >> EXPORT_SYMBOL_GPL(svc_max_payload); >> >> /** >> + * svc_mark_read_payload - mark a range of bytes as a READ payload >> + * @rqstp: svc_rqst to operate on >> + * @pos: payload's byte offset in the RPC Reply message >> + * @length: size of payload, in bytes >> + * >> + */ >> +void svc_mark_read_payload(struct svc_rqst *rqstp, unsigned int pos, >> + unsigned long length) >> +{ >> + rqstp->rq_xprt->xpt_ops->xpo_read_payload(rqstp, pos, length); >> +} >> +EXPORT_SYMBOL_GPL(svc_mark_read_payload); >> + >> +/** >> * svc_fill_write_vector - Construct data argument for VFS write call >> * @rqstp: svc_rqst to operate on >> * @pages: list of pages containing data payload >> diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c >> index 2934dd711715..fe045b3e7e08 100644 >> --- a/net/sunrpc/svcsock.c >> +++ b/net/sunrpc/svcsock.c >> @@ -279,6 +279,11 @@ static int svc_sendto(struct svc_rqst *rqstp, struct xdr_buf *xdr) >> return len; >> } >> >> +static void svc_sock_read_payload(struct svc_rqst *rqstp, unsigned int pos, >> + unsigned long length) >> +{ >> +} >> + >> /* >> * Report socket names for nfsdfs >> */ >> @@ -653,6 +658,7 @@ static struct svc_xprt *svc_udp_create(struct svc_serv *serv, >> .xpo_create = svc_udp_create, >> .xpo_recvfrom = svc_udp_recvfrom, >> .xpo_sendto = svc_udp_sendto, >> + .xpo_read_payload = svc_sock_read_payload, >> .xpo_release_rqst = svc_release_udp_skb, >> .xpo_detach = svc_sock_detach, >> .xpo_free = svc_sock_free, >> @@ -1171,6 +1177,7 @@ static struct svc_xprt *svc_tcp_create(struct svc_serv *serv, >> .xpo_create = svc_tcp_create, >> .xpo_recvfrom = svc_tcp_recvfrom, >> .xpo_sendto = svc_tcp_sendto, >> + .xpo_read_payload = svc_sock_read_payload, >> .xpo_release_rqst = svc_release_skb, >> .xpo_detach = svc_tcp_sock_detach, >> .xpo_free = svc_sock_free, >> diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c >> index 393af8624f53..90f0a9ce7521 100644 >> --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c >> +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c >> @@ -191,6 +191,7 @@ void svc_rdma_recv_ctxts_destroy(struct svcxprt_rdma *rdma) >> >> out: >> ctxt->rc_page_count = 0; >> + ctxt->rc_read_payload_length = 0; >> return ctxt; >> >> out_empty: >> diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c >> index 467d40a1dffa..2cef19592565 100644 >> --- a/net/sunrpc/xprtrdma/svc_rdma_rw.c >> +++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c >> @@ -484,18 +484,18 @@ static int svc_rdma_send_xdr_kvec(struct svc_rdma_write_info *info, >> vec->iov_len); >> } >> >> -/* Send an xdr_buf's page list by itself. A Write chunk is >> - * just the page list. a Reply chunk is the head, page list, >> - * and tail. This function is shared between the two types >> - * of chunk. >> +/* Send an xdr_buf's page list by itself. A Write chunk is just >> + * the page list. A Reply chunk is @xdr's head, page list, and >> + * tail. This function is shared between the two types of chunk. >> */ >> static int svc_rdma_send_xdr_pagelist(struct svc_rdma_write_info *info, >> - struct xdr_buf *xdr) >> + struct xdr_buf *xdr, >> + unsigned int length) >> { >> info->wi_xdr = xdr; >> info->wi_next_off = 0; >> return svc_rdma_build_writes(info, svc_rdma_pagelist_to_sg, >> - xdr->page_len); >> + length); >> } >> >> /** >> @@ -503,6 +503,7 @@ static int svc_rdma_send_xdr_pagelist(struct svc_rdma_write_info *info, >> * @rdma: controlling RDMA transport >> * @wr_ch: Write chunk provided by client >> * @xdr: xdr_buf containing the data payload >> + * @rctxt: location in @xdr of the data payload >> * >> * Returns a non-negative number of bytes the chunk consumed, or >> * %-E2BIG if the payload was larger than the Write chunk, >> @@ -512,9 +513,11 @@ static int svc_rdma_send_xdr_pagelist(struct svc_rdma_write_info *info, >> * %-EIO if rdma_rw initialization failed (DMA mapping, etc). >> */ >> int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma, __be32 *wr_ch, >> - struct xdr_buf *xdr) >> + struct xdr_buf *xdr, >> + struct svc_rdma_recv_ctxt *rctxt) >> { >> struct svc_rdma_write_info *info; >> + unsigned int length; >> int ret; >> >> if (!xdr->page_len) >> @@ -524,7 +527,12 @@ int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma, __be32 *wr_ch, >> if (!info) >> return -ENOMEM; >> >> - ret = svc_rdma_send_xdr_pagelist(info, xdr); >> + /* xdr->page_len is the chunk length, unless the upper layer >> + * has explicitly provided a payload length. >> + */ >> + length = rctxt->rc_read_payload_length ? >> + rctxt->rc_read_payload_length : xdr->page_len; >> + ret = svc_rdma_send_xdr_pagelist(info, xdr, length); >> if (ret < 0) >> goto out_err; >> >> @@ -533,7 +541,7 @@ int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma, __be32 *wr_ch, >> goto out_err; >> >> trace_svcrdma_encode_write(xdr->page_len); >> - return xdr->page_len; >> + return length; >> >> out_err: >> svc_rdma_write_info_free(info); >> @@ -573,7 +581,8 @@ int svc_rdma_send_reply_chunk(struct svcxprt_rdma *rdma, __be32 *rp_ch, >> * client did not provide Write chunks. >> */ >> if (!writelist && xdr->page_len) { >> - ret = svc_rdma_send_xdr_pagelist(info, xdr); >> + ret = svc_rdma_send_xdr_pagelist(info, xdr, >> + xdr->page_len); >> if (ret < 0) >> goto out_err; >> consumed += xdr->page_len; >> diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c >> index a9ba040c70da..6f9b49b6768f 100644 >> --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c >> +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c >> @@ -871,7 +871,7 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) >> >> if (wr_lst) { >> /* XXX: Presume the client sent only one Write chunk */ >> - ret = svc_rdma_send_write_chunk(rdma, wr_lst, xdr); >> + ret = svc_rdma_send_write_chunk(rdma, wr_lst, xdr, rctxt); >> if (ret < 0) >> goto err2; >> svc_rdma_xdr_encode_write_list(rdma_resp, wr_lst, ret); >> @@ -913,3 +913,27 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) >> ret = -ENOTCONN; >> goto out; >> } >> + >> +/** >> + * svc_rdma_read_payload - mark where a READ payload sites >> + * @rqstp: svc_rqst to operate on >> + * @pos: payload's byte offset in the RPC Reply message >> + * @length: size of payload, in bytes >> + * >> + * For the moment, just record the xdr_buf location of the READ >> + * payload. svc_rdma_sendto will use that location later when >> + * we actually send the payload. >> + */ >> +void svc_rdma_read_payload(struct svc_rqst *rqstp, unsigned int pos, >> + unsigned long length) >> +{ >> + struct svc_rdma_recv_ctxt *ctxt = rqstp->rq_xprt_ctxt; >> + >> + /* XXX: Just one READ payload slot for now, since our >> + * transport implementation currently supports only one >> + * Write chunk. We assume the one READ payload always >> + * starts at the head<->pages boundary. >> + */ >> + WARN_ON(rqstp->rq_res.head[0].iov_len != pos); >> + ctxt->rc_read_payload_length = length; >> +} >> diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c >> index 145a3615c319..f6aad2798063 100644 >> --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c >> +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c >> @@ -82,6 +82,7 @@ static struct svc_xprt *svc_rdma_create(struct svc_serv *serv, >> .xpo_create = svc_rdma_create, >> .xpo_recvfrom = svc_rdma_recvfrom, >> .xpo_sendto = svc_rdma_sendto, >> + .xpo_read_payload = svc_rdma_read_payload, >> .xpo_release_rqst = svc_rdma_release_rqst, >> .xpo_detach = svc_rdma_detach, >> .xpo_free = svc_rdma_free, -- Chuck Lever