> On Jan 26, 2019, at 5:44 PM, J. Bruce Fields <bfields@xxxxxxxxxxxx> wrote: > >> On Fri, Jan 25, 2019 at 04:54:54PM -0500, Chuck Lever wrote: >> Two and a half years ago, the client was changed to use gathered >> Send for larger inline messages, in commit 655fec6987b ("xprtrdma: >> Use gathered Send for large inline messages"). Several fixes were >> required because there are a few in-kernel device drivers whose >> max_sge is 3, and these were broken by the change. >> >> Apparently my memory is going, because some time later, I submitted >> commit 25fd86eca11c ("svcrdma: Don't overrun the SGE array in >> svc_rdma_send_ctxt"), and after that, commit f3c1fd0ee294 ("svcrdma: >> Reduce max_send_sges"). These too incorrectly assumed in-kernel >> device drivers would have more than a few Send SGEs available. >> >> The fix for the server side is not the same. This is because the >> fundamental problem on the server is that, whether or not the client >> has provisioned a chunk for the RPC reply, the server must squeeze >> even the most complex RPC replies into a single RDMA Send. Failing >> in the send path because of Send SGE exhaustion should never be an >> option. >> >> Therefore, instead of failing when the send path runs out of SGEs, >> switch to using a bounce buffer mechanism to handle RPC replies that >> are too complex for the device to send directly. That allows us to >> remove the max_sge check to enable drivers with small max_sge to >> work again. >> >> Reported-by: Don Dutile <ddutile@xxxxxxxxxx> >> Fixes: 25fd86eca11c ("svcrdma: Don't overrun the SGE array in ...") > > Thanks! Do you think this is suitable for 5.0 and stable, or should I > save it up for 5.1? Don would like to see it in 5.0 and stable. > --b. > >> Signed-off-by: Chuck Lever <chuck.lever@xxxxxxxxxx> >> --- >> net/sunrpc/xprtrdma/svc_rdma_sendto.c | 105 ++++++++++++++++++++++++++++-- >> net/sunrpc/xprtrdma/svc_rdma_transport.c | 9 +-- >> 2 files changed, 102 insertions(+), 12 deletions(-) >> >> diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c >> index 1aab305fbbb6..8235ca2159d1 100644 >> --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c >> +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c >> @@ -531,6 +531,99 @@ void svc_rdma_sync_reply_hdr(struct svcxprt_rdma *rdma, >> DMA_TO_DEVICE); >> } >> >> +/* If the xdr_buf has more elements than the device can >> + * transmit in a single RDMA Send, then the reply will >> + * have to be copied into a bounce buffer. >> + */ >> +static bool svc_rdma_pull_up_needed(struct svcxprt_rdma *rdma, >> + struct xdr_buf *xdr, >> + __be32 *wr_lst) >> +{ >> + int elements; >> + >> + /* xdr->head */ >> + elements = 1; >> + >> + /* xdr->pages */ >> + if (!wr_lst) { >> + unsigned int remaining; >> + unsigned long pageoff; >> + >> + pageoff = xdr->page_base & ~PAGE_MASK; >> + remaining = xdr->page_len; >> + while (remaining) { >> + ++elements; >> + remaining -= min_t(u32, PAGE_SIZE - pageoff, >> + remaining); >> + pageoff = 0; >> + } >> + } >> + >> + /* xdr->tail */ >> + if (xdr->tail[0].iov_len) >> + ++elements; >> + >> + /* assume 1 SGE is needed for the transport header */ >> + return elements >= rdma->sc_max_send_sges; >> +} >> + >> +/* The device is not capable of sending the reply directly. >> + * Assemble the elements of @xdr into the transport header >> + * buffer. >> + */ >> +static int svc_rdma_pull_up_reply_msg(struct svcxprt_rdma *rdma, >> + struct svc_rdma_send_ctxt *ctxt, >> + struct xdr_buf *xdr, __be32 *wr_lst) >> +{ >> + unsigned char *dst, *tailbase; >> + unsigned int taillen; >> + >> + dst = ctxt->sc_xprt_buf; >> + dst += ctxt->sc_sges[0].length; >> + >> + memcpy(dst, xdr->head[0].iov_base, xdr->head[0].iov_len); >> + dst += xdr->head[0].iov_len; >> + >> + tailbase = xdr->tail[0].iov_base; >> + taillen = xdr->tail[0].iov_len; >> + if (wr_lst) { >> + u32 xdrpad; >> + >> + xdrpad = xdr_padsize(xdr->page_len); >> + if (taillen && xdrpad) { >> + tailbase += xdrpad; >> + taillen -= xdrpad; >> + } >> + } else { >> + unsigned int len, remaining; >> + unsigned long pageoff; >> + struct page **ppages; >> + >> + ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT); >> + pageoff = xdr->page_base & ~PAGE_MASK; >> + remaining = xdr->page_len; >> + while (remaining) { >> + len = min_t(u32, PAGE_SIZE - pageoff, remaining); >> + >> + memcpy(dst, page_address(*ppages), len); >> + remaining -= len; >> + dst += len; >> + pageoff = 0; >> + } >> + } >> + >> + if (taillen) >> + memcpy(dst, tailbase, taillen); >> + >> + ctxt->sc_sges[0].length += xdr->len; >> + ib_dma_sync_single_for_device(rdma->sc_pd->device, >> + ctxt->sc_sges[0].addr, >> + ctxt->sc_sges[0].length, >> + DMA_TO_DEVICE); >> + >> + return 0; >> +} >> + >> /* svc_rdma_map_reply_msg - Map the buffer holding RPC message >> * @rdma: controlling transport >> * @ctxt: send_ctxt for the Send WR >> @@ -553,8 +646,10 @@ int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma, >> u32 xdr_pad; >> int ret; >> >> - if (++ctxt->sc_cur_sge_no >= rdma->sc_max_send_sges) >> - return -EIO; >> + if (svc_rdma_pull_up_needed(rdma, xdr, wr_lst)) >> + return svc_rdma_pull_up_reply_msg(rdma, ctxt, xdr, wr_lst); >> + >> + ++ctxt->sc_cur_sge_no; >> ret = svc_rdma_dma_map_buf(rdma, ctxt, >> xdr->head[0].iov_base, >> xdr->head[0].iov_len); >> @@ -585,8 +680,7 @@ int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma, >> while (remaining) { >> len = min_t(u32, PAGE_SIZE - page_off, remaining); >> >> - if (++ctxt->sc_cur_sge_no >= rdma->sc_max_send_sges) >> - return -EIO; >> + ++ctxt->sc_cur_sge_no; >> ret = svc_rdma_dma_map_page(rdma, ctxt, *ppages++, >> page_off, len); >> if (ret < 0) >> @@ -600,8 +694,7 @@ int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma, >> len = xdr->tail[0].iov_len; >> tail: >> if (len) { >> - if (++ctxt->sc_cur_sge_no >= rdma->sc_max_send_sges) >> - return -EIO; >> + ++ctxt->sc_cur_sge_no; >> ret = svc_rdma_dma_map_buf(rdma, ctxt, base, len); >> if (ret < 0) >> return ret; >> diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c >> index 6f9f4654338c..027a3b07d329 100644 >> --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c >> +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c >> @@ -419,12 +419,9 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) >> /* Transport header, head iovec, tail iovec */ >> newxprt->sc_max_send_sges = 3; >> /* Add one SGE per page list entry */ >> - newxprt->sc_max_send_sges += svcrdma_max_req_size / PAGE_SIZE; >> - if (newxprt->sc_max_send_sges > dev->attrs.max_send_sge) { >> - pr_err("svcrdma: too few Send SGEs available (%d needed)\n", >> - newxprt->sc_max_send_sges); >> - goto errout; >> - } >> + newxprt->sc_max_send_sges += (svcrdma_max_req_size / PAGE_SIZE) + 1; >> + if (newxprt->sc_max_send_sges > dev->attrs.max_send_sge) >> + newxprt->sc_max_send_sges = dev->attrs.max_send_sge; >> newxprt->sc_max_req_size = svcrdma_max_req_size; >> newxprt->sc_max_requests = svcrdma_max_requests; >> newxprt->sc_max_bc_requests = svcrdma_max_bc_requests;