On Fri, Jan 25, 2019 at 04:54:54PM -0500, Chuck Lever wrote: > Two and a half years ago, the client was changed to use gathered > Send for larger inline messages, in commit 655fec6987b ("xprtrdma: > Use gathered Send for large inline messages"). Several fixes were > required because there are a few in-kernel device drivers whose > max_sge is 3, and these were broken by the change. > > Apparently my memory is going, because some time later, I submitted > commit 25fd86eca11c ("svcrdma: Don't overrun the SGE array in > svc_rdma_send_ctxt"), and after that, commit f3c1fd0ee294 ("svcrdma: > Reduce max_send_sges"). These too incorrectly assumed in-kernel > device drivers would have more than a few Send SGEs available. > > The fix for the server side is not the same. This is because the > fundamental problem on the server is that, whether or not the client > has provisioned a chunk for the RPC reply, the server must squeeze > even the most complex RPC replies into a single RDMA Send. Failing > in the send path because of Send SGE exhaustion should never be an > option. > > Therefore, instead of failing when the send path runs out of SGEs, > switch to using a bounce buffer mechanism to handle RPC replies that > are too complex for the device to send directly. That allows us to > remove the max_sge check to enable drivers with small max_sge to > work again. > > Reported-by: Don Dutile <ddutile@xxxxxxxxxx> > Fixes: 25fd86eca11c ("svcrdma: Don't overrun the SGE array in ...") Thanks! Do you think this is suitable for 5.0 and stable, or should I save it up for 5.1? --b. > Signed-off-by: Chuck Lever <chuck.lever@xxxxxxxxxx> > --- > net/sunrpc/xprtrdma/svc_rdma_sendto.c | 105 ++++++++++++++++++++++++++++-- > net/sunrpc/xprtrdma/svc_rdma_transport.c | 9 +-- > 2 files changed, 102 insertions(+), 12 deletions(-) > > diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c > index 1aab305fbbb6..8235ca2159d1 100644 > --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c > +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c > @@ -531,6 +531,99 @@ void svc_rdma_sync_reply_hdr(struct svcxprt_rdma *rdma, > DMA_TO_DEVICE); > } > > +/* If the xdr_buf has more elements than the device can > + * transmit in a single RDMA Send, then the reply will > + * have to be copied into a bounce buffer. > + */ > +static bool svc_rdma_pull_up_needed(struct svcxprt_rdma *rdma, > + struct xdr_buf *xdr, > + __be32 *wr_lst) > +{ > + int elements; > + > + /* xdr->head */ > + elements = 1; > + > + /* xdr->pages */ > + if (!wr_lst) { > + unsigned int remaining; > + unsigned long pageoff; > + > + pageoff = xdr->page_base & ~PAGE_MASK; > + remaining = xdr->page_len; > + while (remaining) { > + ++elements; > + remaining -= min_t(u32, PAGE_SIZE - pageoff, > + remaining); > + pageoff = 0; > + } > + } > + > + /* xdr->tail */ > + if (xdr->tail[0].iov_len) > + ++elements; > + > + /* assume 1 SGE is needed for the transport header */ > + return elements >= rdma->sc_max_send_sges; > +} > + > +/* The device is not capable of sending the reply directly. > + * Assemble the elements of @xdr into the transport header > + * buffer. > + */ > +static int svc_rdma_pull_up_reply_msg(struct svcxprt_rdma *rdma, > + struct svc_rdma_send_ctxt *ctxt, > + struct xdr_buf *xdr, __be32 *wr_lst) > +{ > + unsigned char *dst, *tailbase; > + unsigned int taillen; > + > + dst = ctxt->sc_xprt_buf; > + dst += ctxt->sc_sges[0].length; > + > + memcpy(dst, xdr->head[0].iov_base, xdr->head[0].iov_len); > + dst += xdr->head[0].iov_len; > + > + tailbase = xdr->tail[0].iov_base; > + taillen = xdr->tail[0].iov_len; > + if (wr_lst) { > + u32 xdrpad; > + > + xdrpad = xdr_padsize(xdr->page_len); > + if (taillen && xdrpad) { > + tailbase += xdrpad; > + taillen -= xdrpad; > + } > + } else { > + unsigned int len, remaining; > + unsigned long pageoff; > + struct page **ppages; > + > + ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT); > + pageoff = xdr->page_base & ~PAGE_MASK; > + remaining = xdr->page_len; > + while (remaining) { > + len = min_t(u32, PAGE_SIZE - pageoff, remaining); > + > + memcpy(dst, page_address(*ppages), len); > + remaining -= len; > + dst += len; > + pageoff = 0; > + } > + } > + > + if (taillen) > + memcpy(dst, tailbase, taillen); > + > + ctxt->sc_sges[0].length += xdr->len; > + ib_dma_sync_single_for_device(rdma->sc_pd->device, > + ctxt->sc_sges[0].addr, > + ctxt->sc_sges[0].length, > + DMA_TO_DEVICE); > + > + return 0; > +} > + > /* svc_rdma_map_reply_msg - Map the buffer holding RPC message > * @rdma: controlling transport > * @ctxt: send_ctxt for the Send WR > @@ -553,8 +646,10 @@ int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma, > u32 xdr_pad; > int ret; > > - if (++ctxt->sc_cur_sge_no >= rdma->sc_max_send_sges) > - return -EIO; > + if (svc_rdma_pull_up_needed(rdma, xdr, wr_lst)) > + return svc_rdma_pull_up_reply_msg(rdma, ctxt, xdr, wr_lst); > + > + ++ctxt->sc_cur_sge_no; > ret = svc_rdma_dma_map_buf(rdma, ctxt, > xdr->head[0].iov_base, > xdr->head[0].iov_len); > @@ -585,8 +680,7 @@ int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma, > while (remaining) { > len = min_t(u32, PAGE_SIZE - page_off, remaining); > > - if (++ctxt->sc_cur_sge_no >= rdma->sc_max_send_sges) > - return -EIO; > + ++ctxt->sc_cur_sge_no; > ret = svc_rdma_dma_map_page(rdma, ctxt, *ppages++, > page_off, len); > if (ret < 0) > @@ -600,8 +694,7 @@ int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma, > len = xdr->tail[0].iov_len; > tail: > if (len) { > - if (++ctxt->sc_cur_sge_no >= rdma->sc_max_send_sges) > - return -EIO; > + ++ctxt->sc_cur_sge_no; > ret = svc_rdma_dma_map_buf(rdma, ctxt, base, len); > if (ret < 0) > return ret; > diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c > index 6f9f4654338c..027a3b07d329 100644 > --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c > +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c > @@ -419,12 +419,9 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) > /* Transport header, head iovec, tail iovec */ > newxprt->sc_max_send_sges = 3; > /* Add one SGE per page list entry */ > - newxprt->sc_max_send_sges += svcrdma_max_req_size / PAGE_SIZE; > - if (newxprt->sc_max_send_sges > dev->attrs.max_send_sge) { > - pr_err("svcrdma: too few Send SGEs available (%d needed)\n", > - newxprt->sc_max_send_sges); > - goto errout; > - } > + newxprt->sc_max_send_sges += (svcrdma_max_req_size / PAGE_SIZE) + 1; > + if (newxprt->sc_max_send_sges > dev->attrs.max_send_sge) > + newxprt->sc_max_send_sges = dev->attrs.max_send_sge; > newxprt->sc_max_req_size = svcrdma_max_req_size; > newxprt->sc_max_requests = svcrdma_max_requests; > newxprt->sc_max_bc_requests = svcrdma_max_bc_requests;