On Sat, Feb 13, 2016 at 2:36 AM, Chuck Lever <chuck.lever@xxxxxxxxxx> wrote: > Calling ib_poll_cq() to sort through WCs during a completion is a > common pattern amongst RDMA consumers. Since commit 14d3a3b2498e > ("IB: add a proper completion queue abstraction"), WC sorting can > be handled by the IB core. > > By converting to this new API, xprtrdma is made a better neighbor to > other RDMA consumers, as it allows the core to schedule the delivery > of completions more fairly amongst all active consumers. > > Because each ib_cqe carries a pointer to a completion method, the > core can now post its own operations on a consumer's QP, and handle > the completions itself, without changes to the consumer. > > xprtrdma's receive processing is already handled in a worker thread, > but there is some initial order-dependent processing that is done > in the soft IRQ context before the worker thread is scheduled. > IB_POLL_SOFTIRQ is a direct replacement for the current xprtrdma > receive code path. > > Signed-off-by: Chuck Lever <chuck.lever@xxxxxxxxxx> > --- > net/sunrpc/xprtrdma/verbs.c | 68 ++++++++++----------------------------- > net/sunrpc/xprtrdma/xprt_rdma.h | 1 + > 2 files changed, 19 insertions(+), 50 deletions(-) > > diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c > index fc1ef5f..53c30e2 100644 > --- a/net/sunrpc/xprtrdma/verbs.c > +++ b/net/sunrpc/xprtrdma/verbs.c > @@ -215,8 +215,9 @@ rpcrdma_update_granted_credits(struct rpcrdma_rep *rep) > static void > rpcrdma_recvcq_process_wc(struct ib_wc *wc) > { > - struct rpcrdma_rep *rep = > - (struct rpcrdma_rep *)(unsigned long)wc->wr_id; > + struct ib_cqe *cqe = wc->wr_cqe; > + struct rpcrdma_rep *rep = container_of(cqe, struct rpcrdma_rep, > + rr_cqe); > > /* WARNING: Only wr_id and status are reliable at this point */ > if (wc->status != IB_WC_SUCCESS) > @@ -242,46 +243,23 @@ out_schedule: > > out_fail: > if (wc->status != IB_WC_WR_FLUSH_ERR) > - pr_err("RPC: %s: rep %p: %s\n", > - __func__, rep, ib_wc_status_msg(wc->status)); > + pr_err("RPC: %s: Recv: %s (%u, vendor %u)\n", > + __func__, ib_wc_status_msg(wc->status), > + wc->status, wc->vendor_err); > rep->rr_len = RPCRDMA_BAD_LEN; > goto out_schedule; > } > > -/* The wc array is on stack: automatic memory is always CPU-local. > +/** > + * rpcrdma_receive_wc - Invoked by RDMA provider for each polled Receive WC > + * @cq: completion queue (ignored) > + * @wc: completed WR > * > - * struct ib_wc is 64 bytes, making the poll array potentially > - * large. But this is at the bottom of the call chain. Further > - * substantial work is done in another thread. > - */ > -static void > -rpcrdma_recvcq_poll(struct ib_cq *cq) > -{ > - struct ib_wc *pos, wcs[4]; > - int count, rc; > - > - do { > - pos = wcs; > - > - rc = ib_poll_cq(cq, ARRAY_SIZE(wcs), pos); > - if (rc < 0) > - break; > - > - count = rc; > - while (count-- > 0) > - rpcrdma_recvcq_process_wc(pos++); > - } while (rc == ARRAY_SIZE(wcs)); > -} > - > -/* Handle provider receive completion upcalls. > */ > static void > -rpcrdma_recvcq_upcall(struct ib_cq *cq, void *cq_context) > +rpcrdma_receive_wc(struct ib_cq *cq, struct ib_wc *wc) May be we can get rid of rpcrdma_receive_wc() and directly use rpcrdma_recvcq_process_wc()? > { > - do { > - rpcrdma_recvcq_poll(cq); > - } while (ib_req_notify_cq(cq, IB_CQ_NEXT_COMP | > - IB_CQ_REPORT_MISSED_EVENTS) > 0); > + rpcrdma_recvcq_process_wc(wc); > } > > static void > @@ -655,9 +633,9 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, > goto out2; > } > > - cq_attr.cqe = ep->rep_attr.cap.max_recv_wr + 1; > - recvcq = ib_create_cq(ia->ri_device, rpcrdma_recvcq_upcall, > - rpcrdma_cq_async_error_upcall, NULL, &cq_attr); > + recvcq = ib_alloc_cq(ia->ri_device, NULL, > + ep->rep_attr.cap.max_recv_wr + 1, > + 0, IB_POLL_SOFTIRQ); > if (IS_ERR(recvcq)) { > rc = PTR_ERR(recvcq); > dprintk("RPC: %s: failed to create recv CQ: %i\n", > @@ -665,14 +643,6 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, > goto out2; > } > > - rc = ib_req_notify_cq(recvcq, IB_CQ_NEXT_COMP); > - if (rc) { > - dprintk("RPC: %s: ib_req_notify_cq failed: %i\n", > - __func__, rc); > - ib_destroy_cq(recvcq); > - goto out2; > - } > - > ep->rep_attr.send_cq = sendcq; > ep->rep_attr.recv_cq = recvcq; > > @@ -735,10 +705,7 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) > ia->ri_id->qp = NULL; > } > > - rc = ib_destroy_cq(ep->rep_attr.recv_cq); > - if (rc) > - dprintk("RPC: %s: ib_destroy_cq returned %i\n", > - __func__, rc); > + ib_free_cq(ep->rep_attr.recv_cq); > > rc = ib_destroy_cq(ep->rep_attr.send_cq); > if (rc) > @@ -947,6 +914,7 @@ rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt) > } > > rep->rr_device = ia->ri_device; > + rep->rr_cqe.done = rpcrdma_receive_wc; > rep->rr_rxprt = r_xprt; > INIT_WORK(&rep->rr_work, rpcrdma_receive_worker); > return rep; > @@ -1322,7 +1290,7 @@ rpcrdma_ep_post_recv(struct rpcrdma_ia *ia, > int rc; > > recv_wr.next = NULL; > - recv_wr.wr_id = (u64) (unsigned long) rep; > + recv_wr.wr_cqe = &rep->rr_cqe; > recv_wr.sg_list = &rep->rr_rdmabuf->rg_iov; > recv_wr.num_sge = 1; > > diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h > index efd6fa7..7d87cdc 100644 > --- a/net/sunrpc/xprtrdma/xprt_rdma.h > +++ b/net/sunrpc/xprtrdma/xprt_rdma.h > @@ -171,6 +171,7 @@ rdmab_to_msg(struct rpcrdma_regbuf *rb) > struct rpcrdma_buffer; > > struct rpcrdma_rep { > + struct ib_cqe rr_cqe; > unsigned int rr_len; > struct ib_device *rr_device; > struct rpcrdma_xprt *rr_rxprt; > > -- > To unsubscribe from this list: send the line "unsubscribe linux-rdma" in > the body of a message to majordomo@xxxxxxxxxxxxxxx > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe linux-nfs" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html