Re: [PATCH v1 03/18] xprtrdma: Remove completion polling budgets

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Fri, Sep 18, 2015 at 2:14 AM, Chuck Lever <chuck.lever@xxxxxxxxxx> wrote:
>
> Commit 8301a2c047cc ("xprtrdma: Limit work done by completion
> handler") was supposed to prevent xprtrdma's upcall handlers from
> starving other softIRQ work by letting them return to the provider
> before all CQEs have been polled.
>
> The logic assumes the provider will call the upcall handler again
> immediately if the CQ is re-armed while there are still queued CQEs.
>
> This assumption is invalid. The IBTA spec says that after a CQ is
> armed, the hardware must interrupt only when a new CQE is inserted.
> xprtrdma can't rely on the provider calling again, even though some
> providers do.
>
> Therefore, leaving CQEs on queue makes sense only when there is
> another mechanism that ensures all remaining CQEs are consumed in a
> timely fashion. xprtrdma does not have such a mechanism. If a CQE
> remains queued, the transport can wait forever to send the next RPC.
>
> Finally, move the wcs array back onto the stack to ensure that the
> poll array is always local to the CPU where the completion upcall is
> running.
>
> Fixes: 8301a2c047cc ("xprtrdma: Limit work done by completion ...")
> Signed-off-by: Chuck Lever <chuck.lever@xxxxxxxxxx>
> ---
>  net/sunrpc/xprtrdma/verbs.c     |  100 ++++++++++++++++++---------------------
>  net/sunrpc/xprtrdma/xprt_rdma.h |    5 --
>  2 files changed, 45 insertions(+), 60 deletions(-)
>
> diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
> index 8a477e2..f2e3863 100644
> --- a/net/sunrpc/xprtrdma/verbs.c
> +++ b/net/sunrpc/xprtrdma/verbs.c
> @@ -158,34 +158,37 @@ rpcrdma_sendcq_process_wc(struct ib_wc *wc)
>         }
>  }
>
> -static int
> +/* The wc array is on stack: automatic memory is always CPU-local.
> + *
> + * The common case is a single completion is ready. By asking
> + * for two entries, a return code of 1 means there is exactly
> + * one completion and no more. We don't have to poll again to
> + * know that the CQ is now empty.
> + */
> +static void
>  rpcrdma_sendcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep)
>  {
> -       struct ib_wc *wcs;
> -       int budget, count, rc;
> +       struct ib_wc *pos, wcs[2];
> +       int count, rc;
>
> -       budget = RPCRDMA_WC_BUDGET / RPCRDMA_POLLSIZE;
>         do {
> -               wcs = ep->rep_send_wcs;
> +               pos = wcs;
>
> -               rc = ib_poll_cq(cq, RPCRDMA_POLLSIZE, wcs);
> -               if (rc <= 0)
> -                       return rc;
> +               rc = ib_poll_cq(cq, ARRAY_SIZE(wcs), pos);
> +               if (rc < 0)
> +                       goto out_warn;
>
>                 count = rc;
>                 while (count-- > 0)
> -                       rpcrdma_sendcq_process_wc(wcs++);
> -       } while (rc == RPCRDMA_POLLSIZE && --budget);
> -       return 0;
> +                       rpcrdma_sendcq_process_wc(pos++);
> +       } while (rc == ARRAY_SIZE(wcs));

I think I have missed something and not able to understand the reason
for polling 2 CQEs in one poll? It is possible that in a given poll_cq
call you end up getting on 1 completion, the other completion is
delayed due to some reason. Would it be better to poll for 1 in every
poll call Or
otherwise have this
while ( rc <= ARRAY_SIZE(wcs) && rc);

> +       return;
> +
> +out_warn:
> +       pr_warn("RPC:       %s: ib_poll_cq() failed %i\n", __func__, rc);
>  }
>
> -/*
> - * Handle send, fast_reg_mr, and local_inv completions.
> - *
> - * Send events are typically suppressed and thus do not result
> - * in an upcall. Occasionally one is signaled, however. This
> - * prevents the provider's completion queue from wrapping and
> - * losing a completion.
> +/* Handle provider send completion upcalls.
>   */
>  static void
>  rpcrdma_sendcq_upcall(struct ib_cq *cq, void *cq_context)
> @@ -193,12 +196,7 @@ rpcrdma_sendcq_upcall(struct ib_cq *cq, void *cq_context)
>         struct rpcrdma_ep *ep = (struct rpcrdma_ep *)cq_context;
>         int rc;
>
> -       rc = rpcrdma_sendcq_poll(cq, ep);
> -       if (rc) {
> -               dprintk("RPC:       %s: ib_poll_cq failed: %i\n",
> -                       __func__, rc);
> -               return;
> -       }
> +       rpcrdma_sendcq_poll(cq, ep);
>
>         rc = ib_req_notify_cq(cq,
>                         IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
> @@ -247,44 +245,41 @@ out_fail:
>         goto out_schedule;
>  }
>
> -static int
> +/* The wc array is on stack: automatic memory is always CPU-local.
> + *
> + * struct ib_wc is 64 bytes, making the poll array potentially
> + * large. But this is at the bottom of the call chain. Further
> + * substantial work is done in another thread.
> + */
> +static void
>  rpcrdma_recvcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep)
>  {
> -       struct list_head sched_list;
> -       struct ib_wc *wcs;
> -       int budget, count, rc;
> +       struct ib_wc *pos, wcs[4];
> +       LIST_HEAD(sched_list);
> +       int count, rc;
>
> -       INIT_LIST_HEAD(&sched_list);
> -       budget = RPCRDMA_WC_BUDGET / RPCRDMA_POLLSIZE;
>         do {
> -               wcs = ep->rep_recv_wcs;
> +               pos = wcs;
>
> -               rc = ib_poll_cq(cq, RPCRDMA_POLLSIZE, wcs);
> -               if (rc <= 0)
> -                       goto out_schedule;
> +               rc = ib_poll_cq(cq, ARRAY_SIZE(wcs), pos);
> +               if (rc < 0)
> +                       goto out_warn;
>
>                 count = rc;
>                 while (count-- > 0)
> -                       rpcrdma_recvcq_process_wc(wcs++, &sched_list);
> -       } while (rc == RPCRDMA_POLLSIZE && --budget);
> -       rc = 0;
> +                       rpcrdma_recvcq_process_wc(pos++, &sched_list);
> +       } while (rc == ARRAY_SIZE(wcs));
>
>  out_schedule:
>         rpcrdma_schedule_tasklet(&sched_list);
> -       return rc;
> +       return;
> +
> +out_warn:
> +       pr_warn("RPC:       %s: ib_poll_cq() failed %i\n", __func__, rc);
> +       goto out_schedule;
>  }
>
> -/*
> - * Handle receive completions.
> - *
> - * It is reentrant but processes single events in order to maintain
> - * ordering of receives to keep server credits.
> - *
> - * It is the responsibility of the scheduled tasklet to return
> - * recv buffers to the pool. NOTE: this affects synchronization of
> - * connection shutdown. That is, the structures required for
> - * the completion of the reply handler must remain intact until
> - * all memory has been reclaimed.
> +/* Handle provider receive completion upcalls.
>   */
>  static void
>  rpcrdma_recvcq_upcall(struct ib_cq *cq, void *cq_context)
> @@ -292,12 +287,7 @@ rpcrdma_recvcq_upcall(struct ib_cq *cq, void *cq_context)
>         struct rpcrdma_ep *ep = (struct rpcrdma_ep *)cq_context;
>         int rc;
>
> -       rc = rpcrdma_recvcq_poll(cq, ep);
> -       if (rc) {
> -               dprintk("RPC:       %s: ib_poll_cq failed: %i\n",
> -                       __func__, rc);
> -               return;
> -       }
> +       rpcrdma_recvcq_poll(cq, ep);
>
>         rc = ib_req_notify_cq(cq,
>                         IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
> diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
> index c09414e..42c8d44 100644
> --- a/net/sunrpc/xprtrdma/xprt_rdma.h
> +++ b/net/sunrpc/xprtrdma/xprt_rdma.h
> @@ -77,9 +77,6 @@ struct rpcrdma_ia {
>   * RDMA Endpoint -- one per transport instance
>   */
>
> -#define RPCRDMA_WC_BUDGET      (128)
> -#define RPCRDMA_POLLSIZE       (16)
> -
>  struct rpcrdma_ep {
>         atomic_t                rep_cqcount;
>         int                     rep_cqinit;
> @@ -89,8 +86,6 @@ struct rpcrdma_ep {
>         struct rdma_conn_param  rep_remote_cma;
>         struct sockaddr_storage rep_remote_addr;
>         struct delayed_work     rep_connect_worker;
> -       struct ib_wc            rep_send_wcs[RPCRDMA_POLLSIZE];
> -       struct ib_wc            rep_recv_wcs[RPCRDMA_POLLSIZE];
>  };
>
>  /*
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majordomo@xxxxxxxxxxxxxxx
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html



[Index of Archives]     [Linux Filesystem Development]     [Linux USB Development]     [Linux Media Development]     [Video for Linux]     [Linux NILFS]     [Linux Audio Users]     [Yosemite Info]     [Linux SCSI]

  Powered by Linux