Re: [PATCH] IB/rxe: assign QPs to cores for transmit scaling

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Tue, Jun 19, 2018 at 4:49 AM Vijay Immanuel <vijayi@xxxxxxxxxxxxxxxxx> wrote:
>
> A QP must use the same NIC TX queue to maintain packet order. The TX
> queue is usually selected based on the core from which the transmit
> was originated.
> Assign QPs to cores to better spread traffic across TX queues. This
> requires scheduling the tasklets in the cpu assigned to the QP. The
> transmit cpu is selected based on the comp_vector for the QP's scq.
>
> Signed-off-by: Vijay Immanuel <vijayi@xxxxxxxxxxxxxxxxx>
> ---
>  drivers/infiniband/sw/rxe/rxe_comp.c  |  8 +------
>  drivers/infiniband/sw/rxe/rxe_cq.c    |  1 +
>  drivers/infiniband/sw/rxe/rxe_qp.c    |  6 ++---
>  drivers/infiniband/sw/rxe/rxe_resp.c  |  8 +------
>  drivers/infiniband/sw/rxe/rxe_task.c  | 45 ++++++++++++++++++++++++++++++++---
>  drivers/infiniband/sw/rxe/rxe_task.h  |  7 +++++-
>  drivers/infiniband/sw/rxe/rxe_verbs.c |  3 +--
>  drivers/infiniband/sw/rxe/rxe_verbs.h |  1 +
>  8 files changed, 56 insertions(+), 23 deletions(-)
>
> diff --git a/drivers/infiniband/sw/rxe/rxe_comp.c b/drivers/infiniband/sw/rxe/rxe_comp.c
> index 6cdc40e..189f80e 100644
> --- a/drivers/infiniband/sw/rxe/rxe_comp.c
> +++ b/drivers/infiniband/sw/rxe/rxe_comp.c
> @@ -149,14 +149,8 @@ void retransmit_timer(struct timer_list *t)
>  void rxe_comp_queue_pkt(struct rxe_dev *rxe, struct rxe_qp *qp,
>                         struct sk_buff *skb)
>  {
> -       int must_sched;
> -
>         skb_queue_tail(&qp->resp_pkts, skb);
> -
> -       must_sched = skb_queue_len(&qp->resp_pkts) > 1;
> -       if (must_sched != 0)
> -               rxe_counter_inc(rxe, RXE_CNT_COMPLETER_SCHED);
> -       rxe_run_task(&qp->comp.task, must_sched);
> +       rxe_run_task(&qp->comp.task, 1);
>  }
>
>  static inline enum comp_state get_wqe(struct rxe_qp *qp,
> diff --git a/drivers/infiniband/sw/rxe/rxe_cq.c b/drivers/infiniband/sw/rxe/rxe_cq.c
> index 2ee4b08..ac7c5d0 100644
> --- a/drivers/infiniband/sw/rxe/rxe_cq.c
> +++ b/drivers/infiniband/sw/rxe/rxe_cq.c
> @@ -106,6 +106,7 @@ int rxe_cq_from_init(struct rxe_dev *rxe, struct rxe_cq *cq, int cqe,
>                 cq->is_user = 1;
>
>         cq->is_dying = false;
> +       cq->comp_vector = comp_vector;
What does this mean in RXE since we don't have control on the HW?
>
>         tasklet_init(&cq->comp_task, rxe_send_complete, (unsigned long)cq);
>
> diff --git a/drivers/infiniband/sw/rxe/rxe_qp.c b/drivers/infiniband/sw/rxe/rxe_qp.c
> index b9f7aa1..58204c8 100644
> --- a/drivers/infiniband/sw/rxe/rxe_qp.c
> +++ b/drivers/infiniband/sw/rxe/rxe_qp.c
> @@ -260,9 +260,9 @@ static int rxe_qp_init_req(struct rxe_dev *rxe, struct rxe_qp *qp,
>         spin_lock_init(&qp->sq.sq_lock);
>         skb_queue_head_init(&qp->req_pkts);
>
> -       rxe_init_task(rxe, &qp->req.task, qp,
> +       rxe_init_task(rxe, &qp->req.task, qp->scq->comp_vector, qp,
>                       rxe_requester, "req");
> -       rxe_init_task(rxe, &qp->comp.task, qp,
> +       rxe_init_task(rxe, &qp->comp.task, qp->scq->comp_vector, qp,
>                       rxe_completer, "comp");
You bind all tasklets to the completion vector of the SCQ. I think it
makes bad use of this value since it is mentioned in the documentation
of create_cq() and applications shouldn't expect this.

>
>         qp->qp_timeout_jiffies = 0; /* Can't be set for UD/UC in modify_qp */
> @@ -311,7 +311,7 @@ static int rxe_qp_init_resp(struct rxe_dev *rxe, struct rxe_qp *qp,
>
>         skb_queue_head_init(&qp->resp_pkts);
>
> -       rxe_init_task(rxe, &qp->resp.task, qp,
> +       rxe_init_task(rxe, &qp->resp.task, qp->scq->comp_vector, qp,
>                       rxe_responder, "resp");
>
>         qp->resp.opcode         = OPCODE_NONE;
> diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c
> index 955ff3b..853ab49 100644
> --- a/drivers/infiniband/sw/rxe/rxe_resp.c
> +++ b/drivers/infiniband/sw/rxe/rxe_resp.c
> @@ -107,15 +107,9 @@ static char *resp_state_name[] = {
>  void rxe_resp_queue_pkt(struct rxe_dev *rxe, struct rxe_qp *qp,
>                         struct sk_buff *skb)
>  {
> -       int must_sched;
> -       struct rxe_pkt_info *pkt = SKB_TO_PKT(skb);
> -
>         skb_queue_tail(&qp->req_pkts, skb);
>
> -       must_sched = (pkt->opcode == IB_OPCODE_RC_RDMA_READ_REQUEST) ||
> -                       (skb_queue_len(&qp->req_pkts) > 1);
> -
> -       rxe_run_task(&qp->resp.task, must_sched);
> +       rxe_run_task(&qp->resp.task, 1);
>  }
>
>  static inline enum resp_states get_req(struct rxe_qp *qp,
> diff --git a/drivers/infiniband/sw/rxe/rxe_task.c b/drivers/infiniband/sw/rxe/rxe_task.c
> index 08f05ac..8b7cb97 100644
> --- a/drivers/infiniband/sw/rxe/rxe_task.c
> +++ b/drivers/infiniband/sw/rxe/rxe_task.c
> @@ -37,6 +37,8 @@
>
>  #include "rxe_task.h"
>
> +static void rxe_run_task_local(void *data);
> +
>  int __rxe_do_task(struct rxe_task *task)
>
>  {
> @@ -63,6 +65,7 @@ void rxe_do_task(unsigned long data)
>         struct rxe_task *task = (struct rxe_task *)data;
>
>         spin_lock_irqsave(&task->state_lock, flags);
> +       task->scheduled = false;
>         switch (task->state) {
>         case TASK_STATE_START:
>                 task->state = TASK_STATE_BUSY;
> @@ -108,20 +111,27 @@ void rxe_do_task(unsigned long data)
>                         pr_warn("%s failed with bad state %d\n", __func__,
>                                 task->state);
>                 }
> +
> +               if (!cont && task->scheduled)
> +                       tasklet_schedule(&task->tasklet);
>                 spin_unlock_irqrestore(&task->state_lock, flags);
>         } while (cont);
>
>         task->ret = ret;
>  }
>
> -int rxe_init_task(void *obj, struct rxe_task *task,
> +int rxe_init_task(void *obj, struct rxe_task *task, int cpu,
>                   void *arg, int (*func)(void *), char *name)
>  {
> +       task->cpu       = cpu;
>         task->obj       = obj;
>         task->arg       = arg;
>         task->func      = func;
> +       task->csd.func  = rxe_run_task_local;
> +       task->csd.info  = task;
>         snprintf(task->name, sizeof(task->name), "%s", name);
>         task->destroyed = false;
> +       task->scheduled = false;
>
>         tasklet_init(&task->tasklet, rxe_do_task, (unsigned long)task);
>
> @@ -151,15 +161,44 @@ void rxe_cleanup_task(struct rxe_task *task)
>         tasklet_kill(&task->tasklet);
>  }
>
> +static void rxe_run_task_local(void *data)
> +{
> +       struct rxe_task *task = (struct rxe_task *)data;
> +
> +       if (task->destroyed)
> +               return;
> +
> +       tasklet_schedule(&task->tasklet);
> +}
> +
>  void rxe_run_task(struct rxe_task *task, int sched)
>  {
> +       int cpu;
> +       unsigned long flags;
> +
>         if (task->destroyed)
>                 return;
>
> -       if (sched)
> +       if (!sched) {
> +               rxe_do_task((unsigned long)task);
> +               return;
> +       }
> +
> +       spin_lock_irqsave(&task->state_lock, flags);
> +       if (task->scheduled || task->state != TASK_STATE_START) {
> +               task->scheduled = true;
> +               spin_unlock_irqrestore(&task->state_lock, flags);
> +               return;
> +       }
> +       task->scheduled = true;
> +       spin_unlock_irqrestore(&task->state_lock, flags);
> +
> +       cpu = get_cpu();
> +       if (task->cpu == cpu || !cpu_online(task->cpu))
>                 tasklet_schedule(&task->tasklet);
>         else
> -               rxe_do_task((unsigned long)task);
> +               smp_call_function_single_async(task->cpu, &task->csd);
> +       put_cpu();
>  }
>
>  void rxe_disable_task(struct rxe_task *task)
> diff --git a/drivers/infiniband/sw/rxe/rxe_task.h b/drivers/infiniband/sw/rxe/rxe_task.h
> index 08ff42d..1470dee 100644
> --- a/drivers/infiniband/sw/rxe/rxe_task.h
> +++ b/drivers/infiniband/sw/rxe/rxe_task.h
> @@ -34,6 +34,8 @@
>  #ifndef RXE_TASK_H
>  #define RXE_TASK_H
>
> +#include <linux/smp.h>
> +
>  enum {
>         TASK_STATE_START        = 0,
>         TASK_STATE_BUSY         = 1,
> @@ -48,13 +50,16 @@ enum {
>  struct rxe_task {
>         void                    *obj;
>         struct tasklet_struct   tasklet;
> +       int                     cpu;
>         int                     state;
>         spinlock_t              state_lock; /* spinlock for task state */
>         void                    *arg;
>         int                     (*func)(void *arg);
> +       call_single_data_t      csd;
>         int                     ret;
>         char                    name[16];
>         bool                    destroyed;
> +       bool                    scheduled;
>  };
>
>  /*
> @@ -62,7 +67,7 @@ struct rxe_task {
>   *     arg  => parameter to pass to fcn
>   *     fcn  => function to call until it returns != 0
>   */
> -int rxe_init_task(void *obj, struct rxe_task *task,
> +int rxe_init_task(void *obj, struct rxe_task *task, int cpu,
>                   void *arg, int (*func)(void *), char *name);
>
>  /* cleanup task */
> diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c
> index 73a00a1..addccc1 100644
> --- a/drivers/infiniband/sw/rxe/rxe_verbs.c
> +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c
> @@ -813,8 +813,7 @@ static int rxe_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
>         }
>
>         if (qp->is_user) {
> -               /* Utilize process context to do protocol processing */
> -               rxe_run_task(&qp->req.task, 0);
> +               rxe_run_task(&qp->req.task, 1);
>                 return 0;
>         } else
>                 return rxe_post_send_kernel(qp, wr, bad_wr);
This overrides the CPU that the application choose to run on. I'm not
sure that this is a good idea.
If you left this code unchanged you can trust on the application to
make a smart CPU selection and get the better utilization that you
want to get. What do you think?
> diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h
> index af1470d..6c6e8db 100644
> --- a/drivers/infiniband/sw/rxe/rxe_verbs.h
> +++ b/drivers/infiniband/sw/rxe/rxe_verbs.h
> @@ -92,6 +92,7 @@ struct rxe_cq {
>         u8                      notify;
>         bool                    is_dying;
>         int                     is_user;
> +       int                     comp_vector;
>         struct tasklet_struct   comp_task;
>  };
>
> --
> 2.7.4
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majordomo@xxxxxxxxxxxxxxx
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html



[Index of Archives]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Photo]     [Yosemite News]     [Yosemite Photos]     [Linux Kernel]     [Linux SCSI]     [XFree86]

  Powered by Linux