Re: [PATCH 03/24] ibtrs: core: lib functions shared between client and server modules

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Hi Sagi,

On Mon, Feb 5, 2018 at 11:52 AM, Sagi Grimberg <sagi@xxxxxxxxxxx> wrote:
> Hi Roman,
>
> Here are some comments below.
>
>> +int ibtrs_post_recv_empty(struct ibtrs_con *con, struct ib_cqe *cqe)
>> +{
>> +       struct ib_recv_wr wr, *bad_wr;
>> +
>> +       wr.next    = NULL;
>> +       wr.wr_cqe  = cqe;
>> +       wr.sg_list = NULL;
>> +       wr.num_sge = 0;
>> +
>> +       return ib_post_recv(con->qp, &wr, &bad_wr);
>> +}
>> +EXPORT_SYMBOL_GPL(ibtrs_post_recv_empty);
>
>
> What is this designed to do?

Each IO completion (response from server to client) is an immediate
message with no data inside.  Using IMM field we are able to find IO
in the inflight table and complete it with an error code if any.

>> +int ibtrs_iu_post_rdma_write_imm(struct ibtrs_con *con, struct ibtrs_iu
>> *iu,
>> +                                struct ib_sge *sge, unsigned int num_sge,
>> +                                u32 rkey, u64 rdma_addr, u32 imm_data,
>> +                                enum ib_send_flags flags)
>> +{
>> +       struct ib_send_wr *bad_wr;
>> +       struct ib_rdma_wr wr;
>> +       int i;
>> +
>> +       wr.wr.next        = NULL;
>> +       wr.wr.wr_cqe      = &iu->cqe;
>> +       wr.wr.sg_list     = sge;
>> +       wr.wr.num_sge     = num_sge;
>> +       wr.rkey           = rkey;
>> +       wr.remote_addr    = rdma_addr;
>> +       wr.wr.opcode      = IB_WR_RDMA_WRITE_WITH_IMM;
>> +       wr.wr.ex.imm_data = cpu_to_be32(imm_data);
>> +       wr.wr.send_flags  = flags;
>> +
>> +       /*
>> +        * If one of the sges has 0 size, the operation will fail with an
>> +        * length error
>> +        */
>> +       for (i = 0; i < num_sge; i++)
>> +               if (WARN_ON(sge[i].length == 0))
>> +                       return -EINVAL;
>> +
>> +       return ib_post_send(con->qp, &wr.wr, &bad_wr);
>> +}
>> +EXPORT_SYMBOL_GPL(ibtrs_iu_post_rdma_write_imm);
>> +
>> +int ibtrs_post_rdma_write_imm_empty(struct ibtrs_con *con, struct ib_cqe
>> *cqe,
>> +                                   u32 imm_data, enum ib_send_flags
>> flags)
>> +{
>> +       struct ib_send_wr wr, *bad_wr;
>> +
>> +       memset(&wr, 0, sizeof(wr));
>> +       wr.wr_cqe       = cqe;
>> +       wr.send_flags   = flags;
>> +       wr.opcode       = IB_WR_RDMA_WRITE_WITH_IMM;
>> +       wr.ex.imm_data  = cpu_to_be32(imm_data);
>> +
>> +       return ib_post_send(con->qp, &wr, &bad_wr);
>> +}
>> +EXPORT_SYMBOL_GPL(ibtrs_post_rdma_write_imm_empty);
>
>
> Christoph did a great job adding a generic rdma rw API, please
> reuse it, if you rely on needed functionality that does not exist
> there, please enhance it instead of open-coding a new rdma engine
> library.

Good to know, thanks.

>> +static int ibtrs_ib_dev_init(struct ibtrs_ib_dev *d, struct ib_device
>> *dev)
>> +{
>> +       int err;
>> +
>> +       d->pd = ib_alloc_pd(dev, IB_PD_UNSAFE_GLOBAL_RKEY);
>> +       if (IS_ERR(d->pd))
>> +               return PTR_ERR(d->pd);
>> +       d->dev = dev;
>> +       d->lkey = d->pd->local_dma_lkey;
>> +       d->rkey = d->pd->unsafe_global_rkey;
>> +
>> +       err = ibtrs_query_device(d);
>> +       if (unlikely(err))
>> +               ib_dealloc_pd(d->pd);
>> +
>> +       return err;
>> +}
>
>
> I must say that this makes me frustrated.. We stopped doing these
> sort of things long time ago. No way we can even consider accepting
> the unsafe use of the global rkey exposing the entire memory space for
> remote access permissions.
>
> Sorry for being blunt, but this protocol design which makes a concious
> decision to expose unconditionally is broken by definition.

I suppose we can also afford the same trick which nvme does: provide
register_always module argument, can we?  That can be also interesting
to measure the performance difference.

When I did nvme testing with register_always=true/false I saw the
difference.  Would be nice to measure ibtrs with register_always=true
once ibtrs supports that.

>> +struct ibtrs_ib_dev *ibtrs_ib_dev_find_get(struct rdma_cm_id *cm_id)
>> +{
>> +       struct ibtrs_ib_dev *dev;
>> +       int err;
>> +
>> +       mutex_lock(&device_list_mutex);
>> +       list_for_each_entry(dev, &device_list, entry) {
>> +               if (dev->dev->node_guid == cm_id->device->node_guid &&
>> +                   kref_get_unless_zero(&dev->ref))
>> +                       goto out_unlock;
>> +       }
>> +       dev = kzalloc(sizeof(*dev), GFP_KERNEL);
>> +       if (unlikely(!dev))
>> +               goto out_err;
>> +
>> +       kref_init(&dev->ref);
>> +       err = ibtrs_ib_dev_init(dev, cm_id->device);
>> +       if (unlikely(err))
>> +               goto out_free;
>> +       list_add(&dev->entry, &device_list);
>> +out_unlock:
>> +       mutex_unlock(&device_list_mutex);
>> +
>> +       return dev;
>> +
>> +out_free:
>> +       kfree(dev);
>> +out_err:
>> +       mutex_unlock(&device_list_mutex);
>> +
>> +       return NULL;
>> +}
>> +EXPORT_SYMBOL_GPL(ibtrs_ib_dev_find_get);
>
>
> Is it time to make this a common helper in rdma_cm?

True, can be also the patch for nvme.

>> +static void schedule_hb(struct ibtrs_sess *sess)
>> +{
>> +       queue_delayed_work(sess->hb_wq, &sess->hb_dwork,
>> +                          msecs_to_jiffies(sess->hb_interval_ms));
>> +}
>
>
> What does hb stand for?

Just heartbeats :)

>
>> +void ibtrs_send_hb_ack(struct ibtrs_sess *sess)
>> +{
>> +       struct ibtrs_con *usr_con = sess->con[0];
>> +       u32 imm;
>> +       int err;
>> +
>> +       imm = ibtrs_to_imm(IBTRS_HB_ACK_IMM, 0);
>> +       err = ibtrs_post_rdma_write_imm_empty(usr_con, sess->hb_cqe,
>> +                                             imm, IB_SEND_SIGNALED);
>> +       if (unlikely(err)) {
>> +               sess->hb_err_handler(usr_con, err);
>> +               return;
>> +       }
>> +}
>> +EXPORT_SYMBOL_GPL(ibtrs_send_hb_ack);
>
>
> What is this?
>
> What is all this hb stuff?

Heartbeats acknowledgements.

>> +static int ibtrs_str_ipv4_to_sockaddr(const char *addr, size_t len,
>> +                                     short port, struct sockaddr *dst)
>> +{
>> +       struct sockaddr_in *dst_sin = (struct sockaddr_in *)dst;
>> +       int ret;
>> +
>> +       ret = in4_pton(addr, len, (u8 *)&dst_sin->sin_addr.s_addr,
>> +                      '\0', NULL);
>> +       if (ret == 0)
>> +               return -EINVAL;
>> +
>> +       dst_sin->sin_family = AF_INET;
>> +       dst_sin->sin_port = htons(port);
>> +
>> +       return 0;
>> +}
>> +
>> +static int ibtrs_str_ipv6_to_sockaddr(const char *addr, size_t len,
>> +                                     short port, struct sockaddr *dst)
>> +{
>> +       struct sockaddr_in6 *dst_sin6 = (struct sockaddr_in6 *)dst;
>> +       int ret;
>> +
>> +       ret = in6_pton(addr, len, dst_sin6->sin6_addr.s6_addr,
>> +                      '\0', NULL);
>> +       if (ret != 1)
>> +               return -EINVAL;
>> +
>> +       dst_sin6->sin6_family = AF_INET6;
>> +       dst_sin6->sin6_port = htons(port);
>> +
>> +       return 0;
>> +}
>
>
> We already added helpers for this in net utils, you don't need to
> code it again.

Nice.  Will reuse then.

>> +
>> +static int ibtrs_str_gid_to_sockaddr(const char *addr, size_t len,
>> +                                    short port, struct sockaddr *dst)
>> +{
>> +       struct sockaddr_ib *dst_ib = (struct sockaddr_ib *)dst;
>> +       int ret;
>> +
>> +       /* We can use some of the I6 functions since GID is a valid
>> +        * IPv6 address format
>> +        */
>> +       ret = in6_pton(addr, len, dst_ib->sib_addr.sib_raw, '\0', NULL);
>> +       if (ret == 0)
>> +               return -EINVAL;
>> +
>> +       dst_ib->sib_family = AF_IB;
>> +       /*
>> +        * Use the same TCP server port number as the IB service ID
>> +        * on the IB port space range
>> +        */
>> +       dst_ib->sib_sid = cpu_to_be64(RDMA_IB_IP_PS_IB | port);
>> +       dst_ib->sib_sid_mask = cpu_to_be64(0xffffffffffffffffULL);
>> +       dst_ib->sib_pkey = cpu_to_be16(0xffff);
>> +
>> +       return 0;
>> +}
>
>
> Would be a nice addition to net utils.

Got it.

--
Roman
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html



[Index of Archives]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Photo]     [Yosemite News]     [Yosemite Photos]     [Linux Kernel]     [Linux SCSI]     [XFree86]

  Powered by Linux