Re: [PATCH v4 00/25] InfiniBand Transport (IBTRS) and Network Block Device (IBNBD)

Jinpu Wang <jinpuwang@xxxxxxxxx> · Fri, 12 Jul 2019 09:57:54 +0200

Hi Sagi,

> >> Another question, from what I understand from the code, the client
> >> always rdma_writes data on writes (with imm) from a remote pool of
> >> server buffers dedicated to it. Essentially all writes are immediate (no
> >> rdma reads ever). How is that different than using send wrs to a set of
> >> pre-posted recv buffers (like all others are doing)? Is it faster?
> > At the very beginning of the project we did some measurements and saw,
> > that it is faster. I'm not sure if this is still true
>
> Its not significantly faster (can't imagine why it would be).
> What could make a difference is probably the fact that you never
> do rdma reads for I/O writes which might be better. Also perhaps the
> fact that you normally don't wait for send completions before completing
> I/O (which is broken), and the fact that you batch recv operations.

I don't know how do you come to the conclusion we don't wait for send
completion before completing IO.

We do chain wr on successfull read request from server, see funtion
rdma_write_sg,

 318 static int rdma_write_sg(struct ibtrs_srv_op *id)
 319 {
 320         struct ibtrs_srv_sess *sess = to_srv_sess(id->con->c.sess);
 321         dma_addr_t dma_addr = sess->dma_addr[id->msg_id];
 322         struct ibtrs_srv *srv = sess->srv;
 323         struct ib_send_wr inv_wr, imm_wr;
 324         struct ib_rdma_wr *wr = NULL;
snip
333         need_inval = le16_to_cpu(id->rd_msg->flags) &
IBTRS_MSG_NEED_INVAL_F;
snip
 357                 wr->wr.wr_cqe   = &io_comp_cqe;
 358                 wr->wr.sg_list  = list;
 359                 wr->wr.num_sge  = 1;
 360                 wr->remote_addr = le64_to_cpu(id->rd_msg->desc[i].addr);
 361                 wr->rkey        = le32_to_cpu(id->rd_msg->desc[i].key);
 snip
368                 if (i < (sg_cnt - 1))
 369                         wr->wr.next = &id->tx_wr[i + 1].wr;
 370                 else if (need_inval)
 371                         wr->wr.next = &inv_wr;
 372                 else
 373                         wr->wr.next = &imm_wr;
 374
 375                 wr->wr.opcode = IB_WR_RDMA_WRITE;
 376                 wr->wr.ex.imm_data = 0;
 377                 wr->wr.send_flags  = 0;
snip
 386         if (need_inval) {
 387                 inv_wr.next = &imm_wr;
 388                 inv_wr.wr_cqe = &io_comp_cqe;
 389                 inv_wr.sg_list = NULL;
 390                 inv_wr.num_sge = 0;
 391                 inv_wr.opcode = IB_WR_SEND_WITH_INV;
 392                 inv_wr.send_flags = 0;
 393                 inv_wr.ex.invalidate_rkey = rkey;
 394         }
 395         imm_wr.next = NULL;
 396         imm_wr.wr_cqe = &io_comp_cqe;
 397         imm_wr.sg_list = NULL;
 398         imm_wr.num_sge = 0;
 399         imm_wr.opcode = IB_WR_RDMA_WRITE_WITH_IMM;
 400         imm_wr.send_flags = flags;
 401         imm_wr.ex.imm_data = cpu_to_be32(ibtrs_to_io_rsp_imm(id->msg_id,
 402                                                              0,
need_inval));
 403

when we need to do invalidation of remote memory, there will chain WR
togather, last 2 are inv_wr, and imm_wr.
imm_wr is the last one, this is important, due to the fact RC QP are
ordered, we know when when we received
IB_WC_RECV_RDMA_WITH_IMM and w_inval is true, hardware should already
finished it's job to invalidate the MR.
If server fails to invalidate, we will do local invalidation, and wait
for completion.

On client side
284 static void complete_rdma_req(struct ibtrs_clt_io_req *req, int errno,
 285                               bool notify, bool can_wait)
 286 {
 287         struct ibtrs_clt_con *con = req->con;
 288         struct ibtrs_clt_sess *sess;
 289         struct ibtrs_clt *clt;
 290         int err;
 291
 292         if (WARN_ON(!req->in_use))
 293                 return;
 294         if (WARN_ON(!req->con))
 295                 return;
 296         sess = to_clt_sess(con->c.sess);
 297         clt = sess->clt;
 298
 299         if (req->sg_cnt) {
 300                 if (unlikely(req->dir == DMA_FROM_DEVICE &&
req->need_inv)) {
 301                         /*
 302                          * We are here to invalidate RDMA read requests
 303                          * ourselves.  In normal scenario server should
 304                          * send INV for all requested RDMA reads, but
 305                          * we are here, thus two things could happen:
 306                          *
 307                          *    1.  this is failover, when errno != 0
 308                          *        and can_wait == 1,
 309                          *
 310                          *    2.  something totally bad happened and
 311                          *        server forgot to send INV, so we
 312                          *        should do that ourselves.
 313                          */
 314
 315                         if (likely(can_wait)) {
 316                                 req->need_inv_comp = true;
 317                         } else {
 318                                 /* This should be IO path, so
always notify */
 319                                 WARN_ON(!notify);
 320                                 /* Save errno for INV callback */
 321                                 req->inv_errno = errno;
 322                         }
 323
 324                         err = ibtrs_inv_rkey(req);
 325                         if (unlikely(err)) {
 326                                 ibtrs_err(sess, "Send INV WR
key=%#x: %d\n",
 327                                           req->mr->rkey, err);
 328                         } else if (likely(can_wait)) {
 329                                 wait_for_completion(&req->inv_comp);
 330                         } else {
330                         } else {
 331                                 /*
 332                                  * Something went wrong, so request will be
 333                                  * completed from INV callback.
 334                                  */
 335                                 WARN_ON_ONCE(1);
 336
 337                                 return;
 338                         }
 339                 }
 340                 ib_dma_unmap_sg(sess->s.dev->ib_dev, req->sglist,
 341                                 req->sg_cnt, req->dir);
 342         }
 343         if (sess->stats.enable_rdma_lat)
 344                 ibtrs_clt_update_rdma_lat(&sess->stats,
 345                                 req->dir == DMA_FROM_DEVICE,
 346                                 jiffies_to_msecs(jiffies -
req->start_jiffies));
 347         ibtrs_clt_decrease_inflight(&sess->stats);
 348
 349         req->in_use = false;
 350         req->con = NULL;
 351
 352         if (notify)
 353                 req->conf(req->priv, errno);
 354 }

 356 static void process_io_rsp(struct ibtrs_clt_sess *sess, u32
msg_id,
 357                            s16 errno, bool w_inval)
 358 {
 359         struct ibtrs_clt_io_req *req;
 360
 361         if (WARN_ON(msg_id >= sess->queue_depth))
 362                 return;
 363
 364         req = &sess->reqs[msg_id];
 365         /* Drop need_inv if server responsed with invalidation */
 366         req->need_inv &= !w_inval;
 367         complete_rdma_req(req, errno, true, false);
 368 }

Hope this clears the doubt.

Regards,
Jack