On Wed, Jun 9, 2021 at 7:18 AM Pearson, Robert B <rpearsonhpe@xxxxxxxxx> wrote: > > > On 6/8/2021 2:17 AM, Zhu Yanjun wrote: > > On Tue, Jun 8, 2021 at 3:01 PM Bob Pearson <rpearsonhpe@xxxxxxxxx> wrote: > >> On 6/8/21 1:54 AM, Zhu Yanjun wrote: > >>> On Tue, Jun 8, 2021 at 12:28 PM Bob Pearson <rpearsonhpe@xxxxxxxxx> wrote: > >>>> This patch makes the required changes to the rxe provider to support the > >>>> kernel memory windows patches to the rxe driver. > >>>> > >>>> The following changes are made: > >>>> - Add ibv_alloc_mw verb > >>>> - Add ibv_dealloc_mw verb > >>>> - Add ibv_bind_mw verb for type 1 MWs > >>>> - Add support for bind MW send work requests through the traditional > >>>> QP API and the extended QP API. > >>>> > >>>> Signed-off-by: Bob Pearson <rpearsonhpe@xxxxxxxxx> > >>>> --- > >>>> v2: > >>>> Added support for extended QP bind MW work requests. > >>> I got the following errors. > >>> Is it a known issue? > >>> > >>> # ./bin/run_tests.py --dev rxe0 > >>> .............sssssssss.............ssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssssss........sssssssssssssssssss....ssss.........s...s.....E.......ssssssssss..ss > >>> ====================================================================== > >>> ERROR: test_qp_ex_rc_bind_mw (tests.test_qpex.QpExTestCase) > >>> Verify bind memory window operation using the new post_send API. > >>> ---------------------------------------------------------------------- > >>> Traceback (most recent call last): > >>> File "/root/rdma-core/tests/test_qpex.py", line 292, in test_qp_ex_rc_bind_mw > >>> u.poll_cq(server.cq) > >>> File "/root/rdma-core/tests/utils.py", line 538, in poll_cq > >>> raise PyverbsRDMAError('Completion status is {s}'. > >>> pyverbs.pyverbs_error.PyverbsRDMAError: Completion status is Memory > >>> window bind error. Errno: 6, No such device or address > >>> > >>> ---------------------------------------------------------------------- > >>> Ran 193 tests in 2.544s > >>> > >>> FAILED (errors=1, skipped=128) > >>> > >>> Zhu Yanjun > >> Yes. It is because the test is not setting the BIND_MW access for the MR. It is not permitted to bind an MW to a MR that does not have the BIND_MW access flag set. But this test mistakenly thinks it is going to work. The other MW tests in test_mr.py do set the BIND_MW access flag. > > Thanks a lot. > > If the root cause to this problem is correct, and setting the BIND_MW > > access for the MR can fix this problem, > > > > Please file a commit to fix this problem. > > > > I am fine with this MW patch series if all the problems (including > > rdma-core and rxe in kernel) are fixed. > > > > Thanks for your effort. > > > > Zhu Yanjun > > Zhu, > > There was a second test bug, now fixed. And all the python tests are > passing or skipped. No errors. There is a message to the liux-rdma list > from Edward Srouji indicating that the patch is upstream at github or > you can use the one in my email to him. It will be a while until his PR > goes up stream I would guess. > > I sent a fresh set of patches last night for both the kernel and user > space provider. Thanks a lot for your effort. I applied your commits for rdma-core and rxe in kernel, and the following diff. All the python tests in rdma-core pass or skipped. No errors Thanks again. " diff --git a/tests/test_qpex.py b/tests/test_qpex.py index 20288d45..41028e2d 100644 --- a/tests/test_qpex.py +++ b/tests/test_qpex.py @@ -146,10 +146,10 @@ class QpExRCAtomicFetchAdd(RCResources): class QpExRCBindMw(RCResources): def create_qps(self): - create_qp_ex(self, e.IBV_QPT_RC, e.IBV_QP_EX_WITH_BIND_MW) + create_qp_ex(self, e.IBV_QPT_RC, e.IBV_QP_EX_WITH_BIND_MW|e.IBV_QP_EX_WITH_RDMA_WRITE) def create_mr(self): - self.mr = u.create_custom_mr(self, e.IBV_ACCESS_REMOTE_WRITE) + self.mr = u.create_custom_mr(self, e.IBV_ACCESS_REMOTE_WRITE|e.IBV_ACCESS_MW_BIND) " Zhu Yanjun > > Bob > > >> Bob > >>>> --- > >>>> providers/rxe/rxe.c | 157 +++++++++++++++++++++++++++++++++++++++++++- > >>>> 1 file changed, 155 insertions(+), 2 deletions(-) > >>>> > >>>> diff --git a/providers/rxe/rxe.c b/providers/rxe/rxe.c > >>>> index a68656ae..bb39ef04 100644 > >>>> --- a/providers/rxe/rxe.c > >>>> +++ b/providers/rxe/rxe.c > >>>> @@ -128,6 +128,95 @@ static int rxe_dealloc_pd(struct ibv_pd *pd) > >>>> return ret; > >>>> } > >>>> > >>>> +static struct ibv_mw *rxe_alloc_mw(struct ibv_pd *ibpd, enum ibv_mw_type type) > >>>> +{ > >>>> + int ret; > >>>> + struct ibv_mw *ibmw; > >>>> + struct ibv_alloc_mw cmd = {}; > >>>> + struct ib_uverbs_alloc_mw_resp resp = {}; > >>>> + > >>>> + ibmw = calloc(1, sizeof(*ibmw)); > >>>> + if (!ibmw) > >>>> + return NULL; > >>>> + > >>>> + ret = ibv_cmd_alloc_mw(ibpd, type, ibmw, &cmd, sizeof(cmd), > >>>> + &resp, sizeof(resp)); > >>>> + if (ret) { > >>>> + free(ibmw); > >>>> + return NULL; > >>>> + } > >>>> + > >>>> + return ibmw; > >>>> +} > >>>> + > >>>> +static int rxe_dealloc_mw(struct ibv_mw *ibmw) > >>>> +{ > >>>> + int ret; > >>>> + > >>>> + ret = ibv_cmd_dealloc_mw(ibmw); > >>>> + if (ret) > >>>> + return ret; > >>>> + > >>>> + free(ibmw); > >>>> + return 0; > >>>> +} > >>>> + > >>>> +static int next_rkey(int rkey) > >>>> +{ > >>>> + return (rkey & 0xffffff00) | ((rkey + 1) & 0x000000ff); > >>>> +} > >>>> + > >>>> +static int rxe_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr_list, > >>>> + struct ibv_send_wr **bad_wr); > >>>> + > >>>> +static int rxe_bind_mw(struct ibv_qp *ibqp, struct ibv_mw *ibmw, > >>>> + struct ibv_mw_bind *mw_bind) > >>>> +{ > >>>> + int ret; > >>>> + struct ibv_mw_bind_info *bind_info = &mw_bind->bind_info; > >>>> + struct ibv_send_wr ibwr; > >>>> + struct ibv_send_wr *bad_wr; > >>>> + > >>>> + if (!bind_info->mr && (bind_info->addr || bind_info->length)) { > >>>> + ret = EINVAL; > >>>> + goto err; > >>>> + } > >>>> + > >>>> + if (bind_info->mw_access_flags & IBV_ACCESS_ZERO_BASED) { > >>>> + ret = EINVAL; > >>>> + goto err; > >>>> + } > >>>> + > >>>> + if (bind_info->mr) { > >>>> + if (ibmw->pd != bind_info->mr->pd) { > >>>> + ret = EPERM; > >>>> + goto err; > >>>> + } > >>>> + } > >>>> + > >>>> + memset(&ibwr, 0, sizeof(ibwr)); > >>>> + > >>>> + ibwr.opcode = IBV_WR_BIND_MW; > >>>> + ibwr.next = NULL; > >>>> + ibwr.wr_id = mw_bind->wr_id; > >>>> + ibwr.send_flags = mw_bind->send_flags; > >>>> + ibwr.bind_mw.bind_info = mw_bind->bind_info; > >>>> + ibwr.bind_mw.mw = ibmw; > >>>> + ibwr.bind_mw.rkey = next_rkey(ibmw->rkey); > >>>> + > >>>> + ret = rxe_post_send(ibqp, &ibwr, &bad_wr); > >>>> + if (ret) > >>>> + goto err; > >>>> + > >>>> + /* user has to undo this if he gets an error wc */ > >>>> + ibmw->rkey = ibwr.bind_mw.rkey; > >>>> + > >>>> + return 0; > >>>> +err: > >>>> + errno = ret; > >>>> + return errno; > >>>> +} > >>>> + > >>>> static struct ibv_mr *rxe_reg_mr(struct ibv_pd *pd, void *addr, size_t length, > >>>> uint64_t hca_va, int access) > >>>> { > >>>> @@ -715,6 +804,31 @@ static void wr_atomic_fetch_add(struct ibv_qp_ex *ibqp, uint32_t rkey, > >>>> advance_qp_cur_index(qp); > >>>> } > >>>> > >>>> +static void wr_bind_mw(struct ibv_qp_ex *ibqp, struct ibv_mw *ibmw, > >>>> + uint32_t rkey, const struct ibv_mw_bind_info *info) > >>>> +{ > >>>> + struct rxe_qp *qp = container_of(ibqp, struct rxe_qp, vqp.qp_ex); > >>>> + struct rxe_send_wqe *wqe = addr_from_index(qp->sq.queue, qp->cur_index); > >>>> + > >>>> + if (check_qp_queue_full(qp)) > >>>> + return; > >>>> + > >>>> + memset(wqe, 0, sizeof(*wqe)); > >>>> + > >>>> + wqe->wr.wr_id = ibqp->wr_id; > >>>> + wqe->wr.opcode = IBV_WR_BIND_MW; > >>>> + wqe->wr.send_flags = qp->vqp.qp_ex.wr_flags; > >>>> + wqe->wr.wr.mw.addr = info->addr; > >>>> + wqe->wr.wr.mw.length = info->length; > >>>> + wqe->wr.wr.mw.mr_lkey = info->mr->lkey; > >>>> + wqe->wr.wr.mw.mw_rkey = ibmw->rkey; > >>>> + wqe->wr.wr.mw.rkey = rkey; > >>>> + wqe->wr.wr.mw.access = info->mw_access_flags; > >>>> + wqe->ssn = qp->ssn++; > >>>> + > >>>> + advance_qp_cur_index(qp); > >>>> +} > >>>> + > >>>> static void wr_local_inv(struct ibv_qp_ex *ibqp, uint32_t invalidate_rkey) > >>>> { > >>>> struct rxe_qp *qp = container_of(ibqp, struct rxe_qp, vqp.qp_ex); > >>>> @@ -1106,6 +1220,7 @@ enum { > >>>> | IBV_QP_EX_WITH_ATOMIC_CMP_AND_SWP > >>>> | IBV_QP_EX_WITH_ATOMIC_FETCH_AND_ADD > >>>> | IBV_QP_EX_WITH_LOCAL_INV > >>>> + | IBV_QP_EX_WITH_BIND_MW > >>>> | IBV_QP_EX_WITH_SEND_WITH_INV, > >>>> > >>>> RXE_SUP_UC_QP_SEND_OPS_FLAGS = > >>>> @@ -1113,6 +1228,7 @@ enum { > >>>> | IBV_QP_EX_WITH_RDMA_WRITE_WITH_IMM > >>>> | IBV_QP_EX_WITH_SEND > >>>> | IBV_QP_EX_WITH_SEND_WITH_IMM > >>>> + | IBV_QP_EX_WITH_BIND_MW > >>>> | IBV_QP_EX_WITH_SEND_WITH_INV, > >>>> > >>>> RXE_SUP_UD_QP_SEND_OPS_FLAGS = > >>>> @@ -1162,6 +1278,9 @@ static void set_qp_send_ops(struct rxe_qp *qp, uint64_t flags) > >>>> if (flags & IBV_QP_EX_WITH_ATOMIC_FETCH_AND_ADD) > >>>> qp->vqp.qp_ex.wr_atomic_fetch_add = wr_atomic_fetch_add; > >>>> > >>>> + if (flags & IBV_QP_EX_WITH_BIND_MW) > >>>> + qp->vqp.qp_ex.wr_bind_mw = wr_bind_mw; > >>>> + > >>>> if (flags & IBV_QP_EX_WITH_LOCAL_INV) > >>>> qp->vqp.qp_ex.wr_local_inv = wr_local_inv; > >>>> > >>>> @@ -1275,9 +1394,10 @@ static int rxe_destroy_qp(struct ibv_qp *ibqp) > >>>> } > >>>> > >>>> /* basic sanity checks for send work request */ > >>>> -static int validate_send_wr(struct rxe_wq *sq, struct ibv_send_wr *ibwr, > >>>> +static int validate_send_wr(struct rxe_qp *qp, struct ibv_send_wr *ibwr, > >>>> unsigned int length) > >>>> { > >>>> + struct rxe_wq *sq = &qp->sq; > >>>> enum ibv_wr_opcode opcode = ibwr->opcode; > >>>> > >>>> if (ibwr->num_sge > sq->max_sge) > >>>> @@ -1291,11 +1411,26 @@ static int validate_send_wr(struct rxe_wq *sq, struct ibv_send_wr *ibwr, > >>>> if ((ibwr->send_flags & IBV_SEND_INLINE) && (length > sq->max_inline)) > >>>> return -EINVAL; > >>>> > >>>> + if (ibwr->opcode == IBV_WR_BIND_MW) { > >>>> + if (length) > >>>> + return -EINVAL; > >>>> + if (ibwr->num_sge) > >>>> + return -EINVAL; > >>>> + if (ibwr->imm_data) > >>>> + return -EINVAL; > >>>> + if ((qp_type(qp) != IBV_QPT_RC) && > >>>> + (qp_type(qp) != IBV_QPT_UC)) > >>>> + return -EINVAL; > >>>> + } > >>>> + > >>>> return 0; > >>>> } > >>>> > >>>> static void convert_send_wr(struct rxe_send_wr *kwr, struct ibv_send_wr *uwr) > >>>> { > >>>> + struct ibv_mw *ibmw; > >>>> + struct ibv_mr *ibmr; > >>>> + > >>>> memset(kwr, 0, sizeof(*kwr)); > >>>> > >>>> kwr->wr_id = uwr->wr_id; > >>>> @@ -1326,6 +1461,18 @@ static void convert_send_wr(struct rxe_send_wr *kwr, struct ibv_send_wr *uwr) > >>>> kwr->wr.atomic.rkey = uwr->wr.atomic.rkey; > >>>> break; > >>>> > >>>> + case IBV_WR_BIND_MW: > >>>> + ibmr = uwr->bind_mw.bind_info.mr; > >>>> + ibmw = uwr->bind_mw.mw; > >>>> + > >>>> + kwr->wr.mw.addr = uwr->bind_mw.bind_info.addr; > >>>> + kwr->wr.mw.length = uwr->bind_mw.bind_info.length; > >>>> + kwr->wr.mw.mr_lkey = ibmr->lkey; > >>>> + kwr->wr.mw.mw_rkey = ibmw->rkey; > >>>> + kwr->wr.mw.rkey = uwr->bind_mw.rkey; > >>>> + kwr->wr.mw.access = uwr->bind_mw.bind_info.mw_access_flags; > >>>> + break; > >>>> + > >>>> default: > >>>> break; > >>>> } > >>>> @@ -1348,6 +1495,8 @@ static int init_send_wqe(struct rxe_qp *qp, struct rxe_wq *sq, > >>>> if (ibwr->send_flags & IBV_SEND_INLINE) { > >>>> uint8_t *inline_data = wqe->dma.inline_data; > >>>> > >>>> + wqe->dma.resid = 0; > >>>> + > >>>> for (i = 0; i < num_sge; i++) { > >>>> memcpy(inline_data, > >>>> (uint8_t *)(long)ibwr->sg_list[i].addr, > >>>> @@ -1363,6 +1512,7 @@ static int init_send_wqe(struct rxe_qp *qp, struct rxe_wq *sq, > >>>> wqe->iova = ibwr->wr.atomic.remote_addr; > >>>> else > >>>> wqe->iova = ibwr->wr.rdma.remote_addr; > >>>> + > >>>> wqe->dma.length = length; > >>>> wqe->dma.resid = length; > >>>> wqe->dma.num_sge = num_sge; > >>>> @@ -1385,7 +1535,7 @@ static int post_one_send(struct rxe_qp *qp, struct rxe_wq *sq, > >>>> for (i = 0; i < ibwr->num_sge; i++) > >>>> length += ibwr->sg_list[i].length; > >>>> > >>>> - err = validate_send_wr(sq, ibwr, length); > >>>> + err = validate_send_wr(qp, ibwr, length); > >>>> if (err) { > >>>> printf("validate send failed\n"); > >>>> return err; > >>>> @@ -1579,6 +1729,9 @@ static const struct verbs_context_ops rxe_ctx_ops = { > >>>> .dealloc_pd = rxe_dealloc_pd, > >>>> .reg_mr = rxe_reg_mr, > >>>> .dereg_mr = rxe_dereg_mr, > >>>> + .alloc_mw = rxe_alloc_mw, > >>>> + .dealloc_mw = rxe_dealloc_mw, > >>>> + .bind_mw = rxe_bind_mw, > >>>> .create_cq = rxe_create_cq, > >>>> .create_cq_ex = rxe_create_cq_ex, > >>>> .poll_cq = rxe_poll_cq, > >>>> -- > >>>> 2.30.2 > >>>>