> -----Original Message----- > From: Cheng Xu <chengyou@xxxxxxxxxxxxxxxxx> > Sent: Monday, December 27, 2021 1:29 PM > To: Devesh Sharma <devesh.s.sharma@xxxxxxxxxx>; leon@xxxxxxxxxx > Cc: dledford@xxxxxxxxxx; jgg@xxxxxxxxxxxx; linux-rdma@xxxxxxxxxxxxxxx; > KaiShen@xxxxxxxxxxxxxxxxx > Subject: [External] : Re: [PATCH rdma-core 2/5] RDMA-CORE/erdma: Add > userspace verbs implementation > > > > On 12/27/21 2:29 PM, Devesh Sharma wrote: > > > > > > <...> > > >> + ++page->used; > >> + > >> + for (i = 0; !page->free[i]; ++i) > >> + ;/* nothing */ > > Why? > > page->free[i] is a 64bits bitmap, all zeros means the corespoding > entries are all used, then we need to traverse next. If stopped, then we get a > valid entry to use. The loop could be a candidate for compiler optimization? > > >> + > >> + j = ffsl(page->free[i]) - 1; > >> + page->free[i] &= ~(1UL << j); > >> + > >> + db_records = page->page_buf + (i * bits_perlong + j) * > >> +ERDMA_DBRECORDS_SIZE; > >> + > >> +out: > >> + pthread_mutex_unlock(&ctx->dbrecord_pages_mutex); > >> + > >> + return db_records; > >> +} > >> + > >> +void erdma_dealloc_dbrecords(struct erdma_context *ctx, uint64_t > >> +*dbrecords) { > >> + struct erdma_dbrecord_page *page; > >> + int page_mask = ~(ctx->page_size - 1); > >> + int idx; > >> + > >> + pthread_mutex_lock(&ctx->dbrecord_pages_mutex); > >> + for (page = ctx->dbrecord_pages; page; page = page->next) > >> + if (((uintptr_t)dbrecords & page_mask) == (uintptr_t)page- > >>> page_buf) > >> + break; > >> + > >> + if (!page) > >> + goto out; > >> + > >> + idx = ((void *)dbrecords - page->page_buf) / > >> ERDMA_DBRECORDS_SIZE; > >> + page->free[idx / (8 * sizeof(unsigned long))] |= > >> + 1UL << (idx % (8 * sizeof(unsigned long))); > >> + > >> + if (!--page->used) { > >> + if (page->prev) > >> + page->prev->next = page->next; > >> + else > >> + ctx->dbrecord_pages = page->next; > >> + if (page->next) > >> + page->next->prev = page->prev; > >> + > >> + free(page->page_buf); > >> + free(page); > >> + } > >> + > >> +out: > >> + pthread_mutex_unlock(&ctx->dbrecord_pages_mutex); > >> +} > > <...> > > >> +static void __erdma_alloc_dbs(struct erdma_qp *qp, struct > >> +erdma_context > >> +*ctx) { > >> + uint32_t qpn = qp->id; > >> + > >> + if (ctx->sdb_type == ERDMA_SDB_PAGE) { > >> + /* qpn[4:0] as the index in this db page. */ > >> + qp->sq.db = ctx->sdb + (qpn & 31) * ERDMA_SQDB_SIZE; > >> + } else if (ctx->sdb_type == ERDMA_SDB_ENTRY) { > >> + /* for type 'ERDMA_SDB_ENTRY', each uctx has 2 dwqe, > >> totally takes 256Bytes. */ > >> + qp->sq.db = ctx->sdb + ctx->sdb_offset * 256; > > Generally we use macros to define hard-coded integers. E.g 256 should use > a macro. > > OK, will fix. > > >> + } else { > >> + /* qpn[4:0] as the index in this db page. */ > >> + qp->sq.db = ctx->sdb + (qpn & 31) * ERDMA_SQDB_SIZE; > >> + } > >> + > >> + /* qpn[6:0] as the index in this rq db page. */ > >> + qp->rq.db = ctx->rdb + (qpn & 127) * ERDMA_RQDB_SPACE_SIZE; } > >> + > >> +struct ibv_qp *erdma_create_qp(struct ibv_pd *pd, struct > >> +ibv_qp_init_attr *attr) { > >> + struct erdma_cmd_create_qp cmd = {}; > >> + struct erdma_cmd_create_qp_resp resp = {}; > >> + struct erdma_qp *qp; > >> + struct ibv_context *base_ctx = pd->context; > >> + struct erdma_context *ctx = to_ectx(base_ctx); > >> + uint64_t *db_records = NULL; > >> + int rv, tbl_idx, tbl_off; > >> + int sq_size = 0, rq_size = 0, total_bufsize = 0; > >> + > >> + memset(&cmd, 0, sizeof(cmd)); > >> + memset(&resp, 0, sizeof(resp)); > > No need of memset due to declaration step. > > OK, will fix. > > >> + > >> + qp = calloc(1, sizeof(*qp)); > >> + if (!qp) > >> + return NULL; > >> + > >> + sq_size = roundup_pow_of_two(attr->cap.max_send_wr * > >> MAX_WQEBB_PER_SQE) << SQEBB_SHIFT; > >> + sq_size = align(sq_size, ctx->page_size); > >> + rq_size = align(roundup_pow_of_two(attr->cap.max_recv_wr) << > >> RQE_SHIFT, ctx->page_size); > >> + total_bufsize = sq_size + rq_size; > >> + rv = posix_memalign(&qp->qbuf, ctx->page_size, total_bufsize); > >> + if (rv || !qp->qbuf) { > >> + errno = ENOMEM; > >> + goto error_alloc; > >> + } > > <...> > > >> + > >> +int erdma_destroy_qp(struct ibv_qp *base_qp) { > >> + struct erdma_qp *qp = to_eqp(base_qp); > >> + struct ibv_context *base_ctx = base_qp->pd->context; > >> + struct erdma_context *ctx = to_ectx(base_ctx); > >> + int rv, tbl_idx, tbl_off; > >> + > >> + pthread_spin_lock(&qp->sq_lock); > >> + pthread_spin_lock(&qp->rq_lock); > > Why to hold these? > > Here, we are destroying the qp resources, such as the queue buffers. > we want to avoid race condition with post_send and post_recv. Application should make sure no one is accessing the QP when qp-destroy is called. Probably you can be easy on these locks. > > >> + > >> + pthread_mutex_lock(&ctx->qp_table_mutex); > >> + tbl_idx = qp->id >> ERDMA_QP_TABLE_SHIFT; > >> + tbl_off = qp->id & ERDMA_QP_TABLE_MASK; > >> + > >> + ctx->qp_table[tbl_idx].table[tbl_off] = NULL; > >> + ctx->qp_table[tbl_idx].refcnt--; > >> + > >> + if (ctx->qp_table[tbl_idx].refcnt == 0) { > >> + free(ctx->qp_table[tbl_idx].table); > >> + ctx->qp_table[tbl_idx].table = NULL; > >> + } > >> + > >> + pthread_mutex_unlock(&ctx->qp_table_mutex); > >> + > >> + rv = ibv_cmd_destroy_qp(base_qp); > >> + if (rv) { > >> + pthread_spin_unlock(&qp->rq_lock); > >> + pthread_spin_unlock(&qp->sq_lock); > >> + return rv; > >> + } > >> + pthread_spin_destroy(&qp->rq_lock); > >> + pthread_spin_destroy(&qp->sq_lock); > >> + > >> + if (qp->db_records) > >> + erdma_dealloc_dbrecords(ctx, qp->db_records); > >> + > >> + if (qp->qbuf) > >> + free(qp->qbuf); > >> + > >> + free(qp); > >> + > >> + return 0; > >> +} > >> + > > <...> > > >> + > >> +int erdma_post_send(struct ibv_qp *base_qp, struct ibv_send_wr *wr, > >> +struct ibv_send_wr **bad_wr) { > >> + struct erdma_qp *qp = to_eqp(base_qp); > >> + uint16_t sq_pi; > >> + int new_sqe = 0, rv = 0; > >> + > >> + *bad_wr = NULL; > >> + > >> + if (base_qp->state == IBV_QPS_ERR) { > > Post_send is allowed in Error state. Thus the check is redundant. > > Does this have specification? We didn't find the description in IB > specification. > ERDMA uses iWarp, and in this specification (link: [1]), it says that two actions > should be taken: "post wqe, and then flush it" or "return an immediate > error" when post WR in ERROR state. After modify qp to err, our hardware > will flush all the wqes, thus for the newly posted wr, we return an error > immediately. > Also, I review other providers' implementation, ocrdma/efa/bnxt_re also > don't allow post_send in ERROR state. Does this can have a little difference > depended on different HCAs? Well its indeed. You can omit it for now. > > Thanks, > Cheng Xu > > [1] > https://urldefense.com/v3/__https://datatracker.ietf.org/doc/html/draft- > hilland-rddp-verbs-00*section- > 6.2.4__;Iw!!ACWV5N9M2RV99hQ!fcFF2OWy2n6miTv9UwPpXs1y1RbO4pxYs > XOCc59hTAL0Z4nagUt0Z2Aaht8jX0alE_TU$ > > > > >> + *bad_wr = wr; > >> + return -EIO; > >> + } > >> + > >> + pthread_spin_lock(&qp->sq_lock); > >> + > >> + sq_pi = qp->sq.pi; > >> + > >> + while (wr) { > >> + if ((uint16_t)(sq_pi - qp->sq.ci) >= qp->sq.depth) { > >> + rv = -ENOMEM; > >> + *bad_wr = wr; > >> + break; > >> + } > >> + > > <...> > > >> -- > >> 2.27.0