RE: [External] : Re: [PATCH rdma-core 2/5] RDMA-CORE/erdma: Add userspace verbs implementation

Devesh Sharma <devesh.s.sharma@xxxxxxxxxx> · Tue, 28 Dec 2021 12:17:45 +0000



> -----Original Message-----
> From: Cheng Xu <chengyou@xxxxxxxxxxxxxxxxx>
> Sent: Monday, December 27, 2021 1:29 PM
> To: Devesh Sharma <devesh.s.sharma@xxxxxxxxxx>; leon@xxxxxxxxxx
> Cc: dledford@xxxxxxxxxx; jgg@xxxxxxxxxxxx; linux-rdma@xxxxxxxxxxxxxxx;
> KaiShen@xxxxxxxxxxxxxxxxx
> Subject: [External] : Re: [PATCH rdma-core 2/5] RDMA-CORE/erdma: Add
> userspace verbs implementation
> 
> 
> 
> On 12/27/21 2:29 PM, Devesh Sharma wrote:
> >
> >
> 
> <...>
> 
> >> +	++page->used;
> >> +
> >> +	for (i = 0; !page->free[i]; ++i)
> >> +		;/* nothing */
> > Why?
> 
> page->free[i] is a 64bits bitmap, all zeros means the corespoding
> entries are all used, then we need to traverse next. If stopped, then we get a
> valid entry to use.
The loop could be a candidate for compiler optimization?
> 
> >> +
> >> +	j = ffsl(page->free[i]) - 1;
> >> +	page->free[i] &= ~(1UL << j);
> >> +
> >> +	db_records = page->page_buf + (i * bits_perlong + j) *
> >> +ERDMA_DBRECORDS_SIZE;
> >> +
> >> +out:
> >> +	pthread_mutex_unlock(&ctx->dbrecord_pages_mutex);
> >> +
> >> +	return db_records;
> >> +}
> >> +
> >> +void erdma_dealloc_dbrecords(struct erdma_context *ctx, uint64_t
> >> +*dbrecords) {
> >> +	struct erdma_dbrecord_page *page;
> >> +	int page_mask = ~(ctx->page_size - 1);
> >> +	int idx;
> >> +
> >> +	pthread_mutex_lock(&ctx->dbrecord_pages_mutex);
> >> +	for (page = ctx->dbrecord_pages; page; page = page->next)
> >> +		if (((uintptr_t)dbrecords & page_mask) == (uintptr_t)page-
> >>> page_buf)
> >> +			break;
> >> +
> >> +	if (!page)
> >> +		goto out;
> >> +
> >> +	idx = ((void *)dbrecords - page->page_buf) /
> >> ERDMA_DBRECORDS_SIZE;
> >> +	page->free[idx / (8 * sizeof(unsigned long))] |=
> >> +		1UL << (idx % (8 * sizeof(unsigned long)));
> >> +
> >> +	if (!--page->used) {
> >> +		if (page->prev)
> >> +			page->prev->next = page->next;
> >> +		else
> >> +			ctx->dbrecord_pages = page->next;
> >> +		if (page->next)
> >> +			page->next->prev = page->prev;
> >> +
> >> +		free(page->page_buf);
> >> +		free(page);
> >> +	}
> >> +
> >> +out:
> >> +	pthread_mutex_unlock(&ctx->dbrecord_pages_mutex);
> >> +}
> 
> <...>
> 
> >> +static void __erdma_alloc_dbs(struct erdma_qp *qp, struct
> >> +erdma_context
> >> +*ctx) {
> >> +	uint32_t qpn = qp->id;
> >> +
> >> +	if (ctx->sdb_type == ERDMA_SDB_PAGE) {
> >> +		/* qpn[4:0] as the index in this db page. */
> >> +		qp->sq.db = ctx->sdb + (qpn & 31) * ERDMA_SQDB_SIZE;
> >> +	} else if (ctx->sdb_type == ERDMA_SDB_ENTRY) {
> >> +		/* for type 'ERDMA_SDB_ENTRY', each uctx has 2 dwqe,
> >> totally takes 256Bytes. */
> >> +		qp->sq.db = ctx->sdb + ctx->sdb_offset * 256;
> > Generally we use macros to define hard-coded integers. E.g 256 should use
> a macro.
> 
> OK, will fix.
> 
> >> +	} else {
> >> +		/* qpn[4:0] as the index in this db page. */
> >> +		qp->sq.db = ctx->sdb + (qpn & 31) * ERDMA_SQDB_SIZE;
> >> +	}
> >> +
> >> +	/* qpn[6:0] as the index in this rq db page. */
> >> +	qp->rq.db = ctx->rdb + (qpn & 127) * ERDMA_RQDB_SPACE_SIZE; }
> >> +
> >> +struct ibv_qp *erdma_create_qp(struct ibv_pd *pd, struct
> >> +ibv_qp_init_attr *attr) {
> >> +	struct erdma_cmd_create_qp cmd = {};
> >> +	struct erdma_cmd_create_qp_resp resp = {};
> >> +	struct erdma_qp *qp;
> >> +	struct ibv_context *base_ctx = pd->context;
> >> +	struct erdma_context *ctx = to_ectx(base_ctx);
> >> +	uint64_t *db_records  = NULL;
> >> +	int rv, tbl_idx, tbl_off;
> >> +	int sq_size = 0, rq_size = 0, total_bufsize = 0;
> >> +
> >> +	memset(&cmd, 0, sizeof(cmd));
> >> +	memset(&resp, 0, sizeof(resp));
> > No need of memset due to declaration step.
> 
> OK, will fix.
> 
> >> +
> >> +	qp = calloc(1, sizeof(*qp));
> >> +	if (!qp)
> >> +		return NULL;
> >> +
> >> +	sq_size = roundup_pow_of_two(attr->cap.max_send_wr *
> >> MAX_WQEBB_PER_SQE) << SQEBB_SHIFT;
> >> +	sq_size = align(sq_size, ctx->page_size);
> >> +	rq_size = align(roundup_pow_of_two(attr->cap.max_recv_wr) <<
> >> RQE_SHIFT, ctx->page_size);
> >> +	total_bufsize = sq_size + rq_size;
> >> +	rv = posix_memalign(&qp->qbuf, ctx->page_size, total_bufsize);
> >> +	if (rv || !qp->qbuf) {
> >> +		errno = ENOMEM;
> >> +		goto error_alloc;
> >> +	}
> 
> <...>
> 
> >> +
> >> +int erdma_destroy_qp(struct ibv_qp *base_qp) {
> >> +	struct erdma_qp *qp = to_eqp(base_qp);
> >> +	struct ibv_context *base_ctx = base_qp->pd->context;
> >> +	struct erdma_context *ctx = to_ectx(base_ctx);
> >> +	int rv, tbl_idx, tbl_off;
> >> +
> >> +	pthread_spin_lock(&qp->sq_lock);
> >> +	pthread_spin_lock(&qp->rq_lock);
> > Why to hold these?
> 
> Here, we are destroying the qp resources, such as the queue buffers.
> we want to avoid race condition with post_send and post_recv.
Application should make sure no one is accessing the QP when qp-destroy is called. Probably you can be easy on these locks.
> 
> >> +
> >> +	pthread_mutex_lock(&ctx->qp_table_mutex);
> >> +	tbl_idx = qp->id >> ERDMA_QP_TABLE_SHIFT;
> >> +	tbl_off = qp->id & ERDMA_QP_TABLE_MASK;
> >> +
> >> +	ctx->qp_table[tbl_idx].table[tbl_off] = NULL;
> >> +	ctx->qp_table[tbl_idx].refcnt--;
> >> +
> >> +	if (ctx->qp_table[tbl_idx].refcnt == 0) {
> >> +		free(ctx->qp_table[tbl_idx].table);
> >> +		ctx->qp_table[tbl_idx].table = NULL;
> >> +	}
> >> +
> >> +	pthread_mutex_unlock(&ctx->qp_table_mutex);
> >> +
> >> +	rv = ibv_cmd_destroy_qp(base_qp);
> >> +	if (rv) {
> >> +		pthread_spin_unlock(&qp->rq_lock);
> >> +		pthread_spin_unlock(&qp->sq_lock);
> >> +		return rv;
> >> +	}
> >> +	pthread_spin_destroy(&qp->rq_lock);
> >> +	pthread_spin_destroy(&qp->sq_lock);
> >> +
> >> +	if (qp->db_records)
> >> +		erdma_dealloc_dbrecords(ctx, qp->db_records);
> >> +
> >> +	if (qp->qbuf)
> >> +		free(qp->qbuf);
> >> +
> >> +	free(qp);
> >> +
> >> +	return 0;
> >> +}
> >> +
> 
> <...>
> 
> >> +
> >> +int erdma_post_send(struct ibv_qp *base_qp, struct ibv_send_wr *wr,
> >> +struct ibv_send_wr **bad_wr) {
> >> +	struct erdma_qp *qp = to_eqp(base_qp);
> >> +	uint16_t sq_pi;
> >> +	int new_sqe = 0, rv = 0;
> >> +
> >> +	*bad_wr = NULL;
> >> +
> >> +	if (base_qp->state == IBV_QPS_ERR) {
> > Post_send is allowed in Error state. Thus the check is redundant.
> 
> Does this have specification? We didn't find the description in IB
> specification.
> ERDMA uses iWarp, and in this specification (link: [1]), it says that two actions
> should be taken: "post wqe, and then flush it" or "return an immediate
> error" when post WR in ERROR state. After modify qp to err, our hardware
> will flush all the wqes, thus for the newly posted wr, we return an error
> immediately.
> Also, I review other providers' implementation, ocrdma/efa/bnxt_re also
> don't allow post_send in ERROR state. Does this can have a little difference
> depended on different HCAs?
Well its indeed. You can omit it for now.
> 
> Thanks,
> Cheng Xu
> 
> [1]
> https://urldefense.com/v3/__https://datatracker.ietf.org/doc/html/draft-
> hilland-rddp-verbs-00*section-
> 6.2.4__;Iw!!ACWV5N9M2RV99hQ!fcFF2OWy2n6miTv9UwPpXs1y1RbO4pxYs
> XOCc59hTAL0Z4nagUt0Z2Aaht8jX0alE_TU$
> 
> 
> 
> >> +		*bad_wr = wr;
> >> +		return -EIO;
> >> +	}
> >> +
> >> +	pthread_spin_lock(&qp->sq_lock);
> >> +
> >> +	sq_pi = qp->sq.pi;
> >> +
> >> +	while (wr) {
> >> +		if ((uint16_t)(sq_pi - qp->sq.ci) >= qp->sq.depth) {
> >> +			rv = -ENOMEM;
> >> +			*bad_wr = wr;
> >> +			break;
> >> +		}
> >> +
> 
> <...>
> 
> >> --
> >> 2.27.0