> -----Original Message----- > From: Cheng Xu <chengyou@xxxxxxxxxxxxxxxxx> > Sent: Friday, December 24, 2021 12:25 PM > To: leon@xxxxxxxxxx > Cc: dledford@xxxxxxxxxx; jgg@xxxxxxxxxxxx; linux-rdma@xxxxxxxxxxxxxxx; > KaiShen@xxxxxxxxxxxxxxxxx; chengyou@xxxxxxxxxxxxxxxxx > Subject: [PATCH rdma-core 2/5] RDMA-CORE/erdma: Add userspace verbs > implementation > > Imlementation of the erdma's 'struct verbs_context_ops' interface. > Due to doorbells may be drop by hardware in some situations, such as > hardware hot-upgrade, driver will keep the latest doorbell value of each QP > and CQ. We introduce the doorbell records to store the latest doorbell > values, and its allocation mechanism comes from dbre.c of mlx5. > > Signed-off-by: Cheng Xu <chengyou@xxxxxxxxxxxxxxxxx> > --- > providers/erdma/erdma_db.c | 110 ++++ > providers/erdma/erdma_verbs.c | 934 > ++++++++++++++++++++++++++++++++++ > 2 files changed, 1044 insertions(+) > create mode 100644 providers/erdma/erdma_db.c create mode 100644 > providers/erdma/erdma_verbs.c > > diff --git a/providers/erdma/erdma_db.c b/providers/erdma/erdma_db.c > new file mode 100644 index 00000000..83db76d1 > --- /dev/null > +++ b/providers/erdma/erdma_db.c > @@ -0,0 +1,110 @@ > +// SPDX-License-Identifier: GPL-2.0 or OpenIB.org BSD (MIT) See COPYING > +file > + > +// Authors: Cheng Xu <chengyou@xxxxxxxxxxxxxxxxx> // Copyright (c) > +2020-2021, Alibaba Group. > + > +// Copyright (c) 2012 Mellanox Technologies, Inc. All rights reserved. > + > +#include <inttypes.h> > +#include <stdlib.h> > +#include <util/util.h> > + > +#include "erdma.h" > +#include "erdma_db.h" > + > +#define ERDMA_DBRECORDS_SIZE 16 > + > +struct erdma_dbrecord_page { > + struct erdma_dbrecord_page *prev, *next; > + void *page_buf; > + int cnt; > + int used; > + unsigned long free[0]; > +}; > + > +uint64_t *erdma_alloc_dbrecords(struct erdma_context *ctx) { > + struct erdma_dbrecord_page *page = NULL; > + uint64_t *db_records = NULL; > + int dbrecords_per_page, nlongs = 0, bits_perlong = (8 * > sizeof(unsigned long)); > + int i, j, rv; > + > + pthread_mutex_lock(&ctx->dbrecord_pages_mutex); > + > + for (page = ctx->dbrecord_pages; page; page = page->next) > + if (page->used < page->cnt) > + goto found; > + > + dbrecords_per_page = ctx->page_size / ERDMA_DBRECORDS_SIZE; > + nlongs = align(dbrecords_per_page, bits_perlong) / bits_perlong; > + page = malloc(sizeof(*page) + nlongs * sizeof(unsigned long)); > + if (!page) > + goto out; > + > + rv = posix_memalign(&page->page_buf, ctx->page_size, ctx- > >page_size); > + if (rv) { > + free(page); > + goto out; > + } > + > + page->cnt = dbrecords_per_page; > + page->used = 0; > + for (i = 0; i < nlongs; i++) > + page->free[i] = ~0UL; > + > + page->prev = NULL; > + page->next = ctx->dbrecord_pages; > + ctx->dbrecord_pages = page; > + if (page->next) > + page->next->prev = page; > + > +found: > + ++page->used; > + > + for (i = 0; !page->free[i]; ++i) > + ;/* nothing */ Why? > + > + j = ffsl(page->free[i]) - 1; > + page->free[i] &= ~(1UL << j); > + > + db_records = page->page_buf + (i * bits_perlong + j) * > +ERDMA_DBRECORDS_SIZE; > + > +out: > + pthread_mutex_unlock(&ctx->dbrecord_pages_mutex); > + > + return db_records; > +} > + > +void erdma_dealloc_dbrecords(struct erdma_context *ctx, uint64_t > +*dbrecords) { > + struct erdma_dbrecord_page *page; > + int page_mask = ~(ctx->page_size - 1); > + int idx; > + > + pthread_mutex_lock(&ctx->dbrecord_pages_mutex); > + for (page = ctx->dbrecord_pages; page; page = page->next) > + if (((uintptr_t)dbrecords & page_mask) == (uintptr_t)page- > >page_buf) > + break; > + > + if (!page) > + goto out; > + > + idx = ((void *)dbrecords - page->page_buf) / > ERDMA_DBRECORDS_SIZE; > + page->free[idx / (8 * sizeof(unsigned long))] |= > + 1UL << (idx % (8 * sizeof(unsigned long))); > + > + if (!--page->used) { > + if (page->prev) > + page->prev->next = page->next; > + else > + ctx->dbrecord_pages = page->next; > + if (page->next) > + page->next->prev = page->prev; > + > + free(page->page_buf); > + free(page); > + } > + > +out: > + pthread_mutex_unlock(&ctx->dbrecord_pages_mutex); > +} > diff --git a/providers/erdma/erdma_verbs.c > b/providers/erdma/erdma_verbs.c new file mode 100644 index > 00000000..3c1c9769 > --- /dev/null > +++ b/providers/erdma/erdma_verbs.c > @@ -0,0 +1,934 @@ > +// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause > + > +// Authors: Cheng Xu <chengyou@xxxxxxxxxxxxxxxxx> // Copyright (c) > +2020-2021, Alibaba Group. > +// Authors: Bernard Metzler <bmt@xxxxxxxxxxxxxx> // Copyright (c) > +2008-2019, IBM Corporation > + > +#include <ccan/minmax.h> > +#include <endian.h> > +#include <stdio.h> > +#include <stdlib.h> > +#include <sys/mman.h> > +#include <sys/types.h> > +#include <unistd.h> > +#include <util/mmio.h> > +#include <util/udma_barrier.h> > +#include <util/util.h> > + > +#include "erdma.h" > +#include "erdma_abi.h" > +#include "erdma_db.h" > +#include "erdma_hw.h" > +#include "erdma_verbs.h" > + > +int erdma_query_device(struct ibv_context *ctx, > + const struct ibv_query_device_ex_input *input, > + struct ibv_device_attr_ex *attr, size_t attr_size) { > + struct ib_uverbs_ex_query_device_resp resp; > + size_t resp_size = sizeof(resp); > + uint64_t raw_fw_ver; > + unsigned int major, minor, sub_minor; > + int rv; > + > + rv = ibv_cmd_query_device_any(ctx, input, attr, attr_size, &resp, > &resp_size); > + if (rv) > + return rv; > + > + raw_fw_ver = resp.base.fw_ver; > + major = (raw_fw_ver >> 32) & 0xffff; > + minor = (raw_fw_ver >> 16) & 0xffff; > + sub_minor = raw_fw_ver & 0xffff; > + > + snprintf(attr->orig_attr.fw_ver, sizeof(attr->orig_attr.fw_ver), > + "%d.%d.%d", major, minor, sub_minor); > + > + return 0; > +} > + > +int erdma_query_port(struct ibv_context *ctx, uint8_t port, struct > +ibv_port_attr *attr) { > + struct ibv_query_port cmd; > + > + memset(&cmd, 0, sizeof(cmd)); > + > + return ibv_cmd_query_port(ctx, port, attr, &cmd, sizeof(cmd)); } > + > +int erdma_query_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, > + int attr_mask, struct ibv_qp_init_attr *init_attr) { > + struct ibv_query_qp cmd; > + > + memset(&cmd, 0, sizeof(cmd)); > + > + return ibv_cmd_query_qp(qp, attr, attr_mask, init_attr, &cmd, > +sizeof(cmd)); } > + > +struct ibv_pd *erdma_alloc_pd(struct ibv_context *ctx) { > + struct ibv_alloc_pd cmd; > + struct ib_uverbs_alloc_pd_resp resp; > + struct ibv_pd *pd; > + > + memset(&cmd, 0, sizeof(cmd)); > + > + pd = calloc(1, sizeof(*pd)); > + if (!pd) > + return NULL; > + > + if (ibv_cmd_alloc_pd(ctx, pd, &cmd, sizeof(cmd), &resp, > sizeof(resp))) { > + free(pd); > + return NULL; > + } > + > + return pd; > +} > + > +int erdma_free_pd(struct ibv_pd *pd) > +{ > + int rv; > + > + rv = ibv_cmd_dealloc_pd(pd); > + if (rv) > + return rv; > + > + free(pd); > + return 0; > +} > + > +struct ibv_mr *erdma_reg_mr(struct ibv_pd *pd, void *addr, size_t len, > + uint64_t hca_va, int access) > +{ > + struct ibv_reg_mr cmd; > + struct ib_uverbs_reg_mr_resp resp; > + struct verbs_mr *vmr; > + int ret; > + > + vmr = calloc(1, sizeof(*vmr)); > + if (!vmr) > + return NULL; > + > + ret = ibv_cmd_reg_mr(pd, addr, len, hca_va, access, vmr, &cmd, > sizeof(cmd), > + &resp, sizeof(resp)); > + if (ret) { > + free(vmr); > + return NULL; > + } > + > + return &vmr->ibv_mr; > +} > + > +int erdma_dereg_mr(struct verbs_mr *vmr) { > + int ret; > + > + ret = ibv_cmd_dereg_mr(vmr); > + if (ret) > + return ret; > + > + free(vmr); > + return 0; > +} > + > +int erdma_notify_cq(struct ibv_cq *ibcq, int solicited) { > + struct erdma_cq *cq = to_ecq(ibcq); > + uint64_t db_data; > + int ret; > + > + ret = pthread_spin_lock(&cq->lock); > + if (ret) > + return ret; > + > + db_data = FIELD_PREP(ERDMA_CQDB_EQN_MASK, cq- > >comp_vector + 1) | > + FIELD_PREP(ERDMA_CQDB_CQN_MASK, cq->id) | > + FIELD_PREP(ERDMA_CQDB_ARM_MASK, 1) | > + FIELD_PREP(ERDMA_CQDB_SOL_MASK, solicited) | > + FIELD_PREP(ERDMA_CQDB_CMDSN_MASK, cq->cmdsn) | > + FIELD_PREP(ERDMA_CQDB_CI_MASK, cq->ci); > + > + *(__le64 *)cq->db_record = htole64(db_data); > + udma_to_device_barrier(); > + mmio_write64_le(cq->db, htole64(db_data)); > + > + pthread_spin_unlock(&cq->lock); > + > + return ret; > +} > + > +struct ibv_cq *erdma_create_cq(struct ibv_context *ctx, int num_cqe, > + struct ibv_comp_channel *channel, int > comp_vector) { > + struct erdma_context *ectx; > + struct erdma_cmd_create_cq cmd = {}; > + struct erdma_cmd_create_cq_resp resp = {}; > + struct erdma_cq *cq; > + uint64_t *db_records = NULL; > + int cq_size; > + int rv; > + > + ectx = to_ectx(ctx); > + > + cq = calloc(1, sizeof(*cq)); > + if (!cq) > + return NULL; > + > + if (num_cqe < 64) > + num_cqe = 64; > + > + num_cqe = roundup_pow_of_two(num_cqe); > + cq_size = num_cqe * sizeof(struct erdma_cqe); > + cq_size = ERDMA_SIZE_TO_NPAGE(cq_size) << > ERDMA_PAGE_SHIFT; > + > + rv = posix_memalign((void **)&cq->queue, ERDMA_PAGE_SIZE, > cq_size); > + if (rv) { > + errno = rv; > + free(cq); > + return NULL; > + } > + > + db_records = erdma_alloc_dbrecords(ectx); > + if (!db_records) { > + errno = ENOMEM; > + goto error_alloc; > + } > + > + cmd.db_record_va = (__u64)db_records; > + cmd.qbuf_va = (uint64_t)cq->queue; > + cmd.qbuf_len = cq_size; > + > + rv = ibv_cmd_create_cq(ctx, num_cqe, channel, comp_vector, &cq- > >base_cq, > + &cmd.ibv_cmd, sizeof(cmd), &resp.ibv_resp, sizeof(resp)); > + if (rv) { > + free(cq); > + errno = EIO; > + goto error_alloc; > + } > + > + pthread_spin_init(&cq->lock, PTHREAD_PROCESS_PRIVATE); > + > + *db_records = 0; > + cq->db_record = db_records; > + > + cq->id = resp.cq_id; > + cq->depth = resp.num_cqe; > + cq->owner = 1; > + > + cq->db = ectx->cdb; > + cq->db_offset = (cq->id & (ERDMA_PAGE_SIZE / ERDMA_CQDB_SIZE > - 1)) * ERDMA_CQDB_SIZE; > + cq->db += cq->db_offset; > + > + cq->comp_vector = comp_vector; > + > + return &cq->base_cq; > + > +error_alloc: > + if (db_records) > + erdma_dealloc_dbrecords(ectx, db_records); > + > + if (cq->queue) > + free(cq->queue); > + > + free(cq); > + > + return NULL; > +} > + > +int erdma_destroy_cq(struct ibv_cq *base_cq) { > + struct erdma_cq *cq = to_ecq(base_cq); > + struct erdma_context *ctx = to_ectx(base_cq->context); > + int rv; > + > + pthread_spin_lock(&cq->lock); > + rv = ibv_cmd_destroy_cq(base_cq); > + if (rv) { > + pthread_spin_unlock(&cq->lock); > + errno = EIO; > + return rv; > + } > + pthread_spin_destroy(&cq->lock); > + > + if (cq->db_record) > + erdma_dealloc_dbrecords(ctx, cq->db_record); > + > + if (cq->queue) > + free(cq->queue); > + > + free(cq); > + > + return 0; > +} > + > +static void __erdma_alloc_dbs(struct erdma_qp *qp, struct erdma_context > +*ctx) { > + uint32_t qpn = qp->id; > + > + if (ctx->sdb_type == ERDMA_SDB_PAGE) { > + /* qpn[4:0] as the index in this db page. */ > + qp->sq.db = ctx->sdb + (qpn & 31) * ERDMA_SQDB_SIZE; > + } else if (ctx->sdb_type == ERDMA_SDB_ENTRY) { > + /* for type 'ERDMA_SDB_ENTRY', each uctx has 2 dwqe, > totally takes 256Bytes. */ > + qp->sq.db = ctx->sdb + ctx->sdb_offset * 256; Generally we use macros to define hard-coded integers. E.g 256 should use a macro. > + } else { > + /* qpn[4:0] as the index in this db page. */ > + qp->sq.db = ctx->sdb + (qpn & 31) * ERDMA_SQDB_SIZE; > + } > + > + /* qpn[6:0] as the index in this rq db page. */ > + qp->rq.db = ctx->rdb + (qpn & 127) * ERDMA_RQDB_SPACE_SIZE; } > + > +struct ibv_qp *erdma_create_qp(struct ibv_pd *pd, struct > +ibv_qp_init_attr *attr) { > + struct erdma_cmd_create_qp cmd = {}; > + struct erdma_cmd_create_qp_resp resp = {}; > + struct erdma_qp *qp; > + struct ibv_context *base_ctx = pd->context; > + struct erdma_context *ctx = to_ectx(base_ctx); > + uint64_t *db_records = NULL; > + int rv, tbl_idx, tbl_off; > + int sq_size = 0, rq_size = 0, total_bufsize = 0; > + > + memset(&cmd, 0, sizeof(cmd)); > + memset(&resp, 0, sizeof(resp)); No need of memset due to declaration step. > + > + qp = calloc(1, sizeof(*qp)); > + if (!qp) > + return NULL; > + > + sq_size = roundup_pow_of_two(attr->cap.max_send_wr * > MAX_WQEBB_PER_SQE) << SQEBB_SHIFT; > + sq_size = align(sq_size, ctx->page_size); > + rq_size = align(roundup_pow_of_two(attr->cap.max_recv_wr) << > RQE_SHIFT, ctx->page_size); > + total_bufsize = sq_size + rq_size; > + rv = posix_memalign(&qp->qbuf, ctx->page_size, total_bufsize); > + if (rv || !qp->qbuf) { > + errno = ENOMEM; > + goto error_alloc; > + } > + > + db_records = erdma_alloc_dbrecords(ctx); > + if (!db_records) { > + errno = ENOMEM; > + goto error_alloc; > + } > + > + cmd.db_record_va = (__u64)db_records; > + cmd.qbuf_va = (__u64)qp->qbuf; > + cmd.qbuf_len = (__u32)total_bufsize; > + > + rv = ibv_cmd_create_qp(pd, &qp->base_qp, attr, &cmd.ibv_cmd, > + sizeof(cmd), &resp.ibv_resp, sizeof(resp)); > + if (rv) { > + errno = EIO; > + goto error_alloc; > + } > + > + qp->id = resp.qp_id; > + > + pthread_mutex_lock(&ctx->qp_table_mutex); > + tbl_idx = qp->id >> ERDMA_QP_TABLE_SHIFT; > + tbl_off = qp->id & ERDMA_QP_TABLE_MASK; > + > + if (ctx->qp_table[tbl_idx].refcnt == 0) { > + ctx->qp_table[tbl_idx].table = calloc(ERDMA_PAGE_SIZE, > sizeof(struct erdma_qp *)); > + if (!ctx->qp_table[tbl_idx].table) { > + errno = ENOMEM; > + goto fail; > + } > + } > + > + /* exist qp */ > + if (ctx->qp_table[tbl_idx].table[tbl_off]) { > + errno = EBUSY; > + goto fail; > + } > + > + ctx->qp_table[tbl_idx].table[tbl_off] = qp; > + ctx->qp_table[tbl_idx].refcnt++; > + pthread_mutex_unlock(&ctx->qp_table_mutex); > + > + qp->sq.qbuf = qp->qbuf; > + qp->rq.qbuf = qp->qbuf + resp.rq_offset; > + qp->sq.depth = resp.num_sqe; > + qp->rq.depth = resp.num_rqe; > + qp->sq_sig_all = attr->sq_sig_all; > + qp->sq.size = resp.num_sqe * SQEBB_SIZE; > + qp->rq.size = resp.num_rqe * sizeof(struct erdma_rqe); > + > + /* doorbell allocation. */ > + __erdma_alloc_dbs(qp, ctx); > + > + pthread_spin_init(&qp->sq_lock, PTHREAD_PROCESS_PRIVATE); > + pthread_spin_init(&qp->rq_lock, PTHREAD_PROCESS_PRIVATE); > + > + *db_records = 0; > + *(db_records + 1) = 0; > + qp->db_records = db_records; > + qp->sq.db_record = db_records; > + qp->rq.db_record = db_records + 1; > + > + qp->rq.wr_tbl = calloc(qp->rq.depth, sizeof(uint64_t)); > + if (!qp->rq.wr_tbl) > + goto fail; > + > + qp->sq.wr_tbl = calloc(qp->sq.depth, sizeof(uint64_t)); > + if (!qp->sq.wr_tbl) > + goto fail; > + > + > + return &qp->base_qp; > +fail: > + if (qp->sq.wr_tbl) > + free(qp->sq.wr_tbl); > + > + if (qp->rq.wr_tbl) > + free(qp->rq.wr_tbl); > + > + ibv_cmd_destroy_qp(&qp->base_qp); > + > +error_alloc: > + if (db_records) > + erdma_dealloc_dbrecords(ctx, db_records); > + > + if (qp->qbuf) > + free(qp->qbuf); > + > + free(qp); > + > + return NULL; > +} > + > +int erdma_modify_qp(struct ibv_qp *base_qp, struct ibv_qp_attr *attr, > +int attr_mask) { > + struct ibv_modify_qp cmd; > + struct erdma_qp *qp = to_eqp(base_qp); > + int rv; > + > + memset(&cmd, 0, sizeof(cmd)); > + > + pthread_spin_lock(&qp->sq_lock); > + pthread_spin_lock(&qp->rq_lock); > + > + rv = ibv_cmd_modify_qp(base_qp, attr, attr_mask, &cmd, > sizeof(cmd)); > + > + pthread_spin_unlock(&qp->rq_lock); > + pthread_spin_unlock(&qp->sq_lock); > + > + return rv; > +} > + > +int erdma_destroy_qp(struct ibv_qp *base_qp) { > + struct erdma_qp *qp = to_eqp(base_qp); > + struct ibv_context *base_ctx = base_qp->pd->context; > + struct erdma_context *ctx = to_ectx(base_ctx); > + int rv, tbl_idx, tbl_off; > + > + pthread_spin_lock(&qp->sq_lock); > + pthread_spin_lock(&qp->rq_lock); Why to hold these? > + > + pthread_mutex_lock(&ctx->qp_table_mutex); > + tbl_idx = qp->id >> ERDMA_QP_TABLE_SHIFT; > + tbl_off = qp->id & ERDMA_QP_TABLE_MASK; > + > + ctx->qp_table[tbl_idx].table[tbl_off] = NULL; > + ctx->qp_table[tbl_idx].refcnt--; > + > + if (ctx->qp_table[tbl_idx].refcnt == 0) { > + free(ctx->qp_table[tbl_idx].table); > + ctx->qp_table[tbl_idx].table = NULL; > + } > + > + pthread_mutex_unlock(&ctx->qp_table_mutex); > + > + rv = ibv_cmd_destroy_qp(base_qp); > + if (rv) { > + pthread_spin_unlock(&qp->rq_lock); > + pthread_spin_unlock(&qp->sq_lock); > + return rv; > + } > + pthread_spin_destroy(&qp->rq_lock); > + pthread_spin_destroy(&qp->sq_lock); > + > + if (qp->db_records) > + erdma_dealloc_dbrecords(ctx, qp->db_records); > + > + if (qp->qbuf) > + free(qp->qbuf); > + > + free(qp); > + > + return 0; > +} > + > +static int erdma_push_one_sqe(struct erdma_qp *qp, struct ibv_send_wr > +*wr, uint16_t *sq_pi) { > + uint16_t tmp_pi = *sq_pi; > + void *sqe; > + uint64_t sqe_hdr; > + struct erdma_write_sqe *write_sqe; > + struct erdma_send_sqe *send_sqe; > + struct erdma_readreq_sqe *read_sqe; > + uint32_t wqe_size = 0; > + __le32 *length_field = NULL; > + struct erdma_sge *sgl_base = NULL; > + uint32_t i, bytes = 0; > + uint32_t sgl_off, sgl_idx, wqebb_cnt, opcode; > + > + sqe = get_sq_wqebb(qp, tmp_pi); > + /* Clear the first 8Byte of the wqe hdr. */ > + *(uint64_t *)sqe = 0; > + > + qp->sq.wr_tbl[tmp_pi & (qp->sq.depth - 1)] = wr->wr_id; > + > + sqe_hdr = FIELD_PREP(ERDMA_SQE_HDR_QPN_MASK, qp->id) | > + FIELD_PREP(ERDMA_SQE_HDR_CE_MASK, wr->send_flags & > IBV_SEND_SIGNALED ? 1 : 0) | > + FIELD_PREP(ERDMA_SQE_HDR_CE_MASK, qp->sq_sig_all) | > + FIELD_PREP(ERDMA_SQE_HDR_SE_MASK, wr->send_flags & > IBV_SEND_SOLICITED ? 1 : 0) | > + FIELD_PREP(ERDMA_SQE_HDR_FENCE_MASK, wr- > >send_flags & IBV_SEND_FENCE ? 1 : 0) | > + FIELD_PREP(ERDMA_SQE_HDR_INLINE_MASK, wr- > >send_flags & > +IBV_SEND_INLINE ? 1 : 0); > + > + switch (wr->opcode) { > + case IBV_WR_RDMA_WRITE: > + case IBV_WR_RDMA_WRITE_WITH_IMM: > + opcode = wr->opcode == IBV_WR_RDMA_WRITE ? > + ERDMA_OP_WRITE : > ERDMA_OP_WRITE_WITH_IMM; > + sqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_OPCODE_MASK, > opcode); > + write_sqe = (struct erdma_write_sqe *)sqe; > + write_sqe->imm_data = wr->imm_data; > + write_sqe->sink_stag = htole32(wr->wr.rdma.rkey); > + write_sqe->sink_to_low = htole32(wr- > >wr.rdma.remote_addr & 0xFFFFFFFF); > + write_sqe->sink_to_high = htole32((wr- > >wr.rdma.remote_addr >> 32) & > +0xFFFFFFFF); > + > + length_field = &write_sqe->length; > + sgl_base = get_sq_wqebb(qp, tmp_pi + 1); > + /* sgl is in next wqebb. */ > + sgl_off = 0; > + sgl_idx = tmp_pi + 1; > + wqe_size = sizeof(struct erdma_write_sqe); > + > + break; > + case IBV_WR_SEND: > + case IBV_WR_SEND_WITH_IMM: > + opcode = wr->opcode == IBV_WR_SEND ? ERDMA_OP_SEND > : ERDMA_OP_SEND_WITH_IMM; > + sqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_OPCODE_MASK, > opcode); > + send_sqe = (struct erdma_send_sqe *)sqe; > + send_sqe->imm_data = wr->imm_data; > + > + length_field = &send_sqe->length; > + sgl_base = (void *)send_sqe; > + /* sgl is in the half of current wqebb */ > + sgl_off = 16; > + sgl_idx = tmp_pi; > + wqe_size = sizeof(struct erdma_send_sqe); > + > + break; > + case IBV_WR_RDMA_READ: > + sqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_OPCODE_MASK, > ERDMA_OP_READ); > + > + read_sqe = (struct erdma_readreq_sqe *)sqe; > + > + read_sqe->sink_to_low = htole32(wr->sg_list->addr & > 0xFFFFFFFF); > + read_sqe->sink_to_high = htole32((wr->sg_list->addr >> 32) > & 0xFFFFFFFF); > + read_sqe->sink_stag = htole32(wr->sg_list->lkey); > + read_sqe->length = htole32(wr->sg_list->length); > + > + struct erdma_sge *sgl = (struct erdma_sge > *)get_sq_wqebb(qp, tmp_pi + > +1); > + > + sgl->laddr = htole64(wr->wr.rdma.remote_addr); > + sgl->length = htole32(wr->sg_list->length); > + sgl->lkey = htole32(wr->wr.rdma.rkey); > + > + wqe_size = sizeof(struct erdma_readreq_sqe); > + > + goto out; > + default: > + return -EINVAL; > + } > + > + if (wr->send_flags & IBV_SEND_INLINE) { > + char *data = (char *)sgl_base; > + uint32_t remain_size; > + uint32_t copy_size; > + uint32_t data_off; > + > + i = 0; > + bytes = 0; > + > + /* Allow more than ERDMA_MAX_SGE, since content copied > here */ > + while (i < wr->num_sge) { > + bytes += wr->sg_list[i].length; > + if (bytes > (int)ERDMA_MAX_INLINE) > + return -EINVAL; > + > + remain_size = wr->sg_list[i].length; > + data_off = 0; > + > + while (1) { > + copy_size = min(remain_size, SQEBB_SIZE - > sgl_off); > + memcpy(data + sgl_off, > + (void *)(uintptr_t)wr->sg_list[i].addr > + data_off, > + copy_size); > + remain_size -= copy_size; > + > + /* Update sgl_offset. */ > + sgl_idx += ((sgl_off + copy_size) >> > SQEBB_SHIFT); > + sgl_off = (sgl_off + copy_size) & (SQEBB_SIZE > - 1); > + data_off += copy_size; > + data = get_sq_wqebb(qp, sgl_idx); > + > + if (!remain_size) > + break; > + }; > + > + i++; > + } > + > + *length_field = htole32(bytes); > + wqe_size += bytes; > + sqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_SGL_LEN_MASK, > bytes); > + } else { > + char *sgl = (char *)sgl_base; > + > + if (wr->num_sge > ERDMA_MAX_SEND_SGE) > + return -EINVAL; > + > + i = 0; > + bytes = 0; > + > + while (i < wr->num_sge) { > + bytes += wr->sg_list[i].length; > + memcpy(sgl + sgl_off, &wr->sg_list[i], sizeof(struct > ibv_sge)); > + > + if (sgl_off == 0) > + *(uint32_t *)(sgl + 28) = qp->id; > + > + sgl_idx += (sgl_off == sizeof(struct ibv_sge) ? 1 : 0); > + sgl = get_sq_wqebb(qp, sgl_idx); > + sgl_off = sizeof(struct ibv_sge) - sgl_off; > + > + i++; > + } > + > + *length_field = htole32(bytes); > + sqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_SGL_LEN_MASK, > wr->num_sge); > + wqe_size += wr->num_sge * sizeof(struct ibv_sge); > + } > + > +out: > + wqebb_cnt = SQEBB_COUNT(wqe_size); > + assert(wqebb_cnt <= MAX_WQEBB_PER_SQE); > + sqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_WQEBB_CNT_MASK, > wqebb_cnt - 1); > + sqe_hdr |= FIELD_PREP(ERDMA_SQE_HDR_WQEBB_INDEX_MASK, > tmp_pi + > +wqebb_cnt); > + > + *(__le64 *)sqe = htole64(sqe_hdr); > + *sq_pi = tmp_pi + wqebb_cnt; > + > + return 0; > +} > + > +int erdma_post_send(struct ibv_qp *base_qp, struct ibv_send_wr *wr, > +struct ibv_send_wr **bad_wr) { > + struct erdma_qp *qp = to_eqp(base_qp); > + uint16_t sq_pi; > + int new_sqe = 0, rv = 0; > + > + *bad_wr = NULL; > + > + if (base_qp->state == IBV_QPS_ERR) { Post_send is allowed in Error state. Thus the check is redundant. > + *bad_wr = wr; > + return -EIO; > + } > + > + pthread_spin_lock(&qp->sq_lock); > + > + sq_pi = qp->sq.pi; > + > + while (wr) { > + if ((uint16_t)(sq_pi - qp->sq.ci) >= qp->sq.depth) { > + rv = -ENOMEM; > + *bad_wr = wr; > + break; > + } > + > + rv = erdma_push_one_sqe(qp, wr, &sq_pi); > + if (rv) { > + *bad_wr = wr; > + break; > + } > + > + new_sqe++; > + wr = wr->next; > + } > + > + if (new_sqe) { > + qp->sq.pi = sq_pi; > + __kick_sq_db(qp, sq_pi); /* normal doorbell. */ > + } > + > + pthread_spin_unlock(&qp->sq_lock); > + > + return rv; > +} > + > +static int push_recv_wqe(struct erdma_qp *qp, struct ibv_recv_wr *wr) { > + uint16_t rq_pi = qp->rq.pi; > + uint16_t idx = rq_pi & (qp->rq.depth - 1); > + struct erdma_rqe *rqe = (struct erdma_rqe *)qp->rq.qbuf + idx; > + > + if ((uint16_t)(rq_pi - qp->rq.ci) == qp->rq.depth) > + return -ENOMEM; > + > + rqe->qe_idx = htole16(rq_pi + 1); > + rqe->qpn = htole32(qp->id); > + qp->rq.wr_tbl[idx] = wr->wr_id; > + > + if (wr->num_sge == 0) { > + rqe->length = 0; > + } else if (wr->num_sge == 1) { > + rqe->stag = htole32(wr->sg_list[0].lkey); > + rqe->to = htole64(wr->sg_list[0].addr); > + rqe->length = htole32(wr->sg_list[0].length); > + } else { > + return -EINVAL; > + } > + > + *(__le64 *)qp->rq.db_record = *(__le64 *)rqe; > + udma_to_device_barrier(); > + mmio_write64_le(qp->rq.db, *(__le64 *)rqe); > + > + qp->rq.pi = rq_pi + 1; > + > + return 0; > +} > + > +int erdma_post_recv(struct ibv_qp *base_qp, struct ibv_recv_wr *wr, > +struct ibv_recv_wr **bad_wr) { > + struct erdma_qp *qp = to_eqp(base_qp); > + int ret = 0; > + > + if (base_qp->state == IBV_QPS_ERR) { > + *bad_wr = wr; > + return -EIO; > + } > + > + pthread_spin_lock(&qp->rq_lock); > + > + while (wr) { > + ret = push_recv_wqe(qp, wr); > + if (ret) { > + *bad_wr = wr; > + break; > + } > + > + wr = wr->next; > + } > + > + pthread_spin_unlock(&qp->rq_lock); > + > + return ret; > +} > + > + > +void erdma_cq_event(struct ibv_cq *ibcq) { > + struct erdma_cq *cq = to_ecq(ibcq); > + > + cq->cmdsn++; > +} > + > +static const struct { > + enum erdma_opcode erdma; > + enum ibv_wc_opcode base; > +} map_cqe_opcode[ERDMA_NUM_OPCODES] = { > + { ERDMA_OP_WRITE, IBV_WC_RDMA_WRITE }, > + { ERDMA_OP_READ, IBV_WC_RDMA_READ }, > + { ERDMA_OP_SEND, IBV_WC_SEND }, > + { ERDMA_OP_SEND_WITH_IMM, IBV_WC_SEND }, > + { ERDMA_OP_RECEIVE, IBV_WC_RECV }, > + { ERDMA_OP_RECV_IMM, IBV_WC_RECV_RDMA_WITH_IMM }, > + { ERDMA_OP_RECV_INV, IBV_WC_LOCAL_INV}, /* can not appear > */ > + { ERDMA_OP_REQ_ERR, IBV_WC_RECV }, /* can not appear */ > + { ERDNA_OP_READ_RESPONSE, IBV_WC_RECV }, /* can not appear > */ > + { ERDMA_OP_WRITE_WITH_IMM, IBV_WC_RDMA_WRITE }, > + { ERDMA_OP_RECV_ERR, IBV_WC_RECV_RDMA_WITH_IMM }, /* > can not appear */ > + { ERDMA_OP_INVALIDATE, IBV_WC_LOCAL_INV }, > + { ERDMA_OP_RSP_SEND_IMM, IBV_WC_RECV }, > + { ERDMA_OP_SEND_WITH_INV, IBV_WC_SEND }, > + { ERDMA_OP_REG_MR, IBV_WC_RECV }, /* can not appear */ > + { ERDMA_OP_LOCAL_INV, IBV_WC_LOCAL_INV }, > + { ERDMA_OP_READ_WITH_INV, IBV_WC_RDMA_READ }, }; > + > +static const struct { > + enum erdma_wc_status erdma; > + enum ibv_wc_status base; > + enum erdma_vendor_err vendor; > +} map_cqe_status[ERDMA_NUM_WC_STATUS] = { > + { ERDMA_WC_SUCCESS, IBV_WC_SUCCESS, > ERDMA_WC_VENDOR_NO_ERR }, > + { ERDMA_WC_GENERAL_ERR, IBV_WC_GENERAL_ERR, > ERDMA_WC_VENDOR_NO_ERR }, > + { ERDMA_WC_RECV_WQE_FORMAT_ERR, IBV_WC_GENERAL_ERR, > ERDMA_WC_VENDOR_INVALID_RQE }, > + { ERDMA_WC_RECV_STAG_INVALID_ERR, > + IBV_WC_REM_ACCESS_ERR, > ERDMA_WC_VENDOR_RQE_INVALID_STAG }, > + { ERDMA_WC_RECV_ADDR_VIOLATION_ERR, > + IBV_WC_REM_ACCESS_ERR, > ERDMA_WC_VENDOR_RQE_ADDR_VIOLATION }, > + { ERDMA_WC_RECV_RIGHT_VIOLATION_ERR, > + IBV_WC_REM_ACCESS_ERR, > ERDMA_WC_VENDOR_RQE_ACCESS_RIGHT_ERR }, > + { ERDMA_WC_RECV_PDID_ERR, IBV_WC_REM_ACCESS_ERR, > ERDMA_WC_VENDOR_RQE_INVALID_PD }, > + { ERDMA_WC_RECV_WARRPING_ERR, IBV_WC_REM_ACCESS_ERR, > ERDMA_WC_VENDOR_RQE_WRAP_ERR }, > + { ERDMA_WC_SEND_WQE_FORMAT_ERR, > IBV_WC_LOC_QP_OP_ERR, ERDMA_WC_VENDOR_INVALID_SQE }, > + { ERDMA_WC_SEND_WQE_ORD_EXCEED, IBV_WC_GENERAL_ERR, > ERDMA_WC_VENDOR_ZERO_ORD }, > + { ERDMA_WC_SEND_STAG_INVALID_ERR, > + IBV_WC_LOC_ACCESS_ERR, > ERDMA_WC_VENDOR_SQE_INVALID_STAG }, > + { ERDMA_WC_SEND_ADDR_VIOLATION_ERR, > + IBV_WC_LOC_ACCESS_ERR, > ERDMA_WC_VENDOR_SQE_ADDR_VIOLATION }, > + { ERDMA_WC_SEND_RIGHT_VIOLATION_ERR, > + IBV_WC_LOC_ACCESS_ERR, > ERDMA_WC_VENDOR_SQE_ACCESS_ERR }, > + { ERDMA_WC_SEND_PDID_ERR, IBV_WC_LOC_ACCESS_ERR, > ERDMA_WC_VENDOR_SQE_INVALID_PD }, > + { ERDMA_WC_SEND_WARRPING_ERR, IBV_WC_LOC_ACCESS_ERR, > ERDMA_WC_VENDOR_SQE_WARP_ERR }, > + { ERDMA_WC_FLUSH_ERR, IBV_WC_WR_FLUSH_ERR, > ERDMA_WC_VENDOR_NO_ERR }, > + { ERDMA_WC_RETRY_EXC_ERR, IBV_WC_RETRY_EXC_ERR, > ERDMA_WC_VENDOR_NO_ERR > +}, }; > + > +#define ERDMA_POLLCQ_NO_QP (-1) > +#define ERDMA_POLLCQ_DUP_COMP (-2) > +#define ERDMA_POLLCQ_WRONG_IDX (-3) > + > +static int __erdma_poll_one_cqe(struct erdma_context *ctx, struct > erdma_cq *cq, > + struct erdma_cqe *cqe, uint32_t cqe_hdr, > struct ibv_wc *wc) { > + struct erdma_qp *qp; > + uint64_t *qeidx2wrid = NULL; > + uint32_t qpn = be32toh(cqe->qpn); > + uint16_t depth = 0; > + uint64_t *sqe_hdr; > + uint16_t wqe_idx = be32toh(cqe->qe_idx); > + uint16_t old_ci, new_ci; > + uint32_t opcode = FIELD_GET(ERDMA_CQE_HDR_OPCODE_MASK, > cqe_hdr); > + uint32_t syndrome = > FIELD_GET(ERDMA_CQE_HDR_SYNDROME_MASK, cqe_hdr); > + > + int tbl_idx = qpn >> ERDMA_QP_TABLE_SHIFT; > + int tbl_off = qpn & ERDMA_QP_TABLE_MASK; > + > + if (!ctx->qp_table[tbl_idx].table || !ctx- > >qp_table[tbl_idx].table[tbl_off]) > + return ERDMA_POLLCQ_NO_QP; > + > + qp = ctx->qp_table[tbl_idx].table[tbl_off]; > + > + if (FIELD_GET(ERDMA_CQE_HDR_QTYPE_MASK, cqe_hdr) == > ERDMA_CQE_QTYPE_SQ) { > + qeidx2wrid = qp->sq.wr_tbl; > + depth = qp->sq.depth; > + sqe_hdr = (uint64_t *)get_sq_wqebb(qp, wqe_idx); > + old_ci = qp->sq.ci; > + new_ci = wqe_idx + > FIELD_GET(ERDMA_SQE_HDR_WQEBB_CNT_MASK, *sqe_hdr) > ++ 1; > + > + if ((uint16_t)(new_ci - old_ci) > depth) > + return ERDMA_POLLCQ_WRONG_IDX; > + else if (new_ci == old_ci) > + return ERDMA_POLLCQ_DUP_COMP; > + > + qp->sq.ci = new_ci; > + } else { > + qeidx2wrid = qp->rq.wr_tbl; > + depth = qp->rq.depth; > + qp->rq.ci++; > + } > + > + wc->wr_id = qeidx2wrid[wqe_idx & (depth - 1)]; > + wc->byte_len = be32toh(cqe->size); > + wc->wc_flags = 0; > + > + if (opcode == ERDMA_OP_RECV_IMM) { > + wc->opcode = IBV_WC_RECV_RDMA_WITH_IMM; > + wc->imm_data = htobe32(le32toh(cqe->imm_data)); > + wc->wc_flags |= IBV_WC_WITH_IMM; > + } else if (opcode == ERDMA_OP_RSP_SEND_IMM) { > + wc->opcode = IBV_WC_RECV; > + wc->imm_data = htobe32(le32toh(cqe->imm_data)); > + wc->wc_flags |= IBV_WC_WITH_IMM; > + } else { > + wc->opcode = map_cqe_opcode[opcode].base; > + } > + > + if (syndrome >= ERDMA_NUM_WC_STATUS) > + syndrome = ERDMA_WC_GENERAL_ERR; > + > + wc->status = map_cqe_status[syndrome].base; > + wc->vendor_err = map_cqe_status[syndrome].vendor; > + wc->qp_num = qpn; > + > + return 0; > +} > + > +int erdma_poll_cq(struct ibv_cq *ibcq, int num_entries, struct ibv_wc > +*wc) { > + struct erdma_cq *cq = to_ecq(ibcq); > + struct erdma_context *ctx = to_ectx(ibcq->context); > + int new = 0; > + struct erdma_cqe *cqe; > + int owner; > + uint32_t ci; > + uint32_t depth_mask = cq->depth - 1; > + uint32_t hdr; > + int i, ret; > + > + pthread_spin_lock(&cq->lock); > + > + owner = cq->owner; > + ci = cq->ci; > + > + for (i = 0; i < num_entries; i++) { > + cqe = &cq->queue[ci & depth_mask]; > + hdr = be32toh(cqe->hdr); > + > + if (FIELD_GET(ERDMA_CQE_HDR_OWNER_MASK, hdr) != > owner) > + break; > + > + udma_from_device_barrier(); > + > + ret = __erdma_poll_one_cqe(ctx, cq, cqe, hdr, wc); > + > + ci++; > + if ((ci & depth_mask) == 0) > + owner = !owner; > + > + if (ret) > + continue; > + > + wc++; > + new++; > + } > + > + cq->owner = owner; > + cq->ci = ci; > + > + pthread_spin_unlock(&cq->lock); > + > + return new; > +} > + > +void erdma_free_context(struct ibv_context *ibv_ctx) { > + struct erdma_context *ctx = to_ectx(ibv_ctx); > + int i; > + > + munmap(ctx->sdb, ERDMA_PAGE_SIZE); > + munmap(ctx->rdb, ERDMA_PAGE_SIZE); > + munmap(ctx->cdb, ERDMA_PAGE_SIZE); > + > + pthread_mutex_lock(&ctx->qp_table_mutex); > + for (i = 0; i < ERDMA_QP_TABLE_SIZE; ++i) { > + if (ctx->qp_table[i].refcnt) > + free(ctx->qp_table[i].table); > + } > + > + pthread_mutex_unlock(&ctx->qp_table_mutex); > + pthread_mutex_destroy(&ctx->qp_table_mutex); > + > + verbs_uninit_context(&ctx->ibv_ctx); > + free(ctx); > +} > -- > 2.27.0