Implement device supported userspace verb APIs. Signed-off-by: Tatyana Nikolova <tatyana.e.nikolova@xxxxxxxxx> --- providers/irdma/uverbs.c | 1990 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 1990 insertions(+) create mode 100644 providers/irdma/uverbs.c diff --git a/providers/irdma/uverbs.c b/providers/irdma/uverbs.c new file mode 100644 index 0000000..3cf0eee --- /dev/null +++ b/providers/irdma/uverbs.c @@ -0,0 +1,1990 @@ +// SPDX-License-Identifier: GPL-2.0 or Linux-OpenIB +/* Copyright (C) 2019 - 2020 Intel Corporation */ +#include <config.h> +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <unistd.h> +#include <signal.h> +#include <errno.h> +#include <malloc.h> +#include <sys/param.h> +#include <sys/mman.h> +#include <netinet/in.h> +#include <linux/if_ether.h> +#include <sys/stat.h> +#include <fcntl.h> + +#include "umain.h" +#include "abi.h" + +static inline void print_fw_ver(uint64_t fw_ver, char *str, size_t len) +{ + uint16_t major, minor; + + major = fw_ver >> 32 & 0xffff; + minor = fw_ver & 0xffff; + + snprintf(str, len, "%d.%d", major, minor); +} + +/** + * irdma_uquery_device_ex - query device attributes including extended properties + * @context: user context for the device + * @input: extensible input struct for ibv_query_device_ex verb + * @attr: extended device attribute struct + * @attr_size: size of extended device attribute struct + **/ +int irdma_uquery_device_ex(struct ibv_context *context, + const struct ibv_query_device_ex_input *input, + struct ibv_device_attr_ex *attr, size_t attr_size) +{ + struct ib_uverbs_ex_query_device_resp resp = {}; + size_t resp_size = sizeof(resp); + uint64_t fw_ver; + int ret; + + ret = ibv_cmd_query_device_any(context, input, attr, attr_size, + &resp, &resp_size); + if (ret) { + fprintf(stderr, PFX "%s: query device failed with status code: %d\n", + __func__, ret); + return ret; + } + + fw_ver = resp.base.fw_ver; + print_fw_ver(fw_ver, attr->orig_attr.fw_ver, sizeof(attr->orig_attr.fw_ver)); + + return 0; +} + +/** + * irdma_uquery_port - get port attributes (msg size, lnk, mtu...) + * @context: user context of the device + * @port: port for the attributes + * @attr: to return port attributes + **/ +int irdma_uquery_port(struct ibv_context *context, uint8_t port, + struct ibv_port_attr *attr) +{ + struct ibv_query_port cmd; + + return ibv_cmd_query_port(context, port, attr, &cmd, sizeof(cmd)); +} + +/** + * irdma_ualloc_pd - allocates protection domain and return pd ptr + * @context: user context of the device + **/ +struct ibv_pd *irdma_ualloc_pd(struct ibv_context *context) +{ + struct ibv_alloc_pd cmd; + struct irdma_ualloc_pd_resp resp = {}; + struct irdma_upd *iwupd; + + iwupd = malloc(sizeof(*iwupd)); + if (!iwupd) + return NULL; + + if (ibv_cmd_alloc_pd(context, &iwupd->ibv_pd, &cmd, sizeof(cmd), + &resp.ibv_resp, sizeof(resp))) + goto err_free; + + iwupd->pd_id = resp.pd_id; + + return &iwupd->ibv_pd; + +err_free: + free(iwupd); + return NULL; +} + +/** + * irdma_ufree_pd - free pd resources + * @pd: pd to free resources + */ +int irdma_ufree_pd(struct ibv_pd *pd) +{ + struct irdma_upd *iwupd; + int ret; + + iwupd = to_irdma_upd(pd); + ret = ibv_cmd_dealloc_pd(pd); + if (ret) + return ret; + + free(iwupd); + + return 0; +} + +/** + * irdma_ureg_mr - register user memory region + * @pd: pd for the mr + * @addr: user address of the memory region + * @length: length of the memory + * @hca_va: hca_va + * @access: access allowed on this mr + */ +struct ibv_mr *irdma_ureg_mr(struct ibv_pd *pd, void *addr, size_t length, + uint64_t hca_va, int access) +{ + struct irdma_umr *umr; + struct irdma_ureg_mr cmd; + struct ib_uverbs_reg_mr_resp resp; + + umr = malloc(sizeof(*umr)); + if (!umr) + return NULL; + + cmd.reg_type = IW_MEMREG_TYPE_MEM; + if (ibv_cmd_reg_mr(pd, addr, length, hca_va, access, &umr->vmr, &cmd.ibv_cmd, + sizeof(cmd), &resp, sizeof(resp))) { + free(umr); + return NULL; + } + umr->acc_flags = access; + + return &umr->vmr.ibv_mr; +} + +/** + * irdma_udereg_mr - re-register memory region + * @vmr: mr that was allocated + */ +int irdma_udereg_mr(struct verbs_mr *vmr) +{ + int ret; + + ret = ibv_cmd_dereg_mr(vmr); + if (ret) + return ret; + + free(vmr); + + return 0; +} + +/** + * irdma_ualloc_mw - allocate memory window + * @pd: protection domain + * @type: memory window type + */ +struct ibv_mw *irdma_ualloc_mw(struct ibv_pd *pd, enum ibv_mw_type type) +{ + struct ibv_mw *mw; + struct ibv_alloc_mw cmd; + struct ib_uverbs_alloc_mw_resp resp; + + mw = calloc(1, sizeof(*mw)); + if (!mw) + return NULL; + + if (ibv_cmd_alloc_mw(pd, type, mw, &cmd, sizeof(cmd), &resp, + sizeof(resp))) { + fprintf(stderr, PFX "%s: Failed to alloc memory window\n", + __func__); + free(mw); + return NULL; + } + + return mw; +} + +/** + * irdma_ubind_mw - bind a memory window + * @qp: qp to post WR + * @mw: memory window to bind + * @mw_bind: bind info + */ +int irdma_ubind_mw(struct ibv_qp *qp, struct ibv_mw *mw, + struct ibv_mw_bind *mw_bind) +{ + struct ibv_mw_bind_info *bind_info = &mw_bind->bind_info; + struct verbs_mr *vmr = verbs_get_mr(bind_info->mr); + struct irdma_umr *umr = container_of(vmr, struct irdma_umr, vmr); + + struct ibv_send_wr wr = {}; + struct ibv_send_wr *bad_wr; + int err; + + if (vmr->mr_type != IBV_MR_TYPE_MR) + return ENOTSUP; + + if (umr->acc_flags & IBV_ACCESS_ZERO_BASED) + return EINVAL; + + wr.opcode = IBV_WR_BIND_MW; + wr.bind_mw.bind_info = mw_bind->bind_info; + wr.bind_mw.mw = mw; + wr.bind_mw.rkey = ibv_inc_rkey(mw->rkey); + + wr.wr_id = mw_bind->wr_id; + wr.send_flags = mw_bind->send_flags; + + err = irdma_upost_send(qp, &wr, &bad_wr); + if (!err) + mw->rkey = wr.bind_mw.rkey; + + return err; +} + +/** + * irdma_udealloc_mw - deallocate memory window + * @mw: memory window to dealloc + */ +int irdma_udealloc_mw(struct ibv_mw *mw) +{ + int ret; + + ret = ibv_cmd_dealloc_mw(mw); + if (ret) + return ret; + free(mw); + + return 0; +} + +/** + * get_cq_size - returns actual cqe needed by HW + * @ncqe: minimum cqes requested by application + * @hw_rev: HW generation + */ +static inline int get_cq_size(int ncqe, __u8 hw_rev) +{ + ncqe++; + + /* Completions with immediate require 1 extra entry */ + if (hw_rev > IRDMA_GEN_1) + ncqe *= 2; + + if (ncqe < IRDMA_U_MINCQ_SIZE) + ncqe = IRDMA_U_MINCQ_SIZE; + + return ncqe; +} + +/** + * ucreate_cq - irdma util function to create a CQ + * @context: ibv context + * @attr_ex: CQ init attributes + * @ext_cq: flag to create an extendable or normal CQ + */ +static struct ibv_cq_ex *ucreate_cq(struct ibv_context *context, + struct ibv_cq_init_attr_ex *attr_ex, + bool ext_cq) +{ + struct irdma_ucq *iwucq; + struct irdma_cq_uk_init_info info = {}; + struct irdma_ureg_mr reg_mr_cmd = {}; + struct ib_uverbs_reg_mr_resp reg_mr_resp = {}; + struct irdma_ureg_mr reg_mr_shadow_cmd = {}; + struct ib_uverbs_reg_mr_resp reg_mr_shadow_resp = {}; + struct irdma_uvcontext *iwvctx = to_irdma_uctx(context); + struct irdma_uk_attrs *uk_attrs = &iwvctx->uk_attrs; + __u8 hw_rev = uk_attrs->hw_rev; + __u32 cqe_struct_size; + __u32 totalsize; + __u32 cq_pages; + int ret, ncqe; + + if (ext_cq && hw_rev == IRDMA_GEN_1) { + errno = EOPNOTSUPP; + return NULL; + } + + if (attr_ex->cqe > uk_attrs->max_hw_cq_size) + return NULL; + + /* save the cqe requested by application */ + ncqe = attr_ex->cqe; + + iwucq = calloc(1, sizeof(*iwucq)); + if (!iwucq) + return NULL; + + if (pthread_spin_init(&iwucq->lock, PTHREAD_PROCESS_PRIVATE)) { + free(iwucq); + return NULL; + } + + info.cq_size = get_cq_size(attr_ex->cqe, hw_rev); + iwucq->comp_vector = attr_ex->comp_vector; + list_head_init(&iwucq->resize_list); + cqe_struct_size = sizeof(struct irdma_cqe); + totalsize = roundup(info.cq_size * cqe_struct_size, IRDMA_HW_PAGE_SIZE); + cq_pages = totalsize >> IRDMA_HW_PAGE_SHIFT; + + if (!(uk_attrs->feature_flags & IRDMA_FEATURE_CQ_RESIZE)) + totalsize = (cq_pages << IRDMA_HW_PAGE_SHIFT) + IRDMA_DB_SHADOW_AREA_SIZE; + + info.cq_base = memalign(IRDMA_HW_PAGE_SIZE, totalsize); + if (!info.cq_base) + goto err; + + memset(info.cq_base, 0, totalsize); + reg_mr_cmd.reg_type = IW_MEMREG_TYPE_CQ; + reg_mr_cmd.cq_pages = cq_pages; + + ret = ibv_cmd_reg_mr(&iwvctx->iwupd->ibv_pd, info.cq_base, + totalsize, (uintptr_t)info.cq_base, + IBV_ACCESS_LOCAL_WRITE, &iwucq->vmr, + ®_mr_cmd.ibv_cmd, sizeof(reg_mr_cmd), + ®_mr_resp, sizeof(reg_mr_resp)); + if (ret) { + fprintf(stderr, PFX "%s: failed to pin memory for CQ\n", + __func__); + goto err; + } + iwucq->vmr.ibv_mr.pd = &iwvctx->iwupd->ibv_pd; + + if (uk_attrs->feature_flags & IRDMA_FEATURE_CQ_RESIZE) { + info.shadow_area = memalign(IRDMA_HW_PAGE_SIZE, + IRDMA_DB_SHADOW_AREA_SIZE); + if (!info.shadow_area) + goto err_dereg_mr; + + memset(info.shadow_area, 0, IRDMA_DB_SHADOW_AREA_SIZE); + reg_mr_shadow_cmd.reg_type = IW_MEMREG_TYPE_CQ; + reg_mr_shadow_cmd.cq_pages = 1; + + ret = ibv_cmd_reg_mr(&iwvctx->iwupd->ibv_pd, info.shadow_area, + IRDMA_DB_SHADOW_AREA_SIZE, (uintptr_t)info.shadow_area, + IBV_ACCESS_LOCAL_WRITE, &iwucq->vmr_shadow_area, + ®_mr_shadow_cmd.ibv_cmd, sizeof(reg_mr_shadow_cmd), + ®_mr_shadow_resp, sizeof(reg_mr_shadow_resp)); + if (ret) { + fprintf(stderr, PFX "%s: failed to pin memory for CQ shadow\n", + __func__); + goto err_dereg_mr; + } + iwucq->vmr_shadow_area.ibv_mr.pd = &iwvctx->iwupd->ibv_pd; + + } else { + info.shadow_area = (__le64 *)((__u8 *)info.cq_base + (cq_pages << IRDMA_HW_PAGE_SHIFT)); + } + + attr_ex->cqe = info.cq_size; + if (ext_cq) { + struct irdma_ucreate_cq_ex cmd = {}; + struct irdma_ucreate_cq_ex_resp resp = {}; + + cmd.user_cq_buf = (__u64)((uintptr_t)info.cq_base); + cmd.user_shadow_area = (__u64)((uintptr_t)info.shadow_area); + + ret = ibv_cmd_create_cq_ex(context, attr_ex, &iwucq->verbs_cq, + &cmd.ibv_cmd, sizeof(cmd), &resp.ibv_resp, + sizeof(resp)); + if (!ret) { + irdma_ibvcq_ex_fill_priv_funcs(iwucq, attr_ex); + info.cq_id = resp.cq_id; + /* Do not report the cqe's burned by HW */ + iwucq->verbs_cq.cq.cqe = ncqe; + } + } else { + struct irdma_ucreate_cq cmd = {}; + struct irdma_ucreate_cq_resp resp = {}; + + cmd.user_cq_buf = (__u64)((uintptr_t)info.cq_base); + cmd.user_shadow_area = (__u64)((uintptr_t)info.shadow_area); + + ret = ibv_cmd_create_cq(context, attr_ex->cqe, attr_ex->channel, + attr_ex->comp_vector, &iwucq->verbs_cq.cq, + &cmd.ibv_cmd, sizeof(cmd), &resp.ibv_resp, + sizeof(resp)); + if (!ret) { + info.cq_id = resp.cq_id; + /* Do not report the cqe's burned by HW */ + iwucq->verbs_cq.cq.cqe = ncqe; + } + } + + if (ret) { + fprintf(stderr, PFX "%s: failed to create CQ\n", __func__); + goto err_dereg_mr; + } + + info.cqe_alloc_db = (__u32 *)((__u8 *)iwvctx->db + IRDMA_DB_CQ_OFFSET); + ret = iwvctx->dev.ops_uk.iw_cq_uk_init(&iwucq->cq, &info); + if (!ret) + return &iwucq->verbs_cq.cq_ex; + +err_dereg_mr: + fprintf(stderr, PFX "%s: failed to initialize CQ, status %d\n", + __func__, ret); + ibv_cmd_dereg_mr(&iwucq->vmr); + if (iwucq->vmr_shadow_area.ibv_mr.handle) + ibv_cmd_dereg_mr(&iwucq->vmr_shadow_area); +err: + if (info.cq_base) + free(info.cq_base); + pthread_spin_destroy(&iwucq->lock); + + free(iwucq); + + return NULL; +} + +struct ibv_cq *irdma_ucreate_cq(struct ibv_context *context, int cqe, + struct ibv_comp_channel *channel, + int comp_vector) +{ + struct ibv_cq_init_attr_ex attr_ex = { + .cqe = cqe, + .channel = channel, + .comp_vector = comp_vector, + }; + struct ibv_cq_ex *ibvcq_ex; + + ibvcq_ex = ucreate_cq(context, &attr_ex, false); + + return ibvcq_ex ? ibv_cq_ex_to_cq(ibvcq_ex) : NULL; +} + +struct ibv_cq_ex *irdma_ucreate_cq_ex(struct ibv_context *context, + struct ibv_cq_init_attr_ex *attr_ex) +{ + if (attr_ex->wc_flags & ~IRDMA_CQ_SUPPORTED_WC_FLAGS) { + errno = EOPNOTSUPP; + return NULL; + } + + return ucreate_cq(context, attr_ex, true); +} + +/** + * irdma_free_cq_buf - free memory for cq buffer + * @cq_buf: cq buf to free + */ +static void irdma_free_cq_buf(struct irdma_cq_buf *cq_buf) +{ + ibv_cmd_dereg_mr(&cq_buf->vmr); + free(cq_buf->cq.cq_base); + free(cq_buf); +} + +/** + * irdma_process_resize_list - process the cq list to remove buffers + * @iwucq: cq which owns the list + * @lcqe_buf: cq buf where the last cqe is found + */ +static int irdma_process_resize_list(struct irdma_ucq *iwucq, + struct irdma_cq_buf *lcqe_buf) +{ + struct irdma_cq_buf *cq_buf, *next; + int cq_cnt = 0; + + list_for_each_safe(&iwucq->resize_list, cq_buf, next, list) { + if (cq_buf == lcqe_buf) + return cq_cnt; + + list_del(&cq_buf->list); + irdma_free_cq_buf(cq_buf); + cq_cnt++; + } + + return cq_cnt; +} + +/** + * irdma_udestroy_cq - destroys cq + * @cq: ptr to cq to be destroyed + */ +int irdma_udestroy_cq(struct ibv_cq *cq) +{ + struct irdma_ucq *iwucq = to_irdma_ucq(cq); + struct irdma_uvcontext *iwvctx = to_irdma_uctx(cq->context); + struct irdma_uk_attrs *uk_attrs = &iwvctx->uk_attrs; + int ret; + + ret = pthread_spin_destroy(&iwucq->lock); + if (ret) + goto err; + + irdma_process_resize_list(iwucq, NULL); + ret = ibv_cmd_destroy_cq(cq); + if (ret) + goto err; + + ibv_cmd_dereg_mr(&iwucq->vmr); + free(iwucq->cq.cq_base); + + if (uk_attrs->feature_flags & IRDMA_FEATURE_CQ_RESIZE) { + ibv_cmd_dereg_mr(&iwucq->vmr_shadow_area); + free(iwucq->cq.shadow_area); + } + free(iwucq); + return 0; + +err: + fprintf(stderr, PFX "%s: failed to destroy CQ, status %d\n", + __func__, ret); + + return ret; +} + +static enum ibv_wc_status to_ibv_wc_status(enum irdma_cmpl_status status) +{ + switch (status) { + case IRDMA_COMPL_STATUS_SUCCESS: + return IBV_WC_SUCCESS; + case IRDMA_COMPL_STATUS_FLUSHED: + return IBV_WC_WR_FLUSH_ERR; + case IRDMA_COMPL_STATUS_INVALID_LEN: + return IBV_WC_LOC_LEN_ERR; + default: + return IBV_WC_GENERAL_ERR; + } +} + +/** + * irdma_process_cqe_ext - process current cqe for extended CQ + * @cur_cqe - current cqe info + */ +static void irdma_process_cqe_ext(struct irdma_cq_poll_info *cur_cqe) +{ + struct irdma_ucq *iwucq = container_of(cur_cqe, struct irdma_ucq, cur_cqe); + struct ibv_cq_ex *ibvcq_ex = &iwucq->verbs_cq.cq_ex; + + ibvcq_ex->wr_id = cur_cqe->wr_id; + ibvcq_ex->status = to_ibv_wc_status(cur_cqe->comp_status); +} + +/** + * irdma_process_cqe - process current cqe info + * @entry - ibv_wc object to fill in for non-extended CQ + * @cur_cqe - current cqe info + */ +static void irdma_process_cqe(struct ibv_wc *entry, struct irdma_cq_poll_info *cur_cqe) +{ + struct irdma_qp_uk *qp; + struct ibv_qp *ib_qp; + + entry->wc_flags = 0; + entry->wr_id = cur_cqe->wr_id; + entry->qp_num = cur_cqe->qp_id; + qp = cur_cqe->qp_handle; + ib_qp = qp->back_qp; + + entry->status = to_ibv_wc_status(cur_cqe->comp_status); + if (entry->status) { + entry->vendor_err = cur_cqe->major_err << 16 | + cur_cqe->minor_err; + /* + * In case of an error, there is no need to populate + * remaining fields of wc structure. + */ + return; + } + + if (cur_cqe->imm_valid) { + entry->imm_data = htonl(cur_cqe->imm_data); + entry->wc_flags |= IBV_WC_WITH_IMM; + } + + switch (cur_cqe->op_type) { + case IRDMA_OP_TYPE_RDMA_WRITE: + case IRDMA_OP_TYPE_RDMA_WRITE_SOL: + entry->opcode = IBV_WC_RDMA_WRITE; + break; + case IRDMA_OP_TYPE_RDMA_READ: + entry->opcode = IBV_WC_RDMA_READ; + break; + case IRDMA_OP_TYPE_SEND_SOL: + case IRDMA_OP_TYPE_SEND_SOL_INV: + case IRDMA_OP_TYPE_SEND_INV: + case IRDMA_OP_TYPE_SEND: + entry->opcode = IBV_WC_SEND; + break; + case IRDMA_OP_TYPE_BIND_MW: + entry->opcode = IBV_WC_BIND_MW; + break; + case IRDMA_OP_TYPE_REC: + entry->opcode = IBV_WC_RECV; + if (ib_qp->qp_type != IBV_QPT_UD && + cur_cqe->stag_invalid_set) { + entry->invalidated_rkey = cur_cqe->inv_stag; + entry->wc_flags |= IBV_WC_WITH_INV; + } + break; + case IRDMA_OP_TYPE_REC_IMM: + entry->opcode = IBV_WC_RECV_RDMA_WITH_IMM; + if (ib_qp->qp_type != IBV_QPT_UD && + cur_cqe->stag_invalid_set) { + entry->invalidated_rkey = cur_cqe->inv_stag; + entry->wc_flags |= IBV_WC_WITH_INV; + } + break; + case IRDMA_OP_TYPE_INV_STAG: + entry->opcode = IBV_WC_LOCAL_INV; + break; + default: + entry->status = IBV_WC_GENERAL_ERR; + fprintf(stderr, PFX "%s: Invalid opcode = %d in CQE\n", + __func__, cur_cqe->op_type); + return; + } + + + if (ib_qp->qp_type == IBV_QPT_UD) { + entry->src_qp = cur_cqe->ud_src_qpn; + entry->wc_flags |= IBV_WC_GRH; + } else { + entry->src_qp = cur_cqe->qp_id; + } + entry->byte_len = cur_cqe->bytes_xfered; +} + +/** + * irdma_poll_one - poll one entry of the CQ + * @ukcq: ukcq to poll + * @cur_cqe: current CQE info to be filled in + * @entry: ibv_wc object to be filled for non-extended CQ or NULL for extended CQ + * + * Returns the internal irdma device error code or 0 on success + */ +static int irdma_poll_one(struct irdma_cq_uk *ukcq, struct irdma_cq_poll_info *cur_cqe, + struct ibv_wc *entry) +{ + int ret = ukcq->ops.iw_cq_poll_cmpl(ukcq, cur_cqe); + + if (ret) + return ret; + + if (!entry) + irdma_process_cqe_ext(cur_cqe); + else + irdma_process_cqe(entry, cur_cqe); + + return 0; +} + +/** + * __irdma_upoll_cq - irdma util function to poll device CQ + * @iwucq: irdma cq to poll + * @num_entries: max cq entries to poll + * @entry: pointer to array of ibv_wc objects to be filled in for each completion or NULL if ext CQ + * + * Returns non-negative value equal to the number of completions + * found. On failure, -EINVAL + */ +static int __irdma_upoll_cq(struct irdma_ucq *iwucq, int num_entries, + struct ibv_wc *entry) +{ + struct irdma_cq_buf *cq_buf, *next; + struct irdma_cq_buf *last_buf = NULL; + struct irdma_cq_poll_info *cur_cqe = &iwucq->cur_cqe; + bool cq_new_cqe = false; + int resized_bufs = 0; + int npolled = 0; + int ret; + + /* go through the list of previously resized CQ buffers */ + list_for_each_safe(&iwucq->resize_list, cq_buf, next, list) { + while (npolled < num_entries) { + ret = irdma_poll_one(&cq_buf->cq, cur_cqe, + entry ? entry + npolled : NULL); + if (!ret) { + ++npolled; + cq_new_cqe = true; + continue; + } + if (ret == IRDMA_ERR_Q_EMPTY) + break; + /* QP using the CQ is destroyed. Skip reporting this CQE */ + if (ret == IRDMA_ERR_Q_DESTROYED) { + cq_new_cqe = true; + continue; + } + goto error; + } + + /* save the resized CQ buffer which received the last cqe */ + if (cq_new_cqe) + last_buf = cq_buf; + cq_new_cqe = false; + } + + /* check the current CQ for new cqes */ + while (npolled < num_entries) { + ret = irdma_poll_one(&iwucq->cq, cur_cqe, + entry ? entry + npolled : NULL); + if (!ret) { + ++npolled; + cq_new_cqe = true; + continue; + } + if (ret == IRDMA_ERR_Q_EMPTY) + break; + /* QP using the CQ is destroyed. Skip reporting this CQE */ + if (ret == IRDMA_ERR_Q_DESTROYED) { + cq_new_cqe = true; + continue; + } + goto error; + } + + if (cq_new_cqe) + /* all previous CQ resizes are complete */ + resized_bufs = irdma_process_resize_list(iwucq, NULL); + else if (last_buf) + /* only CQ resizes up to the last_buf are complete */ + resized_bufs = irdma_process_resize_list(iwucq, last_buf); + if (resized_bufs) + /* report to the HW the number of complete CQ resizes */ + iwucq->cq.ops.iw_cq_set_resized_cnt(&iwucq->cq, resized_bufs); + + return npolled; + +error: + fprintf(stderr, PFX "%s: Error polling CQ, irdma_err: %d\n", __func__, ret); + + return -EINVAL; +} + +/** + * irdma_upoll_cq - verb API callback to poll device CQ + * @cq: ibv_cq to poll + * @num_entries: max cq entries to poll + * @entry: pointer to array of ibv_wc objects to be filled in for each completion + * + * Returns non-negative value equal to the number of completions + * found and a negative error code on failure + */ +int irdma_upoll_cq(struct ibv_cq *cq, int num_entries, struct ibv_wc *entry) +{ + struct irdma_ucq *iwucq = to_irdma_ucq(cq); + int ret; + + ret = pthread_spin_lock(&iwucq->lock); + if (ret) + return -ret; + + ret = __irdma_upoll_cq(iwucq, num_entries, entry); + + pthread_spin_unlock(&iwucq->lock); + + return ret; +} + +/** + * irdma_start_poll - verb_ex API callback to poll batch of WC's + * @ibvcq_ex: ibv extended CQ + * @attr: attributes (not used) + * + * Start polling batch of work completions. Return 0 on success, ENONENT when + * no completions are available on CQ. And an error code on errors + */ +static int irdma_start_poll(struct ibv_cq_ex *ibvcq_ex, struct ibv_poll_cq_attr *attr) +{ + struct irdma_ucq *iwucq = to_irdma_ucq_ex(ibvcq_ex); + int ret; + + ret = pthread_spin_lock(&iwucq->lock); + if (ret) + return ret; + + ret = __irdma_upoll_cq(iwucq, 1, NULL); + if (ret == 1) + return 0; + + /* No Completions on CQ */ + if (!ret) + ret = ENOENT; + + pthread_spin_unlock(&iwucq->lock); + + return ret; +} + +/** + * irdma_next_poll - verb_ex API callback to get next WC + * @ibvcq_ex: ibv extended CQ + * + * Return 0 on success, ENONENT when no completions are available on CQ. + * And an error code on errors + */ +static int irdma_next_poll(struct ibv_cq_ex *ibvcq_ex) +{ + struct irdma_ucq *iwucq = to_irdma_ucq_ex(ibvcq_ex); + int ret; + + ret = __irdma_upoll_cq(iwucq, 1, NULL); + if (ret == 1) + return 0; + + /* No Completions on CQ */ + if (!ret) + ret = ENOENT; + + return ret; +} + +/** + * irdma_end_poll - verb_ex API callback to end polling of WC's + * @ibvcq_ex: ibv extended CQ + */ +static void irdma_end_poll(struct ibv_cq_ex *ibvcq_ex) +{ + struct irdma_ucq *iwucq = to_irdma_ucq_ex(ibvcq_ex); + + pthread_spin_unlock(&iwucq->lock); +} + +/** + * irdma_wc_read_completion_ts - Get completion timestamp + * @ibvcq_ex: ibv extended CQ + * + * Get completion timestamp in HCA clock units + */ +static uint64_t irdma_wc_read_completion_ts(struct ibv_cq_ex *ibvcq_ex) +{ + struct irdma_ucq *iwucq = to_irdma_ucq_ex(ibvcq_ex); +#define HCA_CORE_CLOCK_800_MHZ 800 + + return iwucq->cur_cqe.tcp_seq_num_rtt / HCA_CORE_CLOCK_800_MHZ; +} + +/** + * irdma_wc_read_completion_wallclock_ns - Get completion timestamp in ns + * @ibvcq_ex: ibv extended CQ + * + * Get completion timestamp from current completion in wall clock nanoseconds + */ +static uint64_t irdma_wc_read_completion_wallclock_ns(struct ibv_cq_ex *ibvcq_ex) +{ + struct irdma_ucq *iwucq = to_irdma_ucq_ex(ibvcq_ex); + + /* RTT is in usec */ + return iwucq->cur_cqe.tcp_seq_num_rtt * 1000; +} + +static enum ibv_wc_opcode irdma_wc_read_opcode(struct ibv_cq_ex *ibvcq_ex) +{ + struct irdma_ucq *iwucq = to_irdma_ucq_ex(ibvcq_ex); + + switch (iwucq->cur_cqe.op_type) { + case IRDMA_OP_TYPE_RDMA_WRITE: + case IRDMA_OP_TYPE_RDMA_WRITE_SOL: + return IBV_WC_RDMA_WRITE; + case IRDMA_OP_TYPE_RDMA_READ: + return IBV_WC_RDMA_READ; + case IRDMA_OP_TYPE_SEND_SOL: + case IRDMA_OP_TYPE_SEND_SOL_INV: + case IRDMA_OP_TYPE_SEND_INV: + case IRDMA_OP_TYPE_SEND: + return IBV_WC_SEND; + case IRDMA_OP_TYPE_BIND_MW: + return IBV_WC_BIND_MW; + case IRDMA_OP_TYPE_REC: + return IBV_WC_RECV; + case IRDMA_OP_TYPE_REC_IMM: + return IBV_WC_RECV_RDMA_WITH_IMM; + case IRDMA_OP_TYPE_INV_STAG: + return IBV_WC_LOCAL_INV; + } + + fprintf(stderr, PFX "%s: Invalid opcode = %d in CQE\n", __func__, + iwucq->cur_cqe.op_type); + + return 0; +} + +static uint32_t irdma_wc_read_vendor_err(struct ibv_cq_ex *ibvcq_ex) +{ + struct irdma_ucq *iwucq = to_irdma_ucq_ex(ibvcq_ex); + struct irdma_cq_poll_info *cur_cqe = &iwucq->cur_cqe; + + return cur_cqe->error ? cur_cqe->major_err << 16 | cur_cqe->minor_err : 0; +} + +static unsigned int irdma_wc_read_wc_flags(struct ibv_cq_ex *ibvcq_ex) +{ + struct irdma_ucq *iwucq = to_irdma_ucq_ex(ibvcq_ex); + struct irdma_cq_poll_info *cur_cqe = &iwucq->cur_cqe; + struct irdma_qp_uk *qp = cur_cqe->qp_handle; + struct ibv_qp *ib_qp = qp->back_qp; + unsigned int wc_flags = 0; + + if (cur_cqe->imm_valid) + wc_flags |= IBV_WC_WITH_IMM; + + if (ib_qp->qp_type == IBV_QPT_UD) { + wc_flags |= IBV_WC_GRH; + } else { + if (cur_cqe->stag_invalid_set) { + switch (cur_cqe->op_type) { + case IRDMA_OP_TYPE_REC: + wc_flags |= IBV_WC_WITH_INV; + break; + case IRDMA_OP_TYPE_REC_IMM: + wc_flags |= IBV_WC_WITH_INV; + break; + } + } + } + + return wc_flags; +} + +static uint32_t irdma_wc_read_byte_len(struct ibv_cq_ex *ibvcq_ex) +{ + struct irdma_ucq *iwucq = to_irdma_ucq_ex(ibvcq_ex); + + return iwucq->cur_cqe.bytes_xfered; +} + +static __be32 irdma_wc_read_imm_data(struct ibv_cq_ex *ibvcq_ex) +{ + struct irdma_ucq *iwucq = to_irdma_ucq_ex(ibvcq_ex); + struct irdma_cq_poll_info *cur_cqe = &iwucq->cur_cqe; + + return cur_cqe->imm_valid ? htonl(cur_cqe->imm_data) : 0; +} + +static uint32_t irdma_wc_read_qp_num(struct ibv_cq_ex *ibvcq_ex) +{ + struct irdma_ucq *iwucq = to_irdma_ucq_ex(ibvcq_ex); + + return iwucq->cur_cqe.qp_id; +} + +static uint32_t irdma_wc_read_src_qp(struct ibv_cq_ex *ibvcq_ex) +{ + struct irdma_ucq *iwucq = to_irdma_ucq_ex(ibvcq_ex); + struct irdma_cq_poll_info *cur_cqe = &iwucq->cur_cqe; + struct irdma_qp_uk *qp = cur_cqe->qp_handle; + struct ibv_qp *ib_qp = qp->back_qp; + + return ib_qp->qp_type == IBV_QPT_UD ? cur_cqe->ud_src_qpn : cur_cqe->qp_id; +} + +static uint32_t irdma_wc_read_slid(struct ibv_cq_ex *ibvcq_ex) +{ + return 0; +} + +static uint8_t irdma_wc_read_sl(struct ibv_cq_ex *ibvcq_ex) +{ + return 0; +} + +static uint8_t irdma_wc_read_dlid_path_bits(struct ibv_cq_ex *ibvcq_ex) +{ + return 0; +} + +void irdma_ibvcq_ex_fill_priv_funcs(struct irdma_ucq *iwucq, + struct ibv_cq_init_attr_ex *attr_ex) +{ + struct ibv_cq_ex *ibvcq_ex = &iwucq->verbs_cq.cq_ex; + + ibvcq_ex->start_poll = irdma_start_poll; + ibvcq_ex->end_poll = irdma_end_poll; + ibvcq_ex->next_poll = irdma_next_poll; + + if (attr_ex->wc_flags & IBV_WC_EX_WITH_COMPLETION_TIMESTAMP) { + ibvcq_ex->read_completion_ts = irdma_wc_read_completion_ts; + iwucq->report_rtt = true; + } + if (attr_ex->wc_flags & IBV_WC_EX_WITH_COMPLETION_TIMESTAMP_WALLCLOCK) { + ibvcq_ex->read_completion_wallclock_ns = irdma_wc_read_completion_wallclock_ns; + iwucq->report_rtt = true; + } + + ibvcq_ex->read_opcode = irdma_wc_read_opcode; + ibvcq_ex->read_vendor_err = irdma_wc_read_vendor_err; + ibvcq_ex->read_wc_flags = irdma_wc_read_wc_flags; + + if (attr_ex->wc_flags & IBV_WC_EX_WITH_BYTE_LEN) + ibvcq_ex->read_byte_len = irdma_wc_read_byte_len; + if (attr_ex->wc_flags & IBV_WC_EX_WITH_IMM) + ibvcq_ex->read_imm_data = irdma_wc_read_imm_data; + if (attr_ex->wc_flags & IBV_WC_EX_WITH_QP_NUM) + ibvcq_ex->read_qp_num = irdma_wc_read_qp_num; + if (attr_ex->wc_flags & IBV_WC_EX_WITH_SRC_QP) + ibvcq_ex->read_src_qp = irdma_wc_read_src_qp; + if (attr_ex->wc_flags & IBV_WC_EX_WITH_SLID) + ibvcq_ex->read_slid = irdma_wc_read_slid; + if (attr_ex->wc_flags & IBV_WC_EX_WITH_SL) + ibvcq_ex->read_sl = irdma_wc_read_sl; + if (attr_ex->wc_flags & IBV_WC_EX_WITH_DLID_PATH_BITS) + ibvcq_ex->read_dlid_path_bits = irdma_wc_read_dlid_path_bits; +} + +/** + * irdma_arm_cq - arm of cq + * @iwucq: cq to which arm + * @cq_notify: notification params + */ +static void irdma_arm_cq(struct irdma_ucq *iwucq, + enum irdma_cmpl_notify cq_notify) +{ + iwucq->is_armed = true; + iwucq->arm_sol = true; + iwucq->skip_arm = false; + iwucq->skip_sol = true; + iwucq->cq.ops.iw_cq_request_notification(&iwucq->cq, cq_notify); +} + +/** + * irdma_uarm_cq - callback for arm of cq + * @cq: cq to arm + * @solicited: to get notify params + */ +int irdma_uarm_cq(struct ibv_cq *cq, int solicited) +{ + struct irdma_ucq *iwucq; + enum irdma_cmpl_notify cq_notify = IRDMA_CQ_COMPL_EVENT; + int ret; + + iwucq = to_irdma_ucq(cq); + if (solicited) + cq_notify = IRDMA_CQ_COMPL_SOLICITED; + + ret = pthread_spin_lock(&iwucq->lock); + if (ret) + return ret; + + if (iwucq->is_armed) { + if (iwucq->arm_sol && !solicited) { + irdma_arm_cq(iwucq, cq_notify); + } else { + iwucq->skip_arm = true; + iwucq->skip_sol = solicited ? true : false; + } + } else { + irdma_arm_cq(iwucq, cq_notify); + } + + pthread_spin_unlock(&iwucq->lock); + + return 0; +} + +/** + * irdma_cq_event - cq to do completion event + * @cq: cq to arm + */ +void irdma_cq_event(struct ibv_cq *cq) +{ + struct irdma_ucq *iwucq; + + iwucq = to_irdma_ucq(cq); + if (pthread_spin_lock(&iwucq->lock)) + return; + + if (iwucq->skip_arm) + irdma_arm_cq(iwucq, IRDMA_CQ_COMPL_EVENT); + else + iwucq->is_armed = false; + + pthread_spin_unlock(&iwucq->lock); +} + +/** + * irdma_destroy_vmapped_qp - destroy resources for qp + * @iwuqp: qp struct for resources + */ +static int irdma_destroy_vmapped_qp(struct irdma_uqp *iwuqp) +{ + int ret; + + ret = ibv_cmd_destroy_qp(&iwuqp->ibv_qp); + if (ret) + return ret; + + if (iwuqp->qp.push_db) + munmap(iwuqp->qp.push_db, IRDMA_HW_PAGE_SIZE); + if (iwuqp->qp.push_wqe) + munmap(iwuqp->qp.push_wqe, IRDMA_HW_PAGE_SIZE); + + ibv_cmd_dereg_mr(&iwuqp->vmr); + + return 0; +} + +/** + * irdma_vmapped_qp - create resources for qp + * @iwuqp: qp struct for resources + * @pd: pd for the qp + * @attr: attributes of qp passed + * @resp: response back from create qp + * @sqdepth: depth of sq + * @rqdepth: depth of rq + * @info: info for initializing user level qp + * @abi_ver: abi version of the create qp command + */ +static int irdma_vmapped_qp(struct irdma_uqp *iwuqp, struct ibv_pd *pd, + struct ibv_qp_init_attr *attr, int sqdepth, + int rqdepth, struct irdma_qp_uk_init_info *info, + int abi_ver) +{ + struct irdma_ucreate_qp cmd = {}; + struct i40iw_ucreate_qp cmd_legacy = {}; + int sqsize, rqsize, totalqpsize; + struct irdma_ucreate_qp_resp resp = {}; + struct i40iw_ucreate_qp_resp resp_legacy = {}; + struct irdma_ureg_mr reg_mr_cmd = {}; + struct ib_uverbs_reg_mr_resp reg_mr_resp = {}; + int ret; + + sqsize = roundup(sqdepth * IRDMA_QP_WQE_MIN_SIZE, IRDMA_HW_PAGE_SIZE); + rqsize = roundup(rqdepth * IRDMA_QP_WQE_MIN_SIZE, IRDMA_HW_PAGE_SIZE); + totalqpsize = rqsize + sqsize + IRDMA_DB_SHADOW_AREA_SIZE; + info->sq = memalign(IRDMA_HW_PAGE_SIZE, totalqpsize); + + if (!info->sq) { + fprintf(stderr, PFX "%s: failed to allocate memory for SQ\n", + __func__); + return ENOMEM; + } + + memset(info->sq, 0, totalqpsize); + info->rq = &info->sq[sqsize / IRDMA_QP_WQE_MIN_SIZE]; + info->shadow_area = info->rq[rqsize / IRDMA_QP_WQE_MIN_SIZE].elem; + + reg_mr_cmd.reg_type = IW_MEMREG_TYPE_QP; + reg_mr_cmd.sq_pages = sqsize >> IRDMA_HW_PAGE_SHIFT; + reg_mr_cmd.rq_pages = rqsize >> IRDMA_HW_PAGE_SHIFT; + + ret = ibv_cmd_reg_mr(pd, info->sq, totalqpsize, + (uintptr_t)info->sq, IBV_ACCESS_LOCAL_WRITE, + &iwuqp->vmr, ®_mr_cmd.ibv_cmd, + sizeof(reg_mr_cmd), ®_mr_resp, + sizeof(reg_mr_resp)); + if (ret) { + fprintf(stderr, PFX "%s: failed to pin memory for SQ\n", + __func__); + free(info->sq); + return ret; + } + + /* GEN_1 legacy support with i40iw */ + if (abi_ver <= 5) { + cmd_legacy.user_wqe_bufs = (__u64)((uintptr_t)info->sq); + cmd_legacy.user_compl_ctx = (__u64)(uintptr_t)&iwuqp->qp; + ret = ibv_cmd_create_qp(pd, &iwuqp->ibv_qp, attr, + &cmd_legacy.ibv_cmd, sizeof(cmd_legacy), + &resp_legacy.ibv_resp, + sizeof(struct i40iw_ucreate_qp_resp)); + if (ret) + goto error; + info->sq_size = resp_legacy.actual_sq_size; + info->rq_size = resp_legacy.actual_rq_size; + info->first_sq_wq = 1; + info->qp_caps = 0; + info->qp_id = resp_legacy.qp_id; + iwuqp->irdma_drv_opt = resp_legacy.i40iw_drv_opt; + iwuqp->ibv_qp.qp_num = resp_legacy.qp_id; + } else { + cmd.user_wqe_bufs = (__u64)((uintptr_t)info->sq); + cmd.user_compl_ctx = (__u64)(uintptr_t)&iwuqp->qp; + ret = ibv_cmd_create_qp(pd, &iwuqp->ibv_qp, attr, &cmd.ibv_cmd, + sizeof(cmd), &resp.ibv_resp, + sizeof(struct irdma_ucreate_qp_resp)); + if (ret) + goto error; + info->sq_size = resp.actual_sq_size; + info->rq_size = resp.actual_rq_size; + info->first_sq_wq = resp.lsmm; + info->qp_caps = resp.qp_caps; + info->qp_id = resp.qp_id; + iwuqp->irdma_drv_opt = resp.irdma_drv_opt; + iwuqp->ibv_qp.qp_num = resp.qp_id; + } + + iwuqp->send_cq = to_irdma_ucq(attr->send_cq); + iwuqp->recv_cq = to_irdma_ucq(attr->recv_cq); + iwuqp->send_cq->uqp = iwuqp; + iwuqp->recv_cq->uqp = iwuqp; + + return 0; +error: + fprintf(stderr, PFX "%s: failed to create QP, status %d\n", __func__, ret); + ibv_cmd_dereg_mr(&iwuqp->vmr); + free(info->sq); + return ret; +} + +/** + * irdma_ucreate_qp - create qp on user app + * @pd: pd for the qp + * @attr: attributes of the qp to be created (sizes, sge, cq) + */ +struct ibv_qp *irdma_ucreate_qp(struct ibv_pd *pd, + struct ibv_qp_init_attr *attr) +{ + struct irdma_uvcontext *iwvctx = to_irdma_uctx(pd->context); + struct irdma_uk_attrs *uk_attrs = &iwvctx->uk_attrs; + struct irdma_qp_uk_init_info info = {}; + struct irdma_uqp *iwuqp; + __u32 sqdepth, rqdepth; + __u8 sqshift, rqshift; + int status; + + if (attr->qp_type != IBV_QPT_RC && attr->qp_type != IBV_QPT_UD) { + fprintf(stderr, PFX "%s: failed to create QP, unsupported QP type: 0x%x\n", + __func__, attr->qp_type); + return NULL; + } + + if (attr->cap.max_send_sge > uk_attrs->max_hw_wq_frags || + attr->cap.max_recv_sge > uk_attrs->max_hw_wq_frags || + attr->cap.max_inline_data > uk_attrs->max_hw_inline) + return NULL; + + irdma_get_wqe_shift(uk_attrs, + uk_attrs->hw_rev > IRDMA_GEN_1 ? attr->cap.max_send_sge + 1 : + attr->cap.max_send_sge, + attr->cap.max_inline_data, &sqshift); + status = irdma_get_sqdepth(uk_attrs, attr->cap.max_send_wr, sqshift, + &sqdepth); + if (status) { + fprintf(stderr, PFX "%s: invalid SQ attributes, max_send_wr=%d max_send_sge=%d max_inline=%d\n", + __func__, attr->cap.max_send_wr, attr->cap.max_send_sge, + attr->cap.max_inline_data); + return NULL; + } + + if (uk_attrs->hw_rev == IRDMA_GEN_1 && iwvctx->abi_ver > 4) + rqshift = IRDMA_MAX_RQ_WQE_SHIFT_GEN1; + else + irdma_get_wqe_shift(uk_attrs, attr->cap.max_recv_sge, 0, + &rqshift); + + status = irdma_get_rqdepth(uk_attrs, attr->cap.max_recv_wr, rqshift, + &rqdepth); + if (status) { + fprintf(stderr, PFX "%s: invalid RQ attributes, recv_wr=%d recv_sge=%d\n", + __func__, attr->cap.max_recv_wr, attr->cap.max_recv_sge); + return NULL; + } + + iwuqp = memalign(1024, sizeof(*iwuqp)); + if (!iwuqp) + return NULL; + + memset(iwuqp, 0, sizeof(*iwuqp)); + + if (pthread_spin_init(&iwuqp->lock, PTHREAD_PROCESS_PRIVATE)) + goto err_free_qp; + + info.sq_size = sqdepth >> sqshift; + info.rq_size = rqdepth >> rqshift; + attr->cap.max_send_wr = info.sq_size; + attr->cap.max_recv_wr = info.rq_size; + + info.uk_attrs = uk_attrs; + info.max_sq_frag_cnt = attr->cap.max_send_sge; + info.max_rq_frag_cnt = attr->cap.max_recv_sge; + iwuqp->recv_sges = calloc(attr->cap.max_recv_sge, sizeof(*iwuqp->recv_sges)); + if (!iwuqp->recv_sges) + goto err_destroy_lock; + + info.wqe_alloc_db = (__u32 *)iwvctx->db; + info.abi_ver = iwvctx->abi_ver; + info.sq_wrtrk_array = calloc(sqdepth, sizeof(*info.sq_wrtrk_array)); + if (!info.sq_wrtrk_array) { + fprintf(stderr, PFX "%s: failed to allocate memory for SQ work array\n", + __func__); + goto err_free_rsges; + } + + info.rq_wrid_array = calloc(rqdepth, sizeof(*info.rq_wrid_array)); + if (!info.rq_wrid_array) { + fprintf(stderr, PFX "%s: failed to allocate memory for RQ work array\n", + __func__); + goto err_free_sq_wrtrk; + } + + iwuqp->sq_sig_all = attr->sq_sig_all; + iwuqp->qp_type = attr->qp_type; + status = irdma_vmapped_qp(iwuqp, pd, attr, sqdepth, rqdepth, &info, + iwvctx->abi_ver); + if (status) { + fprintf(stderr, PFX "%s: failed to map QP\n", __func__); + goto err_free_rq_wrid; + } + + iwuqp->qp.back_qp = iwuqp; + iwuqp->qp.lock = &iwuqp->lock; + + info.max_sq_frag_cnt = attr->cap.max_send_sge; + info.max_rq_frag_cnt = attr->cap.max_recv_sge; + info.max_inline_data = attr->cap.max_inline_data; + status = iwvctx->dev.ops_uk.iw_qp_uk_init(&iwuqp->qp, &info); + if (!status) { + attr->cap.max_send_wr = (sqdepth - IRDMA_SQ_RSVD) >> sqshift; + attr->cap.max_recv_wr = (rqdepth - IRDMA_RQ_RSVD) >> rqshift; + return &iwuqp->ibv_qp; + } + + irdma_destroy_vmapped_qp(iwuqp); + free(info.sq); +err_free_rq_wrid: + free(info.rq_wrid_array); +err_free_sq_wrtrk: + free(info.sq_wrtrk_array); +err_free_rsges: + free(iwuqp->recv_sges); +err_destroy_lock: + pthread_spin_destroy(&iwuqp->lock); +err_free_qp: + free(iwuqp); + + return NULL; +} + +/** + * irdma_uquery_qp - query qp for some attribute + * @qp: qp for the attributes query + * @attr: to return the attributes + * @attr_mask: mask of what is query for + * @init_attr: initial attributes during create_qp + */ +int irdma_uquery_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask, + struct ibv_qp_init_attr *init_attr) +{ + struct ibv_query_qp cmd; + + return ibv_cmd_query_qp(qp, attr, attr_mask, init_attr, &cmd, + sizeof(cmd)); +} + +/** + * irdma_umodify_qp - send qp modify to driver + * @qp: qp to modify + * @attr: attribute to modify + * @attr_mask: mask of the attribute + */ +int irdma_umodify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask) +{ + struct irdma_umodify_qp_resp resp = {}; + struct ibv_modify_qp cmd = {}; + struct irdma_modify_qp_cmd cmd_ex = {}; + struct irdma_uqp *iwuqp = to_irdma_uqp(qp); + struct irdma_uvcontext *iwctx = to_irdma_uctx(qp->context); + + iwuqp->attr_mask = attr_mask; + memcpy(&iwuqp->attr, attr, sizeof(iwuqp->attr)); + + if (iwctx->uk_attrs.hw_rev > IRDMA_GEN_1 && attr_mask & IBV_QP_STATE) { + __u64 offset; + void *map; + int ret; + + ret = ibv_cmd_modify_qp_ex(qp, attr, attr_mask, &cmd_ex.ibv_cmd, + sizeof(cmd_ex), &resp.ibv_resp, sizeof(resp)); + if (ret || !resp.push_valid) + return ret; + + if (iwuqp->qp.push_wqe) + return ret; + + offset = resp.push_wqe_mmap_key; + map = mmap(NULL, IRDMA_HW_PAGE_SIZE, PROT_WRITE | PROT_READ, + MAP_SHARED, qp->context->cmd_fd, offset); + if (map == MAP_FAILED) { + fprintf(stderr, PFX "failed to map push page, errno %d\n", errno); + } else { + iwuqp->qp.push_wqe = map; + + offset = resp.push_db_mmap_key; + map = mmap(NULL, IRDMA_HW_PAGE_SIZE, + PROT_WRITE | PROT_READ, MAP_SHARED, + qp->context->cmd_fd, offset); + if (map == MAP_FAILED) { + fprintf(stderr, PFX "failed to map push doorbell, errno %d\n", + errno); + munmap(iwuqp->qp.push_wqe, IRDMA_HW_PAGE_SIZE); + iwuqp->qp.push_wqe = NULL; + } else { + iwuqp->qp.push_wqe += resp.push_offset; + iwuqp->qp.push_db = map + resp.push_offset; + } + } + + return ret; + } else { + return ibv_cmd_modify_qp(qp, attr, attr_mask, &cmd, sizeof(cmd)); + } + +} + +static void irdma_issue_flush(struct ibv_qp *qp, bool sq_flush, bool rq_flush) +{ + struct ib_uverbs_ex_modify_qp_resp resp = {}; + struct irdma_modify_qp_cmd cmd_ex = {}; + struct irdma_uqp *iwuqp = to_irdma_uqp(qp); + int ret; + + cmd_ex.sq_flush = sq_flush; + cmd_ex.rq_flush = rq_flush; + + ret = ibv_cmd_modify_qp_ex(qp, &iwuqp->attr, iwuqp->attr_mask, + &cmd_ex.ibv_cmd, sizeof(cmd_ex), + &resp, sizeof(resp)); + if (ret) + fprintf(stderr, PFX "failed to flush SQ [%d] RQ [%d], errno %d\n", + sq_flush, rq_flush, ret); +} + +/** + * irdma_clean_cqes - clean cq entries for qp + * @qp: qp for which completions are cleaned + * @iwcq: cq to be cleaned + */ +static void irdma_clean_cqes(struct irdma_qp_uk *qp, struct irdma_ucq *iwucq) +{ + struct irdma_cq_uk *ukcq = &iwucq->cq; + int ret; + + ret = pthread_spin_lock(&iwucq->lock); + if (ret) { + fprintf(stderr, "irdma: Unable to clean cqes\n"); + return; + } + ukcq->ops.iw_cq_clean(qp, ukcq); + pthread_spin_unlock(&iwucq->lock); +} + +/** + * irdma_udestroy_qp - destroy qp + * @qp: qp to destroy + */ +int irdma_udestroy_qp(struct ibv_qp *qp) +{ + struct irdma_uqp *iwuqp = to_irdma_uqp(qp); + int ret; + + ret = pthread_spin_destroy(&iwuqp->lock); + if (ret) + goto err; + + ret = irdma_destroy_vmapped_qp(iwuqp); + if (ret) + goto err; + + /* Clean any pending completions from the cq(s) */ + if (iwuqp->send_cq) + irdma_clean_cqes(&iwuqp->qp, iwuqp->send_cq); + + if (iwuqp->recv_cq && iwuqp->recv_cq != iwuqp->send_cq) + irdma_clean_cqes(&iwuqp->qp, iwuqp->recv_cq); + + if (iwuqp->qp.sq_wrtrk_array) + free(iwuqp->qp.sq_wrtrk_array); + if (iwuqp->qp.rq_wrid_array) + free(iwuqp->qp.rq_wrid_array); + + free(iwuqp->qp.sq_base); + free(iwuqp->recv_sges); + free(iwuqp); + return 0; + +err: + fprintf(stderr, PFX "%s: failed to destroy QP, status %d\n", + __func__, ret); + + return ret; +} + +/** + * irdma_copy_sg_list - copy sg list for qp + * @sg_list: copied into sg_list + * @sgl: copy from sgl + * @num_sges: count of sg entries + * @max_sges: count of max supported sg entries + */ +static void irdma_copy_sg_list(struct irdma_sge *sg_list, struct ibv_sge *sgl, + int num_sges) +{ + int i; + + for (i = 0; i < num_sges; i++) { + sg_list[i].tag_off = sgl[i].addr; + sg_list[i].len = sgl[i].length; + sg_list[i].stag = sgl[i].lkey; + } +} + +/** + * calc_type2_mw_stag - calculate type 2 MW stag + * @rkey: desired rkey of the MW + * @mw_rkey: type2 memory window rkey + * + * compute type2 memory window stag by taking lower 8 bits + * of the desired rkey and leaving 24 bits if mw->rkey unchanged + */ +static inline __u32 calc_type2_mw_stag(__u32 rkey, __u32 mw_rkey) +{ + const __u32 mask = 0xff; + + return (rkey & mask) | (mw_rkey & ~mask); +} + +/** + * irdma_post_send - post send wr for user application + * @ib_qp: qp to post wr + * @ib_wr: work request ptr + * @bad_wr: return of bad wr if err + */ +int irdma_upost_send(struct ibv_qp *ib_qp, struct ibv_send_wr *ib_wr, + struct ibv_send_wr **bad_wr) +{ + struct irdma_uqp *iwuqp; + struct irdma_post_sq_info info; + struct irdma_uvcontext *iwvctx = to_irdma_uctx(ib_qp->context); + struct irdma_uk_attrs *uk_attrs = &iwvctx->uk_attrs; + enum irdma_status_code ret = 0; + bool reflush = false; + int err = 0; + + iwuqp = (struct irdma_uqp *)ib_qp; + + err = pthread_spin_lock(&iwuqp->lock); + if (err) + return err; + + if (!IRDMA_RING_MORE_WORK(iwuqp->qp.sq_ring) && + ib_qp->state == IBV_QPS_ERR) + reflush = true; + + while (ib_wr) { + memset(&info, 0, sizeof(info)); + info.wr_id = (__u64)(ib_wr->wr_id); + if ((ib_wr->send_flags & IBV_SEND_SIGNALED) || + iwuqp->sq_sig_all) + info.signaled = true; + if (ib_wr->send_flags & IBV_SEND_FENCE) + info.read_fence = true; + if (iwuqp->send_cq->report_rtt) + info.report_rtt = true; + + switch (ib_wr->opcode) { + case IBV_WR_SEND_WITH_IMM: + if (iwuqp->qp.qp_caps & IRDMA_SEND_WITH_IMM) { + info.imm_data_valid = true; + info.imm_data = ntohl(ib_wr->imm_data); + } else { + err = EINVAL; + break; + } + SWITCH_FALLTHROUGH; + case IBV_WR_SEND: + case IBV_WR_SEND_WITH_INV: + if (ib_wr->opcode == IBV_WR_SEND || + ib_wr->opcode == IBV_WR_SEND_WITH_IMM) { + if (ib_wr->send_flags & IBV_SEND_SOLICITED) + info.op_type = IRDMA_OP_TYPE_SEND_SOL; + else + info.op_type = IRDMA_OP_TYPE_SEND; + } else { + if (ib_wr->send_flags & IBV_SEND_SOLICITED) + info.op_type = IRDMA_OP_TYPE_SEND_SOL_INV; + else + info.op_type = IRDMA_OP_TYPE_SEND_INV; + info.stag_to_inv = ib_wr->invalidate_rkey; + } + if (ib_wr->send_flags & IBV_SEND_INLINE) { + info.op.inline_send.data = (void *)(uintptr_t)ib_wr->sg_list[0].addr; + info.op.inline_send.len = ib_wr->sg_list[0].length; + if (ib_qp->qp_type == IBV_QPT_UD) { + struct irdma_uah *ah = to_irdma_uah(ib_wr->wr.ud.ah); + + info.op.inline_send.ah_id = ah->ah_id; + info.op.inline_send.qkey = ib_wr->wr.ud.remote_qkey; + info.op.inline_send.dest_qp = ib_wr->wr.ud.remote_qpn; + ret = iwuqp->qp.qp_ops.iw_inline_send(&iwuqp->qp, &info, false); + } else { + ret = iwuqp->qp.qp_ops.iw_inline_send( + &iwuqp->qp, &info, false); + } + } else { + info.op.send.num_sges = ib_wr->num_sge; + info.op.send.sg_list = (struct irdma_sge *)ib_wr->sg_list; + if (ib_qp->qp_type == IBV_QPT_UD) { + struct irdma_uah *ah = to_irdma_uah(ib_wr->wr.ud.ah); + + info.op.inline_send.ah_id = ah->ah_id; + info.op.inline_send.qkey = ib_wr->wr.ud.remote_qkey; + info.op.inline_send.dest_qp = ib_wr->wr.ud.remote_qpn; + ret = iwuqp->qp.qp_ops.iw_send(&iwuqp->qp, &info, false); + } else { + ret = iwuqp->qp.qp_ops.iw_send( + &iwuqp->qp, &info, false); + } + } + if (ret) { + if (ret == IRDMA_ERR_QP_TOOMANY_WRS_POSTED) + err = ENOMEM; + else + err = EINVAL; + } + break; + case IBV_WR_RDMA_WRITE_WITH_IMM: + if (iwuqp->qp.qp_caps & IRDMA_WRITE_WITH_IMM) { + info.imm_data_valid = true; + info.imm_data = ntohl(ib_wr->imm_data); + } else { + err = EINVAL; + break; + } + SWITCH_FALLTHROUGH; + case IBV_WR_RDMA_WRITE: + if (ib_wr->send_flags & IBV_SEND_SOLICITED) + info.op_type = IRDMA_OP_TYPE_RDMA_WRITE_SOL; + else + info.op_type = IRDMA_OP_TYPE_RDMA_WRITE; + + if (ib_wr->send_flags & IBV_SEND_INLINE) { + info.op.inline_rdma_write.data = (void *)(uintptr_t)ib_wr->sg_list[0].addr; + info.op.inline_rdma_write.len = ib_wr->sg_list[0].length; + info.op.inline_rdma_write.rem_addr.tag_off = ib_wr->wr.rdma.remote_addr; + info.op.inline_rdma_write.rem_addr.stag = ib_wr->wr.rdma.rkey; + ret = iwuqp->qp.qp_ops.iw_inline_rdma_write(&iwuqp->qp, &info, false); + } else { + info.op.rdma_write.lo_sg_list = (void *)ib_wr->sg_list; + info.op.rdma_write.num_lo_sges = ib_wr->num_sge; + info.op.rdma_write.rem_addr.tag_off = ib_wr->wr.rdma.remote_addr; + info.op.rdma_write.rem_addr.stag = ib_wr->wr.rdma.rkey; + ret = iwuqp->qp.qp_ops.iw_rdma_write(&iwuqp->qp, &info, false); + } + if (ret) { + if (ret == IRDMA_ERR_QP_TOOMANY_WRS_POSTED) + err = ENOMEM; + else + err = EINVAL; + } + break; + case IBV_WR_RDMA_READ: + if (ib_wr->num_sge > uk_attrs->max_hw_read_sges) { + err = EINVAL; + break; + } + info.op_type = IRDMA_OP_TYPE_RDMA_READ; + info.op.rdma_read.rem_addr.tag_off = ib_wr->wr.rdma.remote_addr; + info.op.rdma_read.rem_addr.stag = ib_wr->wr.rdma.rkey; + + info.op.rdma_read.lo_sg_list = (void *)ib_wr->sg_list; + info.op.rdma_read.num_lo_sges = ib_wr->num_sge; + ret = iwuqp->qp.qp_ops.iw_rdma_read(&iwuqp->qp, &info, + false, false); + if (ret) { + if (ret == IRDMA_ERR_QP_TOOMANY_WRS_POSTED) + err = ENOMEM; + else + err = EINVAL; + } + break; + case IBV_WR_BIND_MW: + if (ib_qp->qp_type != IBV_QPT_RC) { + err = EINVAL; + break; + } + info.op_type = IRDMA_OP_TYPE_BIND_MW; + info.op.bind_window.mr_stag = ib_wr->bind_mw.bind_info.mr->rkey; + if (ib_wr->bind_mw.mw->type == IBV_MW_TYPE_1) { + info.op.bind_window.mem_window_type_1 = true; + info.op.bind_window.mw_stag = ib_wr->bind_mw.rkey; + } else { + struct verbs_mr *vmr = verbs_get_mr(ib_wr->bind_mw.bind_info.mr); + struct irdma_umr *umr = container_of(vmr, struct irdma_umr, vmr); + + if (umr->acc_flags & IBV_ACCESS_ZERO_BASED) { + err = EINVAL; + break; + } + info.op.bind_window.mw_stag = + calc_type2_mw_stag(ib_wr->bind_mw.rkey, ib_wr->bind_mw.mw->rkey); + ib_wr->bind_mw.mw->rkey = info.op.bind_window.mw_stag; + + } + + if (ib_wr->bind_mw.bind_info.mw_access_flags & IBV_ACCESS_ZERO_BASED) { + info.op.bind_window.addressing_type = IRDMA_ADDR_TYPE_ZERO_BASED; + info.op.bind_window.va = (void *)(uintptr_t)0; + } else { + info.op.bind_window.addressing_type = IRDMA_ADDR_TYPE_VA_BASED; + info.op.bind_window.va = (void *)(uintptr_t)ib_wr->bind_mw.bind_info.addr; + } + info.op.bind_window.bind_len = ib_wr->bind_mw.bind_info.length; + info.op.bind_window.ena_reads = + (ib_wr->bind_mw.bind_info.mw_access_flags & IBV_ACCESS_REMOTE_READ) ? 1 : 0; + info.op.bind_window.ena_writes = + (ib_wr->bind_mw.bind_info.mw_access_flags & IBV_ACCESS_REMOTE_WRITE) ? 1 : 0; + + ret = iwuqp->qp.qp_ops.iw_mw_bind(&iwuqp->qp, &info, false); + if (ret) { + if (ret == IRDMA_ERR_QP_TOOMANY_WRS_POSTED) + err = ENOMEM; + else + err = EINVAL; + } + break; + case IBV_WR_LOCAL_INV: + info.op_type = IRDMA_OP_TYPE_INV_STAG; + info.op.inv_local_stag.target_stag = ib_wr->invalidate_rkey; + ret = iwuqp->qp.qp_ops.iw_stag_local_invalidate(&iwuqp->qp, &info, true); + if (ret) + err = ENOMEM; + break; + default: + /* error */ + err = EINVAL; + fprintf(stderr, PFX "%s: post work request failed, invalid opcode: 0x%x\n", + __func__, ib_wr->opcode); + break; + } + if (err) + break; + + ib_wr = ib_wr->next; + } + + if (err) + *bad_wr = ib_wr; + + iwuqp->qp.qp_ops.iw_qp_post_wr(&iwuqp->qp); + if (reflush) + irdma_issue_flush(ib_qp, 1, 0); + + pthread_spin_unlock(&iwuqp->lock); + + return err; +} + +/** + * irdma_post_recv - post receive wr for user application + * @ib_wr: work request for receive + * @bad_wr: bad wr caused an error + */ +int irdma_upost_recv(struct ibv_qp *ib_qp, struct ibv_recv_wr *ib_wr, + struct ibv_recv_wr **bad_wr) +{ + struct irdma_uqp *iwuqp = to_irdma_uqp(ib_qp); + enum irdma_status_code ret = 0; + struct irdma_post_rq_info post_recv = {}; + struct irdma_sge *sg_list; + bool reflush = false; + int err = 0; + + sg_list = iwuqp->recv_sges; + + err = pthread_spin_lock(&iwuqp->lock); + if (err) + return err; + + if (!IRDMA_RING_MORE_WORK(iwuqp->qp.rq_ring) && + ib_qp->state == IBV_QPS_ERR) + reflush = true; + + while (ib_wr) { + if (ib_wr->num_sge > iwuqp->qp.max_rq_frag_cnt) { + *bad_wr = ib_wr; + err = EINVAL; + goto error; + } + post_recv.num_sges = ib_wr->num_sge; + post_recv.wr_id = ib_wr->wr_id; + irdma_copy_sg_list(sg_list, ib_wr->sg_list, ib_wr->num_sge); + post_recv.sg_list = sg_list; + ret = iwuqp->qp.qp_ops.iw_post_receive(&iwuqp->qp, &post_recv); + if (ret) { + if (ret == IRDMA_ERR_QP_TOOMANY_WRS_POSTED) + err = ENOMEM; + else + err = EINVAL; + *bad_wr = ib_wr; + goto error; + } + + if (reflush) + irdma_issue_flush(ib_qp, 0, 1); + + ib_wr = ib_wr->next; + } +error: + pthread_spin_unlock(&iwuqp->lock); + + return err; +} + +/** + * irdma_ucreate_ah - create address handle associated with a pd + * @ibpd: pd for the address handle + * @attr: attributes of address handle + */ +struct ibv_ah *irdma_ucreate_ah(struct ibv_pd *ibpd, struct ibv_ah_attr *attr) +{ + struct irdma_uah *ah; + union ibv_gid sgid; + struct irdma_ucreate_ah_resp resp; + int err; + + err = ibv_query_gid(ibpd->context, attr->port_num, attr->grh.sgid_index, + &sgid); + if (err) { + fprintf(stderr, "irdma: Error from ibv_query_gid.\n"); + return NULL; + } + + ah = calloc(1, sizeof(*ah)); + if (!ah) + return NULL; + + if (ibv_cmd_create_ah(ibpd, &ah->ibv_ah, attr, &resp.ibv_resp, + sizeof(resp))) { + free(ah); + return NULL; + } + + ah->ah_id = resp.ah_id; + + return &ah->ibv_ah; +} + +/** + * irdma_udestroy_ah - destroy the address handle + * @ibah: address handle + */ +int irdma_udestroy_ah(struct ibv_ah *ibah) +{ + struct irdma_uah *ah; + int ret; + + ah = to_irdma_uah(ibah); + + ret = ibv_cmd_destroy_ah(ibah); + if (ret) + return ret; + + free(ah); + + return 0; +} + +/** + * irdma_uattach_mcast - Attach qp to multicast group implemented + * @qp: The queue pair + * @gid:The Global ID for multicast group + * @lid: The Local ID + */ +int irdma_uattach_mcast(struct ibv_qp *qp, const union ibv_gid *gid, + uint16_t lid) +{ + return ibv_cmd_attach_mcast(qp, gid, lid); +} + +/** + * irdma_udetach_mcast - Detach qp from multicast group + * @qp: The queue pair + * @gid:The Global ID for multicast group + * @lid: The Local ID + */ +int irdma_udetach_mcast(struct ibv_qp *qp, const union ibv_gid *gid, + uint16_t lid) +{ + return ibv_cmd_detach_mcast(qp, gid, lid); +} + +/** + * irdma_uresize_cq - resizes a cq + * @cq: cq to resize + * @cqe: the number of cqes of the new cq + */ +int irdma_uresize_cq(struct ibv_cq *cq, int cqe) +{ + struct irdma_uvcontext *iwvctx; + struct irdma_uk_attrs *uk_attrs; + struct irdma_uresize_cq cmd = {}; + struct ib_uverbs_resize_cq_resp resp = {}; + struct irdma_ureg_mr reg_mr_cmd = {}; + struct ib_uverbs_reg_mr_resp reg_mr_resp = {}; + struct irdma_ucq *iwucq = to_irdma_ucq(cq); + struct irdma_cq_buf *cq_buf = NULL; + struct irdma_cqe *cq_base = NULL; + struct verbs_mr new_mr = {}; + __u32 cq_size; + __u32 cq_pages; + int cqe_needed; + int ret = 0; + + iwvctx = to_irdma_uctx(cq->context); + uk_attrs = &iwvctx->uk_attrs; + + if (!(uk_attrs->feature_flags & IRDMA_FEATURE_CQ_RESIZE)) + return EOPNOTSUPP; + + if (cqe > IRDMA_MAX_CQ_SIZE) + return EINVAL; + + cqe_needed = cqe + 1; + if (uk_attrs->hw_rev > IRDMA_GEN_1) + cqe_needed *= 2; + + if (cqe_needed < IRDMA_U_MINCQ_SIZE) + cqe_needed = IRDMA_U_MINCQ_SIZE; + + if (cqe_needed == iwucq->cq.cq_size) + return 0; + + cq_size = roundup(cqe_needed * sizeof(struct irdma_cqe), IRDMA_HW_PAGE_SIZE); + cq_pages = cq_size >> IRDMA_HW_PAGE_SHIFT; + cq_base = memalign(IRDMA_HW_PAGE_SIZE, cq_size); + if (!cq_base) + goto err; + + memset(cq_base, 0, cq_size); + + cq_buf = malloc(sizeof(*cq_buf)); + if (!cq_buf) + goto err; + + new_mr.ibv_mr.pd = iwucq->vmr.ibv_mr.pd; + reg_mr_cmd.reg_type = IW_MEMREG_TYPE_CQ; + reg_mr_cmd.cq_pages = cq_pages; + + ret = ibv_cmd_reg_mr(new_mr.ibv_mr.pd, cq_base, cq_size, + (uintptr_t)cq_base, IBV_ACCESS_LOCAL_WRITE, + &new_mr, ®_mr_cmd.ibv_cmd, sizeof(reg_mr_cmd), + ®_mr_resp, sizeof(reg_mr_resp)); + if (ret) { + fprintf(stderr, "failed to pin memory for CQ ret = %d\n", ret); + goto err; + } + + ret = pthread_spin_lock(&iwucq->lock); + if (ret) { + ibv_cmd_dereg_mr(&new_mr); + goto err; + } + cmd.user_cq_buffer = (__u64)((uintptr_t)cq_base); + ret = ibv_cmd_resize_cq(&iwucq->verbs_cq.cq, cqe_needed, &cmd.ibv_cmd, + sizeof(cmd), &resp, sizeof(resp)); + if (ret) { + pthread_spin_unlock(&iwucq->lock); + ibv_cmd_dereg_mr(&new_mr); + fprintf(stderr, "failed to resize CQ ret = %d\n", ret); + goto err; + } + + memcpy(&cq_buf->cq, &iwucq->cq, sizeof(cq_buf->cq)); + cq_buf->vmr = iwucq->vmr; + iwucq->vmr = new_mr; + iwucq->cq.ops.iw_cq_resize(&iwucq->cq, cq_base, + cqe_needed); + iwucq->verbs_cq.cq.cqe = cqe; + list_add_tail(&iwucq->resize_list, &cq_buf->list); + + pthread_spin_unlock(&iwucq->lock); + + return ret; +err: + if (cq_buf) + free(cq_buf); + if (cq_base) + free(cq_base); + return ret; +} -- 1.8.3.1