This is the most simplest rdma (RoCE) loopback driver. It simplements rdma device on top of 'lo' netdevice. Since data doesn't leave a system, it doesn't emulate any tranport, network or link layers. It implements fully functional verbs layer and supported by data copy engine. Signed-off-by: Parav Pandit <parav@xxxxxxxxxxxx> --- drivers/infiniband/Kconfig | 1 + drivers/infiniband/sw/Makefile | 1 + drivers/infiniband/sw/loopback/Kconfig | 14 + drivers/infiniband/sw/loopback/Makefile | 4 + drivers/infiniband/sw/loopback/loopback.c | 1603 +++++++++++++++++++++++++++++ include/uapi/rdma/rdma_user_ioctl_cmds.h | 1 + 6 files changed, 1624 insertions(+) create mode 100644 drivers/infiniband/sw/loopback/Kconfig create mode 100644 drivers/infiniband/sw/loopback/Makefile create mode 100644 drivers/infiniband/sw/loopback/loopback.c diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig index a1fb840d..1715ead 100644 --- a/drivers/infiniband/Kconfig +++ b/drivers/infiniband/Kconfig @@ -107,6 +107,7 @@ source "drivers/infiniband/hw/hfi1/Kconfig" source "drivers/infiniband/hw/qedr/Kconfig" source "drivers/infiniband/sw/rdmavt/Kconfig" source "drivers/infiniband/sw/rxe/Kconfig" +source "drivers/infiniband/sw/loopback/Kconfig" endif source "drivers/infiniband/ulp/ipoib/Kconfig" diff --git a/drivers/infiniband/sw/Makefile b/drivers/infiniband/sw/Makefile index 8b095b2..f2e95f9 100644 --- a/drivers/infiniband/sw/Makefile +++ b/drivers/infiniband/sw/Makefile @@ -1,2 +1,3 @@ obj-$(CONFIG_INFINIBAND_RDMAVT) += rdmavt/ obj-$(CONFIG_RDMA_RXE) += rxe/ +obj-$(CONFIG_RDMA_LOOPBACK) += loopback/ diff --git a/drivers/infiniband/sw/loopback/Kconfig b/drivers/infiniband/sw/loopback/Kconfig new file mode 100644 index 0000000..4aadcaa --- /dev/null +++ b/drivers/infiniband/sw/loopback/Kconfig @@ -0,0 +1,14 @@ +config RDMA_LOOPBACK + tristate "loopback (RoCE) driver" + depends on INET && INFINIBAND + depends on ARCH_DMA_ADDR_T_64BIT + select DMA_VIRT_OPS + help + This driver implements the InfiniBand RDMA transport over + the Linux network lo netdevice. It enables a system to + use a standard lo(loopback) netdevice to emulate completely + software RDMA driver. It doesn't implement any tranport + layers. It implements only data copier and verbs layer as + it works only on top of local lo device. It follows standard + Infiniband specification version 1.3 and RoCE annex. This is + zero configuration driver. diff --git a/drivers/infiniband/sw/loopback/Makefile b/drivers/infiniband/sw/loopback/Makefile new file mode 100644 index 0000000..84e530d --- /dev/null +++ b/drivers/infiniband/sw/loopback/Makefile @@ -0,0 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 +obj-$(CONFIG_RDMA_LOOPBACK) += rdma_loopback.o + +rdma_loopback-y := loopback.o helper.o diff --git a/drivers/infiniband/sw/loopback/loopback.c b/drivers/infiniband/sw/loopback/loopback.c new file mode 100644 index 0000000..238ecfb --- /dev/null +++ b/drivers/infiniband/sw/loopback/loopback.c @@ -0,0 +1,1603 @@ +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB +/* + * Copyright (c) 2019 Mellanox Technologies. All rights reserved. + */ + +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/netdevice.h> +#include <net/addrconf.h> +#include <rdma/ib_verbs.h> +#include <rdma/ib_umem.h> +#include <rdma/ib_mad.h> + +#include "loopback_helper.h" + +enum { + LOOPBACK_MAX_MR = (1 << 24) - 1, + LOOPBACK_MAX_QP = (1 << 24) - 1, + LOOPBACK_MAX_CQ = LOOPBACK_MAX_QP, + LOOPBACK_MAX_AH = INT_MAX, + LOOPBACK_MAX_PD = INT_MAX, +}; + +struct loopback_uctx { + struct ib_ucontext ibuctx; +}; + +struct loopback_pd { + struct ib_pd ibpd; +}; + +struct loopback_cq { + /* resource entry must be first */ + struct loopback_resource res; + struct ib_cq ibcq; + struct loopback_fifo cqes; + enum ib_cq_notify_flags armed; +}; + +enum { + LOOPBACK_MR_TYPE_PHY = 0, /* access physical sges of kernel */ + LOOPBACK_MR_TYPE_USER = 1, /* userspace MR */ + LOOPBACK_MR_TYPE_DMA = 2, /* DMA MR of kernel */ + LOOPBACK_MR_TYPE_FRMR = 3, /* FRMR */ +}; + +struct loopback_usr_mr { + struct ib_umem *umem; + /* array of pages for this mr to access in datapath */ + struct page *pages; + size_t pages_alloc_size; +}; + +struct loopback_frmr { + int pg_iter; +}; + +struct loopback_mr { + struct loopback_resource res; + struct ib_mr ibmr; + int type; + int access; + + u32 fbo; + u64 *pg_tbl; + /* we store the page shift to make common use for frmr and user mr */ + u32 page_shift; + union { + struct loopback_usr_mr umr; + struct loopback_frmr frmr; + } u; +}; + +static u32 mr_id_to_mkey(u32 id) +{ + return id << 8; +} + +struct loopback_qp { + struct loopback_resource res; + struct ib_qp ibqp; + enum ib_qp_state state; + struct loopback_fifo rqes; + + gfp_t cqe_alloc_flags; + gfp_t rqe_alloc_flags; + bool user_qp; + struct ib_qp_attr attr; + struct ib_qp_init_attr init_attr; + +}; + +struct loopback_ah { + struct loopback_resource res; + struct ib_ah ibah; + struct rdma_ah_attr attr; +}; + +struct rdma_loopdev { + struct ib_device dev; + + struct loopback_resource_table mr_tbl; + struct loopback_resource_table cq_tbl; + struct loopback_resource_table qp_tbl; + struct loopback_resource_table ah_tbl; + + struct ib_port_attr port_attr; + struct loopback_mr zero_mr; +}; + +static inline struct rdma_loopdev *ib_to_loopdev(struct ib_device *dev) +{ + return container_of(dev, struct rdma_loopdev, dev); +} + +static inline struct loopback_mr *ib_to_loop_mr(struct ib_mr *ibmr) +{ + return container_of(ibmr, struct loopback_mr, ibmr); +} + +static inline struct loopback_cq *ib_to_loop_cq(struct ib_cq *ibcq) +{ + return container_of(ibcq, struct loopback_cq, ibcq); +} + +static inline struct loopback_ah *ib_to_loop_ah(struct ib_ah *ibah) +{ + return container_of(ibah, struct loopback_ah, ibah); +} + +static inline struct loopback_qp *ib_to_loop_qp(struct ib_qp *ibqp) +{ + return container_of(ibqp, struct loopback_qp, ibqp); +} + +struct loopback_cqe { + struct list_head list; + struct ib_wc wc; +}; + +struct loopback_rqe { + struct list_head list; + + /* copy of recv wr */ + struct ib_recv_wr wr; + struct ib_sge sges[0]; +}; + +static struct rdma_loopdev *loopdev; +static struct net_device *lo; + +static void init_loopdev_tables(struct rdma_loopdev *ld) +{ + init_table(&ld->ah_tbl, 0, LOOPBACK_MAX_AH, UINT_MAX, 0); + init_table(&ld->cq_tbl, 0, LOOPBACK_MAX_CQ, UINT_MAX, 0); + init_table(&ld->mr_tbl, 0, LOOPBACK_MAX_MR, 0xffffff00, 8); + init_table(&ld->qp_tbl, 1, LOOPBACK_MAX_QP, UINT_MAX, 0); +} + +static int loopback_query_device(struct ib_device *dev, + struct ib_device_attr *attr, + struct ib_udata *uhw) +{ + memset(attr, 0, sizeof(*attr)); + + attr->sys_image_guid = dev->node_guid; + attr->max_mr_size = ULONG_MAX; + attr->page_size_cap = 0; + attr->vendor_id = 0; + attr->vendor_part_id = 0; + attr->hw_ver = 0; + attr->max_qp = LOOPBACK_MAX_QP; + attr->max_qp_wr = 65536; + attr->device_cap_flags = IB_DEVICE_MEM_MGT_EXTENSIONS; + attr->max_send_sge = 16; + attr->max_recv_sge = 16; + attr->max_sge_rd = 16; + attr->max_cq = LOOPBACK_MAX_CQ; + attr->max_cqe = 65536; + attr->max_mr = LOOPBACK_MAX_MR; + attr->max_pd = LOOPBACK_MAX_PD; + attr->max_qp_rd_atom = 64; + attr->max_ee_rd_atom = 0; + attr->max_res_rd_atom = 64; + attr->max_qp_init_rd_atom = 64; + attr->max_ee_init_rd_atom = 64; + attr->atomic_cap = 0; + attr->masked_atomic_cap = 0; + attr->max_ee = 0; + attr->max_rdd = 0; + attr->max_mw = 0; + attr->max_raw_ipv6_qp = 0; + attr->max_raw_ethy_qp = 0; + attr->max_mcast_grp = 0; + attr->max_mcast_qp_attach = 0; + attr->max_total_mcast_qp_attach = 0; + attr->max_ah = LOOPBACK_MAX_AH; + attr->max_srq = 0; + attr->max_srq_wr = 0; + attr->max_srq_sge = 0; + attr->max_fast_reg_page_list_len = 4; + attr->max_pkeys = 1; + attr->local_ca_ack_delay = 16; + attr->sig_prot_cap = 0; + attr->sig_guard_cap = 0; + attr->timestamp_mask = 0; + attr->hca_core_clock = 0; /* TODO */ + return 0; +} + +static int loopback_query_port(struct ib_device *dev, u8 port_num, + struct ib_port_attr *attr) +{ + struct rdma_loopdev *ld = ib_to_loopdev(dev); + + *attr = ld->port_attr; + attr->max_mtu = IB_MTU_4096; + attr->active_mtu = ib_mtu_int_to_enum(lo->mtu); + attr->ip_gids = 1; + attr->max_msg_sz = 1 << 24; + attr->active_width = 2; + attr->active_speed = IB_SPEED_HDR; + attr->max_vl_num = 1; + attr->phys_state = 5; /* TODO */ + attr->port_cap_flags = IB_PORT_CM_SUP; + if (dev_get_flags(lo) & IFF_UP) + attr->state = IB_PORT_ACTIVE; + else + attr->state = IB_PORT_DOWN; + return 0; +} + +static struct net_device * +loopback_get_netdev(struct ib_device *device, u8 port_num) +{ + dev_hold(lo); + return lo; +} + +static int loopback_query_pkey(struct ib_device *device, + u8 port_num, u16 index, u16 *pkey) +{ + *pkey = 0xffff; + return 0; +} + +static enum rdma_link_layer +loopback_get_link_layer(struct ib_device *dev, u8 port_num) +{ + return IB_LINK_LAYER_ETHERNET; +} + +static int loopback_port_immutable(struct ib_device *dev, u8 port_num, + struct ib_port_immutable *immutable) +{ + struct rdma_loopdev *ld = ib_to_loopdev(dev); + + immutable->core_cap_flags = RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP; + immutable->pkey_tbl_len = ld->port_attr.pkey_tbl_len; + immutable->gid_tbl_len = ld->port_attr.gid_tbl_len; + immutable->max_mad_size = IB_MGMT_MAD_SIZE; + return 0; +} + +static int +loopback_alloc_ucontext(struct ib_ucontext *ibuctx, struct ib_udata *udata) +{ + return 0; +} + +static void loopback_dealloc_ucontext(struct ib_ucontext *ibuctx) +{ +} + +static int +loopback_alloc_pd(struct ib_pd *pd, struct ib_ucontext *context, + struct ib_udata *udata) +{ + /* Yes, we can further enhance core to not need null routines. */ + return 0; +} + +static void loopback_dealloc_pd(struct ib_pd *ibpd) +{ +} + +static int loopback_poll_cq(struct ib_cq *ibcq, int num_entries, + struct ib_wc *wc) +{ + struct loopback_cq *cq = ib_to_loop_cq(ibcq); + struct loopback_cqe *cqe; + struct list_head *entry; + int wc_count = 0; + + while (num_entries) { + entry = pop_from_fifo(&cq->cqes); + if (!entry) + break; + cqe = container_of(entry, struct loopback_cqe, list); + memcpy(&wc[wc_count], &cqe->wc, sizeof(cqe->wc)); + num_entries--; + wc_count++; + kfree(cqe); + } + return wc_count; +} + +static void attempt_notify_cq(struct loopback_cq *cq) +{ + unsigned long flags; + u64 entries; + + spin_lock_irqsave(&cq->cqes.lock, flags); + entries = get_fifo_entries(&cq->cqes); + if (cq->armed && entries && cq->ibcq.comp_handler) + (*cq->ibcq.comp_handler)(&cq->ibcq, cq->ibcq.cq_context); + spin_unlock_irqrestore(&cq->cqes.lock, flags); +} + +static int loopback_req_notify_cq(struct ib_cq *ibcq, + enum ib_cq_notify_flags arm) +{ + struct loopback_cq *cq = ib_to_loop_cq(ibcq); + unsigned long flags; + u64 entries; + + spin_lock_irqsave(&cq->cqes.lock, flags); + cq->armed = arm; + entries = get_fifo_entries(&cq->cqes); + if (cq->armed && entries && cq->ibcq.comp_handler) + (*cq->ibcq.comp_handler)(&cq->ibcq, cq->ibcq.cq_context); + spin_unlock_irqrestore(&cq->cqes.lock, flags); + return 0; +} + +static struct ib_cq * +loopback_create_cq(struct ib_device *dev, + const struct ib_cq_init_attr *attr, + struct ib_ucontext *context, + struct ib_udata *udata) +{ + struct rdma_loopdev *ld = ib_to_loopdev(dev); + struct loopback_cq *cq; + int ret; + + cq = kzalloc(sizeof(*cq), GFP_KERNEL); + if (!cq) + return ERR_PTR(-ENOMEM); + init_fifo(&cq->cqes); + ret = attach_table_id(&ld->cq_tbl, &cq->res); + if (ret) { + kfree(cq); + return ERR_PTR(ret); + } + return &cq->ibcq; +} + +static int loopback_destroy_cq(struct ib_cq *ibcq) +{ + struct loopback_cq *cq = ib_to_loop_cq(ibcq); + struct rdma_loopdev *ld = ib_to_loopdev(ibcq->device); + + detach_table_id(&ld->cq_tbl, &cq->res); + kfree(cq); + return 0; +} + +static struct ib_mr *loopback_get_dma_mr(struct ib_pd *ibpd, int access) +{ + struct rdma_loopdev *ld = ib_to_loopdev(ibpd->device); + struct loopback_mr *mr; + int ret; + + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + mr->type = LOOPBACK_MR_TYPE_DMA; + mr->ibmr.length = ULONG_MAX; + mr->access = access; + ret = attach_table_id(&ld->mr_tbl, &mr->res); + if (ret) { + kfree(mr); + return ERR_PTR(ret); + } + mr->ibmr.lkey = mr_id_to_mkey(mr->res.id); + mr->ibmr.rkey = mr->ibmr.lkey; + return &mr->ibmr; +} + +static size_t mr_pages_store_size(struct ib_umem *umem) +{ + return ib_umem_page_count(umem) * sizeof(struct page *); +} + +static void fill_pg_table(struct loopback_mr *mr, struct ib_umem *umem) +{ + int page_size = BIT(mr->u.umr.umem->page_shift); + struct scatterlist *sg; + int pg_iter = 0; + void *map_va; + int pg_idx; + int pages; + int i; + + for_each_sg(umem->sg_head.sgl, sg, umem->nmap, i) { + pages = sg_dma_len(sg) >> mr->u.umr.umem->page_shift; + map_va = page_address(sg_page(sg)); + + for (pg_idx = 0; pg_idx < pages; pg_idx++, pg_iter++) + mr->pg_tbl[pg_iter] = + (u64)map_va + (pg_idx * page_size); + } +} + +static void free_mr(struct loopback_mr *mr) +{ + kfree(mr->pg_tbl); + if (mr->type == LOOPBACK_MR_TYPE_USER) { + if (mr->u.umr.umem) + ib_umem_release(mr->u.umr.umem); + } + kfree(mr); +} + +static int loopback_dereg_mr(struct ib_mr *ibmr) +{ + struct rdma_loopdev *ld = ib_to_loopdev(ibmr->device); + struct loopback_mr *mr = ib_to_loop_mr(ibmr); + + /* First we must drop the reference, so nothing new starts on this + * mr, followed by wait for any ongoing operations. + * after that free the umem etc. This is done through table callback. + */ + detach_table_id(&ld->mr_tbl, &mr->res); + free_mr(mr); + return 0; +} + +static struct ib_mr * +loopback_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 length, u64 iova, + int access, struct ib_udata *udata) +{ + struct rdma_loopdev *ld = ib_to_loopdev(ibpd->device); + struct loopback_mr *mr; + struct ib_umem *umem; + struct page *pages; + size_t alloc_size; + int ret; + + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + mr->type = LOOPBACK_MR_TYPE_USER; + + umem = ib_umem_get(udata, start, length, access, 0); + if (!umem) { + ret = -ENOMEM; + goto err; + } + mr->u.umr.umem = umem; + + alloc_size = mr_pages_store_size(umem); + mr->pg_tbl = kmalloc(alloc_size, GFP_KERNEL); + if (!mr->pg_tbl) { + /* TODO: Remove 2GB registration limit */ + pr_err("%s alloc_size = %zu, page cnt = %d\n", __func__, + alloc_size, ib_umem_page_count(umem)); + ret = -ENOMEM; + goto err; + } + memset(mr->pg_tbl, 0, alloc_size); + mr->u.umr.pages = pages; + mr->u.umr.pages_alloc_size = alloc_size; + mr->fbo = ib_umem_offset(umem); + mr->page_shift = umem->page_shift; + mr->access = access; + mr->ibmr.iova = iova; + mr->ibmr.length = length; + fill_pg_table(mr, umem); + + ret = attach_table_id(&ld->mr_tbl, &mr->res); + if (ret) + goto err; + mr->ibmr.lkey = mr_id_to_mkey(mr->res.id); + mr->ibmr.rkey = mr->ibmr.lkey; + pr_debug("%s mr=0x%x fbo=0x%x, len=%lld pg_sz=%d pg_cnt=%d\n", + __func__, mr->ibmr.lkey, mr->fbo, + mr->ibmr.length, mr->ibmr.page_size, + ib_umem_page_count(umem)); + return &mr->ibmr; + +err: + free_mr(mr); + return ERR_PTR(ret); +} + +static void *get_dma_mr_va(const struct loopback_mr *mr, u64 va, + u32 cpy_len, u32 *ret_len) +{ + *ret_len = cpy_len; + return ((void *)(uintptr_t)va); +} + +static void *get_virt_mr_va(const struct loopback_mr *mr, u64 va, + u32 cpy_len, u32 *ret_len) +{ + u64 zero_based_offset; + u32 in_pg_offset; + u32 byte_offset; + u64 pg_addr; + void *vaddr; + int pg_idx; + + /* zero_based_offset accounts for fbo; due to which it can be + * offset by one page. + */ + zero_based_offset = (va - mr->ibmr.iova) + mr->fbo; + pg_idx = zero_based_offset / BIT(mr->page_shift); + + byte_offset = va - mr->ibmr.iova; + + if (mr->fbo) { + int bytes_in_first_page = 0; + + bytes_in_first_page = BIT(mr->page_shift) - mr->fbo; + in_pg_offset = byte_offset - + ((BIT(mr->page_shift) * (pg_idx - 1)) + bytes_in_first_page); + } else { + in_pg_offset = byte_offset - (BIT(mr->page_shift) * pg_idx); + } + pg_addr = mr->pg_tbl[pg_idx]; + pg_addr += in_pg_offset; + vaddr = ((void *)(uintptr_t)pg_addr); + *ret_len = min_t(u32, BIT(mr->page_shift) - in_pg_offset, cpy_len); + return vaddr; +} + +static void *get_mr_va(const struct loopback_mr *mr, u64 va, + u32 cpy_len, u32 *ret_len) +{ + switch (mr->type) { + case LOOPBACK_MR_TYPE_PHY: + case LOOPBACK_MR_TYPE_DMA: + return get_dma_mr_va(mr, va, cpy_len, ret_len); + case LOOPBACK_MR_TYPE_USER: + case LOOPBACK_MR_TYPE_FRMR: + return get_virt_mr_va(mr, va, cpy_len, ret_len); + } + return NULL; +} + +static void generate_rc_rq_cqe(struct loopback_qp *dqp, + struct loopback_rqe *rqe, + u32 recv_len, + enum ib_wc_status rqe_status, + u32 inv_key, u32 wc_flags) +{ + struct loopback_cq *recv_cq = ib_to_loop_cq(dqp->ibqp.recv_cq); + struct loopback_cqe *rq_cqe; + + /* Generate receive completion on best effort basis */ + rq_cqe = kzalloc(sizeof(*rq_cqe), dqp->cqe_alloc_flags); + if (!rq_cqe) + return; + + rq_cqe->wc.qp = &dqp->ibqp; + rq_cqe->wc.wr_cqe = rqe->wr.wr_cqe; + rq_cqe->wc.status = rqe_status; + rq_cqe->wc.byte_len = recv_len; + rq_cqe->wc.opcode = IB_WC_RECV; + rq_cqe->wc.qp = &dqp->ibqp; + rq_cqe->wc.port_num = 1; + rq_cqe->wc.wc_flags = wc_flags; + rq_cqe->wc.ex.invalidate_rkey = inv_key; + push_to_fifo(&recv_cq->cqes, &rq_cqe->list); + attempt_notify_cq(recv_cq); +} + +static void generate_ud_rq_cqe(const struct loopback_qp *sqp, + struct loopback_qp *dqp, + struct loopback_rqe *rqe, + u32 recv_len, + u8 network_hdr_type, int wc_flags, + enum ib_wc_status rqe_status) +{ + struct loopback_cq *recv_cq = ib_to_loop_cq(dqp->ibqp.recv_cq); + struct loopback_cqe *rq_cqe; + + /* Generate receive completion on best effort basis */ + rq_cqe = kzalloc(sizeof(*rq_cqe), dqp->cqe_alloc_flags); + if (!rq_cqe) + return; + + rq_cqe->wc.qp = &dqp->ibqp; + rq_cqe->wc.wr_cqe = rqe->wr.wr_cqe; + rq_cqe->wc.status = rqe_status; + rq_cqe->wc.byte_len = recv_len; + rq_cqe->wc.opcode = IB_WC_RECV; + rq_cqe->wc.qp = &dqp->ibqp; + rq_cqe->wc.src_qp = sqp->ibqp.qp_num; + rq_cqe->wc.port_num = 1; + rq_cqe->wc.network_hdr_type = network_hdr_type; + rq_cqe->wc.wc_flags = wc_flags; + push_to_fifo(&recv_cq->cqes, &rq_cqe->list); + attempt_notify_cq(recv_cq); +} + +static bool is_qp_supported(enum ib_qp_type type) +{ + return (type == IB_QPT_GSI || type == IB_QPT_RC); +} + +static struct ib_qp * +loopback_create_qp(struct ib_pd *ibpd, struct ib_qp_init_attr *attr, + struct ib_udata *udata) +{ + struct rdma_loopdev *ld = ib_to_loopdev(ibpd->device); + struct loopback_qp *qp; + int ret; + + if (!is_qp_supported(attr->qp_type)) + return ERR_PTR(-EINVAL); + + qp = kzalloc(sizeof(*qp), GFP_KERNEL); + if (!qp) + return ERR_PTR(-ENOMEM); + + qp->init_attr = *attr; + /* Since we generate cqes under rcu read lock, + * cqe allocations are atomic. + */ + qp->cqe_alloc_flags = udata ? GFP_KERNEL : GFP_ATOMIC; + qp->rqe_alloc_flags = udata ? GFP_KERNEL : GFP_ATOMIC; + qp->user_qp = udata ? true : false; + init_fifo(&qp->rqes); + if (attr->qp_type == IB_QPT_GSI) + ret = attach_table_id_for_id(&ld->qp_tbl, &qp->res, 1); + else + ret = attach_table_id(&ld->qp_tbl, &qp->res); + if (ret) { + kfree(qp); + return ERR_PTR(ret); + } + qp->ibqp.qp_num = qp->res.id; + return &qp->ibqp; +} + +static void loopbak_flush_rq(struct loopback_qp *qp, bool gen_cqe) +{ + struct loopback_rqe *rqe; + struct list_head *entry; + + while (1) { + entry = pop_from_fifo(&qp->rqes); + if (!entry) + break; + rqe = container_of(entry, struct loopback_rqe, list); + if (gen_cqe) + generate_rc_rq_cqe(qp, rqe, 0, IB_WC_WR_FLUSH_ERR, + 0, 0); + kfree(rqe); + } +} + +static int loopback_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int mask, struct ib_udata *udata) +{ + struct loopback_qp *qp = ib_to_loop_qp(ibqp); + + if (mask & IB_QP_ACCESS_FLAGS) + qp->attr.qp_access_flags = attr->qp_access_flags; + if (mask & IB_QP_DEST_QPN) + qp->attr.dest_qp_num = attr->dest_qp_num; + if (mask & IB_QP_STATE) { + WRITE_ONCE(qp->state, attr->qp_state); + + put_table_entry(&qp->res); + /* Wait for all datapath operations to stop */ + wait_for_completion(&qp->res.completion); + + if (attr->qp_state == IB_QPS_ERR || + attr->qp_state == IB_QPS_RESET) { + loopbak_flush_rq(qp, true); + } + /* Reinit the refcount so that new data path ops can start + * after a new state. There is extremely rare corner case + * where RTR->RTS transition time, sender is sending the + * data, which this driver doesn't support currently. + */ + refcount_set(&qp->res.refcount, 1); + } + return 0; +} + +static int loopback_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int mask, struct ib_qp_init_attr *init_attr) +{ + struct loopback_qp *qp = ib_to_loop_qp(ibqp); + + *attr = qp->attr; + *init_attr = qp->init_attr; + return 0; +} + +static int loopback_destroy_qp(struct ib_qp *ibqp) +{ + struct rdma_loopdev *ld = ib_to_loopdev(ibqp->device); + struct loopback_qp *qp = ib_to_loop_qp(ibqp); + + detach_table_id(&ld->qp_tbl, &qp->res); + loopbak_flush_rq(qp, false); + kfree(qp); + return 0; +} + +static enum ib_wc_status sq_opcode_to_wc_opcode(enum ib_wr_opcode opcode) +{ + switch (opcode) { + case IB_WR_RDMA_WRITE: + case IB_WR_RDMA_WRITE_WITH_IMM: + return IB_WC_RDMA_WRITE; + case IB_WR_SEND: + case IB_WR_SEND_WITH_IMM: + return IB_WC_SEND; + case IB_WR_RDMA_READ: + return IB_WC_RDMA_READ; + case IB_WR_ATOMIC_CMP_AND_SWP: + return IB_WC_COMP_SWAP; + case IB_WR_ATOMIC_FETCH_AND_ADD: + return IB_WC_FETCH_ADD; + case IB_WR_LSO: + return IB_WC_LSO; + case IB_WR_LOCAL_INV: + return IB_WC_LOCAL_INV; + case IB_WR_REG_MR: + case IB_WR_REG_SIG_MR: + return IB_WC_REG_MR; + case IB_WR_MASKED_ATOMIC_CMP_AND_SWP: + return IB_WC_MASKED_COMP_SWAP; + case IB_WR_MASKED_ATOMIC_FETCH_AND_ADD: + return IB_WC_MASKED_FETCH_ADD; + default: + /* TODO: no better default value */ + return IB_WC_SEND; + } +} + +static u64 get_sges_len(const struct ib_sge *sg_list, int num_sges) +{ + u64 size = 0; + int i; + + for (i = 0; i < num_sges; i++) + size += sg_list[i].length; + + return size; +} + +static u64 get_send_wqe_len(const struct ib_send_wr *wr) +{ + u64 send_len = 0; + + switch (wr->opcode) { + case IB_WR_SEND: + case IB_WR_SEND_WITH_INV: + case IB_WR_SEND_WITH_IMM: + case IB_WR_RDMA_READ: + case IB_WR_RDMA_WRITE: + send_len = get_sges_len(wr->sg_list, wr->num_sge); + break; + default: + break; + } + return send_len; +} + +static u64 get_rqe_len(struct loopback_rqe *rqe) +{ + return get_sges_len(&rqe->sges[0], rqe->wr.num_sge); +} + +static void put_mr(struct loopback_mr *mr) +{ + if (!IS_ERR(mr) && mr) + put_table_entry(&mr->res); +} + +static struct loopback_mr *get_mr_for_key(struct rdma_loopdev *ld, u32 key) +{ + struct loopback_resource *mr_entry; + struct loopback_mr *mr; + + mr_entry = get_table_entry_by_id(&ld->mr_tbl, key); + if (!mr_entry) + return ERR_PTR(-EINVAL); + mr = container_of(mr_entry, struct loopback_mr, res); + return mr; +} + +static int validate_mr_access(const struct loopback_mr *mr, + const struct loopback_qp *qp, + enum ib_wr_opcode opc, u64 addr, u32 len) +{ + if (len > mr->ibmr.length || + (qp->user_qp && mr->type != LOOPBACK_MR_TYPE_USER)) + return -EACCES; + + if ((opc == IB_WR_RDMA_WRITE || + opc == IB_WR_RDMA_WRITE_WITH_IMM || + opc == IB_WR_RDMA_READ || + opc == IB_WR_ATOMIC_CMP_AND_SWP || + opc == IB_WR_ATOMIC_FETCH_AND_ADD) && + ((qp->attr.qp_access_flags & mr->access) == 0)) + return -EACCES; + + /* MR length and iova checks are applicable to FRMR and user type */ + if (mr->type == LOOPBACK_MR_TYPE_PHY || + mr->type == LOOPBACK_MR_TYPE_DMA) + return 0; + + /* address within range check */ + if (addr < mr->ibmr.iova || addr > mr->ibmr.iova + mr->ibmr.length || + addr + len > mr->ibmr.iova + mr->ibmr.length) + return -EACCES; + return 0; +} + +static struct loopback_mr * +get_mr_for_wr_sge(struct rdma_loopdev *ld, const struct loopback_qp *qp, + enum ib_wr_opcode opc, const struct ib_sge *sge) +{ + struct loopback_mr *mr; + int ret; + + mr = get_mr_for_key(ld, sge->lkey); + if (IS_ERR(mr)) + return mr; + + ret = validate_mr_access(mr, qp, opc, sge->addr, sge->length); + if (ret) { + put_mr(mr); + mr = ERR_PTR(ret); + } + return mr; +} + +static struct loopback_mr * +get_mr_for_rkey(struct rdma_loopdev *ld, struct loopback_qp *qp, + enum ib_wr_opcode opc, u64 addr, u32 len, u32 mkey) +{ + struct loopback_mr *mr; + int ret; + + mr = get_mr_for_key(ld, mkey); + if (IS_ERR(mr)) + return mr; + + ret = validate_mr_access(mr, qp, opc, addr, len); + if (ret) { + put_mr(mr); + mr = ERR_PTR(ret); + } + return mr; +} + +static int +copy_data_wqe_to_rqe(struct rdma_loopdev *ld, struct loopback_qp *sqp, + struct loopback_qp *dqp, + const struct ib_send_wr *wr, struct loopback_rqe *rqe, + u32 send_len, u32 dst_fbo) +{ + struct loopback_mr *src_mr = NULL; + struct loopback_mr *dst_mr = NULL; + u32 src_sge_len = 0; + u32 dst_sge_len = 0; + u64 src_sge_va = 0; + u64 dst_sge_va = 0; + u32 ret_len = 0; + u32 cpy_len = 0; + void *src_addr; + void *dst_addr; + int s_idx = 0; + int d_idx = 0; + int ret = 0; + + while (send_len) { + if (!src_mr) { + src_mr = get_mr_for_wr_sge(ld, sqp, wr->opcode, + &wr->sg_list[s_idx]); + if (IS_ERR(src_mr)) { + ret = PTR_ERR(src_mr); + goto err; + } + src_sge_len = wr->sg_list[s_idx].length; + src_sge_va = wr->sg_list[s_idx].addr; + } + if (!dst_mr) { + dst_mr = get_mr_for_wr_sge(ld, sqp, wr->opcode, + &rqe->sges[d_idx]); + if (IS_ERR(dst_mr)) { + ret = PTR_ERR(dst_mr); + goto err; + } + dst_sge_len = rqe->sges[d_idx].length; + dst_sge_va = rqe->sges[d_idx].addr + dst_fbo; + } + + /* copy data of minimum length between and src and dst sge */ + cpy_len = min_t(u32, src_sge_len, dst_sge_len); + + src_addr = get_mr_va(src_mr, src_sge_va, cpy_len, &ret_len); + cpy_len = min_t(u32, cpy_len, ret_len); + + dst_addr = get_mr_va(dst_mr, dst_sge_va, cpy_len, &ret_len); + cpy_len = min_t(u32, cpy_len, ret_len); + + memcpy(dst_addr, src_addr, cpy_len); + + src_sge_len -= cpy_len; + dst_sge_len -= cpy_len; + src_sge_va += cpy_len; + dst_sge_va += cpy_len; + send_len -= cpy_len; + + if (!src_sge_len) { + s_idx++; + put_mr(src_mr); + src_mr = NULL; + } + if (!dst_sge_len) { + d_idx++; + put_mr(dst_mr); + dst_mr = NULL; + } + dst_fbo = 0; + } +err: + put_mr(dst_mr); + put_mr(src_mr); + return ret; +} + +static int copy_data_wqe_rkey(struct rdma_loopdev *ld, struct loopback_qp *sqp, + struct loopback_qp *dqp, + const struct ib_send_wr *wr) +{ + const struct ib_rdma_wr *rdmawr = rdma_wr(wr); + u64 rdma_len = get_send_wqe_len(wr); + struct loopback_mr *wr_mr = NULL; + struct loopback_mr *rkey_mr; + u32 wr_sge_len = 0; + u64 wr_sge_va = 0; + void *wr_sge_addr; + u64 rkey_va = 0; + u32 ret_len = 0; + u32 cpy_len = 0; + void *rkey_addr; + int sge_idx = 0; + int ret = 0; + + rkey_va = rdmawr->remote_addr; + rkey_mr = get_mr_for_rkey(ld, dqp, wr->opcode, rkey_va, + rdma_len, rdmawr->rkey); + if (IS_ERR(rkey_mr)) + return PTR_ERR(rkey_mr); + + while (rdma_len) { + if (!wr_mr) { + wr_mr = get_mr_for_wr_sge(ld, sqp, wr->opcode, + &wr->sg_list[sge_idx]); + if (IS_ERR(wr_mr)) { + ret = PTR_ERR(wr_mr); + goto err; + } + wr_sge_len = wr->sg_list[sge_idx].length; + wr_sge_va = wr->sg_list[sge_idx].addr; + } + + cpy_len = wr_sge_len; + + wr_sge_addr = get_mr_va(wr_mr, wr_sge_va, cpy_len, &ret_len); + cpy_len = min_t(u32, cpy_len, ret_len); + + rkey_addr = get_mr_va(rkey_mr, rkey_va, cpy_len, &ret_len); + cpy_len = min_t(u32, cpy_len, ret_len); + + if (wr->opcode == IB_WR_RDMA_READ) { + /* rdma read => read from remote rkey to local sges */ + memcpy(wr_sge_addr, rkey_addr, cpy_len); + } else { + /* rdma write => local sges to remote rkey */ + memcpy(rkey_addr, wr_sge_addr, cpy_len); + } + + wr_sge_len -= cpy_len; + wr_sge_va += cpy_len; + rkey_va += cpy_len; + rdma_len -= cpy_len; + + if (!wr_sge_len) { + sge_idx++; + put_mr(wr_mr); + wr_mr = NULL; + } + } +err: + put_mr(wr_mr); + put_mr(rkey_mr); + return ret; +} + +static int +write_ud_grh_hdr(struct rdma_loopdev *ld, struct loopback_qp *dqp, + const struct ib_send_wr *wr, struct loopback_rqe *rqe) +{ + union rdma_network_hdr hdr = {}; + struct loopback_mr *dst_mr; + u32 dst_sge_len; + void *dst_addr; + u64 dst_sge_va; + u32 ret_len = 0; + int ret = 0; + + /* Even though spec allows to split first 40 bytes header in 40 sges, + * there isn't good usecase, so this expect minimum 40 bytes sge. + */ + dst_sge_va = rqe->sges[0].addr; + dst_sge_len = rqe->sges[0].length; + if (dst_sge_len < sizeof(hdr)) + return -EINVAL; + + dst_mr = get_mr_for_wr_sge(ld, dqp, wr->opcode, &rqe->sges[0]); + if (IS_ERR(dst_mr)) + return PTR_ERR(dst_mr); + dst_addr = get_mr_va(dst_mr, dst_sge_va, sizeof(hdr), &ret_len); + if (ret_len != sizeof(hdr)) { + ret = -EINVAL; + goto done; + } + /* TODO: consider ipv6 */ + hdr.roce4grh.saddr = htonl(0x7f000001); + hdr.roce4grh.daddr = htonl(0x7f000001); + hdr.roce4grh.ttl = 1; + memcpy(dst_addr, &hdr, sizeof(hdr)); +done: + put_mr(dst_mr); + return ret; +} + +static void +post_one_ud_send_wqe(struct rdma_loopdev *ld, struct loopback_qp *qp, + struct loopback_qp *dqp, + const struct ib_send_wr *wr, + struct loopback_cqe *wq_cqe) +{ + u32 send_len = get_send_wqe_len(wr); + struct loopback_rqe *rqe = NULL; + enum ib_wc_status recv_status; + struct list_head *rqe_entry; + u32 recv_len; + int ret; + + rqe_entry = pop_from_fifo(&dqp->rqes); + if (!rqe_entry) { + wq_cqe->wc.status = IB_WC_GENERAL_ERR; + return; + } + rqe = container_of(rqe_entry, struct loopback_rqe, list); + recv_len = get_rqe_len(rqe); + + if (send_len + sizeof(union rdma_network_hdr) > recv_len) { + recv_len = 0; + recv_status = IB_WC_GENERAL_ERR; + wq_cqe->wc.status = IB_WC_GENERAL_ERR; + } else { + /* copy minimum data + grh of what is sent and rqe size */ + recv_len = min_t(u32, send_len + + sizeof(union rdma_network_hdr), recv_len); + + ret = write_ud_grh_hdr(ld, dqp, wr, rqe); + if (ret) { + recv_status = IB_WC_GENERAL_ERR; + wq_cqe->wc.status = IB_WC_GENERAL_ERR; + goto done; + } + ret = copy_data_wqe_to_rqe(ld, qp, dqp, wr, rqe, send_len, + sizeof(union rdma_network_hdr)); + if (ret) { + recv_status = IB_WC_GENERAL_ERR; + wq_cqe->wc.status = IB_WC_GENERAL_ERR; + } else { + recv_status = IB_WC_SUCCESS; + wq_cqe->wc.status = IB_WC_SUCCESS; + } + } +done: + generate_ud_rq_cqe(qp, dqp, rqe, recv_len, RDMA_NETWORK_IPV4, + IB_WC_WITH_NETWORK_HDR_TYPE | + IB_WC_GRH, recv_status); + kfree(rqe); +} + +static struct loopback_qp * +get_qp_by_qpn(struct rdma_loopdev *ld, u32 qpn) +{ + struct loopback_resource *entry; + struct loopback_qp *qp; + + entry = get_table_entry_by_id(&ld->qp_tbl, qpn); + if (!entry) + return ERR_PTR(-EINVAL); + qp = container_of(entry, struct loopback_qp, res); + return qp; +} + +static struct loopback_qp *get_qp(struct loopback_qp *qp) +{ + return refcount_inc_not_zero(&qp->res.refcount) ? qp : ERR_PTR(-EINVAL); +} + +static void put_qp(struct loopback_qp *qp) +{ + if (!IS_ERR(qp) && qp) + put_table_entry(&qp->res); +} + +static void +post_one_ud_wqe(struct rdma_loopdev *ld, struct loopback_qp *qp, + const struct ib_send_wr *wr, + struct loopback_cqe *wq_cqe) +{ + const struct ib_ud_wr *ud_wqe = ud_wr(wr); + struct loopback_qp *dqp; + + dqp = get_qp_by_qpn(ld, ud_wqe->remote_qpn); + if (IS_ERR(dqp)) + goto done; + + switch (wr->opcode) { + case IB_WR_SEND: + post_one_ud_send_wqe(ld, qp, dqp, wr, wq_cqe); + break; + default: + wq_cqe->wc.status = IB_WC_GENERAL_ERR; + break; + } +done: + wq_cqe->wc.src_qp = qp->ibqp.qp_num; + put_qp(dqp); +} + +static int invalidate_rkey(struct rdma_loopdev *ld, u32 inv_key) +{ + struct loopback_mr *mr; + + mr = get_mr_for_key(ld, inv_key); + if (IS_ERR(mr)) + return PTR_ERR(mr); + /* Fail invalidation if there are active users for now, as this is + * extremely rare scenario and not well known use case. + */ + if (refcount_read(&mr->res.refcount) > 2) + return -EINVAL; + xa_clear_mark(&ld->mr_tbl.ids, mr->res.id, + LOOPBACK_RESOURCE_STATE_VALID); + put_mr(mr); + return 0; +} + +static void +process_one_rc_linv(struct rdma_loopdev *ld, struct loopback_qp *qp, + const struct ib_send_wr *wr, + struct loopback_cqe *wq_cqe) +{ + u32 inv_key = wr->ex.invalidate_rkey; + int ret; + + ret = invalidate_rkey(ld, inv_key); + if (ret) + wq_cqe->wc.status = IB_WC_GENERAL_ERR; + else + wq_cqe->wc.status = IB_WC_SUCCESS; +} + +static void +process_one_rc_send_wqe(struct rdma_loopdev *ld, struct loopback_qp *qp, + struct loopback_qp *dqp, + const struct ib_send_wr *wr, + struct loopback_cqe *wq_cqe) +{ + enum ib_wc_status recv_status; + struct loopback_rqe *rqe; + struct list_head *entry; + u32 rqe_wc_flags = 0; + u32 inv_key = 0; + u32 send_len; + u32 recv_len; + int ret; + + entry = pop_from_fifo(&dqp->rqes); + if (!entry) { + wq_cqe->wc.status = IB_WC_RNR_RETRY_EXC_ERR; + rcu_read_unlock(); + return; + } + rqe = container_of(entry, struct loopback_rqe, list); + send_len = get_send_wqe_len(wr); + recv_len = get_rqe_len(rqe); + if (send_len > recv_len) { + recv_len = 0; + recv_status = IB_WC_GENERAL_ERR; + wq_cqe->wc.status = IB_WC_REM_INV_REQ_ERR; + } else { + /* copy minimum data of what is sent and rqe size */ + recv_len = min_t(u32, send_len, recv_len); + ret = copy_data_wqe_to_rqe(ld, qp, dqp, wr, + rqe, send_len, 0); + if (ret) { + recv_status = IB_WC_LOC_LEN_ERR; + wq_cqe->wc.status = IB_WC_REM_INV_REQ_ERR; + } else { + recv_status = IB_WC_SUCCESS; + wq_cqe->wc.status = IB_WC_SUCCESS; + } + } + if (!ret && wr->opcode == IB_WR_SEND_WITH_INV) { + ret = invalidate_rkey(ld, wr->ex.invalidate_rkey); + if (ret) { + recv_status = IB_WC_LOC_LEN_ERR; + wq_cqe->wc.status = IB_WC_REM_INV_REQ_ERR; + } else { + inv_key = wr->ex.invalidate_rkey; + rqe_wc_flags = IB_WC_WITH_INVALIDATE; + recv_status = IB_WC_SUCCESS; + wq_cqe->wc.status = IB_WC_SUCCESS; + } + } + + generate_rc_rq_cqe(dqp, rqe, recv_len, recv_status, + inv_key, rqe_wc_flags); + kfree(rqe); +} + +static void +process_one_rc_rw_wqe(struct rdma_loopdev *ld, struct loopback_qp *qp, + struct loopback_qp *dqp, + const struct ib_send_wr *wr, + struct loopback_cqe *wq_cqe) +{ + int ret; + + ret = copy_data_wqe_rkey(ld, qp, dqp, wr); + if (ret) + wq_cqe->wc.status = IB_WC_REM_ACCESS_ERR; + else + wq_cqe->wc.status = IB_WC_SUCCESS; +} + +static void +process_one_rc_wqe(struct rdma_loopdev *ld, struct loopback_qp *sqp, + struct loopback_qp *dqp, + const struct ib_send_wr *wr, + struct loopback_cqe *wq_cqe) +{ + switch (wr->opcode) { + case IB_WR_SEND: + case IB_WR_SEND_WITH_INV: + process_one_rc_send_wqe(ld, sqp, dqp, wr, wq_cqe); + break; + case IB_WR_RDMA_READ: + case IB_WR_RDMA_WRITE: + process_one_rc_rw_wqe(ld, sqp, dqp, wr, wq_cqe); + break; + case IB_WR_LOCAL_INV: + process_one_rc_linv(ld, sqp, wr, wq_cqe); + break; + default: + wq_cqe->wc.status = IB_WC_GENERAL_ERR; + break; + } +} + +static void +post_one_rc_wqe(struct rdma_loopdev *ld, struct loopback_qp *sqp, + const struct ib_send_wr *wr, struct loopback_cqe *wq_cqe) +{ + struct loopback_qp *dqp; + + dqp = get_qp_by_qpn(ld, sqp->attr.dest_qp_num); + if (IS_ERR(dqp)) { + wq_cqe->wc.status = IB_WC_RETRY_EXC_ERR; + goto done; + } + process_one_rc_wqe(ld, sqp, dqp, wr, wq_cqe); + +done: + wq_cqe->wc.src_qp = sqp->ibqp.qp_num; + put_qp(dqp); +} + +static int post_one_send(struct loopback_qp *qp, const struct ib_send_wr *wr) +{ + struct rdma_loopdev *ld = ib_to_loopdev(qp->ibqp.device); + struct loopback_cq *send_cq = ib_to_loop_cq(qp->ibqp.send_cq); + struct loopback_cqe *cqe; + struct loopback_qp *sqp; + int ret = 0; + + sqp = get_qp(qp); + if (IS_ERR(sqp)) + return -EINVAL; + + cqe = kzalloc(sizeof(*cqe), qp->cqe_alloc_flags); + if (!cqe) { + ret = -ENOMEM; + goto alloc_err; + } + + if (qp->state != IB_QPS_RTS) { + cqe->wc.status = IB_WC_WR_FLUSH_ERR; + goto done; + } + + switch (qp->ibqp.qp_type) { + case IB_QPT_GSI: + post_one_ud_wqe(ld, qp, wr, cqe); + break; + case IB_QPT_RC: + post_one_rc_wqe(ld, qp, wr, cqe); + break; + default: + break; + } + +done: + cqe->wc.opcode = sq_opcode_to_wc_opcode(wr->opcode); + cqe->wc.wr_cqe = wr->wr_cqe; + cqe->wc.qp = &sqp->ibqp; + cqe->wc.port_num = 1; + if (wr->send_flags & IB_SEND_SIGNALED || + qp->init_attr.sq_sig_type == IB_SIGNAL_ALL_WR || + cqe->wc.status != IB_WC_SUCCESS) { + push_to_fifo(&send_cq->cqes, &cqe->list); + attempt_notify_cq(send_cq); + } else { + kfree(cqe); + } +alloc_err: + put_qp(sqp); + return ret; +} + +static int loopback_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, + const struct ib_send_wr **bad_wr) +{ + struct loopback_qp *qp = ib_to_loop_qp(ibqp); + int err = 0; + + while (wr) { + err = post_one_send(qp, wr); + if (unlikely(err)) { + *bad_wr = wr; + break; + } + wr = wr->next; + } + return err; +} + +static int post_one_recv(struct loopback_qp *qp, const struct ib_recv_wr *wr) +{ + struct loopback_rqe *rqe; + struct loopback_qp *sqp; + enum ib_qp_state state; + int ret = 0; + + sqp = get_qp(qp); + if (IS_ERR(sqp)) + return -EINVAL; + + rqe = kzalloc(struct_size(rqe, sges, wr->num_sge), qp->rqe_alloc_flags); + if (!rqe) { + ret = -ENOMEM; + goto alloc_err; + } + + rqe->wr = *wr; + memcpy(&rqe->sges[0], wr->sg_list, wr->num_sge * sizeof(rqe->sges[0])); + + state = qp->state; + if (state == IB_QPS_INIT || state == IB_QPS_RTR || state == IB_QPS_RTS) + push_to_fifo(&qp->rqes, &rqe->list); + else + ret = -EINVAL; + + if (ret) { + generate_rc_rq_cqe(qp, rqe, 0, IB_WC_WR_FLUSH_ERR, 0, 0); + kfree(rqe); + ret = 0; + } +alloc_err: + put_qp(sqp); + return ret; +} + +static int loopback_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, + const struct ib_recv_wr **bad_wr) +{ + struct loopback_qp *qp = ib_to_loop_qp(ibqp); + int err = 0; + + while (wr) { + err = post_one_recv(qp, wr); + if (unlikely(err)) { + *bad_wr = wr; + break; + } + wr = wr->next; + } + return err; +} + +static struct ib_ah * +loopback_create_ah(struct ib_pd *ibpd, struct rdma_ah_attr *attr, + u32 flags, struct ib_udata *udata) +{ + struct rdma_loopdev *ld = ib_to_loopdev(ibpd->device); + struct loopback_ah *ah; + int ret; + + ah = kzalloc(sizeof(*ah), GFP_KERNEL); + if (!ah) + return ERR_PTR(-ENOMEM); + ah->attr = *attr; + ret = attach_table_id(&ld->ah_tbl, &ah->res); + if (ret) { + kfree(ah); + return ERR_PTR(ret); + } + return &ah->ibah; +} + +static int loopback_query_ah(struct ib_ah *ibah, struct rdma_ah_attr *attr) +{ + struct loopback_ah *ah = ib_to_loop_ah(ibah); + + memset(attr, 0, sizeof(*attr)); + *attr = ah->attr; + attr->type = ibah->type; + return 0; +} + +static int loopback_destroy_ah(struct ib_ah *ibah, u32 flags) +{ + struct rdma_loopdev *ld = ib_to_loopdev(ibah->device); + struct loopback_ah *ah = ib_to_loop_ah(ibah); + + detach_table_id(&ld->ah_tbl, &ah->res); + kfree(ah); + return 0; +} + +static const struct ib_device_ops rdma_loopdev_ops = { + .alloc_pd = loopback_alloc_pd, + .alloc_ucontext = loopback_alloc_ucontext, + .create_ah = loopback_create_ah, + .create_cq = loopback_create_cq, + .create_qp = loopback_create_qp, + .dealloc_pd = loopback_dealloc_pd, + .dealloc_ucontext = loopback_dealloc_ucontext, + .dereg_mr = loopback_dereg_mr, + .destroy_ah = loopback_destroy_ah, + .destroy_cq = loopback_destroy_cq, + .destroy_qp = loopback_destroy_qp, + .get_dma_mr = loopback_get_dma_mr, + .get_link_layer = loopback_get_link_layer, + .get_netdev = loopback_get_netdev, + .get_port_immutable = loopback_port_immutable, + .modify_qp = loopback_modify_qp, + .poll_cq = loopback_poll_cq, + .post_recv = loopback_post_recv, + .post_send = loopback_post_send, + .query_ah = loopback_query_ah, + .query_device = loopback_query_device, + .query_pkey = loopback_query_pkey, + .query_port = loopback_query_port, + .query_qp = loopback_query_qp, + .reg_user_mr = loopback_reg_user_mr, + .req_notify_cq = loopback_req_notify_cq, + INIT_RDMA_OBJ_SIZE(ib_pd, loopback_pd, ibpd), + INIT_RDMA_OBJ_SIZE(ib_ucontext, loopback_uctx, ibuctx), +}; + +static void init_rdma_loopdev(struct rdma_loopdev *ld) +{ + struct ib_device *dev = &ld->dev; + + strlcpy(dev->node_desc, "lo", sizeof(dev->node_desc)); + + dev->owner = THIS_MODULE; + dev->node_type = RDMA_NODE_IB_CA; + dev->phys_port_cnt = 1; + dev->num_comp_vectors = num_possible_cpus(); + dev->dev.parent = &lo->dev; + dev->local_dma_lkey = 0; + dev->dev.dma_ops = &dma_virt_ops; + dev->node_guid = 0x7f0001; + dma_coerce_mask_and_coherent(&dev->dev, + dma_get_required_mask(&dev->dev)); + + ld->port_attr.pkey_tbl_len = 1; + /* deault, 127.0.0.1 and ::1 */ + ld->port_attr.gid_tbl_len = 3; + + dev->uverbs_abi_ver = 2; + dev->uverbs_cmd_mask = BIT_ULL(IB_USER_VERBS_CMD_GET_CONTEXT) + | BIT_ULL(IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) + | BIT_ULL(IB_USER_VERBS_CMD_QUERY_DEVICE) + | BIT_ULL(IB_USER_VERBS_CMD_QUERY_PORT) + | BIT_ULL(IB_USER_VERBS_CMD_ALLOC_PD) + | BIT_ULL(IB_USER_VERBS_CMD_DEALLOC_PD) + | BIT_ULL(IB_USER_VERBS_CMD_CREATE_QP) + | BIT_ULL(IB_USER_VERBS_CMD_MODIFY_QP) + | BIT_ULL(IB_USER_VERBS_CMD_QUERY_QP) + | BIT_ULL(IB_USER_VERBS_CMD_DESTROY_QP) + | BIT_ULL(IB_USER_VERBS_CMD_POST_SEND) + | BIT_ULL(IB_USER_VERBS_CMD_POST_RECV) + | BIT_ULL(IB_USER_VERBS_CMD_CREATE_CQ) + | BIT_ULL(IB_USER_VERBS_CMD_DESTROY_CQ) + | BIT_ULL(IB_USER_VERBS_CMD_POLL_CQ) + | BIT_ULL(IB_USER_VERBS_CMD_REQ_NOTIFY_CQ) + | BIT_ULL(IB_USER_VERBS_CMD_REG_MR) + | BIT_ULL(IB_USER_VERBS_CMD_DEREG_MR) + | BIT_ULL(IB_USER_VERBS_CMD_CREATE_AH) + | BIT_ULL(IB_USER_VERBS_CMD_MODIFY_AH) + | BIT_ULL(IB_USER_VERBS_CMD_QUERY_AH) + | BIT_ULL(IB_USER_VERBS_CMD_DESTROY_AH) + ; + + ib_set_device_ops(dev, &rdma_loopdev_ops); + + dev->driver_id = RDMA_DRIVER_LOOPBACK; + + init_loopdev_tables(ld); +} + +static void cleanup_zero_lkey_mr(struct rdma_loopdev *ld) +{ + detach_table_id(&ld->mr_tbl, &ld->zero_mr.res); +} + +static int init_zero_lkey_mr(struct rdma_loopdev *ld) +{ + ld->zero_mr.type = LOOPBACK_MR_TYPE_PHY; + ld->zero_mr.ibmr.length = ULONG_MAX; + ld->zero_mr.access = IB_ACCESS_LOCAL_WRITE; + return attach_table_id_for_id(&ld->mr_tbl, &ld->zero_mr.res, 0); +} + +static int loopback_init(void) +{ + int ret; + + lo = dev_get_by_name(&init_net, "lo"); + if (!lo) + return -ENODEV; + + loopdev = ib_alloc_device(rdma_loopdev, dev); + if (!loopdev) { + ret = -ENOMEM; + goto alloc_err; + } + init_rdma_loopdev(loopdev); + + ret = init_zero_lkey_mr(loopdev); + if (ret) + goto mr_err; + + ret = ib_register_device(&loopdev->dev, "lo"); + if (ret) + goto reg_err; + return 0; + +reg_err: + cleanup_zero_lkey_mr(loopdev); +mr_err: + ib_dealloc_device(&loopdev->dev); +alloc_err: + dev_put(lo); + return ret; +} + +static void loopback_cleanup(void) +{ + ib_unregister_device(&loopdev->dev); + cleanup_zero_lkey_mr(loopdev); + ib_dealloc_device(&loopdev->dev); + dev_put(lo); +} + +module_init(loopback_init); +module_exit(loopback_cleanup); +MODULE_LICENSE("GPL"); diff --git a/include/uapi/rdma/rdma_user_ioctl_cmds.h b/include/uapi/rdma/rdma_user_ioctl_cmds.h index 06c34d9..f9756a2 100644 --- a/include/uapi/rdma/rdma_user_ioctl_cmds.h +++ b/include/uapi/rdma/rdma_user_ioctl_cmds.h @@ -102,6 +102,7 @@ enum rdma_driver_id { RDMA_DRIVER_RXE, RDMA_DRIVER_HFI1, RDMA_DRIVER_QIB, + RDMA_DRIVER_LOOPBACK, }; #endif -- 1.8.3.1