[EXPERIMENTAL 2/3] RDMA/loopback: Loopback rdma (RoCE) driver

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



This is the most simplest rdma (RoCE) loopback driver.
It simplements rdma device on top of 'lo' netdevice.

Since data doesn't leave a system, it doesn't emulate any tranport,
network or link layers.

It implements fully functional verbs layer and supported by data copy
engine.

Signed-off-by: Parav Pandit <parav@xxxxxxxxxxxx>
---
 drivers/infiniband/Kconfig                |    1 +
 drivers/infiniband/sw/Makefile            |    1 +
 drivers/infiniband/sw/loopback/Kconfig    |   14 +
 drivers/infiniband/sw/loopback/Makefile   |    4 +
 drivers/infiniband/sw/loopback/loopback.c | 1603 +++++++++++++++++++++++++++++
 include/uapi/rdma/rdma_user_ioctl_cmds.h  |    1 +
 6 files changed, 1624 insertions(+)
 create mode 100644 drivers/infiniband/sw/loopback/Kconfig
 create mode 100644 drivers/infiniband/sw/loopback/Makefile
 create mode 100644 drivers/infiniband/sw/loopback/loopback.c

diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig
index a1fb840d..1715ead 100644
--- a/drivers/infiniband/Kconfig
+++ b/drivers/infiniband/Kconfig
@@ -107,6 +107,7 @@ source "drivers/infiniband/hw/hfi1/Kconfig"
 source "drivers/infiniband/hw/qedr/Kconfig"
 source "drivers/infiniband/sw/rdmavt/Kconfig"
 source "drivers/infiniband/sw/rxe/Kconfig"
+source "drivers/infiniband/sw/loopback/Kconfig"
 endif
 
 source "drivers/infiniband/ulp/ipoib/Kconfig"
diff --git a/drivers/infiniband/sw/Makefile b/drivers/infiniband/sw/Makefile
index 8b095b2..f2e95f9 100644
--- a/drivers/infiniband/sw/Makefile
+++ b/drivers/infiniband/sw/Makefile
@@ -1,2 +1,3 @@
 obj-$(CONFIG_INFINIBAND_RDMAVT)		+= rdmavt/
 obj-$(CONFIG_RDMA_RXE)			+= rxe/
+obj-$(CONFIG_RDMA_LOOPBACK)		+= loopback/
diff --git a/drivers/infiniband/sw/loopback/Kconfig b/drivers/infiniband/sw/loopback/Kconfig
new file mode 100644
index 0000000..4aadcaa
--- /dev/null
+++ b/drivers/infiniband/sw/loopback/Kconfig
@@ -0,0 +1,14 @@
+config RDMA_LOOPBACK
+	tristate "loopback (RoCE) driver"
+	depends on INET && INFINIBAND
+	depends on ARCH_DMA_ADDR_T_64BIT
+	select DMA_VIRT_OPS
+	help
+	This driver implements the InfiniBand RDMA transport over
+	the Linux network lo netdevice. It enables a system to
+	use a standard lo(loopback) netdevice to emulate completely
+	software RDMA driver. It doesn't implement any tranport
+	layers. It implements only data copier and verbs layer as
+	it works only on top of local lo device. It follows standard
+	Infiniband specification version 1.3 and RoCE annex. This is
+	zero configuration driver.
diff --git a/drivers/infiniband/sw/loopback/Makefile b/drivers/infiniband/sw/loopback/Makefile
new file mode 100644
index 0000000..84e530d
--- /dev/null
+++ b/drivers/infiniband/sw/loopback/Makefile
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-$(CONFIG_RDMA_LOOPBACK) += rdma_loopback.o
+
+rdma_loopback-y := loopback.o helper.o
diff --git a/drivers/infiniband/sw/loopback/loopback.c b/drivers/infiniband/sw/loopback/loopback.c
new file mode 100644
index 0000000..238ecfb
--- /dev/null
+++ b/drivers/infiniband/sw/loopback/loopback.c
@@ -0,0 +1,1603 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2019 Mellanox Technologies. All rights reserved.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <net/addrconf.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_umem.h>
+#include <rdma/ib_mad.h>
+
+#include "loopback_helper.h"
+
+enum {
+	LOOPBACK_MAX_MR	= (1 << 24) - 1,
+	LOOPBACK_MAX_QP	= (1 << 24) - 1,
+	LOOPBACK_MAX_CQ	= LOOPBACK_MAX_QP,
+	LOOPBACK_MAX_AH = INT_MAX,
+	LOOPBACK_MAX_PD = INT_MAX,
+};
+
+struct loopback_uctx {
+	struct ib_ucontext ibuctx;
+};
+
+struct loopback_pd {
+	struct ib_pd ibpd;
+};
+
+struct loopback_cq {
+	/* resource entry must be first */
+	struct loopback_resource res;
+	struct ib_cq ibcq;
+	struct loopback_fifo cqes;
+	enum ib_cq_notify_flags armed;
+};
+
+enum {
+	LOOPBACK_MR_TYPE_PHY	= 0,	/* access physical sges of kernel */
+	LOOPBACK_MR_TYPE_USER	= 1,	/* userspace MR */
+	LOOPBACK_MR_TYPE_DMA	= 2,	/* DMA MR of kernel */
+	LOOPBACK_MR_TYPE_FRMR	= 3,	/* FRMR */
+};
+
+struct loopback_usr_mr {
+	struct ib_umem	*umem;
+	/* array of pages for this mr to access in datapath */
+	struct page *pages;
+	size_t pages_alloc_size;
+};
+
+struct loopback_frmr {
+	int pg_iter;
+};
+
+struct loopback_mr {
+	struct loopback_resource res;
+	struct ib_mr ibmr;
+	int type;
+	int access;
+
+	u32 fbo;
+	u64 *pg_tbl;
+	/* we store the page shift to make common use for frmr and user mr */
+	u32 page_shift;
+	union {
+		struct loopback_usr_mr umr;
+		struct loopback_frmr frmr;
+	} u;
+};
+
+static u32 mr_id_to_mkey(u32 id)
+{
+	return id << 8;
+}
+
+struct loopback_qp {
+	struct loopback_resource res;
+	struct ib_qp ibqp;
+	enum ib_qp_state state;
+	struct loopback_fifo rqes;
+
+	gfp_t cqe_alloc_flags;
+	gfp_t rqe_alloc_flags;
+	bool user_qp;
+	struct ib_qp_attr attr;
+	struct ib_qp_init_attr init_attr;
+
+};
+
+struct loopback_ah {
+	struct loopback_resource res;
+	struct ib_ah ibah;
+	struct rdma_ah_attr attr;
+};
+
+struct rdma_loopdev {
+	struct ib_device dev;
+
+	struct loopback_resource_table mr_tbl;
+	struct loopback_resource_table cq_tbl;
+	struct loopback_resource_table qp_tbl;
+	struct loopback_resource_table ah_tbl;
+
+	struct ib_port_attr port_attr;
+	struct loopback_mr zero_mr;
+};
+
+static inline struct rdma_loopdev *ib_to_loopdev(struct ib_device *dev)
+{
+	return container_of(dev, struct rdma_loopdev, dev);
+}
+
+static inline struct loopback_mr *ib_to_loop_mr(struct ib_mr *ibmr)
+{
+	return container_of(ibmr, struct loopback_mr, ibmr);
+}
+
+static inline struct loopback_cq *ib_to_loop_cq(struct ib_cq *ibcq)
+{
+	return container_of(ibcq, struct loopback_cq, ibcq);
+}
+
+static inline struct loopback_ah *ib_to_loop_ah(struct ib_ah *ibah)
+{
+	return container_of(ibah, struct loopback_ah, ibah);
+}
+
+static inline struct loopback_qp *ib_to_loop_qp(struct ib_qp *ibqp)
+{
+	return container_of(ibqp, struct loopback_qp, ibqp);
+}
+
+struct loopback_cqe {
+	struct list_head list;
+	struct ib_wc wc;
+};
+
+struct loopback_rqe {
+	struct list_head list;
+
+	/* copy of recv wr */
+	struct ib_recv_wr wr;
+	struct ib_sge sges[0];
+};
+
+static struct rdma_loopdev *loopdev;
+static struct net_device *lo;
+
+static void init_loopdev_tables(struct rdma_loopdev *ld)
+{
+	init_table(&ld->ah_tbl, 0, LOOPBACK_MAX_AH, UINT_MAX, 0);
+	init_table(&ld->cq_tbl, 0, LOOPBACK_MAX_CQ, UINT_MAX, 0);
+	init_table(&ld->mr_tbl, 0, LOOPBACK_MAX_MR, 0xffffff00, 8);
+	init_table(&ld->qp_tbl, 1, LOOPBACK_MAX_QP, UINT_MAX, 0);
+}
+
+static int loopback_query_device(struct ib_device *dev,
+				 struct ib_device_attr *attr,
+				 struct ib_udata *uhw)
+{
+	memset(attr, 0, sizeof(*attr));
+
+	attr->sys_image_guid = dev->node_guid;
+	attr->max_mr_size = ULONG_MAX;
+	attr->page_size_cap = 0;
+	attr->vendor_id = 0;
+	attr->vendor_part_id = 0;
+	attr->hw_ver = 0;
+	attr->max_qp = LOOPBACK_MAX_QP;
+	attr->max_qp_wr = 65536;
+	attr->device_cap_flags = IB_DEVICE_MEM_MGT_EXTENSIONS;
+	attr->max_send_sge = 16;
+	attr->max_recv_sge = 16;
+	attr->max_sge_rd = 16;
+	attr->max_cq = LOOPBACK_MAX_CQ;
+	attr->max_cqe = 65536;
+	attr->max_mr = LOOPBACK_MAX_MR;
+	attr->max_pd = LOOPBACK_MAX_PD;
+	attr->max_qp_rd_atom = 64;
+	attr->max_ee_rd_atom = 0;
+	attr->max_res_rd_atom = 64;
+	attr->max_qp_init_rd_atom = 64;
+	attr->max_ee_init_rd_atom = 64;
+	attr->atomic_cap = 0;
+	attr->masked_atomic_cap = 0;
+	attr->max_ee = 0;
+	attr->max_rdd = 0;
+	attr->max_mw = 0;
+	attr->max_raw_ipv6_qp = 0;
+	attr->max_raw_ethy_qp = 0;
+	attr->max_mcast_grp = 0;
+	attr->max_mcast_qp_attach = 0;
+	attr->max_total_mcast_qp_attach = 0;
+	attr->max_ah = LOOPBACK_MAX_AH;
+	attr->max_srq = 0;
+	attr->max_srq_wr = 0;
+	attr->max_srq_sge = 0;
+	attr->max_fast_reg_page_list_len = 4;
+	attr->max_pkeys = 1;
+	attr->local_ca_ack_delay = 16;
+	attr->sig_prot_cap = 0;
+	attr->sig_guard_cap = 0;
+	attr->timestamp_mask = 0;
+	attr->hca_core_clock = 0; /* TODO */
+	return 0;
+}
+
+static int loopback_query_port(struct ib_device *dev, u8 port_num,
+			       struct ib_port_attr *attr)
+{
+	struct rdma_loopdev *ld = ib_to_loopdev(dev);
+
+	*attr = ld->port_attr;
+	attr->max_mtu = IB_MTU_4096;
+	attr->active_mtu = ib_mtu_int_to_enum(lo->mtu);
+	attr->ip_gids = 1;
+	attr->max_msg_sz = 1 << 24;
+	attr->active_width = 2;
+	attr->active_speed = IB_SPEED_HDR;
+	attr->max_vl_num = 1;
+	attr->phys_state = 5;	/* TODO */
+	attr->port_cap_flags = IB_PORT_CM_SUP;
+	if (dev_get_flags(lo) & IFF_UP)
+		attr->state = IB_PORT_ACTIVE;
+	else
+		attr->state = IB_PORT_DOWN;
+	return 0;
+}
+
+static struct net_device *
+loopback_get_netdev(struct ib_device *device, u8 port_num)
+{
+	dev_hold(lo);
+	return lo;
+}
+
+static int loopback_query_pkey(struct ib_device *device,
+			       u8 port_num, u16 index, u16 *pkey)
+{
+	*pkey = 0xffff;
+	return 0;
+}
+
+static enum rdma_link_layer
+loopback_get_link_layer(struct ib_device *dev, u8 port_num)
+{
+	return IB_LINK_LAYER_ETHERNET;
+}
+
+static int loopback_port_immutable(struct ib_device *dev, u8 port_num,
+				   struct ib_port_immutable *immutable)
+{
+	struct rdma_loopdev *ld = ib_to_loopdev(dev);
+
+	immutable->core_cap_flags = RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP;
+	immutable->pkey_tbl_len = ld->port_attr.pkey_tbl_len;
+	immutable->gid_tbl_len = ld->port_attr.gid_tbl_len;
+	immutable->max_mad_size = IB_MGMT_MAD_SIZE;
+	return 0;
+}
+
+static int
+loopback_alloc_ucontext(struct ib_ucontext *ibuctx, struct ib_udata *udata)
+{
+	return 0;
+}
+
+static void loopback_dealloc_ucontext(struct ib_ucontext *ibuctx)
+{
+}
+
+static int
+loopback_alloc_pd(struct ib_pd *pd, struct ib_ucontext *context,
+		  struct ib_udata *udata)
+{
+	/* Yes, we can further enhance core to not need null routines. */
+	return 0;
+}
+
+static void loopback_dealloc_pd(struct ib_pd *ibpd)
+{
+}
+
+static int loopback_poll_cq(struct ib_cq *ibcq, int num_entries,
+			    struct ib_wc *wc)
+{
+	struct loopback_cq *cq = ib_to_loop_cq(ibcq);
+	struct loopback_cqe *cqe;
+	struct list_head *entry;
+	int wc_count = 0;
+
+	while (num_entries) {
+		entry = pop_from_fifo(&cq->cqes);
+		if (!entry)
+			break;
+		cqe = container_of(entry, struct loopback_cqe, list);
+		memcpy(&wc[wc_count], &cqe->wc, sizeof(cqe->wc));
+		num_entries--;
+		wc_count++;
+		kfree(cqe);
+	}
+	return wc_count;
+}
+
+static void attempt_notify_cq(struct loopback_cq *cq)
+{
+	unsigned long flags;
+	u64 entries;
+
+	spin_lock_irqsave(&cq->cqes.lock, flags);
+	entries = get_fifo_entries(&cq->cqes);
+	if (cq->armed && entries && cq->ibcq.comp_handler)
+		(*cq->ibcq.comp_handler)(&cq->ibcq, cq->ibcq.cq_context);
+	spin_unlock_irqrestore(&cq->cqes.lock, flags);
+}
+
+static int loopback_req_notify_cq(struct ib_cq *ibcq,
+				  enum ib_cq_notify_flags arm)
+{
+	struct loopback_cq *cq = ib_to_loop_cq(ibcq);
+	unsigned long flags;
+	u64 entries;
+
+	spin_lock_irqsave(&cq->cqes.lock, flags);
+	cq->armed = arm;
+	entries = get_fifo_entries(&cq->cqes);
+	if (cq->armed && entries && cq->ibcq.comp_handler)
+		(*cq->ibcq.comp_handler)(&cq->ibcq, cq->ibcq.cq_context);
+	spin_unlock_irqrestore(&cq->cqes.lock, flags);
+	return 0;
+}
+
+static struct ib_cq *
+loopback_create_cq(struct ib_device *dev,
+		   const struct ib_cq_init_attr *attr,
+		   struct ib_ucontext *context,
+		   struct ib_udata *udata)
+{
+	struct rdma_loopdev *ld = ib_to_loopdev(dev);
+	struct loopback_cq *cq;
+	int ret;
+
+	cq = kzalloc(sizeof(*cq), GFP_KERNEL);
+	if (!cq)
+		return ERR_PTR(-ENOMEM);
+	init_fifo(&cq->cqes);
+	ret = attach_table_id(&ld->cq_tbl, &cq->res);
+	if (ret) {
+		kfree(cq);
+		return ERR_PTR(ret);
+	}
+	return &cq->ibcq;
+}
+
+static int loopback_destroy_cq(struct ib_cq *ibcq)
+{
+	struct loopback_cq *cq = ib_to_loop_cq(ibcq);
+	struct rdma_loopdev *ld = ib_to_loopdev(ibcq->device);
+
+	detach_table_id(&ld->cq_tbl, &cq->res);
+	kfree(cq);
+	return 0;
+}
+
+static struct ib_mr *loopback_get_dma_mr(struct ib_pd *ibpd, int access)
+{
+	struct rdma_loopdev *ld = ib_to_loopdev(ibpd->device);
+	struct loopback_mr *mr;
+	int ret;
+
+	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+	if (!mr)
+		return ERR_PTR(-ENOMEM);
+	mr->type = LOOPBACK_MR_TYPE_DMA;
+	mr->ibmr.length = ULONG_MAX;
+	mr->access = access;
+	ret = attach_table_id(&ld->mr_tbl, &mr->res);
+	if (ret) {
+		kfree(mr);
+		return ERR_PTR(ret);
+	}
+	mr->ibmr.lkey = mr_id_to_mkey(mr->res.id);
+	mr->ibmr.rkey = mr->ibmr.lkey;
+	return &mr->ibmr;
+}
+
+static size_t mr_pages_store_size(struct ib_umem *umem)
+{
+	return ib_umem_page_count(umem) * sizeof(struct page *);
+}
+
+static void fill_pg_table(struct loopback_mr *mr, struct ib_umem *umem)
+{
+	int page_size = BIT(mr->u.umr.umem->page_shift);
+	struct scatterlist *sg;
+	int pg_iter = 0;
+	void *map_va;
+	int pg_idx;
+	int pages;
+	int i;
+
+	for_each_sg(umem->sg_head.sgl, sg, umem->nmap, i) {
+		pages = sg_dma_len(sg) >> mr->u.umr.umem->page_shift;
+		map_va = page_address(sg_page(sg));
+
+		for (pg_idx = 0; pg_idx < pages; pg_idx++, pg_iter++)
+			mr->pg_tbl[pg_iter] =
+				(u64)map_va + (pg_idx * page_size);
+	}
+}
+
+static void free_mr(struct loopback_mr *mr)
+{
+	kfree(mr->pg_tbl);
+	if (mr->type == LOOPBACK_MR_TYPE_USER) {
+		if (mr->u.umr.umem)
+			ib_umem_release(mr->u.umr.umem);
+	}
+	kfree(mr);
+}
+
+static int loopback_dereg_mr(struct ib_mr *ibmr)
+{
+	struct rdma_loopdev *ld = ib_to_loopdev(ibmr->device);
+	struct loopback_mr *mr = ib_to_loop_mr(ibmr);
+
+	/* First we must drop the reference, so nothing new starts on this
+	 * mr, followed by wait for any ongoing operations.
+	 * after that free the umem etc. This is done through table callback.
+	 */
+	detach_table_id(&ld->mr_tbl, &mr->res);
+	free_mr(mr);
+	return 0;
+}
+
+static struct ib_mr *
+loopback_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 length, u64 iova,
+		     int access, struct ib_udata *udata)
+{
+	struct rdma_loopdev *ld = ib_to_loopdev(ibpd->device);
+	struct loopback_mr *mr;
+	struct ib_umem *umem;
+	struct page *pages;
+	size_t alloc_size;
+	int ret;
+
+	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+	if (!mr)
+		return ERR_PTR(-ENOMEM);
+	mr->type = LOOPBACK_MR_TYPE_USER;
+
+	umem = ib_umem_get(udata, start, length, access, 0);
+	if (!umem) {
+		ret = -ENOMEM;
+		goto err;
+	}
+	mr->u.umr.umem = umem;
+
+	alloc_size = mr_pages_store_size(umem);
+	mr->pg_tbl = kmalloc(alloc_size, GFP_KERNEL);
+	if (!mr->pg_tbl) {
+		/* TODO: Remove 2GB registration limit */
+		pr_err("%s alloc_size = %zu, page cnt = %d\n", __func__,
+		       alloc_size, ib_umem_page_count(umem));
+		ret = -ENOMEM;
+		goto err;
+	}
+	memset(mr->pg_tbl, 0, alloc_size);
+	mr->u.umr.pages = pages;
+	mr->u.umr.pages_alloc_size = alloc_size;
+	mr->fbo = ib_umem_offset(umem);
+	mr->page_shift = umem->page_shift;
+	mr->access = access;
+	mr->ibmr.iova = iova;
+	mr->ibmr.length = length;
+	fill_pg_table(mr, umem);
+
+	ret = attach_table_id(&ld->mr_tbl, &mr->res);
+	if (ret)
+		goto err;
+	mr->ibmr.lkey = mr_id_to_mkey(mr->res.id);
+	mr->ibmr.rkey = mr->ibmr.lkey;
+	pr_debug("%s mr=0x%x fbo=0x%x, len=%lld pg_sz=%d pg_cnt=%d\n",
+		 __func__, mr->ibmr.lkey, mr->fbo,
+		 mr->ibmr.length, mr->ibmr.page_size,
+		 ib_umem_page_count(umem));
+	return &mr->ibmr;
+
+err:
+	free_mr(mr);
+	return ERR_PTR(ret);
+}
+
+static void *get_dma_mr_va(const struct loopback_mr *mr, u64 va,
+			   u32 cpy_len, u32 *ret_len)
+{
+	*ret_len = cpy_len;
+	return ((void *)(uintptr_t)va);
+}
+
+static void *get_virt_mr_va(const struct loopback_mr *mr, u64 va,
+			    u32 cpy_len, u32 *ret_len)
+{
+	u64 zero_based_offset;
+	u32 in_pg_offset;
+	u32 byte_offset;
+	u64 pg_addr;
+	void *vaddr;
+	int pg_idx;
+
+	/* zero_based_offset accounts for fbo; due to which it can be
+	 * offset by one page.
+	 */
+	zero_based_offset = (va - mr->ibmr.iova) + mr->fbo;
+	pg_idx = zero_based_offset / BIT(mr->page_shift);
+
+	byte_offset = va - mr->ibmr.iova;
+
+	if (mr->fbo) {
+		int bytes_in_first_page = 0;
+
+		bytes_in_first_page = BIT(mr->page_shift) - mr->fbo;
+		in_pg_offset = byte_offset -
+			((BIT(mr->page_shift) * (pg_idx - 1)) + bytes_in_first_page);
+	} else {
+		in_pg_offset = byte_offset - (BIT(mr->page_shift) * pg_idx);
+	}
+	pg_addr = mr->pg_tbl[pg_idx];
+	pg_addr += in_pg_offset;
+	vaddr = ((void *)(uintptr_t)pg_addr);
+	*ret_len = min_t(u32, BIT(mr->page_shift) - in_pg_offset, cpy_len);
+	return vaddr;
+}
+
+static void *get_mr_va(const struct loopback_mr *mr, u64 va,
+		       u32 cpy_len, u32 *ret_len)
+{
+	switch (mr->type) {
+	case LOOPBACK_MR_TYPE_PHY:
+	case LOOPBACK_MR_TYPE_DMA:
+		return get_dma_mr_va(mr, va, cpy_len, ret_len);
+	case LOOPBACK_MR_TYPE_USER:
+	case LOOPBACK_MR_TYPE_FRMR:
+		return get_virt_mr_va(mr, va, cpy_len, ret_len);
+	}
+	return NULL;
+}
+
+static void generate_rc_rq_cqe(struct loopback_qp *dqp,
+			       struct loopback_rqe *rqe,
+			       u32 recv_len,
+			       enum ib_wc_status rqe_status,
+			       u32 inv_key, u32 wc_flags)
+{
+	struct loopback_cq *recv_cq = ib_to_loop_cq(dqp->ibqp.recv_cq);
+	struct loopback_cqe *rq_cqe;
+
+	/* Generate receive completion on best effort basis */
+	rq_cqe = kzalloc(sizeof(*rq_cqe), dqp->cqe_alloc_flags);
+	if (!rq_cqe)
+		return;
+
+	rq_cqe->wc.qp = &dqp->ibqp;
+	rq_cqe->wc.wr_cqe = rqe->wr.wr_cqe;
+	rq_cqe->wc.status = rqe_status;
+	rq_cqe->wc.byte_len = recv_len;
+	rq_cqe->wc.opcode = IB_WC_RECV;
+	rq_cqe->wc.qp = &dqp->ibqp;
+	rq_cqe->wc.port_num = 1;
+	rq_cqe->wc.wc_flags = wc_flags;
+	rq_cqe->wc.ex.invalidate_rkey = inv_key;
+	push_to_fifo(&recv_cq->cqes, &rq_cqe->list);
+	attempt_notify_cq(recv_cq);
+}
+
+static void generate_ud_rq_cqe(const struct loopback_qp *sqp,
+			       struct loopback_qp *dqp,
+			       struct loopback_rqe *rqe,
+			       u32 recv_len,
+			       u8 network_hdr_type, int wc_flags,
+			       enum ib_wc_status rqe_status)
+{
+	struct loopback_cq *recv_cq = ib_to_loop_cq(dqp->ibqp.recv_cq);
+	struct loopback_cqe *rq_cqe;
+
+	/* Generate receive completion on best effort basis */
+	rq_cqe = kzalloc(sizeof(*rq_cqe), dqp->cqe_alloc_flags);
+	if (!rq_cqe)
+		return;
+
+	rq_cqe->wc.qp = &dqp->ibqp;
+	rq_cqe->wc.wr_cqe = rqe->wr.wr_cqe;
+	rq_cqe->wc.status = rqe_status;
+	rq_cqe->wc.byte_len = recv_len;
+	rq_cqe->wc.opcode = IB_WC_RECV;
+	rq_cqe->wc.qp = &dqp->ibqp;
+	rq_cqe->wc.src_qp = sqp->ibqp.qp_num;
+	rq_cqe->wc.port_num = 1;
+	rq_cqe->wc.network_hdr_type = network_hdr_type;
+	rq_cqe->wc.wc_flags = wc_flags;
+	push_to_fifo(&recv_cq->cqes, &rq_cqe->list);
+	attempt_notify_cq(recv_cq);
+}
+
+static bool is_qp_supported(enum ib_qp_type type)
+{
+	return (type == IB_QPT_GSI || type == IB_QPT_RC);
+}
+
+static struct ib_qp *
+loopback_create_qp(struct ib_pd *ibpd, struct ib_qp_init_attr *attr,
+		   struct ib_udata *udata)
+{
+	struct rdma_loopdev *ld = ib_to_loopdev(ibpd->device);
+	struct loopback_qp *qp;
+	int ret;
+
+	if (!is_qp_supported(attr->qp_type))
+		return ERR_PTR(-EINVAL);
+
+	qp = kzalloc(sizeof(*qp), GFP_KERNEL);
+	if (!qp)
+		return ERR_PTR(-ENOMEM);
+
+	qp->init_attr = *attr;
+	/* Since we generate cqes under rcu read lock,
+	 * cqe allocations are atomic.
+	 */
+	qp->cqe_alloc_flags = udata ? GFP_KERNEL : GFP_ATOMIC;
+	qp->rqe_alloc_flags = udata ? GFP_KERNEL : GFP_ATOMIC;
+	qp->user_qp = udata ? true : false;
+	init_fifo(&qp->rqes);
+	if (attr->qp_type == IB_QPT_GSI)
+		ret = attach_table_id_for_id(&ld->qp_tbl, &qp->res, 1);
+	else
+		ret = attach_table_id(&ld->qp_tbl, &qp->res);
+	if (ret) {
+		kfree(qp);
+		return ERR_PTR(ret);
+	}
+	qp->ibqp.qp_num = qp->res.id;
+	return &qp->ibqp;
+}
+
+static void loopbak_flush_rq(struct loopback_qp *qp, bool gen_cqe)
+{
+	struct loopback_rqe *rqe;
+	struct list_head *entry;
+
+	while (1) {
+		entry = pop_from_fifo(&qp->rqes);
+		if (!entry)
+			break;
+		rqe = container_of(entry, struct loopback_rqe, list);
+		if (gen_cqe)
+			generate_rc_rq_cqe(qp, rqe, 0, IB_WC_WR_FLUSH_ERR,
+					   0, 0);
+		kfree(rqe);
+	}
+}
+
+static int loopback_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+			      int mask, struct ib_udata *udata)
+{
+	struct loopback_qp *qp = ib_to_loop_qp(ibqp);
+
+	if (mask & IB_QP_ACCESS_FLAGS)
+		qp->attr.qp_access_flags = attr->qp_access_flags;
+	if (mask & IB_QP_DEST_QPN)
+		qp->attr.dest_qp_num = attr->dest_qp_num;
+	if (mask & IB_QP_STATE) {
+		WRITE_ONCE(qp->state, attr->qp_state);
+
+		put_table_entry(&qp->res);
+		/* Wait for all datapath operations to stop */
+		wait_for_completion(&qp->res.completion);
+
+		if (attr->qp_state == IB_QPS_ERR ||
+		    attr->qp_state == IB_QPS_RESET) {
+			loopbak_flush_rq(qp, true);
+		}
+		/* Reinit the refcount so that new data path ops can start
+		 * after a new state. There is extremely rare corner case
+		 * where RTR->RTS transition time, sender is sending the
+		 * data, which this driver doesn't support currently.
+		 */
+		refcount_set(&qp->res.refcount, 1);
+	}
+	return 0;
+}
+
+static int loopback_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+			     int mask, struct ib_qp_init_attr *init_attr)
+{
+	struct loopback_qp *qp = ib_to_loop_qp(ibqp);
+
+	*attr = qp->attr;
+	*init_attr = qp->init_attr;
+	return 0;
+}
+
+static int loopback_destroy_qp(struct ib_qp *ibqp)
+{
+	struct rdma_loopdev *ld = ib_to_loopdev(ibqp->device);
+	struct loopback_qp *qp = ib_to_loop_qp(ibqp);
+
+	detach_table_id(&ld->qp_tbl, &qp->res);
+	loopbak_flush_rq(qp, false);
+	kfree(qp);
+	return 0;
+}
+
+static enum ib_wc_status sq_opcode_to_wc_opcode(enum ib_wr_opcode opcode)
+{
+	switch (opcode) {
+	case IB_WR_RDMA_WRITE:
+	case IB_WR_RDMA_WRITE_WITH_IMM:
+		return IB_WC_RDMA_WRITE;
+	case IB_WR_SEND:
+	case IB_WR_SEND_WITH_IMM:
+		return IB_WC_SEND;
+	case IB_WR_RDMA_READ:
+		return IB_WC_RDMA_READ;
+	case IB_WR_ATOMIC_CMP_AND_SWP:
+		return IB_WC_COMP_SWAP;
+	case IB_WR_ATOMIC_FETCH_AND_ADD:
+		return IB_WC_FETCH_ADD;
+	case IB_WR_LSO:
+		return IB_WC_LSO;
+	case IB_WR_LOCAL_INV:
+		return IB_WC_LOCAL_INV;
+	case IB_WR_REG_MR:
+	case IB_WR_REG_SIG_MR:
+		return IB_WC_REG_MR;
+	case IB_WR_MASKED_ATOMIC_CMP_AND_SWP:
+		return IB_WC_MASKED_COMP_SWAP;
+	case IB_WR_MASKED_ATOMIC_FETCH_AND_ADD:
+		return IB_WC_MASKED_FETCH_ADD;
+	default:
+		/* TODO: no better default value */
+		return IB_WC_SEND;
+	}
+}
+
+static u64 get_sges_len(const struct ib_sge *sg_list, int num_sges)
+{
+	u64 size = 0;
+	int i;
+
+	for (i = 0; i < num_sges; i++)
+		size += sg_list[i].length;
+
+	return size;
+}
+
+static u64 get_send_wqe_len(const struct ib_send_wr *wr)
+{
+	u64 send_len = 0;
+
+	switch (wr->opcode) {
+	case IB_WR_SEND:
+	case IB_WR_SEND_WITH_INV:
+	case IB_WR_SEND_WITH_IMM:
+	case IB_WR_RDMA_READ:
+	case IB_WR_RDMA_WRITE:
+		send_len = get_sges_len(wr->sg_list, wr->num_sge);
+		break;
+	default:
+		break;
+	}
+	return send_len;
+}
+
+static u64 get_rqe_len(struct loopback_rqe *rqe)
+{
+	return get_sges_len(&rqe->sges[0], rqe->wr.num_sge);
+}
+
+static void put_mr(struct loopback_mr *mr)
+{
+	if (!IS_ERR(mr) && mr)
+		put_table_entry(&mr->res);
+}
+
+static struct loopback_mr *get_mr_for_key(struct rdma_loopdev *ld, u32 key)
+{
+	struct loopback_resource *mr_entry;
+	struct loopback_mr *mr;
+
+	mr_entry = get_table_entry_by_id(&ld->mr_tbl, key);
+	if (!mr_entry)
+		return ERR_PTR(-EINVAL);
+	mr = container_of(mr_entry, struct loopback_mr, res);
+	return mr;
+}
+
+static int validate_mr_access(const struct loopback_mr *mr,
+			      const struct loopback_qp *qp,
+			      enum ib_wr_opcode opc, u64 addr, u32 len)
+{
+	if (len > mr->ibmr.length ||
+	    (qp->user_qp && mr->type != LOOPBACK_MR_TYPE_USER))
+		return -EACCES;
+
+	if ((opc == IB_WR_RDMA_WRITE ||
+	     opc == IB_WR_RDMA_WRITE_WITH_IMM ||
+	     opc == IB_WR_RDMA_READ ||
+	     opc == IB_WR_ATOMIC_CMP_AND_SWP ||
+	     opc == IB_WR_ATOMIC_FETCH_AND_ADD) &&
+	     ((qp->attr.qp_access_flags & mr->access) == 0))
+		return -EACCES;
+
+	/* MR length and iova checks are applicable to FRMR and user type */
+	if (mr->type == LOOPBACK_MR_TYPE_PHY ||
+	    mr->type == LOOPBACK_MR_TYPE_DMA)
+		return 0;
+
+	/* address within range check */
+	if (addr < mr->ibmr.iova || addr > mr->ibmr.iova + mr->ibmr.length ||
+	    addr + len > mr->ibmr.iova + mr->ibmr.length)
+		return -EACCES;
+	return 0;
+}
+
+static struct loopback_mr *
+get_mr_for_wr_sge(struct rdma_loopdev *ld, const struct loopback_qp *qp,
+		  enum ib_wr_opcode opc, const struct ib_sge *sge)
+{
+	struct loopback_mr *mr;
+	int ret;
+
+	mr = get_mr_for_key(ld, sge->lkey);
+	if (IS_ERR(mr))
+		return mr;
+
+	ret = validate_mr_access(mr, qp, opc, sge->addr, sge->length);
+	if (ret) {
+		put_mr(mr);
+		mr = ERR_PTR(ret);
+	}
+	return mr;
+}
+
+static struct loopback_mr *
+get_mr_for_rkey(struct rdma_loopdev *ld, struct loopback_qp *qp,
+		enum ib_wr_opcode opc, u64 addr, u32 len, u32 mkey)
+{
+	struct loopback_mr *mr;
+	int ret;
+
+	mr = get_mr_for_key(ld, mkey);
+	if (IS_ERR(mr))
+		return mr;
+
+	ret = validate_mr_access(mr, qp, opc, addr, len);
+	if (ret) {
+		put_mr(mr);
+		mr = ERR_PTR(ret);
+	}
+	return mr;
+}
+
+static int
+copy_data_wqe_to_rqe(struct rdma_loopdev *ld, struct loopback_qp *sqp,
+		     struct loopback_qp *dqp,
+		     const struct ib_send_wr *wr, struct loopback_rqe *rqe,
+		     u32 send_len, u32 dst_fbo)
+{
+	struct loopback_mr *src_mr = NULL;
+	struct loopback_mr *dst_mr = NULL;
+	u32 src_sge_len = 0;
+	u32 dst_sge_len = 0;
+	u64 src_sge_va = 0;
+	u64 dst_sge_va = 0;
+	u32 ret_len = 0;
+	u32 cpy_len = 0;
+	void *src_addr;
+	void *dst_addr;
+	int s_idx = 0;
+	int d_idx = 0;
+	int ret = 0;
+
+	while (send_len) {
+		if (!src_mr) {
+			src_mr = get_mr_for_wr_sge(ld, sqp, wr->opcode,
+						   &wr->sg_list[s_idx]);
+			if (IS_ERR(src_mr)) {
+				ret = PTR_ERR(src_mr);
+				goto err;
+			}
+			src_sge_len = wr->sg_list[s_idx].length;
+			src_sge_va = wr->sg_list[s_idx].addr;
+		}
+		if (!dst_mr) {
+			dst_mr = get_mr_for_wr_sge(ld, sqp, wr->opcode,
+						   &rqe->sges[d_idx]);
+			if (IS_ERR(dst_mr)) {
+				ret = PTR_ERR(dst_mr);
+				goto err;
+			}
+			dst_sge_len = rqe->sges[d_idx].length;
+			dst_sge_va = rqe->sges[d_idx].addr + dst_fbo;
+		}
+
+		/* copy data of minimum length between and src and dst sge */
+		cpy_len = min_t(u32, src_sge_len, dst_sge_len);
+
+		src_addr = get_mr_va(src_mr, src_sge_va, cpy_len, &ret_len);
+		cpy_len = min_t(u32, cpy_len, ret_len);
+
+		dst_addr = get_mr_va(dst_mr, dst_sge_va, cpy_len, &ret_len);
+		cpy_len = min_t(u32, cpy_len, ret_len);
+
+		memcpy(dst_addr, src_addr, cpy_len);
+
+		src_sge_len -= cpy_len;
+		dst_sge_len -= cpy_len;
+		src_sge_va += cpy_len;
+		dst_sge_va += cpy_len;
+		send_len -= cpy_len;
+
+		if (!src_sge_len) {
+			s_idx++;
+			put_mr(src_mr);
+			src_mr = NULL;
+		}
+		if (!dst_sge_len) {
+			d_idx++;
+			put_mr(dst_mr);
+			dst_mr = NULL;
+		}
+		dst_fbo = 0;
+	}
+err:
+	put_mr(dst_mr);
+	put_mr(src_mr);
+	return ret;
+}
+
+static int copy_data_wqe_rkey(struct rdma_loopdev *ld, struct loopback_qp *sqp,
+			      struct loopback_qp *dqp,
+			      const struct ib_send_wr *wr)
+{
+	const struct ib_rdma_wr *rdmawr = rdma_wr(wr);
+	u64 rdma_len = get_send_wqe_len(wr);
+	struct loopback_mr *wr_mr = NULL;
+	struct loopback_mr *rkey_mr;
+	u32 wr_sge_len = 0;
+	u64 wr_sge_va = 0;
+	void *wr_sge_addr;
+	u64 rkey_va = 0;
+	u32 ret_len = 0;
+	u32 cpy_len = 0;
+	void *rkey_addr;
+	int sge_idx = 0;
+	int ret = 0;
+
+	rkey_va = rdmawr->remote_addr;
+	rkey_mr = get_mr_for_rkey(ld, dqp, wr->opcode, rkey_va,
+				  rdma_len, rdmawr->rkey);
+	if (IS_ERR(rkey_mr))
+		return PTR_ERR(rkey_mr);
+
+	while (rdma_len) {
+		if (!wr_mr) {
+			wr_mr = get_mr_for_wr_sge(ld, sqp, wr->opcode,
+						  &wr->sg_list[sge_idx]);
+			if (IS_ERR(wr_mr)) {
+				ret = PTR_ERR(wr_mr);
+				goto err;
+			}
+			wr_sge_len = wr->sg_list[sge_idx].length;
+			wr_sge_va = wr->sg_list[sge_idx].addr;
+		}
+
+		cpy_len = wr_sge_len;
+
+		wr_sge_addr = get_mr_va(wr_mr, wr_sge_va, cpy_len, &ret_len);
+		cpy_len = min_t(u32, cpy_len, ret_len);
+
+		rkey_addr = get_mr_va(rkey_mr, rkey_va, cpy_len, &ret_len);
+		cpy_len = min_t(u32, cpy_len, ret_len);
+
+		if (wr->opcode == IB_WR_RDMA_READ) {
+			/* rdma read => read from remote rkey to local sges */
+			memcpy(wr_sge_addr, rkey_addr, cpy_len);
+		} else {
+			/* rdma write => local sges to remote rkey */
+			memcpy(rkey_addr, wr_sge_addr, cpy_len);
+		}
+
+		wr_sge_len -= cpy_len;
+		wr_sge_va += cpy_len;
+		rkey_va += cpy_len;
+		rdma_len -= cpy_len;
+
+		if (!wr_sge_len) {
+			sge_idx++;
+			put_mr(wr_mr);
+			wr_mr = NULL;
+		}
+	}
+err:
+	put_mr(wr_mr);
+	put_mr(rkey_mr);
+	return ret;
+}
+
+static int
+write_ud_grh_hdr(struct rdma_loopdev *ld, struct loopback_qp *dqp,
+		 const struct ib_send_wr *wr, struct loopback_rqe *rqe)
+{
+	union rdma_network_hdr hdr = {};
+	struct loopback_mr *dst_mr;
+	u32 dst_sge_len;
+	void *dst_addr;
+	u64 dst_sge_va;
+	u32 ret_len = 0;
+	int ret = 0;
+
+	/* Even though spec allows to split first 40 bytes header in 40 sges,
+	 * there isn't good usecase, so this expect minimum 40 bytes sge.
+	 */
+	dst_sge_va = rqe->sges[0].addr;
+	dst_sge_len = rqe->sges[0].length;
+	if (dst_sge_len < sizeof(hdr))
+		return -EINVAL;
+
+	dst_mr = get_mr_for_wr_sge(ld, dqp, wr->opcode, &rqe->sges[0]);
+	if (IS_ERR(dst_mr))
+		return PTR_ERR(dst_mr);
+	dst_addr = get_mr_va(dst_mr, dst_sge_va, sizeof(hdr), &ret_len);
+	if (ret_len != sizeof(hdr)) {
+		ret = -EINVAL;
+		goto done;
+	}
+	/* TODO: consider ipv6 */
+	hdr.roce4grh.saddr = htonl(0x7f000001);
+	hdr.roce4grh.daddr = htonl(0x7f000001);
+	hdr.roce4grh.ttl = 1;
+	memcpy(dst_addr, &hdr, sizeof(hdr));
+done:
+	put_mr(dst_mr);
+	return ret;
+}
+
+static void
+post_one_ud_send_wqe(struct rdma_loopdev *ld, struct loopback_qp *qp,
+		     struct loopback_qp *dqp,
+		     const struct ib_send_wr *wr,
+		     struct loopback_cqe *wq_cqe)
+{
+	u32 send_len = get_send_wqe_len(wr);
+	struct loopback_rqe *rqe = NULL;
+	enum ib_wc_status recv_status;
+	struct list_head *rqe_entry;
+	u32 recv_len;
+	int ret;
+
+	rqe_entry = pop_from_fifo(&dqp->rqes);
+	if (!rqe_entry) {
+		wq_cqe->wc.status = IB_WC_GENERAL_ERR;
+		return;
+	}
+	rqe = container_of(rqe_entry, struct loopback_rqe, list);
+	recv_len = get_rqe_len(rqe);
+
+	if (send_len + sizeof(union rdma_network_hdr) > recv_len) {
+		recv_len = 0;
+		recv_status = IB_WC_GENERAL_ERR;
+		wq_cqe->wc.status = IB_WC_GENERAL_ERR;
+	} else {
+		/* copy minimum data + grh of what is sent and rqe size */
+		recv_len = min_t(u32, send_len +
+				 sizeof(union rdma_network_hdr), recv_len);
+
+		ret = write_ud_grh_hdr(ld, dqp, wr, rqe);
+		if (ret) {
+			recv_status = IB_WC_GENERAL_ERR;
+			wq_cqe->wc.status = IB_WC_GENERAL_ERR;
+			goto done;
+		}
+		ret = copy_data_wqe_to_rqe(ld, qp, dqp, wr, rqe, send_len,
+					   sizeof(union rdma_network_hdr));
+		if (ret) {
+			recv_status = IB_WC_GENERAL_ERR;
+			wq_cqe->wc.status = IB_WC_GENERAL_ERR;
+		} else {
+			recv_status = IB_WC_SUCCESS;
+			wq_cqe->wc.status = IB_WC_SUCCESS;
+		}
+	}
+done:
+	generate_ud_rq_cqe(qp, dqp, rqe, recv_len, RDMA_NETWORK_IPV4,
+			   IB_WC_WITH_NETWORK_HDR_TYPE |
+			   IB_WC_GRH, recv_status);
+	kfree(rqe);
+}
+
+static struct loopback_qp *
+get_qp_by_qpn(struct rdma_loopdev *ld, u32 qpn)
+{
+	struct loopback_resource *entry;
+	struct loopback_qp *qp;
+
+	entry = get_table_entry_by_id(&ld->qp_tbl, qpn);
+	if (!entry)
+		return ERR_PTR(-EINVAL);
+	qp = container_of(entry, struct loopback_qp, res);
+	return qp;
+}
+
+static struct loopback_qp *get_qp(struct loopback_qp *qp)
+{
+	return refcount_inc_not_zero(&qp->res.refcount) ? qp : ERR_PTR(-EINVAL);
+}
+
+static void put_qp(struct loopback_qp *qp)
+{
+	if (!IS_ERR(qp) && qp)
+		put_table_entry(&qp->res);
+}
+
+static void
+post_one_ud_wqe(struct rdma_loopdev *ld, struct loopback_qp *qp,
+		const struct ib_send_wr *wr,
+		struct loopback_cqe *wq_cqe)
+{
+	const struct ib_ud_wr *ud_wqe = ud_wr(wr);
+	struct loopback_qp *dqp;
+
+	dqp = get_qp_by_qpn(ld, ud_wqe->remote_qpn);
+	if (IS_ERR(dqp))
+		goto done;
+
+	switch (wr->opcode) {
+	case IB_WR_SEND:
+		post_one_ud_send_wqe(ld, qp, dqp, wr, wq_cqe);
+		break;
+	default:
+		wq_cqe->wc.status = IB_WC_GENERAL_ERR;
+		break;
+	}
+done:
+	wq_cqe->wc.src_qp = qp->ibqp.qp_num;
+	put_qp(dqp);
+}
+
+static int invalidate_rkey(struct rdma_loopdev *ld, u32 inv_key)
+{
+	struct loopback_mr *mr;
+
+	mr = get_mr_for_key(ld, inv_key);
+	if (IS_ERR(mr))
+		return PTR_ERR(mr);
+	/* Fail invalidation if there are active users for now, as this is
+	 * extremely rare scenario and not well known use case.
+	 */
+	if (refcount_read(&mr->res.refcount) > 2)
+		return -EINVAL;
+	xa_clear_mark(&ld->mr_tbl.ids, mr->res.id,
+		      LOOPBACK_RESOURCE_STATE_VALID);
+	put_mr(mr);
+	return 0;
+}
+
+static void
+process_one_rc_linv(struct rdma_loopdev *ld, struct loopback_qp *qp,
+		    const struct ib_send_wr *wr,
+		    struct loopback_cqe *wq_cqe)
+{
+	u32 inv_key = wr->ex.invalidate_rkey;
+	int ret;
+
+	ret = invalidate_rkey(ld, inv_key);
+	if (ret)
+		wq_cqe->wc.status = IB_WC_GENERAL_ERR;
+	else
+		wq_cqe->wc.status = IB_WC_SUCCESS;
+}
+
+static void
+process_one_rc_send_wqe(struct rdma_loopdev *ld, struct loopback_qp *qp,
+			struct loopback_qp *dqp,
+			const struct ib_send_wr *wr,
+			struct loopback_cqe *wq_cqe)
+{
+	enum ib_wc_status recv_status;
+	struct loopback_rqe *rqe;
+	struct list_head *entry;
+	u32 rqe_wc_flags = 0;
+	u32 inv_key = 0;
+	u32 send_len;
+	u32 recv_len;
+	int ret;
+
+	entry = pop_from_fifo(&dqp->rqes);
+	if (!entry) {
+		wq_cqe->wc.status = IB_WC_RNR_RETRY_EXC_ERR;
+		rcu_read_unlock();
+		return;
+	}
+	rqe = container_of(entry, struct loopback_rqe, list);
+	send_len = get_send_wqe_len(wr);
+	recv_len = get_rqe_len(rqe);
+	if (send_len > recv_len) {
+		recv_len = 0;
+		recv_status = IB_WC_GENERAL_ERR;
+		wq_cqe->wc.status = IB_WC_REM_INV_REQ_ERR;
+	} else {
+		/* copy minimum data of what is sent and rqe size */
+		recv_len = min_t(u32, send_len, recv_len);
+		ret = copy_data_wqe_to_rqe(ld, qp, dqp, wr,
+					   rqe, send_len, 0);
+		if (ret) {
+			recv_status = IB_WC_LOC_LEN_ERR;
+			wq_cqe->wc.status = IB_WC_REM_INV_REQ_ERR;
+		} else {
+			recv_status = IB_WC_SUCCESS;
+			wq_cqe->wc.status = IB_WC_SUCCESS;
+		}
+	}
+	if (!ret && wr->opcode == IB_WR_SEND_WITH_INV) {
+		ret = invalidate_rkey(ld, wr->ex.invalidate_rkey);
+		if (ret) {
+			recv_status = IB_WC_LOC_LEN_ERR;
+			wq_cqe->wc.status = IB_WC_REM_INV_REQ_ERR;
+		} else {
+			inv_key = wr->ex.invalidate_rkey;
+			rqe_wc_flags = IB_WC_WITH_INVALIDATE;
+			recv_status = IB_WC_SUCCESS;
+			wq_cqe->wc.status = IB_WC_SUCCESS;
+		}
+	}
+
+	generate_rc_rq_cqe(dqp, rqe, recv_len, recv_status,
+			   inv_key, rqe_wc_flags);
+	kfree(rqe);
+}
+
+static void
+process_one_rc_rw_wqe(struct rdma_loopdev *ld, struct loopback_qp *qp,
+		      struct loopback_qp *dqp,
+		      const struct ib_send_wr *wr,
+		      struct loopback_cqe *wq_cqe)
+{
+	int ret;
+
+	ret = copy_data_wqe_rkey(ld, qp, dqp, wr);
+	if (ret)
+		wq_cqe->wc.status = IB_WC_REM_ACCESS_ERR;
+	else
+		wq_cqe->wc.status = IB_WC_SUCCESS;
+}
+
+static void
+process_one_rc_wqe(struct rdma_loopdev *ld, struct loopback_qp *sqp,
+		   struct loopback_qp *dqp,
+		   const struct ib_send_wr *wr,
+		   struct loopback_cqe *wq_cqe)
+{
+	switch (wr->opcode) {
+	case IB_WR_SEND:
+	case IB_WR_SEND_WITH_INV:
+		process_one_rc_send_wqe(ld, sqp, dqp, wr, wq_cqe);
+		break;
+	case IB_WR_RDMA_READ:
+	case IB_WR_RDMA_WRITE:
+		process_one_rc_rw_wqe(ld, sqp, dqp, wr, wq_cqe);
+		break;
+	case IB_WR_LOCAL_INV:
+		process_one_rc_linv(ld, sqp, wr, wq_cqe);
+		break;
+	default:
+		wq_cqe->wc.status = IB_WC_GENERAL_ERR;
+		break;
+	}
+}
+
+static void
+post_one_rc_wqe(struct rdma_loopdev *ld, struct loopback_qp *sqp,
+		const struct ib_send_wr *wr, struct loopback_cqe *wq_cqe)
+{
+	struct loopback_qp *dqp;
+
+	dqp = get_qp_by_qpn(ld, sqp->attr.dest_qp_num);
+	if (IS_ERR(dqp)) {
+		wq_cqe->wc.status = IB_WC_RETRY_EXC_ERR;
+		goto done;
+	}
+	process_one_rc_wqe(ld, sqp, dqp, wr, wq_cqe);
+
+done:
+	wq_cqe->wc.src_qp = sqp->ibqp.qp_num;
+	put_qp(dqp);
+}
+
+static int post_one_send(struct loopback_qp *qp, const struct ib_send_wr *wr)
+{
+	struct rdma_loopdev *ld = ib_to_loopdev(qp->ibqp.device);
+	struct loopback_cq *send_cq = ib_to_loop_cq(qp->ibqp.send_cq);
+	struct loopback_cqe *cqe;
+	struct loopback_qp *sqp;
+	int ret = 0;
+
+	sqp = get_qp(qp);
+	if (IS_ERR(sqp))
+		return -EINVAL;
+
+	cqe = kzalloc(sizeof(*cqe), qp->cqe_alloc_flags);
+	if (!cqe) {
+		ret = -ENOMEM;
+		goto alloc_err;
+	}
+
+	if (qp->state != IB_QPS_RTS) {
+		cqe->wc.status = IB_WC_WR_FLUSH_ERR;
+		goto done;
+	}
+
+	switch (qp->ibqp.qp_type) {
+	case IB_QPT_GSI:
+		post_one_ud_wqe(ld, qp, wr, cqe);
+		break;
+	case IB_QPT_RC:
+		post_one_rc_wqe(ld, qp, wr, cqe);
+		break;
+	default:
+		break;
+	}
+
+done:
+	cqe->wc.opcode = sq_opcode_to_wc_opcode(wr->opcode);
+	cqe->wc.wr_cqe = wr->wr_cqe;
+	cqe->wc.qp = &sqp->ibqp;
+	cqe->wc.port_num = 1;
+	if (wr->send_flags & IB_SEND_SIGNALED ||
+	    qp->init_attr.sq_sig_type == IB_SIGNAL_ALL_WR ||
+	    cqe->wc.status != IB_WC_SUCCESS) {
+		push_to_fifo(&send_cq->cqes, &cqe->list);
+		attempt_notify_cq(send_cq);
+	} else {
+		kfree(cqe);
+	}
+alloc_err:
+	put_qp(sqp);
+	return ret;
+}
+
+static int loopback_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr,
+			      const struct ib_send_wr **bad_wr)
+{
+	struct loopback_qp *qp = ib_to_loop_qp(ibqp);
+	int err = 0;
+
+	while (wr) {
+		err = post_one_send(qp, wr);
+		if (unlikely(err)) {
+			*bad_wr = wr;
+			break;
+		}
+		wr = wr->next;
+	}
+	return err;
+}
+
+static int post_one_recv(struct loopback_qp *qp, const struct ib_recv_wr *wr)
+{
+	struct loopback_rqe *rqe;
+	struct loopback_qp *sqp;
+	enum ib_qp_state state;
+	int ret = 0;
+
+	sqp = get_qp(qp);
+	if (IS_ERR(sqp))
+		return -EINVAL;
+
+	rqe = kzalloc(struct_size(rqe, sges, wr->num_sge), qp->rqe_alloc_flags);
+	if (!rqe) {
+		ret = -ENOMEM;
+		goto alloc_err;
+	}
+
+	rqe->wr = *wr;
+	memcpy(&rqe->sges[0], wr->sg_list, wr->num_sge * sizeof(rqe->sges[0]));
+
+	state = qp->state;
+	if (state == IB_QPS_INIT || state == IB_QPS_RTR || state == IB_QPS_RTS)
+		push_to_fifo(&qp->rqes, &rqe->list);
+	else
+		ret = -EINVAL;
+
+	if (ret) {
+		generate_rc_rq_cqe(qp, rqe, 0, IB_WC_WR_FLUSH_ERR, 0, 0);
+		kfree(rqe);
+		ret = 0;
+	}
+alloc_err:
+	put_qp(sqp);
+	return ret;
+}
+
+static int loopback_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr,
+			      const struct ib_recv_wr **bad_wr)
+{
+	struct loopback_qp *qp = ib_to_loop_qp(ibqp);
+	int err = 0;
+
+	while (wr) {
+		err = post_one_recv(qp, wr);
+		if (unlikely(err)) {
+			*bad_wr = wr;
+			break;
+		}
+		wr = wr->next;
+	}
+	return err;
+}
+
+static struct ib_ah *
+loopback_create_ah(struct ib_pd *ibpd, struct rdma_ah_attr *attr,
+		   u32 flags, struct ib_udata *udata)
+{
+	struct rdma_loopdev *ld = ib_to_loopdev(ibpd->device);
+	struct loopback_ah *ah;
+	int ret;
+
+	ah = kzalloc(sizeof(*ah), GFP_KERNEL);
+	if (!ah)
+		return ERR_PTR(-ENOMEM);
+	ah->attr = *attr;
+	ret = attach_table_id(&ld->ah_tbl, &ah->res);
+	if (ret) {
+		kfree(ah);
+		return ERR_PTR(ret);
+	}
+	return &ah->ibah;
+}
+
+static int loopback_query_ah(struct ib_ah *ibah, struct rdma_ah_attr *attr)
+{
+	struct loopback_ah *ah = ib_to_loop_ah(ibah);
+
+	memset(attr, 0, sizeof(*attr));
+	*attr = ah->attr;
+	attr->type = ibah->type;
+	return 0;
+}
+
+static int loopback_destroy_ah(struct ib_ah *ibah, u32 flags)
+{
+	struct rdma_loopdev *ld = ib_to_loopdev(ibah->device);
+	struct loopback_ah *ah = ib_to_loop_ah(ibah);
+
+	detach_table_id(&ld->ah_tbl, &ah->res);
+	kfree(ah);
+	return 0;
+}
+
+static const struct ib_device_ops rdma_loopdev_ops = {
+	.alloc_pd = loopback_alloc_pd,
+	.alloc_ucontext = loopback_alloc_ucontext,
+	.create_ah = loopback_create_ah,
+	.create_cq = loopback_create_cq,
+	.create_qp = loopback_create_qp,
+	.dealloc_pd = loopback_dealloc_pd,
+	.dealloc_ucontext = loopback_dealloc_ucontext,
+	.dereg_mr = loopback_dereg_mr,
+	.destroy_ah = loopback_destroy_ah,
+	.destroy_cq = loopback_destroy_cq,
+	.destroy_qp = loopback_destroy_qp,
+	.get_dma_mr = loopback_get_dma_mr,
+	.get_link_layer = loopback_get_link_layer,
+	.get_netdev = loopback_get_netdev,
+	.get_port_immutable = loopback_port_immutable,
+	.modify_qp = loopback_modify_qp,
+	.poll_cq = loopback_poll_cq,
+	.post_recv = loopback_post_recv,
+	.post_send = loopback_post_send,
+	.query_ah = loopback_query_ah,
+	.query_device = loopback_query_device,
+	.query_pkey = loopback_query_pkey,
+	.query_port = loopback_query_port,
+	.query_qp = loopback_query_qp,
+	.reg_user_mr = loopback_reg_user_mr,
+	.req_notify_cq = loopback_req_notify_cq,
+	INIT_RDMA_OBJ_SIZE(ib_pd, loopback_pd, ibpd),
+	INIT_RDMA_OBJ_SIZE(ib_ucontext, loopback_uctx, ibuctx),
+};
+
+static void init_rdma_loopdev(struct rdma_loopdev *ld)
+{
+	struct ib_device *dev = &ld->dev;
+
+	strlcpy(dev->node_desc, "lo", sizeof(dev->node_desc));
+
+	dev->owner = THIS_MODULE;
+	dev->node_type = RDMA_NODE_IB_CA;
+	dev->phys_port_cnt = 1;
+	dev->num_comp_vectors = num_possible_cpus();
+	dev->dev.parent = &lo->dev;
+	dev->local_dma_lkey = 0;
+	dev->dev.dma_ops = &dma_virt_ops;
+	dev->node_guid = 0x7f0001;
+	dma_coerce_mask_and_coherent(&dev->dev,
+				     dma_get_required_mask(&dev->dev));
+
+	ld->port_attr.pkey_tbl_len = 1;
+	/* deault, 127.0.0.1 and ::1 */
+	ld->port_attr.gid_tbl_len = 3;
+
+	dev->uverbs_abi_ver = 2;
+	dev->uverbs_cmd_mask = BIT_ULL(IB_USER_VERBS_CMD_GET_CONTEXT)
+	    | BIT_ULL(IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL)
+	    | BIT_ULL(IB_USER_VERBS_CMD_QUERY_DEVICE)
+	    | BIT_ULL(IB_USER_VERBS_CMD_QUERY_PORT)
+	    | BIT_ULL(IB_USER_VERBS_CMD_ALLOC_PD)
+	    | BIT_ULL(IB_USER_VERBS_CMD_DEALLOC_PD)
+	    | BIT_ULL(IB_USER_VERBS_CMD_CREATE_QP)
+	    | BIT_ULL(IB_USER_VERBS_CMD_MODIFY_QP)
+	    | BIT_ULL(IB_USER_VERBS_CMD_QUERY_QP)
+	    | BIT_ULL(IB_USER_VERBS_CMD_DESTROY_QP)
+	    | BIT_ULL(IB_USER_VERBS_CMD_POST_SEND)
+	    | BIT_ULL(IB_USER_VERBS_CMD_POST_RECV)
+	    | BIT_ULL(IB_USER_VERBS_CMD_CREATE_CQ)
+	    | BIT_ULL(IB_USER_VERBS_CMD_DESTROY_CQ)
+	    | BIT_ULL(IB_USER_VERBS_CMD_POLL_CQ)
+	    | BIT_ULL(IB_USER_VERBS_CMD_REQ_NOTIFY_CQ)
+	    | BIT_ULL(IB_USER_VERBS_CMD_REG_MR)
+	    | BIT_ULL(IB_USER_VERBS_CMD_DEREG_MR)
+	    | BIT_ULL(IB_USER_VERBS_CMD_CREATE_AH)
+	    | BIT_ULL(IB_USER_VERBS_CMD_MODIFY_AH)
+	    | BIT_ULL(IB_USER_VERBS_CMD_QUERY_AH)
+	    | BIT_ULL(IB_USER_VERBS_CMD_DESTROY_AH)
+	    ;
+
+	ib_set_device_ops(dev, &rdma_loopdev_ops);
+
+	dev->driver_id = RDMA_DRIVER_LOOPBACK;
+
+	init_loopdev_tables(ld);
+}
+
+static void cleanup_zero_lkey_mr(struct rdma_loopdev *ld)
+{
+	detach_table_id(&ld->mr_tbl, &ld->zero_mr.res);
+}
+
+static int init_zero_lkey_mr(struct rdma_loopdev *ld)
+{
+	ld->zero_mr.type = LOOPBACK_MR_TYPE_PHY;
+	ld->zero_mr.ibmr.length = ULONG_MAX;
+	ld->zero_mr.access = IB_ACCESS_LOCAL_WRITE;
+	return attach_table_id_for_id(&ld->mr_tbl, &ld->zero_mr.res, 0);
+}
+
+static int loopback_init(void)
+{
+	int ret;
+
+	lo = dev_get_by_name(&init_net, "lo");
+	if (!lo)
+		return -ENODEV;
+
+	loopdev = ib_alloc_device(rdma_loopdev, dev);
+	if (!loopdev) {
+		ret = -ENOMEM;
+		goto alloc_err;
+	}
+	init_rdma_loopdev(loopdev);
+
+	ret = init_zero_lkey_mr(loopdev);
+	if (ret)
+		goto mr_err;
+
+	ret = ib_register_device(&loopdev->dev, "lo");
+	if (ret)
+		goto reg_err;
+	return 0;
+
+reg_err:
+	cleanup_zero_lkey_mr(loopdev);
+mr_err:
+	ib_dealloc_device(&loopdev->dev);
+alloc_err:
+	dev_put(lo);
+	return ret;
+}
+
+static void loopback_cleanup(void)
+{
+	ib_unregister_device(&loopdev->dev);
+	cleanup_zero_lkey_mr(loopdev);
+	ib_dealloc_device(&loopdev->dev);
+	dev_put(lo);
+}
+
+module_init(loopback_init);
+module_exit(loopback_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/include/uapi/rdma/rdma_user_ioctl_cmds.h b/include/uapi/rdma/rdma_user_ioctl_cmds.h
index 06c34d9..f9756a2 100644
--- a/include/uapi/rdma/rdma_user_ioctl_cmds.h
+++ b/include/uapi/rdma/rdma_user_ioctl_cmds.h
@@ -102,6 +102,7 @@ enum rdma_driver_id {
 	RDMA_DRIVER_RXE,
 	RDMA_DRIVER_HFI1,
 	RDMA_DRIVER_QIB,
+	RDMA_DRIVER_LOOPBACK,
 };
 
 #endif
-- 
1.8.3.1




[Index of Archives]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Photo]     [Yosemite News]     [Yosemite Photos]     [Linux Kernel]     [Linux SCSI]     [XFree86]

  Powered by Linux