Re: [PATCH rdma-next 11/13] RDMA/efa: Add EFA verbs implementation

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Tue, Dec 04, 2018 at 02:04:27PM +0200, Gal Pressman wrote:
> Add a file that implements the EFA verbs.
> 
> Signed-off-by: Gal Pressman <galpress@xxxxxxxxxx>
>  drivers/infiniband/hw/efa/efa_verbs.c | 1827 +++++++++++++++++++++++++++++++++
>  1 file changed, 1827 insertions(+)
>  create mode 100644 drivers/infiniband/hw/efa/efa_verbs.c
> 
> diff --git a/drivers/infiniband/hw/efa/efa_verbs.c b/drivers/infiniband/hw/efa/efa_verbs.c
> new file mode 100644
> index 000000000000..ec887648060e
> +++ b/drivers/infiniband/hw/efa/efa_verbs.c
> @@ -0,0 +1,1827 @@
> +// SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause
> +/*
> + * Copyright 2018 Amazon.com, Inc. or its affiliates.
> + */
> +
> +#include <linux/vmalloc.h>
> +
> +#include <rdma/efa-abi.h>
> +#include <rdma/ib_addr.h>
> +#include <rdma/ib_umem.h>
> +#include <rdma/ib_user_verbs.h>
> +#include <rdma/ib_verbs.h>
> +
> +#include "efa.h"
> +
> +#define EFA_MMAP_DB_BAR_MEMORY_FLAG     BIT(61)
> +#define EFA_MMAP_REG_BAR_MEMORY_FLAG    BIT(62)
> +#define EFA_MMAP_MEM_BAR_MEMORY_FLAG    BIT(63)
> +#define EFA_MMAP_BARS_MEMORY_MASK       \
> +	(EFA_MMAP_REG_BAR_MEMORY_FLAG | EFA_MMAP_MEM_BAR_MEMORY_FLAG | \
> +	 EFA_MMAP_DB_BAR_MEMORY_FLAG)
> +
> +struct efa_ucontext {
> +	struct ib_ucontext      ibucontext;
> +	/* Protects ucontext state */
> +	struct mutex            lock;
> +	struct list_head        link;
> +	struct list_head        pending_mmaps;
> +	u64                     mmap_key;
> +};
> +
> +#define EFA_AENQ_ENABLED_GROUPS \
> +	(BIT(EFA_ADMIN_FATAL_ERROR) | BIT(EFA_ADMIN_WARNING) | \
> +	 BIT(EFA_ADMIN_NOTIFICATION) | BIT(EFA_ADMIN_KEEP_ALIVE))
> +
> +struct efa_pd {
> +	struct ib_pd    ibpd;
> +	u32             pdn;
> +};
> +
> +struct efa_mr {
> +	struct ib_mr     ibmr;
> +	struct ib_umem  *umem;
> +	u64 vaddr;
> +};
> +
> +struct efa_cq {
> +	struct ib_cq               ibcq;
> +	struct efa_ucontext       *ucontext;
> +	u16                        cq_idx;
> +	dma_addr_t                 dma_addr;
> +	void                      *cpu_addr;
> +	size_t                     size;
> +};
> +
> +struct efa_qp {
> +	struct ib_qp            ibqp;
> +	enum ib_qp_state        state;
> +	u32                     qp_handle;
> +	dma_addr_t              rq_dma_addr;
> +	void                   *rq_cpu_addr;
> +	size_t                  rq_size;
> +};
> +
> +struct efa_ah {
> +	struct ib_ah    ibah;
> +	/* dest_addr */
> +	u8              id[EFA_GID_SIZE];
> +};
> +
> +struct efa_ah_id {
> +	struct list_head list;
> +	/* dest_addr */
> +	u8 id[EFA_GID_SIZE];
> +	u16 address_handle;
> +	unsigned int  ref_count;
> +};
> +
> +struct efa_mmap_entry {
> +	struct list_head list;
> +	void  *obj;
> +	u64 address;
> +	u64 length;
> +	u64 key;
> +};
> +
> +static void mmap_entry_insert(struct efa_ucontext *ucontext,
> +			      struct efa_mmap_entry *entry,
> +			      u64 mem_flag);
> +
> +static void mmap_obj_entries_remove(struct efa_ucontext *ucontext,
> +				    void *obj);
> +
> +#define EFA_PAGE_SHIFT       12
> +#define EFA_PAGE_SIZE        BIT(EFA_PAGE_SHIFT)
> +#define EFA_PAGE_PTR_SIZE    8
> +
> +#define EFA_CHUNK_ALLOC_SIZE BIT(EFA_PAGE_SHIFT)
> +#define EFA_CHUNK_PTR_SIZE   sizeof(struct efa_com_ctrl_buff_info)
> +
> +#define EFA_PAGE_PTRS_PER_CHUNK  \
> +	((EFA_CHUNK_ALLOC_SIZE - EFA_CHUNK_PTR_SIZE) / EFA_PAGE_PTR_SIZE)
> +
> +#define EFA_CHUNK_USED_SIZE  \
> +	((EFA_PAGE_PTRS_PER_CHUNK * EFA_PAGE_PTR_SIZE) + EFA_CHUNK_PTR_SIZE)
> +
> +#define EFA_SUPPORTED_ACCESS_FLAGS IB_ACCESS_LOCAL_WRITE
> +
> +struct pbl_chunk {
> +	u64 *buf;
> +	u32 length;
> +	dma_addr_t dma_addr;
> +};
> +
> +struct pbl_chunk_list {
> +	unsigned int size;
> +	struct pbl_chunk *chunks;
> +};
> +
> +struct pbl_context {
> +	u64 *pbl_buf;
> +	u32  pbl_buf_size_in_bytes;
> +	bool physically_continuous;
> +	union {
> +		struct {
> +			dma_addr_t dma_addr;
> +		} continuous;
> +		struct {
> +			u32 pbl_buf_size_in_pages;
> +			struct scatterlist *sgl;
> +			int sg_dma_cnt;
> +			struct pbl_chunk_list chunk_list;
> +		} indirect;
> +	} phys;
> +
> +	struct efa_dev *dev;
> +	struct device *dmadev;
> +};
> +
> +static inline struct efa_dev *to_edev(struct ib_device *ibdev)
> +{
> +	return container_of(ibdev, struct efa_dev, ibdev);
> +}
> +
> +static inline struct efa_ucontext *to_eucontext(struct ib_ucontext *ibucontext)
> +{
> +	return container_of(ibucontext, struct efa_ucontext, ibucontext);
> +}
> +
> +static inline struct efa_pd *to_epd(struct ib_pd *ibpd)
> +{
> +	return container_of(ibpd, struct efa_pd, ibpd);
> +}
> +
> +static inline struct efa_mr *to_emr(struct ib_mr *ibmr)
> +{
> +	return container_of(ibmr, struct efa_mr, ibmr);
> +}
> +
> +static inline struct efa_qp *to_eqp(struct ib_qp *ibqp)
> +{
> +	return container_of(ibqp, struct efa_qp, ibqp);
> +}
> +
> +static inline struct efa_cq *to_ecq(struct ib_cq *ibcq)
> +{
> +	return container_of(ibcq, struct efa_cq, ibcq);
> +}
> +
> +static inline struct efa_ah *to_eah(struct ib_ah *ibah)
> +{
> +	return container_of(ibah, struct efa_ah, ibah);
> +}
> +
> +#define field_avail(x, fld, sz) (offsetof(typeof(x), fld) + \
> +				 sizeof(((typeof(x) *)0)->fld) <= (sz))
> +
> +#define EFA_IS_RESERVED_CLEARED(reserved) \
> +	!memchr_inv(reserved, 0, sizeof(reserved))
> +
> +int efa_query_device(struct ib_device *ibdev,
> +		     struct ib_device_attr *props,
> +		     struct ib_udata *udata)
> +{
> +	struct efa_ibv_ex_query_device_resp resp = {};
> +	struct efa_com_get_device_attr_result result;
> +	struct efa_dev *dev = to_edev(ibdev);
> +	int err;
> +
> +	pr_debug("--->\n");
> +	memset(props, 0, sizeof(*props));
> +
> +	if (udata && udata->inlen &&
> +	    !ib_is_udata_cleared(udata, 0, udata->inlen)) {
> +		pr_err_ratelimited("Incompatible ABI params, udata not cleared\n");
> +		return -EINVAL;
> +	}
> +
> +	err = efa_get_device_attributes(dev, &result);
> +	if (err) {
> +		pr_err("failed to get device_attr err[%d]!\n", err);
> +		return err;
> +	}
> +
> +	props->max_mr_size              = result.max_mr_pages * PAGE_SIZE;
> +	props->page_size_cap            = result.page_size_cap;
> +	props->vendor_id                = result.vendor_id;
> +	props->vendor_part_id           = result.vendor_part_id;
> +	props->hw_ver                   = dev->pdev->subsystem_device;
> +	props->max_qp                   = result.max_sq;
> +	props->device_cap_flags         = IB_DEVICE_PORT_ACTIVE_EVENT |
> +					  IB_DEVICE_VIRTUAL_FUNCTION |
> +					  IB_DEVICE_BLOCK_MULTICAST_LOOPBACK;
> +	props->max_cq                   = result.max_cq;
> +	props->max_pd                   = result.max_pd;
> +	props->max_mr                   = result.max_mr;
> +	props->max_ah                   = result.max_ah;
> +	props->max_cqe                  = result.max_cq_depth;
> +	props->max_qp_wr                = min_t(u16, result.max_sq_depth,
> +						result.max_rq_depth);
> +	props->max_send_sge             = result.max_sq_sge;
> +	props->max_recv_sge             = result.max_rq_sge;
> +
> +	if (udata && udata->outlen) {
> +		resp.sub_cqs_per_cq = result.sub_cqs_per_cq;
> +		resp.max_sq_sge = result.max_sq_sge;
> +		resp.max_rq_sge = result.max_rq_sge;
> +		resp.max_sq_wr  = result.max_sq_depth;
> +		resp.max_rq_wr  = result.max_rq_depth;
> +		resp.max_inline_data = result.inline_buf_size;
> +
> +		err = ib_copy_to_udata(udata, &resp,
> +				       min(sizeof(resp), udata->outlen));
> +		if (err) {
> +			pr_err_ratelimited("failed to copy udata for query_device.\n");
> +			return err;
> +		}
> +	}
> +
> +	return err;
> +}
> +
> +int efa_query_port(struct ib_device *ibdev, u8 port,
> +		   struct ib_port_attr *props)
> +{
> +	struct efa_dev *dev = to_edev(ibdev);
> +
> +	pr_debug("--->\n");
> +
> +	mutex_lock(&dev->efa_dev_lock);
> +	memset(props, 0, sizeof(*props));
> +
> +	props->lid = 0;
> +	props->lmc = 1;
> +	props->sm_lid = 0;
> +	props->sm_sl = 0;
> +
> +	props->state = IB_PORT_ACTIVE;
> +	props->phys_state = 5;
> +	props->port_cap_flags = 0;
> +	props->gid_tbl_len = 1;
> +	props->pkey_tbl_len = 1;
> +	props->bad_pkey_cntr = 0;
> +	props->qkey_viol_cntr = 0;
> +	props->active_speed = IB_SPEED_EDR;
> +	props->active_width = IB_WIDTH_4X;
> +	props->max_mtu = ib_mtu_int_to_enum(dev->mtu);
> +	props->active_mtu = ib_mtu_int_to_enum(dev->mtu);
> +	props->max_msg_sz = dev->mtu;
> +	props->max_vl_num = 1;
> +	mutex_unlock(&dev->efa_dev_lock);
> +	return 0;
> +}
> +
> +int efa_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr,
> +		 int qp_attr_mask,
> +		 struct ib_qp_init_attr *qp_init_attr)
> +{
> +	struct efa_qp *qp = to_eqp(ibqp);
> +
> +	pr_debug("--->\n");
> +
> +	memset(qp_attr, 0, sizeof(*qp_attr));
> +	memset(qp_init_attr, 0, sizeof(*qp_init_attr));
> +
> +	qp_attr->qp_state = qp->state;
> +	qp_attr->cur_qp_state = qp->state;
> +	qp_attr->port_num = 1;
> +
> +	qp_init_attr->qp_type = ibqp->qp_type;
> +	qp_init_attr->recv_cq = ibqp->recv_cq;
> +	qp_init_attr->send_cq = ibqp->send_cq;
> +
> +	return 0;
> +}
> +
> +int efa_query_gid(struct ib_device *ibdev, u8 port, int index,
> +		  union ib_gid *gid)
> +{
> +	struct efa_dev *dev = to_edev(ibdev);
> +
> +	pr_debug("port %d gid index %d\n", port, index);
> +
> +	if (index > 1)
> +		return -EINVAL;
> +
> +	mutex_lock(&dev->efa_dev_lock);
> +	memcpy(gid->raw, dev->addr, sizeof(dev->addr));
> +	mutex_unlock(&dev->efa_dev_lock);
> +
> +	return 0;
> +}
> +
> +int efa_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
> +		   u16 *pkey)
> +{
> +	pr_debug("--->\n");
> +	if (index > 1)
> +		return -EINVAL;
> +
> +	*pkey = 0xffff;
> +	return 0;
> +}
> +
> +struct ib_pd *efa_alloc_pd(struct ib_device *ibdev,
> +			   struct ib_ucontext *ibucontext,
> +			   struct ib_udata *udata)
> +{
> +	struct efa_ibv_alloc_pd_resp resp = {};
> +	struct efa_dev *dev = to_edev(ibdev);
> +	struct efa_pd *pd;
> +	int err;
> +
> +	pr_debug("--->\n");
> +
> +	if (!ibucontext) {
> +		pr_err("ibucontext is not valid\n");
> +		return ERR_PTR(-EOPNOTSUPP);
> +	}
> +
> +	if (udata && udata->inlen &&
> +	    !ib_is_udata_cleared(udata, 0, udata->inlen)) {
> +		pr_err_ratelimited("Incompatible ABI params, udata not cleared\n");
> +		return ERR_PTR(-EINVAL);
> +	}
> +
> +	pd = kzalloc(sizeof(*pd), GFP_KERNEL);
> +	if (!pd) {
> +		dev->stats.sw_stats.alloc_pd_alloc_err++;
> +		return ERR_PTR(-ENOMEM);
> +	}
> +
> +	pd->pdn = efa_bitmap_alloc(&dev->pd_bitmap);
> +	if (pd->pdn == EFA_BITMAP_INVAL) {
> +		pr_err("Failed to alloc PD (max_pd %u)\n", dev->caps.max_pd);
> +		dev->stats.sw_stats.alloc_pd_bitmap_full_err++;
> +		kfree(pd);
> +		return ERR_PTR(-ENOMEM);
> +	}
> +
> +	resp.pdn = pd->pdn;
> +
> +	if (udata && udata->outlen) {
> +		err = ib_copy_to_udata(udata, &resp,
> +				       min(sizeof(resp), udata->outlen));
> +		if (err) {
> +			pr_err_ratelimited("failed to copy udata for alloc_pd\n");
> +			efa_bitmap_free(&dev->pd_bitmap, pd->pdn);
> +			kfree(pd);
> +			return ERR_PTR(err);
> +		}
> +	}
> +
> +	pr_debug("Allocated pd[%d]\n", pd->pdn);
> +
> +	return &pd->ibpd;
> +}
> +
> +int efa_dealloc_pd(struct ib_pd *ibpd)
> +{
> +	struct efa_dev *dev = to_edev(ibpd->device);
> +	struct efa_pd *pd = to_epd(ibpd);
> +
> +	pr_debug("Dealloc pd[%d]\n", pd->pdn);
> +	efa_bitmap_free(&dev->pd_bitmap, pd->pdn);
> +	kfree(pd);
> +
> +	return 0;
> +}
> +
> +int efa_destroy_qp_handle(struct efa_dev *dev, u32 qp_handle)
> +{
> +	struct efa_com_destroy_qp_params params = { .qp_handle = qp_handle };
> +
> +	return efa_com_destroy_qp(dev->edev, &params);
> +}
> +
> +int efa_destroy_qp(struct ib_qp *ibqp)
> +{
> +	struct efa_dev *dev = to_edev(ibqp->pd->device);
> +	struct efa_qp *qp = to_eqp(ibqp);
> +	struct efa_ucontext *ucontext;
> +
> +	pr_debug("Destroy qp[%u]\n", ibqp->qp_num);
> +	ucontext = ibqp->pd->uobject ?
> +			to_eucontext(ibqp->pd->uobject->context) :
> +			NULL;
> +
> +	if (!ucontext)
> +		return -EOPNOTSUPP;
> +
> +	efa_destroy_qp_handle(dev, qp->qp_handle);
> +	mmap_obj_entries_remove(ucontext, qp);
> +
> +	if (qp->rq_cpu_addr) {
> +		pr_debug("qp->cpu_addr[%p] freed: size[%lu], dma[%pad]\n",
> +			 qp->rq_cpu_addr, qp->rq_size,
> +			 &qp->rq_dma_addr);
> +		dma_free_coherent(&dev->pdev->dev, qp->rq_size,
> +				  qp->rq_cpu_addr, qp->rq_dma_addr);
> +	}
> +
> +	kfree(qp);
> +	return 0;
> +}
> +
> +static int qp_mmap_entries_setup(struct efa_qp *qp,
> +				 struct efa_dev *dev,
> +				 struct efa_ucontext *ucontext,
> +				 struct efa_com_create_qp_params *params,
> +				 struct efa_ibv_create_qp_resp *resp)
> +{
> +	struct efa_mmap_entry *rq_db_entry;
> +	struct efa_mmap_entry *sq_db_entry;
> +	struct efa_mmap_entry *rq_entry;
> +	struct efa_mmap_entry *sq_entry;
> +
> +	sq_db_entry = kzalloc(sizeof(*sq_db_entry), GFP_KERNEL);
> +	sq_entry = kzalloc(sizeof(*sq_entry), GFP_KERNEL);
> +	if (!sq_db_entry || !sq_entry) {
> +		dev->stats.sw_stats.mmap_entry_alloc_err++;
> +		goto err_alloc;
> +	}
> +
> +	if (qp->rq_size) {
> +		rq_entry = kzalloc(sizeof(*rq_entry), GFP_KERNEL);
> +		rq_db_entry = kzalloc(sizeof(*rq_db_entry), GFP_KERNEL);
> +		if (!rq_entry || !rq_db_entry) {
> +			dev->stats.sw_stats.mmap_entry_alloc_err++;
> +			goto err_alloc_rq;
> +		}
> +
> +		rq_db_entry->obj = qp;
> +		rq_entry->obj    = qp;
> +
> +		rq_entry->address = virt_to_phys(qp->rq_cpu_addr);

virt_to_phys cannot be called on addresses returned by dma_alloc_coherent:

> +		qp->rq_cpu_addr = dma_zalloc_coherent(&dev->pdev->dev,
> +						      qp->rq_size,
> +						      &qp->rq_dma_addr,
> +						      GFP_KERNEL);

And this whole mmap_entries data structure looks like a big confusing
mess to me, I think it should just be trivial usage of xarray if I
understand what it is trying to do. (and doing remove during mmap
seems really wrong to me, mmap cookies should exist as long as the
owning object exists..)

Also you can't mmap to user space dma_coherent memory, so this entire
thing needs reworking to use non-coherent memory and proper barriers
like all the other drivers.

Also everything in __efa_mmap is old-style, it needs to use the new
rdma_user_mmap_io/page() interfaces.

.. and I just wanted to know if CQ was done sensibly :(


Oh, and:

> +int efa_post_send(struct ib_qp *ibqp,
> +		  const struct ib_send_wr *wr,
> +		  const struct ib_send_wr **bad_wr)
> +{
> +	pr_warn("Function not supported\n");
> +	return -EOPNOTSUPP;
> +}

Drivers that don't support something are to just set the function
pointer to NULL, not do stubs like this.

Jason



[Index of Archives]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Photo]     [Yosemite News]     [Yosemite Photos]     [Linux Kernel]     [Linux SCSI]     [XFree86]

  Powered by Linux