RE: [Patch v6 12/12] RDMA/mana_ib: Add a driver for Microsoft Azure Network Adapter

Long Li <longli@xxxxxxxxxxxxx> · Thu, 22 Sep 2022 08:23:45 +0000

> Subject: Re: [Patch v6 12/12] RDMA/mana_ib: Add a driver for Microsoft
> Azure Network Adapter
> 
> On Tue, Sep 20, 2022 at 06:22:32PM -0700, longli@xxxxxxxxxxxxxxxxx wrote:
> > +int mana_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr
> *attr,
> > +		      struct ib_udata *udata)
> > +{
> > +	struct mana_ib_cq *cq = container_of(ibcq, struct mana_ib_cq, ibcq);
> > +	struct ib_device *ibdev = ibcq->device;
> > +	struct mana_ib_create_cq ucmd = {};
> > +	struct mana_ib_dev *mdev;
> > +	int err;
> > +
> > +	mdev = container_of(ibdev, struct mana_ib_dev, ib_dev);
> 
> Stylistically these container_of's are usually in the definitions section, at least
> pick a form and stick to it consistently.

I will review all the other occurrences for consistency. I'm trying to use the "reverse Christmas tree" for definitions, so putting "mdev =" in a separate line.

> 
> > +	if (udata->inlen < sizeof(ucmd))
> > +		return -EINVAL;
> > +
> > +	err = ib_copy_from_udata(&ucmd, udata, min(sizeof(ucmd), udata-
> >inlen));
> > +	if (err) {
> > +		ibdev_dbg(ibdev,
> > +			  "Failed to copy from udata for create cq, %d\n", err);
> 
> > +		return -EFAULT;
> > +	}
> > +
> > +	if (attr->cqe > MAX_SEND_BUFFERS_PER_QUEUE) {
> > +		ibdev_dbg(ibdev, "CQE %d exceeding limit\n", attr->cqe);
> > +		return -EINVAL;
> > +	}
> > +
> > +	cq->cqe = attr->cqe;
> > +	cq->umem = ib_umem_get(ibdev, ucmd.buf_addr, cq->cqe *
> COMP_ENTRY_SIZE,
> > +			       IB_ACCESS_LOCAL_WRITE);
> > +	if (IS_ERR(cq->umem)) {
> > +		err = PTR_ERR(cq->umem);
> > +		ibdev_dbg(ibdev, "Failed to get umem for create cq,
> err %d\n",
> > +			  err);
> > +		return err;
> > +	}
> > +
> > +	err = mana_ib_gd_create_dma_region(mdev, cq->umem, &cq-
> >gdma_region,
> > +					   PAGE_SIZE);
> > +	if (err) {
> > +		ibdev_err(ibdev,
> > +			  "Failed to create dma region for create cq, %d\n",
> > +			  err);
> 
> Prints on userspace paths are not allowed, this should be dbg. There are
> many other cases like this, please fix them all. This driver may have too many
> dbg prints too, not every failure if should have a print :(

Will fix those.

> 
> > +	ibdev_dbg(ibdev,
> > +		  "mana_ib_gd_create_dma_region ret %d gdma_region
> 0x%llx\n",
> > +		  err, cq->gdma_region);
> > +
> > +	/* The CQ ID is not known at this time
> > +	 * The ID is generated at create_qp
> > +	 */
> 
> Wrong comment style, rdma uses the leading empty blank line for some
> reason
> 
>  /*
>   * The CQ ID is not known at this time. The ID is generated at create_qp.
>   */

Will fix all the comments.

> 
> > +static void mana_ib_remove(struct auxiliary_device *adev) {
> > +	struct mana_ib_dev *dev = dev_get_drvdata(&adev->dev);
> > +
> > +	ib_unregister_device(&dev->ib_dev);
> > +	ib_dealloc_device(&dev->ib_dev);
> > +}
> > +
> > +static const struct auxiliary_device_id mana_id_table[] = {
> > +	{
> > +		.name = "mana.rdma",
> > +	},
> > +	{},
> > +};
> > +
> > +MODULE_DEVICE_TABLE(auxiliary, mana_id_table);
> > +
> > +static struct auxiliary_driver mana_driver = {
> > +	.name = "rdma",
> > +	.probe = mana_ib_probe,
> > +	.remove = mana_ib_remove,
> > +	.id_table = mana_id_table,
> > +};
> > +
> > +static int __init mana_ib_init(void)
> > +{
> > +	auxiliary_driver_register(&mana_driver);
> > +
> > +	return 0;
> > +}
> > +
> > +static void __exit mana_ib_cleanup(void) {
> > +	auxiliary_driver_unregister(&mana_driver);
> > +}
> > +
> 
> All this is just module_auxiliary_driver()

Will fix this.

> 
> > +	mutex_lock(&pd->vport_mutex);
> > +
> > +	pd->vport_use_count++;
> > +	if (pd->vport_use_count > 1) {
> > +		ibdev_dbg(&dev->ib_dev,
> > +			  "Skip as this PD is already configured vport\n");
> > +		mutex_unlock(&pd->vport_mutex);
> 
> This leaves vport_use_count elevated.

This is intentional. The code will call mana_ib_uncfg_vport() (which decreases vport_use_count ) when destroying the QP.

> 
> > +		return 0;
> > +int mana_ib_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata) {
> > +	struct mana_ib_pd *pd = container_of(ibpd, struct mana_ib_pd,
> ibpd);
> > +	struct ib_device *ibdev = ibpd->device;
> > +	struct mana_ib_dev *dev;
> > +
> > +	dev = container_of(ibdev, struct mana_ib_dev, ib_dev);
> > +	return mana_ib_gd_destroy_pd(dev, pd->pd_handle);
> 
> This is the only place that calls mana_ib_gd_destroy_pd(), don't have a
> spaghetti of functions calling single other functions like this, just inline it.
> 
> Also it shouldn't have been non-static, please check everything that
> everything non-static actually has an out-of-file user.

Will fix those.

> 
> > +void mana_ib_dealloc_ucontext(struct ib_ucontext *ibcontext) {
> > +	struct mana_ib_ucontext *mana_ucontext =
> > +		container_of(ibcontext, struct mana_ib_ucontext,
> ibucontext);
> > +	struct ib_device *ibdev = ibcontext->device;
> > +	struct mana_ib_dev *mdev;
> > +	struct gdma_context *gc;
> > +	int ret;
> > +
> > +	mdev = container_of(ibdev, struct mana_ib_dev, ib_dev);
> > +	gc = mdev->gdma_dev->gdma_context;
> > +
> > +	ret = mana_gd_destroy_doorbell_page(gc, mana_ucontext-
> >doorbell);
> > +	if (ret)
> > +		ibdev_err(ibdev, "Failed to destroy doorbell page %d\n", ret);
> 
> This already printing on error
> 
> And again, why is this driver split up so strangely?
> mana_gd_destroy_doorbell_page() is an RDMA function, it is only called by
> the RDMA code, why is it located under drivers/net/ethernet?
> 
> I do not want an RDMA driver in drivers/net/ethernet.

Will move this function (and mana_gd_allocate_doorbell_page) to the RDMA driver.

> 
> 
> > +}
> > +
> > +int mana_ib_gd_create_dma_region(struct mana_ib_dev *dev, struct
> ib_umem *umem,
> > +				 mana_handle_t *gdma_region, u64 page_sz)
> {
> > +	size_t num_pages_total = ib_umem_num_dma_blocks(umem,
> page_sz);
> > +	struct gdma_dma_region_add_pages_req *add_req = NULL;
> > +	struct gdma_create_dma_region_resp create_resp = {};
> > +	struct gdma_create_dma_region_req *create_req;
> > +	size_t num_pages_cur, num_pages_to_handle;
> > +	unsigned int create_req_msg_size;
> > +	struct hw_channel_context *hwc;
> > +	struct ib_block_iter biter;
> > +	size_t max_pgs_create_cmd;
> > +	struct gdma_context *gc;
> > +	struct gdma_dev *mdev;
> > +	unsigned int i;
> > +	int err;
> > +
> > +	mdev = dev->gdma_dev;
> > +	gc = mdev->gdma_context;
> > +	hwc = gc->hwc.driver_data;
> > +	max_pgs_create_cmd =
> > +		(hwc->max_req_msg_size - sizeof(*create_req)) /
> sizeof(u64);
> > +
> > +	num_pages_to_handle =
> > +		min_t(size_t, num_pages_total, max_pgs_create_cmd);
> > +	create_req_msg_size =
> > +		struct_size(create_req, page_addr_list,
> num_pages_to_handle);
> > +
> > +	create_req = kzalloc(create_req_msg_size, GFP_KERNEL);
> > +	if (!create_req)
> 
> Is this a multi-order allocation, I can't tell how big max_req_msg_size is?
> 
> This design seems to repeat the mistakes we made in mlx5, the low levels of
> the driver already has memory allocated - why not get a pointer to that
> memory here and directly fill the message buffer instead of all this allocation
> and memory copying?

max_req_msg_size is 4k bytes (one page). I think this allocation is still necessary as we don't have a buffer for this request. But we should be able to reuse this buffer for subsequent commands (details below).

> > +	ibdev_dbg(&dev->ib_dev,
> > +		  "size_dma_region %lu num_pages_total %lu, "
> > +		  "page_sz 0x%llx offset_in_page %u\n",
> > +		  umem->length, num_pages_total, page_sz,
> > +		  create_req->offset_in_page);
> > +
> > +	ibdev_dbg(&dev->ib_dev, "num_pages_to_handle %lu,
> gdma_page_type %u",
> > +		  num_pages_to_handle, create_req->gdma_page_type);
> > +
> > +	__rdma_umem_block_iter_start(&biter, umem, page_sz);
> 
> > +	for (i = 0; i < num_pages_to_handle; ++i) {
> > +		dma_addr_t cur_addr;
> > +
> > +		__rdma_block_iter_next(&biter);
> > +		cur_addr = rdma_block_iter_dma_address(&biter);
> > +
> > +		create_req->page_addr_list[i] = cur_addr;
> > +
> > +		ibdev_dbg(&dev->ib_dev, "page num %u cur_addr 0x%llx\n",
> i,
> > +			  cur_addr);
> 
> Please get rid of the worthless debugging code, actually test your driver with
> EBUG enabled and ensure it is usuable. Printing thousands of lines of garbage
> is not OK. Especially when what should have been a
> 1 line loop body is expended into a big block just to have a worthless debug
> statement.

Will remove those debugging code.

> 
> 
> > +	if (num_pages_cur < num_pages_total) {
> > +		unsigned int add_req_msg_size;
> > +		size_t max_pgs_add_cmd =
> > +			(hwc->max_req_msg_size - sizeof(*add_req)) /
> > +			sizeof(u64);
> > +
> > +		num_pages_to_handle =
> > +			min_t(size_t, num_pages_total - num_pages_cur,
> > +			      max_pgs_add_cmd);
> > +
> > +		/* Calculate the max num of pages that will be handled */
> > +		add_req_msg_size = struct_size(add_req, page_addr_list,
> > +					       num_pages_to_handle);
> > +
> > +		add_req = kmalloc(add_req_msg_size, GFP_KERNEL);
> 
> And allocating every loop iteration seems like overkill, why not just reuse the
> large buffer that create_req allocated?
> 
> Usually the way these loops are structured is to fill the array and then check
> for fullness, trigger an action to drain the array, and reset the indexes back to
> the start.

I will reuse the command buffer allocated earlier as you suggested.

> 
> > +int mana_ib_gd_create_pd(struct mana_ib_dev *dev, u64 *pd_handle,
> u32 *pd_id,
> > +			 enum gdma_pd_flags flags)
> > +{
> > +	struct gdma_dev *mdev = dev->gdma_dev;
> > +	struct gdma_create_pd_resp resp = {};
> > +	struct gdma_create_pd_req req = {};
> > +	struct gdma_context *gc;
> > +	int err;
> > +
> > +	gc = mdev->gdma_context;
> > +
> > +	mana_gd_init_req_hdr(&req.hdr, GDMA_CREATE_PD, sizeof(req),
> > +			     sizeof(resp));
> > +
> > +	req.flags = flags;
> > +	err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp),
> > +&resp);
> > +
> > +	if (err || resp.hdr.status) {
> > +		ibdev_err(&dev->ib_dev,
> > +			  "Failed to get pd_id err %d status %u\n", err,
> > +			  resp.hdr.status);
> > +		if (!err)
> > +			err = -EPROTO;
> 
> This pattern is repeated everywhere, you should fix
> mana_gd_send_request() to return EPROTO.

This pattern is also used all over in the ethernet driver. Since this patch series is for the RDMA driver, can we make a separate patch to refactor the code in mana_gd_send_request()? This can minimize the code changes to the ethernet driver in this patch series.

> 
> > +		return err;
> > +	}
> > +
> > +	*pd_handle = resp.pd_handle;
> > +	*pd_id = resp.pd_id;
> > +	ibdev_dbg(&dev->ib_dev, "pd_handle 0x%llx pd_id %d\n",
> *pd_handle,
> > +		  *pd_id);
> > +
> > +	return 0;
> > +}
> > +
> > +int mana_ib_gd_destroy_pd(struct mana_ib_dev *dev, u64 pd_handle) {
> > +	struct gdma_dev *mdev = dev->gdma_dev;
> > +	struct gdma_destory_pd_resp resp = {};
> > +	struct gdma_destroy_pd_req req = {};
> > +	struct gdma_context *gc;
> > +	int err;
> > +
> > +	gc = mdev->gdma_context;
> 
> Why the local for a variable that is used once?

Will refactoring the code.

> 
> > +int mana_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct
> > +*vma) {
> > +	struct mana_ib_ucontext *mana_ucontext =
> > +		container_of(ibcontext, struct mana_ib_ucontext,
> ibucontext);
> > +	struct ib_device *ibdev = ibcontext->device;
> > +	struct mana_ib_dev *mdev;
> > +	struct gdma_context *gc;
> > +	phys_addr_t pfn;
> > +	pgprot_t prot;
> > +	int ret;
> > +
> > +	mdev = container_of(ibdev, struct mana_ib_dev, ib_dev);
> > +	gc = mdev->gdma_dev->gdma_context;
> > +
> > +	if (vma->vm_pgoff != 0) {
> > +		ibdev_err(ibdev, "Unexpected vm_pgoff %lu\n", vma-
> >vm_pgoff);
> > +		return -EINVAL;
> 
> More user triggerable printing
> 
> > +int mana_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) {
> > +	struct mana_ib_mr *mr = container_of(ibmr, struct mana_ib_mr,
> ibmr);
> > +	struct ib_device *ibdev = ibmr->device;
> > +	struct mana_ib_dev *dev;
> > +	int err;
> > +
> > +	dev = container_of(ibdev, struct mana_ib_dev, ib_dev);
> > +
> > +	err = mana_ib_gd_destroy_mr(dev, mr->mr_handle);
> > +	if (err)
> > +		return err;
> 
> mana_ib_gd_destroy_mr() is only ever called here, why is it in main.c?

Will move it and mana_ib_gd_create_mr to mr.c

> 
> > +static int mana_ib_create_qp_rss(struct ib_qp *ibqp, struct ib_pd *pd,
> > +				 struct ib_qp_init_attr *attr,
> > +				 struct ib_udata *udata)
> > +{
> > +	struct mana_ib_qp *qp = container_of(ibqp, struct mana_ib_qp,
> ibqp);
> > +	struct mana_ib_dev *mdev =
> > +		container_of(pd->device, struct mana_ib_dev, ib_dev);
> > +	struct ib_rwq_ind_table *ind_tbl = attr->rwq_ind_tbl;
> > +	struct mana_ib_create_qp_rss_resp resp = {};
> > +	struct mana_ib_create_qp_rss ucmd = {};
> > +	struct gdma_dev *gd = mdev->gdma_dev;
> > +	mana_handle_t *mana_ind_table;
> > +	struct mana_port_context *mpc;
> > +	struct mana_context *mc;
> > +	struct net_device *ndev;
> > +	struct mana_ib_cq *cq;
> > +	struct mana_ib_wq *wq;
> > +	struct ib_cq *ibcq;
> > +	struct ib_wq *ibwq;
> > +	int i = 0, ret;
> > +	u32 port;
> > +
> > +	mc = gd->driver_data;
> > +
> > +	if (udata->inlen < sizeof(ucmd))
> > +		return -EINVAL;
> > +
> > +	ret = ib_copy_from_udata(&ucmd, udata, min(sizeof(ucmd), udata-
> >inlen));
> > +	if (ret) {
> > +		ibdev_dbg(&mdev->ib_dev,
> > +			  "Failed copy from udata for create rss-qp, err %d\n",
> > +			  ret);
> > +		return -EFAULT;
> > +	}
> > +
> > +	if (attr->cap.max_recv_wr > MAX_SEND_BUFFERS_PER_QUEUE) {
> > +		ibdev_dbg(&mdev->ib_dev,
> > +			  "Requested max_recv_wr %d exceeding limit.\n",
> > +			  attr->cap.max_recv_wr);
> > +		return -EINVAL;
> > +	}
> > +
> > +	if (attr->cap.max_recv_sge > MAX_RX_WQE_SGL_ENTRIES) {
> > +		ibdev_dbg(&mdev->ib_dev,
> > +			  "Requested max_recv_sge %d exceeding limit.\n",
> > +			  attr->cap.max_recv_sge);
> > +		return -EINVAL;
> > +	}
> > +
> > +	if (ucmd.rx_hash_function != MANA_IB_RX_HASH_FUNC_TOEPLITZ)
> {
> > +		ibdev_dbg(&mdev->ib_dev,
> > +			  "RX Hash function is not supported, %d\n",
> > +			  ucmd.rx_hash_function);
> > +		return -EINVAL;
> > +	}
> > +
> > +	/* IB ports start with 1, MANA start with 0 */
> > +	port = ucmd.port;
> > +	if (port < 1 || port > mc->num_ports) {
> > +		ibdev_dbg(&mdev->ib_dev, "Invalid port %u in creating
> qp\n",
> > +			  port);
> > +		return -EINVAL;
> > +	}
> > +	ndev = mc->ports[port - 1];
> > +	mpc = netdev_priv(ndev);
> > +
> > +	ibdev_dbg(&mdev->ib_dev, "rx_hash_function %d port %d\n",
> > +		  ucmd.rx_hash_function, port);
> > +
> > +	mana_ind_table = kzalloc(sizeof(mana_handle_t) *
> > +					 (1 << ind_tbl->log_ind_tbl_size),
> > +				 GFP_KERNEL);
> 
> Should be careful about maths overflow on this calculation.

Will add a check.

> 
> > +	ibdev_dbg(&mdev->ib_dev, "ucmd sq_buf_addr 0x%llx port %u\n",
> > +		  ucmd.sq_buf_addr, ucmd.port);
> > +
> > +	umem = ib_umem_get(ibpd->device, ucmd.sq_buf_addr,
> ucmd.sq_buf_size,
> > +			   IB_ACCESS_LOCAL_WRITE);
> > +	if (IS_ERR(umem)) {
> > +		err = PTR_ERR(umem);
> > +		ibdev_dbg(&mdev->ib_dev,
> > +			  "Failed to get umem for create qp-raw, err %d\n",
> > +			  err);
> > +		goto err_free_vport;
> > +	}
> > +	qp->sq_umem = umem;
> > +
> > +	err = mana_ib_gd_create_dma_region(mdev, qp->sq_umem,
> > +					   &qp->sq_gdma_region, PAGE_SIZE);
> > +	if (err) {
> 
> All these cases that process a page list have to call
> ib_umem_find_best_XXX()!
> 
> It does important validation against hostile user input, including checking the
> critical iova.
> 
> You have missed that the user can request that an unaligned umem be
> created and this creates a starting offset from the first page that must be
> honored in HW, or there will be weird memory corruption. Since it looks like
> this HW doesn't support a starting page offset this should call
> ib_umem_find_best_pgsz() with a SZ_4K and a 0 iova. Also never use
> PAGE_SIZE to describe a HW limitation.

Thank you, will fix this.

> 
> There is alot of repeated code here, it would do well to have a wrapper
> function consolidating this pattern.

Will factor the code.

Thanks,
Long