On 2022/10/18 3:20, longli@xxxxxxxxxxxxxxxxx wrote: > From: Long Li <longli@xxxxxxxxxxxxx> > > Add a RDMA VF driver for Microsoft Azure Network Adapter (MANA). > > Co-developed-by: Ajay Sharma <sharmaajay@xxxxxxxxxxxxx> > Signed-off-by: Ajay Sharma <sharmaajay@xxxxxxxxxxxxx> > Reviewed-by: Dexuan Cui <decui@xxxxxxxxxxxxx> > Signed-off-by: Long Li <longli@xxxxxxxxxxxxx> > --- > Change log: > v2: > Changed coding sytles/formats > Checked undersize for udata length > Changed all logging to use ibdev_xxx() > Avoided page array copy when doing MR > Sorted driver ops > Fixed warnings reported by kernel test robot <lkp@xxxxxxxxx> > > v3: > More coding sytle/format changes > > v4: > Process error on hardware vport configuration > > v5: > Change licenses to GPL-2.0-only > Fix error handling in mana_ib_gd_create_dma_region() > > v6: > rebased to rdma-next > removed redundant initialization to return value in mana_ib_probe() > added missing tabs at the end of mana_ib_gd_create_dma_region() > > v7: > move mana_gd_destroy_doorbell_page() and mana_gd_allocate_doorbell_page() from GDMA to this driver > use ib_umem_find_best_pgsz() for finding page size for registering dma regions with hardware > fix a bug that may double free mana_ind_table in mana_ib_create_qp_rss() > add Ajay Sharma <sharmaajay@xxxxxxxxxxxxx> to maintainer list > add details to description in drivers/infiniband/hw/mana/Kconfig > change multiple lines comments to use RDMA style from NETDEV style > change mana_ib_dev_ops to static > use module_auxiliary_driver() in place of module_init and module_exit > move all user-triggerable error messages to debug messages > check for ind_tbl_size overflow in mana_ib_create_qp_rss() > > MAINTAINERS | 9 + > drivers/infiniband/Kconfig | 1 + > drivers/infiniband/hw/Makefile | 1 + > drivers/infiniband/hw/mana/Kconfig | 10 + > drivers/infiniband/hw/mana/Makefile | 4 + > drivers/infiniband/hw/mana/cq.c | 79 ++++ > drivers/infiniband/hw/mana/device.c | 117 ++++++ > drivers/infiniband/hw/mana/main.c | 508 ++++++++++++++++++++++++ > drivers/infiniband/hw/mana/mana_ib.h | 156 ++++++++ > drivers/infiniband/hw/mana/mr.c | 200 ++++++++++ > drivers/infiniband/hw/mana/qp.c | 505 +++++++++++++++++++++++ > drivers/infiniband/hw/mana/wq.c | 115 ++++++ > include/net/mana/mana.h | 3 + > include/uapi/rdma/ib_user_ioctl_verbs.h | 1 + > include/uapi/rdma/mana-abi.h | 66 +++ > 15 files changed, 1775 insertions(+) > create mode 100644 drivers/infiniband/hw/mana/Kconfig > create mode 100644 drivers/infiniband/hw/mana/Makefile > create mode 100644 drivers/infiniband/hw/mana/cq.c > create mode 100644 drivers/infiniband/hw/mana/device.c > create mode 100644 drivers/infiniband/hw/mana/main.c > create mode 100644 drivers/infiniband/hw/mana/mana_ib.h > create mode 100644 drivers/infiniband/hw/mana/mr.c > create mode 100644 drivers/infiniband/hw/mana/qp.c > create mode 100644 drivers/infiniband/hw/mana/wq.c > create mode 100644 include/uapi/rdma/mana-abi.h > [...] > + > +int mana_ib_cfg_vport(struct mana_ib_dev *dev, u32 port, struct mana_ib_pd *pd, > + u32 doorbell_id) > +{ > + struct gdma_dev *mdev = dev->gdma_dev; > + struct mana_port_context *mpc; > + struct mana_context *mc; > + struct net_device *ndev; > + int err; > + > + mc = mdev->driver_data; > + ndev = mc->ports[port]; > + mpc = netdev_priv(ndev); > + > + mutex_lock(&pd->vport_mutex); > + > + pd->vport_use_count++; > + if (pd->vport_use_count > 1) { > + ibdev_dbg(&dev->ib_dev, > + "Skip as this PD is already configured vport\n"); > + mutex_unlock(&pd->vport_mutex); > + return 0; > + } > + mutex_unlock(&pd->vport_mutex); > + > + err = mana_cfg_vport(mpc, pd->pdn, doorbell_id); > + if (err) { > + mutex_lock(&pd->vport_mutex); > + pd->vport_use_count--; > + mutex_unlock(&pd->vport_mutex); It seems there might be a race between the "pd->vport_use_count > 1" checking above and the error handling here, it may cause other user using a unconfigured vport if other user is checking the "pd->vport_use_count > 1)" while mana_cfg_vport() fails before doing "pd->vport_use_count--". > + > + ibdev_dbg(&dev->ib_dev, "Failed to configure vPort %d\n", err); > + return err; > + } > + > + pd->tx_shortform_allowed = mpc->tx_shortform_allowed; > + pd->tx_vp_offset = mpc->tx_vp_offset; > + > + ibdev_dbg(&dev->ib_dev, "vport handle %llx pdid %x doorbell_id %x\n", > + mpc->port_handle, pd->pdn, doorbell_id); > + > + return 0; > +} > + [...] > + > +static int mana_gd_allocate_doorbell_page(struct gdma_context *gc, > + int *doorbell_page) > +{ > + struct gdma_allocate_resource_range_req req = {}; > + struct gdma_allocate_resource_range_resp resp = {}; > + int err; > + > + mana_gd_init_req_hdr(&req.hdr, GDMA_ALLOCATE_RESOURCE_RANGE, > + sizeof(req), sizeof(resp)); > + > + req.resource_type = GDMA_RESOURCE_DOORBELL_PAGE; > + req.num_resources = 1; > + req.alignment = 1; > + > + /* Have GDMA start searching from 0 */ > + req.allocated_resources = 0; > + > + err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp); > + if (err || resp.hdr.status) { > + dev_err(gc->dev, > + "Failed to allocate doorbell page: ret %d, 0x%x\n", > + err, resp.hdr.status); > + return err ? err : -EPROTO; > + } > + > + *doorbell_page = resp.allocated_resources; > + > + return 0; > +} > + > +int mana_ib_alloc_ucontext(struct ib_ucontext *ibcontext, > + struct ib_udata *udata) > +{ > + struct mana_ib_ucontext *ucontext = > + container_of(ibcontext, struct mana_ib_ucontext, ibucontext); > + struct ib_device *ibdev = ibcontext->device; > + struct mana_ib_dev *mdev; > + struct gdma_context *gc; > + struct gdma_dev *dev; > + int doorbell_page; > + int ret; > + > + mdev = container_of(ibdev, struct mana_ib_dev, ib_dev); > + dev = mdev->gdma_dev; > + gc = dev->gdma_context; > + > + /* Allocate a doorbell page index */ > + ret = mana_gd_allocate_doorbell_page(gc, &doorbell_page); > + if (ret) { > + ibdev_dbg(ibdev, "Failed to allocate doorbell page %d\n", ret); > + return -ENOMEM; It does not make much sense to do "err ? err : -EPROTO" in mana_gd_allocate_doorbell_page() if -ENOMEM is returned unconditionally here. > + } > + > + ibdev_dbg(ibdev, "Doorbell page allocated %d\n", doorbell_page); > + > + ucontext->doorbell = doorbell_page; > + > + return 0; > +} > + [...] > diff --git a/drivers/infiniband/hw/mana/mana_ib.h b/drivers/infiniband/hw/mana/mana_ib.h > new file mode 100644 > index 000000000000..2225a6d6f8e1 > --- /dev/null > +++ b/drivers/infiniband/hw/mana/mana_ib.h > @@ -0,0 +1,156 @@ > +/* SPDX-License-Identifier: GPL-2.0-only */ > +/* > + * Copyright (c) 2022 Microsoft Corporation. All rights reserved. > + */ > + > +#ifndef _MANA_IB_H_ > +#define _MANA_IB_H_ > + > +#include <rdma/ib_verbs.h> > +#include <rdma/ib_mad.h> > +#include <rdma/ib_umem.h> > +#include <rdma/mana-abi.h> > +#include <rdma/uverbs_ioctl.h> > + > +#include <net/mana/mana.h> > + > +#define PAGE_SZ_BM \ > + (SZ_4K | SZ_8K | SZ_16K | SZ_32K | SZ_64K | SZ_128K | SZ_256K | \ > + SZ_512K | SZ_1M | SZ_2M) > + > +/* MANA doesn't have any limit for MR size */ > +#define MANA_IB_MAX_MR_SIZE ((u64)(~(0ULL))) Use U64_MAX? > + > +struct mana_ib_dev { > + struct ib_device ib_dev; > + struct gdma_dev *gdma_dev; > +}; > + [...] > diff --git a/drivers/infiniband/hw/mana/mr.c b/drivers/infiniband/hw/mana/mr.c > new file mode 100644 > index 000000000000..09124dd1792d > --- /dev/null > +++ b/drivers/infiniband/hw/mana/mr.c > @@ -0,0 +1,200 @@ > +// SPDX-License-Identifier: GPL-2.0-only > +/* > + * Copyright (c) 2022, Microsoft Corporation. All rights reserved. > + */ > + > +#include "mana_ib.h" > + > +#define VALID_MR_FLAGS \ > + (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ) > + > +static enum gdma_mr_access_flags > +mana_ib_verbs_to_gdma_access_flags(int access_flags) > +{ > + enum gdma_mr_access_flags flags = GDMA_ACCESS_FLAG_LOCAL_READ; > + > + if (access_flags & IB_ACCESS_LOCAL_WRITE) > + flags |= GDMA_ACCESS_FLAG_LOCAL_WRITE; > + > + if (access_flags & IB_ACCESS_REMOTE_WRITE) > + flags |= GDMA_ACCESS_FLAG_REMOTE_WRITE; > + > + if (access_flags & IB_ACCESS_REMOTE_READ) > + flags |= GDMA_ACCESS_FLAG_REMOTE_READ; > + > + return flags; > +} > + > +static int mana_ib_gd_create_mr(struct mana_ib_dev *dev, struct mana_ib_mr *mr, > + struct gdma_create_mr_params *mr_params) > +{ > + struct gdma_create_mr_response resp = {}; > + struct gdma_create_mr_request req = {}; > + struct gdma_dev *mdev = dev->gdma_dev; > + struct gdma_context *gc; > + int err; > + > + gc = mdev->gdma_context; > + > + mana_gd_init_req_hdr(&req.hdr, GDMA_CREATE_MR, sizeof(req), > + sizeof(resp)); > + req.pd_handle = mr_params->pd_handle; > + req.mr_type = mr_params->mr_type; > + > + switch (mr_params->mr_type) { > + case GDMA_MR_TYPE_GVA: > + req.gva.dma_region_handle = mr_params->gva.dma_region_handle; > + req.gva.virtual_address = mr_params->gva.virtual_address; > + req.gva.access_flags = mr_params->gva.access_flags; > + break; > + > + default: > + ibdev_dbg(&dev->ib_dev, > + "invalid param (GDMA_MR_TYPE) passed, type %d\n", > + req.mr_type); > + err = -EINVAL; > + goto error; > + } > + > + err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp); > + > + if (err || resp.hdr.status) { > + ibdev_dbg(&dev->ib_dev, "Failed to create mr %d, %u", err, > + resp.hdr.status); > + if (!err) > + err = -EPROTO; > + > + goto error; > + } > + > + mr->ibmr.lkey = resp.lkey; > + mr->ibmr.rkey = resp.rkey; > + mr->mr_handle = resp.mr_handle; > + > + return 0; > +error: > + return err; There is no error handling here, maybe just return error directly instead of a goto. > +} > + > +static int mana_ib_gd_destroy_mr(struct mana_ib_dev *dev, gdma_obj_handle_t mr_handle) > +{ > + struct gdma_destroy_mr_response resp = {}; > + struct gdma_destroy_mr_request req = {}; > + struct gdma_dev *mdev = dev->gdma_dev; > + struct gdma_context *gc; > + int err; > + > + gc = mdev->gdma_context; > + > + mana_gd_init_req_hdr(&req.hdr, GDMA_DESTROY_MR, sizeof(req), > + sizeof(resp)); > + > + req.mr_handle = mr_handle; > + > + err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp); > + if (err || resp.hdr.status) { > + dev_err(gc->dev, "Failed to destroy MR: %d, 0x%x\n", err, > + resp.hdr.status); > + if (!err) > + err = -EPROTO; > + return err; > + } > + > + return 0; > +} > + [...] > + > +static int mana_ib_create_qp_rss(struct ib_qp *ibqp, struct ib_pd *pd, > + struct ib_qp_init_attr *attr, > + struct ib_udata *udata) > +{ > + struct mana_ib_qp *qp = container_of(ibqp, struct mana_ib_qp, ibqp); > + struct mana_ib_dev *mdev = > + container_of(pd->device, struct mana_ib_dev, ib_dev); > + struct ib_rwq_ind_table *ind_tbl = attr->rwq_ind_tbl; > + struct mana_ib_create_qp_rss_resp resp = {}; > + struct mana_ib_create_qp_rss ucmd = {}; > + struct gdma_dev *gd = mdev->gdma_dev; > + mana_handle_t *mana_ind_table; > + struct mana_port_context *mpc; > + struct mana_context *mc; > + struct net_device *ndev; > + struct mana_ib_cq *cq; > + struct mana_ib_wq *wq; > + unsigned int ind_tbl_size; > + struct ib_cq *ibcq; > + struct ib_wq *ibwq; > + u32 port; > + int ret; > + int i; > + > + mc = gd->driver_data; > + > + if (!udata || udata->inlen < sizeof(ucmd)) > + return -EINVAL; > + > + ret = ib_copy_from_udata(&ucmd, udata, min(sizeof(ucmd), udata->inlen)); > + if (ret) { > + ibdev_dbg(&mdev->ib_dev, > + "Failed copy from udata for create rss-qp, err %d\n", > + ret); > + return -EFAULT; Why not just return 'ret' directly?