On Thu, Jun 13, 2024 at 11:22:04AM +0300, Omer Shpigelman wrote: > Add an RDMA driver of Gaudi ASICs family for AI scaling. > The driver itself is agnostic to the ASIC in action, it operates according > to the capabilities that were passed on device initialization. > The device is initialized by the hbl_cn driver via auxiliary bus. > The driver also supports QP resource tracking and port/device HW counters. > > Signed-off-by: Omer Shpigelman <oshpigelman@xxxxxxxxx> > Co-developed-by: Abhilash K V <kvabhilash@xxxxxxxxx> > Signed-off-by: Abhilash K V <kvabhilash@xxxxxxxxx> > Co-developed-by: Andrey Agranovich <aagranovich@xxxxxxxxx> > Signed-off-by: Andrey Agranovich <aagranovich@xxxxxxxxx> > Co-developed-by: Bharat Jauhari <bjauhari@xxxxxxxxx> > Signed-off-by: Bharat Jauhari <bjauhari@xxxxxxxxx> > Co-developed-by: David Meriin <dmeriin@xxxxxxxxx> > Signed-off-by: David Meriin <dmeriin@xxxxxxxxx> > Co-developed-by: Sagiv Ozeri <sozeri@xxxxxxxxx> > Signed-off-by: Sagiv Ozeri <sozeri@xxxxxxxxx> > Co-developed-by: Zvika Yehudai <zyehudai@xxxxxxxxx> > Signed-off-by: Zvika Yehudai <zyehudai@xxxxxxxxx> I afraid that you misinterpreted the "Co-developed-by" tag. All these people are probably touch the code and not actually sit together at the same room and write the code together. So, please remove the extensive "Co-developed-by" tags. It is not full review yet, but simple pass-by-comments. > --- > MAINTAINERS | 10 + > drivers/infiniband/Kconfig | 1 + > drivers/infiniband/hw/Makefile | 1 + > drivers/infiniband/hw/hbl/Kconfig | 17 + > drivers/infiniband/hw/hbl/Makefile | 8 + > drivers/infiniband/hw/hbl/hbl.h | 326 +++ > drivers/infiniband/hw/hbl/hbl_main.c | 478 ++++ > drivers/infiniband/hw/hbl/hbl_verbs.c | 2686 ++++++++++++++++++++++ > include/uapi/rdma/hbl-abi.h | 204 ++ > include/uapi/rdma/hbl_user_ioctl_cmds.h | 66 + > include/uapi/rdma/hbl_user_ioctl_verbs.h | 106 + > include/uapi/rdma/ib_user_ioctl_verbs.h | 1 + > 12 files changed, 3904 insertions(+) > create mode 100644 drivers/infiniband/hw/hbl/Kconfig > create mode 100644 drivers/infiniband/hw/hbl/Makefile > create mode 100644 drivers/infiniband/hw/hbl/hbl.h > create mode 100644 drivers/infiniband/hw/hbl/hbl_main.c > create mode 100644 drivers/infiniband/hw/hbl/hbl_verbs.c > create mode 100644 include/uapi/rdma/hbl-abi.h > create mode 100644 include/uapi/rdma/hbl_user_ioctl_cmds.h > create mode 100644 include/uapi/rdma/hbl_user_ioctl_verbs.h <...> > +#define hbl_ibdev_emerg(ibdev, format, ...) ibdev_emerg(ibdev, format, ##__VA_ARGS__) > +#define hbl_ibdev_alert(ibdev, format, ...) ibdev_alert(ibdev, format, ##__VA_ARGS__) > +#define hbl_ibdev_crit(ibdev, format, ...) ibdev_crit(ibdev, format, ##__VA_ARGS__) > +#define hbl_ibdev_err(ibdev, format, ...) ibdev_err(ibdev, format, ##__VA_ARGS__) > +#define hbl_ibdev_warn(ibdev, format, ...) ibdev_warn(ibdev, format, ##__VA_ARGS__) > +#define hbl_ibdev_notice(ibdev, format, ...) ibdev_notice(ibdev, format, ##__VA_ARGS__) > +#define hbl_ibdev_info(ibdev, format, ...) ibdev_info(ibdev, format, ##__VA_ARGS__) > +#define hbl_ibdev_dbg(ibdev, format, ...) ibdev_dbg(ibdev, format, ##__VA_ARGS__) > + > +#define hbl_ibdev_emerg_ratelimited(ibdev, fmt, ...) \ > + ibdev_emerg_ratelimited(ibdev, fmt, ##__VA_ARGS__) > +#define hbl_ibdev_alert_ratelimited(ibdev, fmt, ...) \ > + ibdev_alert_ratelimited(ibdev, fmt, ##__VA_ARGS__) > +#define hbl_ibdev_crit_ratelimited(ibdev, fmt, ...) \ > + ibdev_crit_ratelimited(ibdev, fmt, ##__VA_ARGS__) > +#define hbl_ibdev_err_ratelimited(ibdev, fmt, ...) \ > + ibdev_err_ratelimited(ibdev, fmt, ##__VA_ARGS__) > +#define hbl_ibdev_warn_ratelimited(ibdev, fmt, ...) \ > + ibdev_warn_ratelimited(ibdev, fmt, ##__VA_ARGS__) > +#define hbl_ibdev_notice_ratelimited(ibdev, fmt, ...) \ > + ibdev_notice_ratelimited(ibdev, fmt, ##__VA_ARGS__) > +#define hbl_ibdev_info_ratelimited(ibdev, fmt, ...) \ > + ibdev_info_ratelimited(ibdev, fmt, ##__VA_ARGS__) > +#define hbl_ibdev_dbg_ratelimited(ibdev, fmt, ...) \ > + ibdev_dbg_ratelimited(ibdev, fmt, ##__VA_ARGS__) > + Please don't redefine the existing macros. Just use the existing ones. <...> > + if (hbl_ib_match_netdev(ibdev, netdev)) > + ib_port = hbl_to_ib_port_num(hdev, netdev->dev_port); > + else > + return NOTIFY_DONE; It is not kernel coding style. Please write: if (!hbl_ib_match_netdev(ibdev, netdev)) return NOTIFY_DONE; ib_port = hbl_to_ib_port_num(hdev, netdev->dev_port); > + <...> > +static int hbl_ib_probe(struct auxiliary_device *adev, const struct auxiliary_device_id *id) > +{ > + struct hbl_aux_dev *aux_dev = container_of(adev, struct hbl_aux_dev, adev); > + struct hbl_ib_aux_ops *aux_ops = aux_dev->aux_ops; > + struct hbl_ib_device *hdev; > + ktime_t timeout; > + int rc; > + > + rc = hdev_init(aux_dev); > + if (rc) { > + dev_err(&aux_dev->adev.dev, "Failed to init hdev\n"); > + return -EIO; > + } > + > + hdev = aux_dev->priv; > + > + /* don't allow module unloading while it is attached */ > + if (!try_module_get(THIS_MODULE)) { This part makes wonder, what are you trying to do here? What doesn't work for you in standard driver core and module load mechanism? > + dev_err(hdev->dev, "Failed to increment %s module refcount\n", > + module_name(THIS_MODULE)); > + rc = -EIO; > + goto module_get_err; > + } > + > + timeout = ktime_add_ms(ktime_get(), hdev->pending_reset_long_timeout * MSEC_PER_SEC); > + while (1) { > + aux_ops->hw_access_lock(aux_dev); > + > + /* if the device is operational, proceed to actual init while holding the lock in > + * order to prevent concurrent hard reset > + */ > + if (aux_ops->device_operational(aux_dev)) > + break; > + > + aux_ops->hw_access_unlock(aux_dev); > + > + if (ktime_compare(ktime_get(), timeout) > 0) { > + dev_err(hdev->dev, "Timeout while waiting for hard reset to finish\n"); > + rc = -EBUSY; > + goto timeout_err; > + } > + > + dev_notice_once(hdev->dev, "Waiting for hard reset to finish before probing IB\n"); > + > + msleep_interruptible(MSEC_PER_SEC); > + } The code above is unexpected. > + > + rc = hbl_ib_dev_init(hdev); > + if (rc) { > + dev_err(hdev->dev, "Failed to init ib device\n"); > + goto dev_init_err; > + } > + > + aux_ops->hw_access_unlock(aux_dev); > + > + return 0; > + > +dev_init_err: > + aux_ops->hw_access_unlock(aux_dev); > +timeout_err: > + module_put(THIS_MODULE); > +module_get_err: > + hdev_fini(aux_dev); > + > + return rc; > +} <...> > +static int __init hbl_ib_init(void) > +{ > + pr_info("loading driver\n"); Please remove all these debug prints and leave only the necessary ones. > + > + return auxiliary_driver_register(&hbl_ib_driver); > +} > + > +static void __exit hbl_ib_exit(void) > +{ > + auxiliary_driver_unregister(&hbl_ib_driver); > + > + pr_info("driver removed\n"); > +} > + > +module_init(hbl_ib_init); > +module_exit(hbl_ib_exit) Thanks