On Wed, Aug 28, 2024 at 02:09:41PM +0800, Cheng Xu wrote: > Driver may probe again while hardware is destroying the internal > resources allocated for previous probing How is it possible? > which will fail the device probe. To make it more robust, we always issue a reset at the > beginning of the device probe process. > > Signed-off-by: Cheng Xu <chengyou@xxxxxxxxxxxxxxxxx> > --- > drivers/infiniband/hw/erdma/erdma.h | 1 + > drivers/infiniband/hw/erdma/erdma_main.c | 44 +++++++++++++++++++----- > 2 files changed, 36 insertions(+), 9 deletions(-) > > diff --git a/drivers/infiniband/hw/erdma/erdma.h b/drivers/infiniband/hw/erdma/erdma.h > index c8bd698e21b0..b5c258f77ca0 100644 > --- a/drivers/infiniband/hw/erdma/erdma.h > +++ b/drivers/infiniband/hw/erdma/erdma.h > @@ -94,6 +94,7 @@ enum { > > #define ERDMA_CMDQ_TIMEOUT_MS 15000 > #define ERDMA_REG_ACCESS_WAIT_MS 20 > +#define ERDMA_WAIT_DEV_REST_CNT 50 > #define ERDMA_WAIT_DEV_DONE_CNT 500 > > struct erdma_cmdq { > diff --git a/drivers/infiniband/hw/erdma/erdma_main.c b/drivers/infiniband/hw/erdma/erdma_main.c > index 7080f8a71ec4..9199058a0b29 100644 > --- a/drivers/infiniband/hw/erdma/erdma_main.c > +++ b/drivers/infiniband/hw/erdma/erdma_main.c > @@ -209,11 +209,30 @@ static void erdma_device_uninit(struct erdma_dev *dev) > dma_pool_destroy(dev->resp_pool); > } > > -static void erdma_hw_reset(struct erdma_dev *dev) > +static int erdma_hw_reset(struct erdma_dev *dev, bool wait) > { > u32 ctrl = FIELD_PREP(ERDMA_REG_DEV_CTRL_RESET_MASK, 1); > + int i; > > erdma_reg_write32(dev, ERDMA_REGS_DEV_CTRL_REG, ctrl); > + > + if (!wait) > + return 0; > + > + for (i = 0; i < ERDMA_WAIT_DEV_REST_CNT; i++) { > + if (erdma_reg_read32_filed(dev, ERDMA_REGS_DEV_ST_REG, > + ERDMA_REG_DEV_ST_RESET_DONE_MASK)) > + break; > + > + msleep(ERDMA_REG_ACCESS_WAIT_MS); > + } > + > + if (i == ERDMA_WAIT_DEV_REST_CNT) { > + dev_err(&dev->pdev->dev, "wait reset done timeout.\n"); > + return -ETIME; > + } > + > + return 0; > } > > static int erdma_wait_hw_init_done(struct erdma_dev *dev) > @@ -239,6 +258,17 @@ static int erdma_wait_hw_init_done(struct erdma_dev *dev) > return 0; > } > > +static int erdma_preinit_check(struct erdma_dev *dev) > +{ > + u32 version = erdma_reg_read32(dev, ERDMA_REGS_VERSION_REG); > + > + /* we knows that it is a non-functional function. */ > + if (version == 0) > + return -ENODEV; > + > + return erdma_hw_reset(dev, true); > +} > + > static const struct pci_device_id erdma_pci_tbl[] = { > { PCI_DEVICE(PCI_VENDOR_ID_ALIBABA, 0x107f) }, > {} > @@ -248,7 +278,6 @@ static int erdma_probe_dev(struct pci_dev *pdev) > { > struct erdma_dev *dev; > int bars, err; > - u32 version; > > err = pci_enable_device(pdev); > if (err) { > @@ -287,12 +316,9 @@ static int erdma_probe_dev(struct pci_dev *pdev) > goto err_release_bars; > } > > - version = erdma_reg_read32(dev, ERDMA_REGS_VERSION_REG); > - if (version == 0) { > - /* we knows that it is a non-functional function. */ > - err = -ENODEV; > + err = erdma_preinit_check(dev); > + if (err) > goto err_iounmap_func_bar; > - } > > err = erdma_device_init(dev, pdev); > if (err) > @@ -327,7 +353,7 @@ static int erdma_probe_dev(struct pci_dev *pdev) > return 0; > > err_reset_hw: > - erdma_hw_reset(dev); > + erdma_hw_reset(dev, false); > > err_uninit_cmdq: > erdma_cmdq_destroy(dev); > @@ -364,7 +390,7 @@ static void erdma_remove_dev(struct pci_dev *pdev) > struct erdma_dev *dev = pci_get_drvdata(pdev); > > erdma_ceqs_uninit(dev); > - erdma_hw_reset(dev); > + erdma_hw_reset(dev, false); > erdma_cmdq_destroy(dev); > erdma_aeq_destroy(dev); > erdma_comm_irq_uninit(dev); > -- > 2.31.1 > >