On Thu, Nov 26, 2020 at 04:29:06PM +0100, Loic Poulain wrote: > If the modem crashes for any reason, we may not be able to detect > it at MHI level (MHI registers not reachable anymore). > > This patch implements a health-check mechanism to check regularly > that device is alive (MHI layer can communicate with). If device > is not alive (because a crash or unexpected reset), the recovery > procedure is triggered. > > Tested successfully with Telit FN980m module. > > Signed-off-by: Loic Poulain <loic.poulain@xxxxxxxxxx> Bhaumik, Hemant, does this looks reasonable to you? Or we can do a better job in the MHI stack. To me this is not a specific usecase for Telit. If you guys plan to implement it later, I can just apply this patch in the meantime as it looks good to me. Reviewed-by: Manivannan Sadhasivam <manivannan.sadhasivam@xxxxxxxxxx> Thanks, Mani > --- > drivers/bus/mhi/pci_generic.c | 34 ++++++++++++++++++++++++++++++++++ > 1 file changed, 34 insertions(+) > > diff --git a/drivers/bus/mhi/pci_generic.c b/drivers/bus/mhi/pci_generic.c > index 3ac5cd2..26c2dfa 100644 > --- a/drivers/bus/mhi/pci_generic.c > +++ b/drivers/bus/mhi/pci_generic.c > @@ -14,12 +14,15 @@ > #include <linux/mhi.h> > #include <linux/module.h> > #include <linux/pci.h> > +#include <linux/timer.h> > #include <linux/workqueue.h> > > #define MHI_PCI_DEFAULT_BAR_NUM 0 > > #define DEV_RESET_REG (0xB0) > > +#define HEALTH_CHECK_PERIOD (HZ * 5) > + > /** > * struct mhi_pci_dev_info - MHI PCI device specific information > * @config: MHI controller configuration > @@ -190,6 +193,7 @@ struct mhi_pci_device { > struct mhi_controller mhi_cntrl; > struct pci_saved_state *pci_state; > struct work_struct recovery_work; > + struct timer_list health_check_timer; > unsigned long status; > }; > > @@ -332,6 +336,8 @@ static void mhi_pci_recovery_work(struct work_struct *work) > > dev_warn(&pdev->dev, "device recovery started\n"); > > + del_timer(&mhi_pdev->health_check_timer); > + > /* Clean up MHI state */ > if (test_and_clear_bit(MHI_PCI_DEV_STARTED, &mhi_pdev->status)) { > mhi_power_down(mhi_cntrl, false); > @@ -355,6 +361,7 @@ static void mhi_pci_recovery_work(struct work_struct *work) > goto err_unprepare; > > set_bit(MHI_PCI_DEV_STARTED, &mhi_pdev->status); > + mod_timer(&mhi_pdev->health_check_timer, jiffies + HEALTH_CHECK_PERIOD); > return; > > err_unprepare: > @@ -364,6 +371,21 @@ static void mhi_pci_recovery_work(struct work_struct *work) > dev_err(&pdev->dev, "Recovery failed\n"); > } > > +static void health_check(struct timer_list *t) > +{ > + struct mhi_pci_device *mhi_pdev = from_timer(mhi_pdev, t, health_check_timer); > + struct mhi_controller *mhi_cntrl = &mhi_pdev->mhi_cntrl; > + > + if (!mhi_pci_is_alive(mhi_cntrl)) { > + dev_err(mhi_cntrl->cntrl_dev, "Device died\n"); > + queue_work(system_long_wq, &mhi_pdev->recovery_work); > + return; > + } > + > + /* reschedule in two seconds */ > + mod_timer(&mhi_pdev->health_check_timer, jiffies + HEALTH_CHECK_PERIOD); > +} > + > static int mhi_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) > { > const struct mhi_pci_dev_info *info = (struct mhi_pci_dev_info *) id->driver_data; > @@ -379,6 +401,7 @@ static int mhi_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) > return -ENOMEM; > > INIT_WORK(&mhi_pdev->recovery_work, mhi_pci_recovery_work); > + timer_setup(&mhi_pdev->health_check_timer, health_check, 0); > > mhi_cntrl_config = info->config; > mhi_cntrl = &mhi_pdev->mhi_cntrl; > @@ -431,6 +454,9 @@ static int mhi_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) > > set_bit(MHI_PCI_DEV_STARTED, &mhi_pdev->status); > > + /* start health check */ > + mod_timer(&mhi_pdev->health_check_timer, jiffies + HEALTH_CHECK_PERIOD); > + > return 0; > > err_unprepare: > @@ -446,6 +472,7 @@ static void mhi_pci_remove(struct pci_dev *pdev) > struct mhi_pci_device *mhi_pdev = pci_get_drvdata(pdev); > struct mhi_controller *mhi_cntrl = &mhi_pdev->mhi_cntrl; > > + del_timer(&mhi_pdev->health_check_timer); > cancel_work_sync(&mhi_pdev->recovery_work); > > if (test_and_clear_bit(MHI_PCI_DEV_STARTED, &mhi_pdev->status)) { > @@ -466,6 +493,8 @@ void mhi_pci_reset_prepare(struct pci_dev *pdev) > > dev_info(&pdev->dev, "reset\n"); > > + del_timer(&mhi_pdev->health_check_timer); > + > /* Clean up MHI state */ > if (test_and_clear_bit(MHI_PCI_DEV_STARTED, &mhi_pdev->status)) { > mhi_power_down(mhi_cntrl, false); > @@ -509,6 +538,7 @@ void mhi_pci_reset_done(struct pci_dev *pdev) > } > > set_bit(MHI_PCI_DEV_STARTED, &mhi_pdev->status); > + mod_timer(&mhi_pdev->health_check_timer, jiffies + HEALTH_CHECK_PERIOD); > } > > static pci_ers_result_t mhi_pci_error_detected(struct pci_dev *pdev, > @@ -569,6 +599,7 @@ int __maybe_unused mhi_pci_suspend(struct device *dev) > struct mhi_pci_device *mhi_pdev = dev_get_drvdata(dev); > struct mhi_controller *mhi_cntrl = &mhi_pdev->mhi_cntrl; > > + del_timer(&mhi_pdev->health_check_timer); > cancel_work_sync(&mhi_pdev->recovery_work); > > /* Transition to M3 state */ > @@ -604,6 +635,9 @@ static int __maybe_unused mhi_pci_resume(struct device *dev) > goto err_recovery; > } > > + /* Resume health check */ > + mod_timer(&mhi_pdev->health_check_timer, jiffies + HEALTH_CHECK_PERIOD); > + > return 0; > > err_recovery: > -- > 2.7.4 >