Re: [PATCH v5 06/10] mhi: pci_generic: Add suspend/resume/recovery procedure

Manivannan Sadhasivam <manivannan.sadhasivam@xxxxxxxxxx> · Tue, 22 Dec 2020 22:35:34 +0530

On Mon, Dec 14, 2020 at 03:25:33PM +0100, Loic Poulain wrote:
> Add support for system wide suspend/resume. During suspend, MHI
> device controller must be put in M3 state and PCI bus in D3 state.
> 
> Add a recovery procedure allowing to reinitialize the device in case
> of error during resume steps, which can happen if device loses power
> (and so its context) while system suspend.
> 
> Signed-off-by: Loic Poulain <loic.poulain@xxxxxxxxxx>
> Reviewed-by Hemant Kumar <hemantk@xxxxxxxxxxxxxx>
> ---
>  drivers/bus/mhi/pci_generic.c | 102 ++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 102 insertions(+)
> 
> diff --git a/drivers/bus/mhi/pci_generic.c b/drivers/bus/mhi/pci_generic.c
> index 2521cd4..0936701 100644
> --- a/drivers/bus/mhi/pci_generic.c
> +++ b/drivers/bus/mhi/pci_generic.c
> @@ -13,6 +13,7 @@
>  #include <linux/mhi.h>
>  #include <linux/module.h>
>  #include <linux/pci.h>
> +#include <linux/workqueue.h>
>  
>  #define MHI_PCI_DEFAULT_BAR_NUM 0
>  
> @@ -186,6 +187,7 @@ enum mhi_pci_device_status {
>  struct mhi_pci_device {
>  	struct mhi_controller mhi_cntrl;
>  	struct pci_saved_state *pci_state;
> +	struct work_struct recovery_work;
>  	unsigned long status;
>  };
>  
> @@ -313,6 +315,48 @@ static void mhi_pci_runtime_put(struct mhi_controller *mhi_cntrl)
>  	/* no PM for now */
>  }
>  
> +static void mhi_pci_recovery_work(struct work_struct *work)
> +{
> +	struct mhi_pci_device *mhi_pdev = container_of(work, struct mhi_pci_device,
> +						       recovery_work);
> +	struct mhi_controller *mhi_cntrl = &mhi_pdev->mhi_cntrl;
> +	struct pci_dev *pdev = to_pci_dev(mhi_cntrl->cntrl_dev);
> +	int err;
> +
> +	dev_warn(&pdev->dev, "device recovery started\n");
> +
> +	/* Clean up MHI state */
> +	if (test_and_clear_bit(MHI_PCI_DEV_STARTED, &mhi_pdev->status)) {
> +		mhi_power_down(mhi_cntrl, false);
> +		mhi_unprepare_after_power_down(mhi_cntrl);
> +	}
> +
> +	/* Check if we can recover without full reset */
> +	pci_set_power_state(pdev, PCI_D0);
> +	pci_load_saved_state(pdev, mhi_pdev->pci_state);
> +	pci_restore_state(pdev);

These pci state settings seems redundant with resume().

In this function you should first check if MHI is alive, if yes then do
power up. Else you should just exit.

> +
> +	if (!mhi_pci_is_alive(mhi_cntrl))
> +		goto err_try_reset;
> +
> +	err = mhi_prepare_for_power_up(mhi_cntrl);
> +	if (err)
> +		goto err_try_reset;
> +
> +	err = mhi_sync_power_up(mhi_cntrl);
> +	if (err)
> +		goto err_unprepare;

Add a debug log for recovery success.

> +
> +	set_bit(MHI_PCI_DEV_STARTED, &mhi_pdev->status);
> +	return;
> +
> +err_unprepare:
> +	mhi_unprepare_after_power_down(mhi_cntrl);
> +err_try_reset:
> +	if (pci_reset_function(pdev))

So if the device recovers, who will make sure reinitializing the MHI
controller? That's why I think we should convey the recovery result to
PM core. I don't think using workqueue here is a good idea.

> +		dev_err(&pdev->dev, "Recovery failed\n");
> +}
> +
>  static int mhi_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
>  {
>  	const struct mhi_pci_dev_info *info = (struct mhi_pci_dev_info *) id->driver_data;
> @@ -327,6 +371,8 @@ static int mhi_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
>  	if (!mhi_pdev)
>  		return -ENOMEM;
>  
> +	INIT_WORK(&mhi_pdev->recovery_work, mhi_pci_recovery_work);
> +
>  	mhi_cntrl_config = info->config;
>  	mhi_cntrl = &mhi_pdev->mhi_cntrl;
>  
> @@ -391,6 +437,8 @@ static void mhi_pci_remove(struct pci_dev *pdev)
>  	struct mhi_pci_device *mhi_pdev = pci_get_drvdata(pdev);
>  	struct mhi_controller *mhi_cntrl = &mhi_pdev->mhi_cntrl;
>  
> +	cancel_work_sync(&mhi_pdev->recovery_work);
> +
>  	if (test_and_clear_bit(MHI_PCI_DEV_STARTED, &mhi_pdev->status)) {
>  		mhi_power_down(mhi_cntrl, true);
>  		mhi_unprepare_after_power_down(mhi_cntrl);
> @@ -456,12 +504,66 @@ static const struct pci_error_handlers mhi_pci_err_handler = {
>  	.reset_done = mhi_pci_reset_done,
>  };
>  
> +static int  __maybe_unused mhi_pci_suspend(struct device *dev)
> +{
> +	struct pci_dev *pdev = to_pci_dev(dev);
> +	struct mhi_pci_device *mhi_pdev = dev_get_drvdata(dev);
> +	struct mhi_controller *mhi_cntrl = &mhi_pdev->mhi_cntrl;
> +
> +	cancel_work_sync(&mhi_pdev->recovery_work);
> +
> +	/* Transition to M3 state */
> +	mhi_pm_suspend(mhi_cntrl);
> +
> +	pci_save_state(pdev);
> +	pci_disable_device(pdev);
> +	pci_wake_from_d3(pdev, true);
> +	pci_set_power_state(pdev, PCI_D3hot);
> +
> +	return 0;
> +}
> +
> +static int __maybe_unused mhi_pci_resume(struct device *dev)
> +{
> +	struct pci_dev *pdev = to_pci_dev(dev);
> +	struct mhi_pci_device *mhi_pdev = dev_get_drvdata(dev);
> +	struct mhi_controller *mhi_cntrl = &mhi_pdev->mhi_cntrl;
> +	int err;
> +
> +	pci_set_power_state(pdev, PCI_D0);
> +	pci_restore_state(pdev);
> +	pci_set_master(pdev);
> +
> +	err = pci_enable_device(pdev);
> +	if (err)
> +		goto err_recovery;
> +
> +	/* Exit M3, transition to M0 state */
> +	err = mhi_pm_resume(mhi_cntrl);
> +	if (err) {
> +		dev_err(&pdev->dev, "failed to resume device: %d\n", err);
> +		goto err_recovery;
> +	}
> +
> +	return 0;
> +
> +err_recovery:
> +	/* The device may have loose power or crashed, try recovering it */

Did you actually hit this scenario? In the case of power loss or crash, can we
access the MHI register space?

Thanks,
Mani

> +	queue_work(system_long_wq, &mhi_pdev->recovery_work);
> +	return 0;
> +}
> +
> +static const struct dev_pm_ops mhi_pci_pm_ops = {
> +	SET_SYSTEM_SLEEP_PM_OPS(mhi_pci_suspend, mhi_pci_resume)
> +};
> +
>  static struct pci_driver mhi_pci_driver = {
>  	.name		= "mhi-pci-generic",
>  	.id_table	= mhi_pci_id_table,
>  	.probe		= mhi_pci_probe,
>  	.remove		= mhi_pci_remove,
>  	.err_handler	= &mhi_pci_err_handler,
> +	.driver.pm	= &mhi_pci_pm_ops
>  };
>  module_pci_driver(mhi_pci_driver);
>  
> -- 
> 2.7.4
>