Re: [PATCH 5/7] drm/amdgpu: Fix consecutive DPC recoveries failure.

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 




On 8/26/20 11:20 AM, Alex Deucher wrote:
On Wed, Aug 26, 2020 at 10:46 AM Andrey Grodzovsky
<andrey.grodzovsky@xxxxxxx> wrote:
DPC recovery after prev. DPC recovery or after prev. MODE1 reset fails
unles you save the cashe the saved PCI confspace to load it after
each new reset.
Also use same cached state for other use case of restoring PCI confspace
such as GPU mode1 or VGA switheroo.

We don't want to keep the saved state around in the pci core
otherwise, the pci core will assume we are managing the saved state
for suspend and resume.


Are you talking about all the references to pci_dev->state_saved in pci-driver.c ?

Andrey


I think we want logic like this:

At driver load time:
pci_save_state(pdev);
adev->pci_state = pci_store_saved_state(pdev);
pci_restore_state(adev->pdev);

then in the case of dpc, do:
pci_load_saved_state(pdev, adev->pci_state);

For all the other cases, just leave the code as is.

Alex


Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@xxxxxxx>
---
  drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  6 +++
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 60 +++++++++++++++++++++++++++---
  drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c    |  4 +-
  drivers/gpu/drm/amd/amdgpu/nv.c            |  4 +-
  drivers/gpu/drm/amd/amdgpu/soc15.c         |  4 +-
  5 files changed, 66 insertions(+), 12 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 3489622..42ee208 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -992,7 +992,9 @@ struct amdgpu_device {
         atomic_t                        throttling_logging_enabled;
         struct ratelimit_state          throttling_logging_rs;
         uint32_t                        ras_features;
+
Unrelated whitespace changes.

         bool                            in_dpc;
+       struct pci_saved_state          *pci_state;
  };

  static inline struct amdgpu_device *drm_to_adev(struct drm_device *ddev)
@@ -1272,6 +1274,10 @@ pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev);
  pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev);
  void amdgpu_pci_resume(struct pci_dev *pdev);

+bool amdgpu_device_cache_pci_state(struct pci_dev *pdev);
+bool amdgpu_device_load_pci_state(struct pci_dev *pdev);
+
+

  #include "amdgpu_object.h"

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index d9e3994..2c088df 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -1283,7 +1283,7 @@ static void amdgpu_switcheroo_set_state(struct pci_dev *pdev,
                 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;

                 pci_set_power_state(dev->pdev, PCI_D0);
-               pci_restore_state(dev->pdev);
+               amdgpu_device_load_pci_state(dev->pdev);
                 r = pci_enable_device(dev->pdev);
                 if (r)
                         DRM_WARN("pci_enable_device failed (%d)\n", r);
@@ -1296,7 +1296,7 @@ static void amdgpu_switcheroo_set_state(struct pci_dev *pdev,
                 drm_kms_helper_poll_disable(dev);
                 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
                 amdgpu_device_suspend(dev, true);
-               pci_save_state(dev->pdev);
+               amdgpu_device_cache_pci_state(dev->pdev);
                 /* Shut down the device */
                 pci_disable_device(dev->pdev);
                 pci_set_power_state(dev->pdev, PCI_D3cold);
@@ -3401,8 +3401,9 @@ int amdgpu_device_init(struct amdgpu_device *adev,
         if (r)
                 dev_err(adev->dev, "amdgpu_pmu_init failed\n");

-       if (pci_save_state(pdev))
-               DRM_ERROR("Failed to save PCI state!!\n");
+       /* Have stored pci confspace at hand for restore in sudden PCI error */
+       if (!amdgpu_device_cache_pci_state(adev->pdev))
+               DRM_WARN("Failed to cache PCI state!");

         return 0;

@@ -3430,6 +3431,8 @@ void amdgpu_device_fini(struct amdgpu_device *adev)
         flush_delayed_work(&adev->delayed_init_work);
         adev->shutdown = true;

+       kfree(adev->pci_state);
+
         /* make sure IB test finished before entering exclusive mode
          * to avoid preemption on IB test
          * */
@@ -4855,7 +4858,7 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
         /* wait for asic to come out of reset */
         msleep(500);

-       pci_restore_state(pdev);
+       amdgpu_device_load_pci_state(pdev);

         /* confirm  ASIC came out of reset */
         for (i = 0; i < adev->usec_timeout; i++) {
@@ -4934,8 +4937,10 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)

  out:

-       if (!r)
+       if (!r) {
+               amdgpu_device_cache_pci_state(adev->pdev);
                 DRM_INFO("PCIe error recovery succeeded\n");
+       }
         else {
                 DRM_ERROR("PCIe error recovery failed, err:%d", r);
                 amdgpu_device_unlock_adev(adev);
@@ -4974,3 +4979,46 @@ void amdgpu_pci_resume(struct pci_dev *pdev)

         amdgpu_device_unlock_adev(adev);
  }
+
+bool amdgpu_device_cache_pci_state(struct pci_dev *pdev)
+{
+       struct drm_device *dev = pci_get_drvdata(pdev);
+       struct amdgpu_device *adev = drm_to_adev(dev);
+       int r;
+
+       r = pci_save_state(pdev);
+       if (!r) {
+               kfree(adev->pci_state);
+
+               adev->pci_state = pci_store_saved_state(pdev);
+
+               if (!adev->pci_state) {
+                       DRM_ERROR("Failed to store PCI saved state");
+                       return false;
+               }
+       } else {
+               DRM_WARN("Failed to save PCI state, err:%d\n", r);
+               return false;
+       }
+
+       return true;
+}
+
+bool amdgpu_device_load_pci_state(struct pci_dev *pdev)
+{
+       struct drm_device *dev = pci_get_drvdata(pdev);
+       struct amdgpu_device *adev = drm_to_adev(dev);
+       int r;
+
+       if (!adev->pci_state)
+               return false;
+
+       r = pci_load_saved_state(pdev, adev->pci_state);
+
+       if (!r) {
+               pci_restore_state(pdev);
+       } else {
+               DRM_WARN("Failed to load PCI state, err:%d\n", r);
+               return false;
+       }
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index 4bbcc70..7a6482a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -1320,7 +1320,7 @@ static int amdgpu_pmops_runtime_suspend(struct device *dev)
                 if (amdgpu_is_atpx_hybrid()) {
                         pci_ignore_hotplug(pdev);
                 } else {
-                       pci_save_state(pdev);
+                       amdgpu_device_cache_pci_state(pdev);
                         pci_disable_device(pdev);
                         pci_ignore_hotplug(pdev);
                         pci_set_power_state(pdev, PCI_D3cold);
@@ -1353,7 +1353,7 @@ static int amdgpu_pmops_runtime_resume(struct device *dev)
                         pci_set_master(pdev);
                 } else {
                         pci_set_power_state(pdev, PCI_D0);
-                       pci_restore_state(pdev);
+                       amdgpu_device_load_pci_state(pdev);
                         ret = pci_enable_device(pdev);
                         if (ret)
                                 return ret;
diff --git a/drivers/gpu/drm/amd/amdgpu/nv.c b/drivers/gpu/drm/amd/amdgpu/nv.c
index 4d14023..0ec6603 100644
--- a/drivers/gpu/drm/amd/amdgpu/nv.c
+++ b/drivers/gpu/drm/amd/amdgpu/nv.c
@@ -311,7 +311,7 @@ static int nv_asic_mode1_reset(struct amdgpu_device *adev)
         /* disable BM */
         pci_clear_master(adev->pdev);

-       pci_save_state(adev->pdev);
+       amdgpu_device_cache_pci_state(adev->pdev);

         if (amdgpu_dpm_is_mode1_reset_supported(adev)) {
                 dev_info(adev->dev, "GPU smu mode1 reset\n");
@@ -323,7 +323,7 @@ static int nv_asic_mode1_reset(struct amdgpu_device *adev)

         if (ret)
                 dev_err(adev->dev, "GPU mode1 reset failed\n");
-       pci_restore_state(adev->pdev);
+       amdgpu_device_load_pci_state(adev->pdev);

         /* wait for asic to come out of reset */
         for (i = 0; i < adev->usec_timeout; i++) {
diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c b/drivers/gpu/drm/amd/amdgpu/soc15.c
index 2f93c47..ddd55e3 100644
--- a/drivers/gpu/drm/amd/amdgpu/soc15.c
+++ b/drivers/gpu/drm/amd/amdgpu/soc15.c
@@ -484,13 +484,13 @@ static int soc15_asic_mode1_reset(struct amdgpu_device *adev)
         /* disable BM */
         pci_clear_master(adev->pdev);

-       pci_save_state(adev->pdev);
+       amdgpu_device_cache_pci_state(adev->pdev);

         ret = psp_gpu_reset(adev);
         if (ret)
                 dev_err(adev->dev, "GPU mode1 reset failed\n");

-       pci_restore_state(adev->pdev);
+       amdgpu_device_load_pci_state(adev->pdev);

         /* wait for asic to come out of reset */
         for (i = 0; i < adev->usec_timeout; i++) {
--
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@xxxxxxxxxxxxxxxxxxxxx
https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&amp;data=02%7C01%7Candrey.grodzovsky%40amd.com%7C5faaaf11d6024193cb2508d849d38fae%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637340520282366646&amp;sdata=ziPLmwT95XAnKftzplY2QBHq%2BL1961d0SdaMPAlVMcE%3D&amp;reserved=0
_______________________________________________
amd-gfx mailing list
amd-gfx@xxxxxxxxxxxxxxxxxxxxx
https://lists.freedesktop.org/mailman/listinfo/amd-gfx



[Index of Archives]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux