This patch will ignore non-fatal errors and try to stop amdgpu's sw stack on fatal errors. Signed-off-by: Nirmoy Das <nirmoy.das@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 56 ++++++++++++++++++++++++- 1 file changed, 54 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index c1219af2e7d6..2b9ede3000ee 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -35,6 +35,7 @@ #include <linux/pm_runtime.h> #include <linux/vga_switcheroo.h> #include <drm/drm_probe_helper.h> +#include <drm/drm_atomic_helper.h> #include <linux/mmu_notifier.h> #include "amdgpu.h" @@ -1516,6 +1517,58 @@ static struct drm_driver kms_driver = { .patchlevel = KMS_DRIVER_PATCHLEVEL, }; +static pci_ers_result_t amdgpu_pci_err_detected(struct pci_dev *pdev, + pci_channel_state_t state) +{ + struct drm_device *dev = pci_get_drvdata(pdev); + struct amdgpu_device *adev = dev->dev_private; + int i; + int ret = PCI_ERS_RESULT_DISCONNECT; + + switch (state) { + case pci_channel_io_normal: + ret = PCI_ERS_RESULT_CAN_RECOVER; + break; + default: + /* Disable power management */ + adev->runpm = 0; + /* Suspend all IO operations */ + amdgpu_fbdev_set_suspend(adev, 1); + cancel_delayed_work_sync(&adev->delayed_init_work); + for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { + struct amdgpu_ring *ring = adev->rings[i]; + + if (!ring || !ring->sched.thread) + continue; + + amdgpu_job_stop_all_jobs_on_sched(&ring->sched); + } + + if (adev->mode_info.mode_config_initialized) { + if (!amdgpu_device_has_dc_support(adev)) + drm_helper_force_disable_all(adev->ddev); + else + drm_atomic_helper_shutdown(adev->ddev); + } + + amdgpu_fence_driver_fini(adev); + amdgpu_fbdev_fini(adev); + /* Try to close drm device to stop applications + * from opening dri files for further IO operations. + * TODO: This will throw warning as ttm is not + * cleaned perperly */ + drm_dev_fini(dev); + break; + } + + return ret; +} + +static const struct pci_error_handlers amdgpu_err_handler = { + .error_detected = amdgpu_pci_err_detected, +}; + + static struct pci_driver amdgpu_kms_pci_driver = { .name = DRIVER_NAME, .id_table = pciidlist, @@ -1523,10 +1576,9 @@ static struct pci_driver amdgpu_kms_pci_driver = { .remove = amdgpu_pci_remove, .shutdown = amdgpu_pci_shutdown, .driver.pm = &amdgpu_pm_ops, + .err_handler = &amdgpu_err_handler, }; - - static int __init amdgpu_init(void) { int r; -- 2.27.0 _______________________________________________ amd-gfx mailing list amd-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/amd-gfx