On Fri, Aug 28, 2020 at 12:06 PM Andrey Grodzovsky <andrey.grodzovsky@xxxxxxx> wrote: > > DPC recovery involves ASIC reset just as normal GPU recovery so blosk Typo: "block" > SW GPU scedulers and wait on all concurent GPU resets. Typos: "schedulers" and "concurrent" > > Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@xxxxxxx> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 57 +++++++++++++++++++++++++++--- > 1 file changed, 53 insertions(+), 4 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > index e67cbf2..9a367a8 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > @@ -4745,6 +4745,20 @@ int amdgpu_device_baco_exit(struct drm_device *dev) > return 0; > } > > +static void amdgpu_cancel_all_tdr(struct amdgpu_device *adev) > +{ > + int i; > + > + for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { > + struct amdgpu_ring *ring = adev->rings[i]; > + > + if (!ring || !ring->sched.thread) > + continue; > + > + cancel_delayed_work_sync(&ring->sched.work_tdr); > + } > +} > + > /** > * amdgpu_pci_error_detected - Called when a PCI error is detected. > * @pdev: PCI device struct > @@ -4758,16 +4772,38 @@ pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_sta > { > struct drm_device *dev = pci_get_drvdata(pdev); > struct amdgpu_device *adev = drm_to_adev(dev); > + int i; > > DRM_INFO("PCI error: detected callback, state(%d)!!\n", state); > > switch (state) { > case pci_channel_io_normal: > return PCI_ERS_RESULT_CAN_RECOVER; > - case pci_channel_io_frozen: { > - /* Fatal error, prepare for slot reset */ > + case pci_channel_io_frozen: { /* Fatal error, prepare for slot reset */ > + > + /* > + * Cancel and wait for all TDRs in progress if failing to > + * set adev->in_gpu_reset in amdgpu_device_lock_adev > + * > + * Locking adev->reset_sem will perevent any external access Typo: "prevent" > + * to GPU during PCI error recovery > + */ > + while (!amdgpu_device_lock_adev(adev, NULL)) > + amdgpu_cancel_all_tdr(adev); > + > + /* > + * Block any work scheduling as we do for regualr GPU reset Typo: "regular" > + * for the duration of the recoveryq Typo: "recovery" Overall looks good to me, but you might want to run the scheduling changes by Christian as well. With the typos fixed: Acked-by: Alex Deucher <alexander.deucher@xxxxxxx> > + */ > + for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { > + struct amdgpu_ring *ring = adev->rings[i]; > + > + if (!ring || !ring->sched.thread) > + continue; > + > + drm_sched_stop(&ring->sched, NULL); > + } > > - amdgpu_device_lock_adev(adev); > return PCI_ERS_RESULT_NEED_RESET; > } > case pci_channel_io_perm_failure: > @@ -4900,8 +4936,21 @@ void amdgpu_pci_resume(struct pci_dev *pdev) > { > struct drm_device *dev = pci_get_drvdata(pdev); > struct amdgpu_device *adev = drm_to_adev(dev); > + int i; > > - amdgpu_device_unlock_adev(adev); > > DRM_INFO("PCI error: resume callback!!\n"); > + > + for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { > + struct amdgpu_ring *ring = adev->rings[i]; > + > + if (!ring || !ring->sched.thread) > + continue; > + > + > + drm_sched_resubmit_jobs(&ring->sched); > + drm_sched_start(&ring->sched, true); > + } > + > + amdgpu_device_unlock_adev(adev); > } > -- > 2.7.4 > > _______________________________________________ > amd-gfx mailing list > amd-gfx@xxxxxxxxxxxxxxxxxxxxx > https://lists.freedesktop.org/mailman/listinfo/amd-gfx _______________________________________________ amd-gfx mailing list amd-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/amd-gfx