Am 19.02.25 um 22:35 schrieb André Almeida: > Instead of only triggering a wedged event for complete GPU resets, > trigger for all types, like soft resets and ring resets. Regardless of > the reset, it's useful for userspace to know that it happened because > the kernel will reject further submissions from that app. > > Signed-off-by: André Almeida <andrealmeid@xxxxxxxxxx> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 3 --- > drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 16 +++++++++------- > 2 files changed, 9 insertions(+), 10 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > index 24ba52d76045..36738c1a5b59 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > @@ -6123,9 +6123,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, > > atomic_set(&adev->reset_domain->reset_res, r); > > - if (!r) > - drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE); > - Feel free to add my rb to patch #1 and #2, but this here is a bad idea. We have resets which are not triggered by a submission timeout, but rather because of RAS (for example) and those would now not be raised any more. Regards, Christian. > return r; > } > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c > index 698e5799e542..1082b957e7b1 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c > @@ -91,8 +91,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job) > struct amdgpu_job *job = to_amdgpu_job(s_job); > struct amdgpu_task_info *ti; > struct amdgpu_device *adev = ring->adev; > - int idx; > - int r; > + int idx, ret = 0; > > if (!drm_dev_enter(adev_to_drm(adev), &idx)) { > dev_info(adev->dev, "%s - device unplugged skipping recovery on scheduler:%s", > @@ -141,8 +140,8 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job) > * we'll fall back to full GPU reset. > */ > drm_sched_wqueue_stop(&ring->sched); > - r = amdgpu_ring_reset(ring, job->vmid); > - if (!r) { > + ret = amdgpu_ring_reset(ring, job->vmid); > + if (!ret) { > if (amdgpu_ring_sched_ready(ring)) > drm_sched_stop(&ring->sched, s_job); > atomic_inc(&ring->adev->gpu_reset_counter); > @@ -170,9 +169,9 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job) > */ > set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags); > > - r = amdgpu_device_gpu_recover(ring->adev, job, &reset_context); > - if (r) > - dev_err(adev->dev, "GPU Recovery Failed: %d\n", r); > + ret = amdgpu_device_gpu_recover(ring->adev, job, &reset_context); > + if (ret) > + dev_err(adev->dev, "GPU Recovery Failed: %d\n", ret); > } else { > drm_sched_suspend_timeout(&ring->sched); > if (amdgpu_sriov_vf(adev)) > @@ -180,6 +179,9 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job) > } > > exit: > + if (!ret) > + drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE); > + > drm_dev_exit(idx); > return DRM_GPU_SCHED_STAT_NOMINAL; > }