On 7/28/2022 4:00 PM, Victor Zhao wrote:
In multi container use case, reset time is important, so skip ring
tests and cp halt wait during ip suspending for reset as they are
going to fail and cost more time on reset
v2: add a hang flag to indicate the reset comes from a job timeout,
skip ring test and cp halt wait in this case
Signed-off-by: Victor Zhao <Victor.Zhao@xxxxxxx>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 3 ++-
drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 2 ++
drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 1 +
drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 1 +
drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 11 +++++++++--
5 files changed, 15 insertions(+), 3 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index 222d3d7ea076..c735a17c6afb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -27,6 +27,7 @@
#include "amdgpu_gfx.h"
#include "amdgpu_rlc.h"
#include "amdgpu_ras.h"
+#include "amdgpu_reset.h"
/* delay 0.1 second to enable gfx off feature */
#define GFX_OFF_DELAY_ENABLE msecs_to_jiffies(100)
@@ -477,7 +478,7 @@ int amdgpu_gfx_disable_kcq(struct amdgpu_device *adev)
kiq->pmf->kiq_unmap_queues(kiq_ring, &adev->gfx.compute_ring[i],
RESET_QUEUES, 0, 0);
- if (adev->gfx.kiq.ring.sched.ready)
+ if (adev->gfx.kiq.ring.sched.ready && !(amdgpu_in_reset(adev) && adev->reset_domain->hang))
r = amdgpu_ring_test_helper(kiq_ring);
On a system with multiple GPUs interconnected, this will affect other
GPUs as well on which job was not really running. I guess your usecase
here is device specific.
Thanks,
Lijo
spin_unlock(&adev->gfx.kiq.ring_lock);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index 6c3e7290153f..bb40880a557f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -49,6 +49,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
}
memset(&ti, 0, sizeof(struct amdgpu_task_info));
+ adev->reset_domain->hang = true;
if (amdgpu_gpu_recovery &&
amdgpu_ring_soft_recovery(ring, job->vmid, s_job->s_fence->parent)) {
@@ -83,6 +84,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
}
exit:
+ adev->reset_domain->hang = false;
drm_dev_exit(idx);
return DRM_GPU_SCHED_STAT_NOMINAL;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
index 9da5ead50c90..b828fe773f50 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
@@ -155,6 +155,7 @@ struct amdgpu_reset_domain *amdgpu_reset_create_reset_domain(enum amdgpu_reset_d
atomic_set(&reset_domain->in_gpu_reset, 0);
atomic_set(&reset_domain->reset_res, 0);
init_rwsem(&reset_domain->sem);
+ reset_domain->hang = false;
return reset_domain;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
index cc4b2eeb24cf..29e324add552 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
@@ -84,6 +84,7 @@ struct amdgpu_reset_domain {
struct rw_semaphore sem;
atomic_t in_gpu_reset;
atomic_t reset_res;
+ bool hang;
};
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
index fafbad3cf08d..a384e04d916c 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
@@ -29,6 +29,7 @@
#include "amdgpu.h"
#include "amdgpu_gfx.h"
#include "amdgpu_psp.h"
+#include "amdgpu_reset.h"
#include "nv.h"
#include "nvd.h"
@@ -5971,6 +5972,9 @@ static int gfx_v10_0_cp_gfx_enable(struct amdgpu_device *adev, bool enable)
WREG32_SOC15(GC, 0, mmCP_ME_CNTL, tmp);
}
+ if ((amdgpu_in_reset(adev) && adev->reset_domain->hang) && !enable)
+ return 0;
+
for (i = 0; i < adev->usec_timeout; i++) {
if (RREG32_SOC15(GC, 0, mmCP_STAT) == 0)
break;
@@ -7569,8 +7573,10 @@ static int gfx_v10_0_kiq_disable_kgq(struct amdgpu_device *adev)
for (i = 0; i < adev->gfx.num_gfx_rings; i++)
kiq->pmf->kiq_unmap_queues(kiq_ring, &adev->gfx.gfx_ring[i],
PREEMPT_QUEUES, 0, 0);
-
- return amdgpu_ring_test_helper(kiq_ring);
+ if (!(amdgpu_in_reset(adev) && adev->reset_domain->hang))
+ return amdgpu_ring_test_helper(kiq_ring);
+ else
+ return 0;
}
#endif
@@ -7610,6 +7616,7 @@ static int gfx_v10_0_hw_fini(void *handle)
return 0;
}
+
gfx_v10_0_cp_enable(adev, false);
gfx_v10_0_enable_gui_idle_interrupt(adev, false);