Re: [PATCH v2 6/6] drm/amdgpu: reduce reset time

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 





On 7/28/2022 4:00 PM, Victor Zhao wrote:
In multi container use case, reset time is important, so skip ring
tests and cp halt wait during ip suspending for reset as they are
going to fail and cost more time on reset

v2: add a hang flag to indicate the reset comes from a job timeout,
skip ring test and cp halt wait in this case

Signed-off-by: Victor Zhao <Victor.Zhao@xxxxxxx>
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c   |  3 ++-
  drivers/gpu/drm/amd/amdgpu/amdgpu_job.c   |  2 ++
  drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c |  1 +
  drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h |  1 +
  drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c    | 11 +++++++++--
  5 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index 222d3d7ea076..c735a17c6afb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -27,6 +27,7 @@
  #include "amdgpu_gfx.h"
  #include "amdgpu_rlc.h"
  #include "amdgpu_ras.h"
+#include "amdgpu_reset.h"
/* delay 0.1 second to enable gfx off feature */
  #define GFX_OFF_DELAY_ENABLE         msecs_to_jiffies(100)
@@ -477,7 +478,7 @@ int amdgpu_gfx_disable_kcq(struct amdgpu_device *adev)
  		kiq->pmf->kiq_unmap_queues(kiq_ring, &adev->gfx.compute_ring[i],
  					   RESET_QUEUES, 0, 0);
- if (adev->gfx.kiq.ring.sched.ready)
+	if (adev->gfx.kiq.ring.sched.ready && !(amdgpu_in_reset(adev) && adev->reset_domain->hang))
  		r = amdgpu_ring_test_helper(kiq_ring);

On a system with multiple GPUs interconnected, this will affect other GPUs as well on which job was not really running. I guess your usecase here is device specific.

Thanks,
Lijo

  	spin_unlock(&adev->gfx.kiq.ring_lock);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index 6c3e7290153f..bb40880a557f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -49,6 +49,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
  	}
memset(&ti, 0, sizeof(struct amdgpu_task_info));
+	adev->reset_domain->hang = true;
if (amdgpu_gpu_recovery &&
  	    amdgpu_ring_soft_recovery(ring, job->vmid, s_job->s_fence->parent)) {
@@ -83,6 +84,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
  	}
exit:
+	adev->reset_domain->hang = false;
  	drm_dev_exit(idx);
  	return DRM_GPU_SCHED_STAT_NOMINAL;
  }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
index 9da5ead50c90..b828fe773f50 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
@@ -155,6 +155,7 @@ struct amdgpu_reset_domain *amdgpu_reset_create_reset_domain(enum amdgpu_reset_d
  	atomic_set(&reset_domain->in_gpu_reset, 0);
  	atomic_set(&reset_domain->reset_res, 0);
  	init_rwsem(&reset_domain->sem);
+	reset_domain->hang = false;
return reset_domain;
  }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
index cc4b2eeb24cf..29e324add552 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
@@ -84,6 +84,7 @@ struct amdgpu_reset_domain {
  	struct rw_semaphore sem;
  	atomic_t in_gpu_reset;
  	atomic_t reset_res;
+	bool hang;
  };
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
index fafbad3cf08d..a384e04d916c 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
@@ -29,6 +29,7 @@
  #include "amdgpu.h"
  #include "amdgpu_gfx.h"
  #include "amdgpu_psp.h"
+#include "amdgpu_reset.h"
  #include "nv.h"
  #include "nvd.h"
@@ -5971,6 +5972,9 @@ static int gfx_v10_0_cp_gfx_enable(struct amdgpu_device *adev, bool enable)
  		WREG32_SOC15(GC, 0, mmCP_ME_CNTL, tmp);
  	}
+ if ((amdgpu_in_reset(adev) && adev->reset_domain->hang) && !enable)
+		return 0;
+
  	for (i = 0; i < adev->usec_timeout; i++) {
  		if (RREG32_SOC15(GC, 0, mmCP_STAT) == 0)
  			break;
@@ -7569,8 +7573,10 @@ static int gfx_v10_0_kiq_disable_kgq(struct amdgpu_device *adev)
  	for (i = 0; i < adev->gfx.num_gfx_rings; i++)
  		kiq->pmf->kiq_unmap_queues(kiq_ring, &adev->gfx.gfx_ring[i],
  					   PREEMPT_QUEUES, 0, 0);
-
-	return amdgpu_ring_test_helper(kiq_ring);
+	if (!(amdgpu_in_reset(adev) && adev->reset_domain->hang))
+		return amdgpu_ring_test_helper(kiq_ring);
+	else
+		return 0;
  }
  #endif
@@ -7610,6 +7616,7 @@ static int gfx_v10_0_hw_fini(void *handle) return 0;
  	}
+
  	gfx_v10_0_cp_enable(adev, false);
  	gfx_v10_0_enable_gui_idle_interrupt(adev, false);



[Index of Archives]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux