Re: [PATCH 4/5] drm/amdgpu: revert context to stop engine before mode2 reset

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Got it

Acked-by: Andrey Grodzovsky <andrey.grodzovsky@xxxxxxx>

Andrey

On 2022-07-26 06:01, Zhao, Victor wrote:
[AMD Official Use Only - General]

Hi Andrey,

For slow tests I mean the slow hang tests by quark tool.
An example here:
hang_vm_gfx_dispatch_slow.lua - This script runs on a graphics engine using compute engine and has a hacked CS program which is massive and duplicates standard CS program move code hundreds of thousands of times. The effect is a very slowly executing CS program.

It's not a bad job but just need a very long time to finish. I suppose we don’t have a way to stop shader here. And the running apps will be affected when reset is done.


Thanks,
Victor



-----Original Message-----
From: Grodzovsky, Andrey <Andrey.Grodzovsky@xxxxxxx>
Sent: Tuesday, July 26, 2022 5:20 AM
To: Zhao, Victor <Victor.Zhao@xxxxxxx>; amd-gfx@xxxxxxxxxxxxxxxxxxxxx
Cc: Deucher, Alexander <Alexander.Deucher@xxxxxxx>; Deng, Emily <Emily.Deng@xxxxxxx>; Koenig, Christian <Christian.Koenig@xxxxxxx>
Subject: Re: [PATCH 4/5] drm/amdgpu: revert context to stop engine before mode2 reset

On 2022-07-22 03:34, Victor Zhao wrote:

For some hang caused by slow tests, engine cannot be stopped which may
cause resume failure after reset. In this case, force halt engine by
reverting context addresses

Can you maybe explain a bit more what exactly you mean by slow test and why engine cannot be stopped in this case ?

Andrey


Signed-off-by: Victor Zhao <Victor.Zhao@xxxxxxx>
---
   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c  |  1 +
   drivers/gpu/drm/amd/amdgpu/amdgpu_gfxhub.h  |  1 +
   drivers/gpu/drm/amd/amdgpu/gfxhub_v2_1.c    | 36 +++++++++++++++++++++
   drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c |  2 ++
   4 files changed, 40 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 5498fda8617f..833dc5e224d3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -5037,6 +5037,7 @@ static void amdgpu_device_recheck_guilty_jobs(
/* set guilty */
   			drm_sched_increase_karma(s_job);
+			amdgpu_reset_prepare_hwcontext(adev, reset_context);
   retry:
   			/* do hw reset */
   			if (amdgpu_sriov_vf(adev)) {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfxhub.h
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfxhub.h
index f8036f2b100e..c7b44aeb671b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfxhub.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfxhub.h
@@ -37,6 +37,7 @@ struct amdgpu_gfxhub_funcs {
   	void (*utcl2_harvest)(struct amdgpu_device *adev);
   	void (*mode2_save_regs)(struct amdgpu_device *adev);
   	void (*mode2_restore_regs)(struct amdgpu_device *adev);
+	void (*halt)(struct amdgpu_device *adev);
   };
struct amdgpu_gfxhub {
diff --git a/drivers/gpu/drm/amd/amdgpu/gfxhub_v2_1.c
b/drivers/gpu/drm/amd/amdgpu/gfxhub_v2_1.c
index 51cf8acd2d79..8cf53e039c11 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfxhub_v2_1.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfxhub_v2_1.c
@@ -646,6 +646,41 @@ static void gfxhub_v2_1_restore_regs(struct amdgpu_device *adev)
   	WREG32_SOC15(GC, 0, mmGCMC_VM_MX_L1_TLB_CNTL, adev->gmc.MC_VM_MX_L1_TLB_CNTL);
   }
+static void gfxhub_v2_1_halt(struct amdgpu_device *adev) {
+	struct amdgpu_vmhub *hub = &adev->vmhub[AMDGPU_GFXHUB_0];
+	int i;
+	uint32_t tmp;
+	int time = 1000;
+
+	gfxhub_v2_1_set_fault_enable_default(adev, false);
+
+	for (i = 0; i <= 14; i++) {
+		WREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT1_PAGE_TABLE_START_ADDR_LO32,
+				    i * hub->ctx_addr_distance, ~0);
+		WREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT1_PAGE_TABLE_START_ADDR_HI32,
+				    i * hub->ctx_addr_distance, ~0);
+		WREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT1_PAGE_TABLE_END_ADDR_LO32,
+				    i * hub->ctx_addr_distance,
+				    0);
+		WREG32_SOC15_OFFSET(GC, 0, mmGCVM_CONTEXT1_PAGE_TABLE_END_ADDR_HI32,
+				    i * hub->ctx_addr_distance,
+				    0);
+	}
+	tmp = RREG32_SOC15(GC, 0, mmGRBM_STATUS2);
+	while ((tmp & (GRBM_STATUS2__EA_BUSY_MASK |
+		      GRBM_STATUS2__EA_LINK_BUSY_MASK)) != 0 &&
+	       time) {
+		udelay(100);
+		time--;
+		tmp = RREG32_SOC15(GC, 0, mmGRBM_STATUS2);
+	}
+
+	if (!time) {
+		DRM_WARN("failed to wait for GRBM(EA) idle\n");
+	}
+}
+
   const struct amdgpu_gfxhub_funcs gfxhub_v2_1_funcs = {
   	.get_fb_location = gfxhub_v2_1_get_fb_location,
   	.get_mc_fb_offset = gfxhub_v2_1_get_mc_fb_offset, @@ -658,4 +693,5
@@ const struct amdgpu_gfxhub_funcs gfxhub_v2_1_funcs = {
   	.utcl2_harvest = gfxhub_v2_1_utcl2_harvest,
   	.mode2_save_regs = gfxhub_v2_1_save_regs,
   	.mode2_restore_regs = gfxhub_v2_1_restore_regs,
+	.halt = gfxhub_v2_1_halt,
   };
diff --git a/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c
b/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c
index 51a5b68f77d3..fead7251292f 100644
--- a/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c
+++ b/drivers/gpu/drm/amd/amdgpu/sienna_cichlid.c
@@ -97,6 +97,8 @@ sienna_cichlid_mode2_prepare_hwcontext(struct amdgpu_reset_control *reset_ctl,
   	if (!amdgpu_sriov_vf(adev)) {
   		if (adev->gfxhub.funcs->mode2_save_regs)
   			adev->gfxhub.funcs->mode2_save_regs(adev);
+		if (adev->gfxhub.funcs->halt)
+			adev->gfxhub.funcs->halt(adev);
   		r = sienna_cichlid_mode2_suspend_ip(adev);
   	}



[Index of Archives]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux