[PATCH 2/3] drm/amdgpu: don't output mes error message when gfx hang during gpu reset

YiPeng Chai <YiPeng.Chai@xxxxxxx> · Thu, 27 Apr 2023 16:02:18 +0800

This patch is to clear the invalid mes error message when gfx
ras poison consumption causes gpu reset on gfx v11_0_3.

[Why]:
  Gfx ras poison consumption will cause gfx hang, and gfx hang
  will cause mes to fail to run, and gfx can not be recovered
  until gpu reset complete. So the mes error message before the
  gpu reset completes is invalid. Since the gpu reset has already
  started, these mes error messages are easily misinterpreted.

[How]:
  Since mes depends on gfx, when mes fails to submit a packet
  during gpu reset, it will first check the gfx status to decide
  whether to output an error message. This check is only used
  during gpu reset.

MES error message during gpu reset:
[  389.803015] amdgpu 0000:63:00.0: amdgpu: GPU reset begin!
[  389.913909] [drm:mes_v11_0_submit_pkt_and_poll_completion.constprop.0 [amdgpu]] *ERROR* MES failed to response msg=3
[  389.914104] amdgpu: failed to remove hardware queue from MES, doorbell=0x1000
[  389.914117] amdgpu: MES might be in unrecoverable state, issue a GPU reset
[  389.914128] amdgpu: Failed to evict queue 0
[  389.914136] amdgpu: Failed to evict process queues
[  389.914145] amdgpu: Failed to suspend process 0x8001
[  390.922077] amdgpu 0000:63:00.0: amdgpu: IP block:gfx_v11_0 is hung!
[  390.923519] [drm] kiq ring mec 3 pipe 1 q 0
[  390.924573] amdkcl: cancel_work function is not supported
[  390.931057] amdgpu 0000:63:00.0: amdgpu: recover vram bo from shadow start
[  390.931060] amdgpu 0000:63:00.0: amdgpu: recover vram bo from shadow done
[  390.931067] amdgpu 0000:63:00.0: amdgpu: GPU reset(1) succeeded!

Signed-off-by: YiPeng Chai <YiPeng.Chai@xxxxxxx>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h                 |  2 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c          | 10 ++++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c             |  2 +-
 drivers/gpu/drm/amd/amdgpu/mes_v11_0.c              |  3 +++
 .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c   | 13 ++++++++-----
 drivers/gpu/drm/amd/amdkfd/kfd_process.c            |  7 +++++--
 6 files changed, 29 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 35a0474ccdb0..3ca5716b0e06 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1434,6 +1434,8 @@ int amdgpu_device_set_cg_state(struct amdgpu_device *adev,
 int amdgpu_device_set_pg_state(struct amdgpu_device *adev,
 			       enum amd_powergating_state state);
 
+bool amdgpu_device_check_gfx_status_in_reset(struct amdgpu_device *adev);
+
 static inline bool amdgpu_device_has_timeouts_enabled(struct amdgpu_device *adev)
 {
 	return amdgpu_gpu_recovery != 0 &&
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 051b9e231cf4..a5086be4d7dd 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -6059,3 +6059,13 @@ bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev)
 		return true;
 	}
 }
+
+bool amdgpu_device_check_gfx_status_in_reset(struct amdgpu_device *adev)
+{
+	struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
+
+	if (!amdgpu_in_reset(adev))
+		return false;
+
+	return ras->reset_by_gfx_poison ? true : false;
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
index f0f00466b59f..403fb3f464f5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
@@ -818,7 +818,7 @@ int amdgpu_mes_unmap_legacy_queue(struct amdgpu_device *adev,
 	queue_input.trail_fence_data = seq;
 
 	r = adev->mes.funcs->unmap_legacy_queue(&adev->mes, &queue_input);
-	if (r)
+	if (r && (r != -EREMOTEIO))
 		DRM_ERROR("failed to unmap legacy queue\n");
 
 	return r;
diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
index 67f7557d545d..071973a6b0c9 100644
--- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
@@ -129,6 +129,9 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct amdgpu_mes *mes,
 	r = amdgpu_fence_wait_polling(ring, ring->fence_drv.sync_seq,
 		      timeout);
 	if (r < 1) {
+		if (amdgpu_device_check_gfx_status_in_reset(adev))
+			return -EREMOTEIO;
+
 		DRM_ERROR("MES failed to response msg=%d\n",
 			  x_pkt->header.opcode);
 
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 7a95698d83f7..5106af4bb60d 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -257,9 +257,11 @@ static int remove_queue_mes(struct device_queue_manager *dqm, struct queue *q,
 	amdgpu_mes_unlock(&adev->mes);
 
 	if (r) {
-		pr_err("failed to remove hardware queue from MES, doorbell=0x%x\n",
-			q->properties.doorbell_off);
-		pr_err("MES might be in unrecoverable state, issue a GPU reset\n");
+		if (r != -EREMOTEIO) {
+			pr_err("failed to remove hardware queue from MES, doorbell=0x%x\n",
+				q->properties.doorbell_off);
+			pr_err("MES might be in unrecoverable state, issue a GPU reset\n");
+		}
 		kfd_hws_hang(dqm);
 	}
 
@@ -996,8 +998,9 @@ static int evict_process_queues_cpsch(struct device_queue_manager *dqm,
 		if (dqm->dev->shared_resources.enable_mes) {
 			retval = remove_queue_mes(dqm, q, qpd);
 			if (retval) {
-				pr_err("Failed to evict queue %d\n",
-					q->properties.queue_id);
+				if (retval != -EREMOTEIO)
+					pr_err("Failed to evict queue %d\n",
+						q->properties.queue_id);
 				goto out;
 			}
 		}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index 95cc63d9f578..6cedfba5b1f4 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -1821,7 +1821,8 @@ int kfd_process_evict_queues(struct kfd_process *p, uint32_t trigger)
 		 * them been add back since they actually not be saved right now.
 		 */
 		if (r && r != -EIO) {
-			pr_err("Failed to evict process queues\n");
+			if (r != -EREMOTEIO)
+				pr_err("Failed to evict process queues\n");
 			goto fail;
 		}
 		n_evicted++;
@@ -1984,13 +1985,15 @@ void kfd_suspend_all_processes(void)
 	struct kfd_process *p;
 	unsigned int temp;
 	int idx = srcu_read_lock(&kfd_processes_srcu);
+	int ret = 0;
 
 	WARN(debug_evictions, "Evicting all processes");
 	hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
 		cancel_delayed_work_sync(&p->eviction_work);
 		cancel_delayed_work_sync(&p->restore_work);
 
-		if (kfd_process_evict_queues(p, KFD_QUEUE_EVICTION_TRIGGER_SUSPEND))
+		ret = kfd_process_evict_queues(p, KFD_QUEUE_EVICTION_TRIGGER_SUSPEND);
+		if (ret && (ret != -EREMOTEIO))
 			pr_err("Failed to suspend process 0x%x\n", p->pasid);
 		dma_fence_signal(p->ef);
 		dma_fence_put(p->ef);
-- 
2.34.1