Re: [PATCH v2] drm/amdgpu: dequeue mes scheduler during fini

Christian König <christian.koenig@xxxxxxx> · Thu, 13 Oct 2022 12:55:07 +0200

Am 13.10.22 um 12:39 schrieb YuBiao Wang:
Resend to fix coding style issue.

[Why]
If mes is not dequeued during fini, mes will be in an uncleaned state
during reload, then mes couldn't receive some commands which leads to
reload failure.

[How]
Perform MES dequeue via MMIO after all the unmap jobs are done by mes
and before kiq fini.

Signed-off-by: YuBiao Wang <YuBiao.Wang@xxxxxxx>
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h |  3 ++
  drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c  |  3 ++
  drivers/gpu/drm/amd/amdgpu/mes_v11_0.c  | 47 +++++++++++++++++++++++--
  3 files changed, 50 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
index ad980f4b66e1..ea8efa52503b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
@@ -130,6 +130,9 @@ struct amdgpu_mes {
  	int                             (*kiq_hw_init)(struct amdgpu_device *adev);
  	int                             (*kiq_hw_fini)(struct amdgpu_device *adev);
  
+	/* dequeue sched pipe via kiq */
+	void                            (*kiq_dequeue_sched)(struct amdgpu_device *adev);
+
  	/* ip specific functions */
  	const struct amdgpu_mes_funcs   *funcs;
  };
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
index 257b2e4de600..7c75758f58e1 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
@@ -4406,6 +4406,9 @@ static int gfx_v11_0_hw_fini(void *handle)
  		if (amdgpu_gfx_disable_kcq(adev))
  			DRM_ERROR("KCQ disable failed\n");
  
+		if (adev->mes.ring.sched.ready && adev->mes.kiq_dequeue_sched)
+			adev->mes.kiq_dequeue_sched(adev);
+
  		amdgpu_mes_kiq_hw_fini(adev);
  	}
  
diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
index b48684db2832..837ff485dab6 100644
--- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
@@ -44,6 +44,7 @@ MODULE_FIRMWARE("amdgpu/gc_11_0_3_mes1.bin");
  static int mes_v11_0_hw_fini(void *handle);
  static int mes_v11_0_kiq_hw_init(struct amdgpu_device *adev);
  static int mes_v11_0_kiq_hw_fini(struct amdgpu_device *adev);
+static void mes_v11_0_kiq_dequeue_sched(struct amdgpu_device *adev);
  
  #define MES_EOP_SIZE   2048
  
@@ -1078,6 +1079,7 @@ static int mes_v11_0_sw_init(void *handle)
  	adev->mes.funcs = &mes_v11_0_funcs;
  	adev->mes.kiq_hw_init = &mes_v11_0_kiq_hw_init;
  	adev->mes.kiq_hw_fini = &mes_v11_0_kiq_hw_fini;
+	adev->mes.kiq_dequeue_sched = &mes_v11_0_kiq_dequeue_sched;
  
  	r = amdgpu_mes_init(adev);
  	if (r)
@@ -1151,6 +1153,42 @@ static int mes_v11_0_sw_fini(void *handle)
  	return 0;
  }
  
+static void mes_v11_0_kiq_dequeue_sched(struct amdgpu_device *adev)
+{
+	uint32_t data;
+	int i;
+
+	mutex_lock(&adev->srbm_mutex);
+	soc21_grbm_select(adev, 3, AMDGPU_MES_SCHED_PIPE, 0, 0);
+
+	/* disable the queue if it's active */
+	if (RREG32_SOC15(GC, 0, regCP_HQD_ACTIVE) & 1) {
+		WREG32_SOC15(GC, 0, regCP_HQD_DEQUEUE_REQUEST, 1);
+		for (i = 0; i < adev->usec_timeout; i++) {
+			if (!(RREG32_SOC15(GC, 0, regCP_HQD_ACTIVE) & 1))
+				break;
+			udelay(1);
+		}
+	}
+	data = RREG32_SOC15(GC, 0, regCP_HQD_PQ_DOORBELL_CONTROL);
+	data = REG_SET_FIELD(data, CP_HQD_PQ_DOORBELL_CONTROL,
+				DOORBELL_EN, 0);
+	data = REG_SET_FIELD(data, CP_HQD_PQ_DOORBELL_CONTROL,
+				DOORBELL_HIT, 1);
+	WREG32_SOC15(GC, 0, regCP_HQD_PQ_DOORBELL_CONTROL, data);
+
+	WREG32_SOC15(GC, 0, regCP_HQD_PQ_DOORBELL_CONTROL, 0);
+
+	WREG32_SOC15(GC, 0, regCP_HQD_PQ_WPTR_LO, 0);
+	WREG32_SOC15(GC, 0, regCP_HQD_PQ_WPTR_HI, 0);
+	WREG32_SOC15(GC, 0, regCP_HQD_PQ_RPTR, 0);
+
+	soc21_grbm_select(adev, 0, 0, 0, 0);
+	mutex_unlock(&adev->srbm_mutex);
+
+	adev->mes.ring.sched.ready = false;
+}
+
  static void mes_v11_0_kiq_setting(struct amdgpu_ring *ring)
  {
  	uint32_t tmp;
@@ -1257,9 +1295,12 @@ static int mes_v11_0_hw_init(void *handle)
  
  static int mes_v11_0_hw_fini(void *handle)
  {
-	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
-
-	adev->mes.ring.sched.ready = false;
+	/*
+	 * Do not disable mes.ring.sched.ready here, since mes
+	 * is still used to unmap other rings.
+	 * We'll set mes.ring.sched.ready to false when mes ring
+	 * is dequeued.
+	 */

Maybe drop that comment. The driver should *NEVER* set ring.sched.read 
to false, but only to true.

The background is that this flag is used if a ring was correctly 
initialized from the SW side. It's *not* related to the hardware state, 
but only set to true after the hardware is tested.

We have gotten this wrong so many times and I had to write patches to 
remove it.

The only exception so far to this is the KIQ ring because here we don't 
use it for CS. MES might get the same exception, but not as long as we 
don't need it somewhere.

Regards,
Christian.

  	return 0;
  }