[PATCH] drm/amd/amdgpu: Clean up VCN fw_shared and set flag bits during hw_init

Bokun Zhang <bokun.zhang@xxxxxxx> · Tue, 28 Nov 2023 16:25:17 -0500

- After whole GPU reset, the VCN fw_shared data is cleaned up.
  This seems like a new behavior and it is considered safer since
  we do not want to keep the residual data

- However, the existing driver code still assumes the residual data.
  For example, in sw_init, we will set the UNIFIED_QUEUE flag bit.
  This flag bit is never set anywhere else.

  Then if a WGR happens, sw_init will not be called and therefore,
  we will lose this bit and it leads to a crash in VCN FW.

- A proper fix is that we explicitly set the flag bit in hw_init
  every time and the data is repopulated after a WGR instead of
  assuming the data can survive the WGR.

Signed-off-by: Bokun Zhang <bokun.zhang@xxxxxxx>
Change-Id: I783ff826f12e12eaf48d046ff9f1cef65558c7b9
---
 drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c | 45 +++++++++++++++++----------
 1 file changed, 28 insertions(+), 17 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c
index bf07aa200030..0cf3e639c480 100644
--- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c
@@ -111,6 +111,7 @@ static int vcn_v4_0_sw_init(void *handle)
 {
 	struct amdgpu_ring *ring;
 	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
+	volatile struct amdgpu_vcn4_fw_shared *fw_shared;
 	int i, r;
 
 	r = amdgpu_vcn_sw_init(adev);
@@ -124,11 +125,12 @@ static int vcn_v4_0_sw_init(void *handle)
 		return r;
 
 	for (i = 0; i < adev->vcn.num_vcn_inst; i++) {
-		volatile struct amdgpu_vcn4_fw_shared *fw_shared;
-
 		if (adev->vcn.harvest_config & (1 << i))
 			continue;
 
+		fw_shared = adev->vcn.inst[i].fw_shared.cpu_addr;
+		memset(fw_shared, 0, sizeof(struct amdgpu_vcn4_fw_shared));
+
 		/* Init instance 0 sched_score to 1, so it's scheduled after other instances */
 		if (i == 0)
 			atomic_set(&adev->vcn.inst[i].sched_score, 1);
@@ -161,21 +163,6 @@ static int vcn_v4_0_sw_init(void *handle)
 		if (r)
 			return r;
 
-		fw_shared = adev->vcn.inst[i].fw_shared.cpu_addr;
-		fw_shared->present_flag_0 = cpu_to_le32(AMDGPU_FW_SHARED_FLAG_0_UNIFIED_QUEUE);
-		fw_shared->sq.is_enabled = 1;
-
-		fw_shared->present_flag_0 |= cpu_to_le32(AMDGPU_VCN_SMU_DPM_INTERFACE_FLAG);
-		fw_shared->smu_dpm_interface.smu_interface_type = (adev->flags & AMD_IS_APU) ?
-			AMDGPU_VCN_SMU_DPM_INTERFACE_APU : AMDGPU_VCN_SMU_DPM_INTERFACE_DGPU;
-
-		if (amdgpu_ip_version(adev, VCN_HWIP, 0) ==
-		    IP_VERSION(4, 0, 2)) {
-			fw_shared->present_flag_0 |= AMDGPU_FW_SHARED_FLAG_0_DRM_KEY_INJECT;
-			fw_shared->drm_key_wa.method =
-				AMDGPU_DRM_KEY_INJECT_WORKAROUND_VCNFW_ASD_HANDSHAKING;
-		}
-
 		if (amdgpu_vcnfw_log)
 			amdgpu_vcn_fwlog_init(&adev->vcn.inst[i]);
 	}
@@ -245,9 +232,33 @@ static int vcn_v4_0_sw_fini(void *handle)
 static int vcn_v4_0_hw_init(void *handle)
 {
 	struct amdgpu_device *adev = (struct amdgpu_device *)handle;
+	volatile struct amdgpu_vcn4_fw_shared *fw_shared;
 	struct amdgpu_ring *ring;
 	int i, r;
 
+	for (i = 0; i < adev->vcn.num_vcn_inst; i++) {
+		if (adev->vcn.harvest_config & (1 << i))
+			continue;
+
+		fw_shared = adev->vcn.inst[i].fw_shared.cpu_addr;
+		fw_shared->present_flag_0 |= cpu_to_le32(AMDGPU_FW_SHARED_FLAG_0_UNIFIED_QUEUE);
+		fw_shared->sq.is_enabled = 1;
+
+		fw_shared->present_flag_0 |= cpu_to_le32(AMDGPU_VCN_SMU_DPM_INTERFACE_FLAG);
+		fw_shared->smu_dpm_interface.smu_interface_type = (adev->flags & AMD_IS_APU) ?
+			AMDGPU_VCN_SMU_DPM_INTERFACE_APU : AMDGPU_VCN_SMU_DPM_INTERFACE_DGPU;
+
+		if (amdgpu_ip_version(adev, VCN_HWIP, 0) ==
+		    IP_VERSION(4, 0, 2)) {
+			fw_shared->present_flag_0 |= AMDGPU_FW_SHARED_FLAG_0_DRM_KEY_INJECT;
+			fw_shared->drm_key_wa.method =
+				AMDGPU_DRM_KEY_INJECT_WORKAROUND_VCNFW_ASD_HANDSHAKING;
+		}
+
+		if (amdgpu_vcnfw_log)
+			fw_shared->present_flag_0 |= cpu_to_le32(AMDGPU_VCN_FW_LOGGING_FLAG);
+	}
+
 	if (amdgpu_sriov_vf(adev)) {
 		r = vcn_v4_0_start_sriov(adev);
 		if (r)
-- 
2.34.1