Am 19.11.24 um 11:57 schrieb Xiang Liu:
In the case of RAS err_event_athub, the VCPU buffers are corrupted and
cannot be restored in amdgpu_vcn_resume(), the buffers are cleared to 0
for good. However, the fw_shared stored in the buffers need to be reset
, or the firmware cannot work properly.
v2: Remove redundant code like vcn_v4_0 did
v2: Refine commit message
Signed-off-by: Xiang Liu <xiang.liu@xxxxxxx>
---
drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c | 32 ++++++++++++++++++-------
1 file changed, 23 insertions(+), 9 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
index d011e4678ca1..cf8264bf45c5 100644
--- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
@@ -123,6 +123,20 @@ static int vcn_v4_0_3_early_init(struct amdgpu_ip_block *ip_block)
return amdgpu_vcn_early_init(adev);
}
+static int vcn_v4_0_3_fw_shared_init(struct amdgpu_device *adev, int inst_idx)
+{
+ volatile struct amdgpu_vcn4_fw_shared *fw_shared;
Please drop the volatile here, that usually doesn't do what is intended.
When you need to make sure that the hardware sees the written values you
need to use a memory barrier instead.
Regards,
Christian.
+
+ fw_shared = adev->vcn.inst[inst_idx].fw_shared.cpu_addr;
+ fw_shared->present_flag_0 = cpu_to_le32(AMDGPU_FW_SHARED_FLAG_0_UNIFIED_QUEUE);
+ fw_shared->sq.is_enabled = 1;
+
+ if (amdgpu_vcnfw_log)
+ amdgpu_vcn_fwlog_init(&adev->vcn.inst[inst_idx]);
+
+ return 0;
+}
+
/**
* vcn_v4_0_3_sw_init - sw init for VCN block
*
@@ -155,8 +169,6 @@ static int vcn_v4_0_3_sw_init(struct amdgpu_ip_block *ip_block)
return r;
for (i = 0; i < adev->vcn.num_vcn_inst; i++) {
- volatile struct amdgpu_vcn4_fw_shared *fw_shared;
-
vcn_inst = GET_INST(VCN, i);
ring = &adev->vcn.inst[i].ring_enc[0];
@@ -179,12 +191,7 @@ static int vcn_v4_0_3_sw_init(struct amdgpu_ip_block *ip_block)
if (r)
return r;
- fw_shared = adev->vcn.inst[i].fw_shared.cpu_addr;
- fw_shared->present_flag_0 = cpu_to_le32(AMDGPU_FW_SHARED_FLAG_0_UNIFIED_QUEUE);
- fw_shared->sq.is_enabled = true;
-
- if (amdgpu_vcnfw_log)
- amdgpu_vcn_fwlog_init(&adev->vcn.inst[i]);
+ vcn_v4_0_3_fw_shared_init(adev, i);
}
if (amdgpu_sriov_vf(adev)) {
@@ -234,7 +241,7 @@ static int vcn_v4_0_3_sw_fini(struct amdgpu_ip_block *ip_block)
fw_shared = adev->vcn.inst[i].fw_shared.cpu_addr;
fw_shared->present_flag_0 = 0;
- fw_shared->sq.is_enabled = cpu_to_le32(false);
+ fw_shared->sq.is_enabled = 0;
}
drm_dev_exit(idx);
}
@@ -280,6 +287,8 @@ static int vcn_v4_0_3_hw_init(struct amdgpu_ip_block *ip_block)
}
} else {
for (i = 0; i < adev->vcn.num_vcn_inst; ++i) {
+ volatile struct amdgpu_vcn4_fw_shared *fw_shared;
+
vcn_inst = GET_INST(VCN, i);
ring = &adev->vcn.inst[i].ring_enc[0];
@@ -303,6 +312,11 @@ static int vcn_v4_0_3_hw_init(struct amdgpu_ip_block *ip_block)
regVCN_RB1_DB_CTRL);
}
+ /* Re-init fw_shared when RAS err_event_athub corrupt the VCPU buffers */
+ fw_shared = adev->vcn.inst[i].fw_shared.cpu_addr;
+ if (!fw_shared->sq.is_enabled)
+ vcn_v4_0_3_fw_shared_init(adev, i);
+
r = amdgpu_ring_test_helper(ring);
if (r)
return r;