[AMD Official Use Only - AMD Internal Distribution Only] Reviewed-by: Stanley.Yang <Stanley.Yang@xxxxxxx> Regards, Stanley > -----Original Message----- > From: amd-gfx <amd-gfx-bounces@xxxxxxxxxxxxxxxxxxxxx> On Behalf Of Xiang Liu > Sent: Wednesday, November 20, 2024 8:35 PM > To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx; Deucher, Alexander > <Alexander.Deucher@xxxxxxx>; Koenig, Christian <Christian.Koenig@xxxxxxx>; > Liu, Leo <Leo.Liu@xxxxxxx>; Zhang, Hawking <Hawking.Zhang@xxxxxxx> > Cc: Wu, David <David.Wu3@xxxxxxx>; Liu, Xiang(Dean) <Xiang.Liu@xxxxxxx> > Subject: [PATCH] drm/amdgpu/vcn: reset fw_shared when VCPU buffers corrupted > on vcn v4.0.3 > > It is not necessarily corrupted. When there is RAS fatal error, device memory access > is blocked. Hence vcpu bo cannot be saved to system memory as in a regular > suspend sequence before going for reset. In other full device reset cases, that gets > saved and restored during resume. > > v2: Remove redundant code like vcn_v4_0 did > v2: Refine commit message > v3: Drop the volatile > v3: Refine commit message > > Signed-off-by: Xiang Liu <xiang.liu@xxxxxxx> > --- > drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c | 30 ++++++++++++++++++------- > 1 file changed, 22 insertions(+), 8 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c > b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c > index d011e4678ca1..c678631c6887 100644 > --- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c > +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c > @@ -123,6 +123,20 @@ static int vcn_v4_0_3_early_init(struct amdgpu_ip_block > *ip_block) > return amdgpu_vcn_early_init(adev); > } > > +static int vcn_v4_0_3_fw_shared_init(struct amdgpu_device *adev, int > +inst_idx) { > + struct amdgpu_vcn4_fw_shared *fw_shared; > + > + fw_shared = adev->vcn.inst[inst_idx].fw_shared.cpu_addr; > + fw_shared->present_flag_0 = > cpu_to_le32(AMDGPU_FW_SHARED_FLAG_0_UNIFIED_QUEUE); > + fw_shared->sq.is_enabled = 1; > + > + if (amdgpu_vcnfw_log) > + amdgpu_vcn_fwlog_init(&adev->vcn.inst[inst_idx]); > + > + return 0; > +} > + > /** > * vcn_v4_0_3_sw_init - sw init for VCN block > * > @@ -155,8 +169,6 @@ static int vcn_v4_0_3_sw_init(struct amdgpu_ip_block > *ip_block) > return r; > > for (i = 0; i < adev->vcn.num_vcn_inst; i++) { > - volatile struct amdgpu_vcn4_fw_shared *fw_shared; > - > vcn_inst = GET_INST(VCN, i); > > ring = &adev->vcn.inst[i].ring_enc[0]; @@ -179,12 +191,7 @@ static > int vcn_v4_0_3_sw_init(struct amdgpu_ip_block *ip_block) > if (r) > return r; > > - fw_shared = adev->vcn.inst[i].fw_shared.cpu_addr; > - fw_shared->present_flag_0 = > cpu_to_le32(AMDGPU_FW_SHARED_FLAG_0_UNIFIED_QUEUE); > - fw_shared->sq.is_enabled = true; > - > - if (amdgpu_vcnfw_log) > - amdgpu_vcn_fwlog_init(&adev->vcn.inst[i]); > + vcn_v4_0_3_fw_shared_init(adev, i); > } > > if (amdgpu_sriov_vf(adev)) { > @@ -280,6 +287,8 @@ static int vcn_v4_0_3_hw_init(struct amdgpu_ip_block > *ip_block) > } > } else { > for (i = 0; i < adev->vcn.num_vcn_inst; ++i) { > + struct amdgpu_vcn4_fw_shared *fw_shared; > + > vcn_inst = GET_INST(VCN, i); > ring = &adev->vcn.inst[i].ring_enc[0]; > > @@ -303,6 +312,11 @@ static int vcn_v4_0_3_hw_init(struct amdgpu_ip_block > *ip_block) > regVCN_RB1_DB_CTRL); > } > > + /* Re-init fw_shared when RAS fatal error occurred */ > + fw_shared = adev->vcn.inst[i].fw_shared.cpu_addr; > + if (!fw_shared->sq.is_enabled) > + vcn_v4_0_3_fw_shared_init(adev, i); > + > r = amdgpu_ring_test_helper(ring); > if (r) > return r; > -- > 2.34.1