RE: [PATCH] drm/amdgpu/vcn: reset fw_shared when VCPU buffers corrupted on vcn v4.0.3

"Yang, Stanley" <Stanley.Yang@xxxxxxx> · Thu, 21 Nov 2024 02:34:10 +0000



[AMD Official Use Only - AMD Internal Distribution Only]

Reviewed-by: Stanley.Yang <Stanley.Yang@xxxxxxx>

Regards,
Stanley
> -----Original Message-----
> From: amd-gfx <amd-gfx-bounces@xxxxxxxxxxxxxxxxxxxxx> On Behalf Of Xiang Liu
> Sent: Wednesday, November 20, 2024 8:35 PM
> To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx; Deucher, Alexander
> <Alexander.Deucher@xxxxxxx>; Koenig, Christian <Christian.Koenig@xxxxxxx>;
> Liu, Leo <Leo.Liu@xxxxxxx>; Zhang, Hawking <Hawking.Zhang@xxxxxxx>
> Cc: Wu, David <David.Wu3@xxxxxxx>; Liu, Xiang(Dean) <Xiang.Liu@xxxxxxx>
> Subject: [PATCH] drm/amdgpu/vcn: reset fw_shared when VCPU buffers corrupted
> on vcn v4.0.3
>
> It is not necessarily corrupted. When there is RAS fatal error, device memory access
> is blocked. Hence vcpu bo cannot be saved to system memory as in a regular
> suspend sequence before going for reset. In other full device reset cases, that gets
> saved and restored during resume.
>
> v2: Remove redundant code like vcn_v4_0 did
> v2: Refine commit message
> v3: Drop the volatile
> v3: Refine commit message
>
> Signed-off-by: Xiang Liu <xiang.liu@xxxxxxx>
> ---
>  drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c | 30 ++++++++++++++++++-------
>  1 file changed, 22 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
> b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
> index d011e4678ca1..c678631c6887 100644
> --- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
> +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c
> @@ -123,6 +123,20 @@ static int vcn_v4_0_3_early_init(struct amdgpu_ip_block
> *ip_block)
>       return amdgpu_vcn_early_init(adev);
>  }
>
> +static int vcn_v4_0_3_fw_shared_init(struct amdgpu_device *adev, int
> +inst_idx) {
> +     struct amdgpu_vcn4_fw_shared *fw_shared;
> +
> +     fw_shared = adev->vcn.inst[inst_idx].fw_shared.cpu_addr;
> +     fw_shared->present_flag_0 =
> cpu_to_le32(AMDGPU_FW_SHARED_FLAG_0_UNIFIED_QUEUE);
> +     fw_shared->sq.is_enabled = 1;
> +
> +     if (amdgpu_vcnfw_log)
> +             amdgpu_vcn_fwlog_init(&adev->vcn.inst[inst_idx]);
> +
> +     return 0;
> +}
> +
>  /**
>   * vcn_v4_0_3_sw_init - sw init for VCN block
>   *
> @@ -155,8 +169,6 @@ static int vcn_v4_0_3_sw_init(struct amdgpu_ip_block
> *ip_block)
>               return r;
>
>       for (i = 0; i < adev->vcn.num_vcn_inst; i++) {
> -             volatile struct amdgpu_vcn4_fw_shared *fw_shared;
> -
>               vcn_inst = GET_INST(VCN, i);
>
>               ring = &adev->vcn.inst[i].ring_enc[0]; @@ -179,12 +191,7 @@ static
> int vcn_v4_0_3_sw_init(struct amdgpu_ip_block *ip_block)
>               if (r)
>                       return r;
>
> -             fw_shared = adev->vcn.inst[i].fw_shared.cpu_addr;
> -             fw_shared->present_flag_0 =
> cpu_to_le32(AMDGPU_FW_SHARED_FLAG_0_UNIFIED_QUEUE);
> -             fw_shared->sq.is_enabled = true;
> -
> -             if (amdgpu_vcnfw_log)
> -                     amdgpu_vcn_fwlog_init(&adev->vcn.inst[i]);
> +             vcn_v4_0_3_fw_shared_init(adev, i);
>       }
>
>       if (amdgpu_sriov_vf(adev)) {
> @@ -280,6 +287,8 @@ static int vcn_v4_0_3_hw_init(struct amdgpu_ip_block
> *ip_block)
>               }
>       } else {
>               for (i = 0; i < adev->vcn.num_vcn_inst; ++i) {
> +                     struct amdgpu_vcn4_fw_shared *fw_shared;
> +
>                       vcn_inst = GET_INST(VCN, i);
>                       ring = &adev->vcn.inst[i].ring_enc[0];
>
> @@ -303,6 +312,11 @@ static int vcn_v4_0_3_hw_init(struct amdgpu_ip_block
> *ip_block)
>                                       regVCN_RB1_DB_CTRL);
>                       }
>
> +                     /* Re-init fw_shared when RAS fatal error occurred */
> +                     fw_shared = adev->vcn.inst[i].fw_shared.cpu_addr;
> +                     if (!fw_shared->sq.is_enabled)
> +                             vcn_v4_0_3_fw_shared_init(adev, i);
> +
>                       r = amdgpu_ring_test_helper(ring);
>                       if (r)
>                               return r;
> --
> 2.34.1