RE: [PATCH] drm/amd/amdgpu: Clean up VCN fw_shared and set flag bits during hw_init

"Liu, Leo" <Leo.Liu@xxxxxxx> · Wed, 29 Nov 2023 20:49:43 +0000



[AMD Official Use Only - General]

> -----Original Message-----
> From: amd-gfx <amd-gfx-bounces@xxxxxxxxxxxxxxxxxxxxx> On Behalf Of
> Bokun Zhang
> Sent: Tuesday, November 28, 2023 4:25 PM
> To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx
> Cc: Zhang, Bokun <Bokun.Zhang@xxxxxxx>
> Subject: [PATCH] drm/amd/amdgpu: Clean up VCN fw_shared and set flag
> bits during hw_init
>
> - After whole GPU reset, the VCN fw_shared data is cleaned up.
>   This seems like a new behavior and it is considered safer since
>   we do not want to keep the residual data
>
> - However, the existing driver code still assumes the residual data.
>   For example, in sw_init, we will set the UNIFIED_QUEUE flag bit.
>   This flag bit is never set anywhere else.
>
>   Then if a WGR happens, sw_init will not be called and therefore,
>   we will lose this bit and it leads to a crash in VCN FW.
>
> - A proper fix is that we explicitly set the flag bit in hw_init
>   every time and the data is repopulated after a WGR instead of
>   assuming the data can survive the WGR.
>
I think this is part of sw_init, along with loading fw. Should not be in the hw_init. I think you probably can try to save it to a saved_bo when suspend, and copy back when resume, same way as for FW.

Regards,
Leo


> Signed-off-by: Bokun Zhang <bokun.zhang@xxxxxxx>
> Change-Id: I783ff826f12e12eaf48d046ff9f1cef65558c7b9
> ---
>  drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c | 45 +++++++++++++++++----------
>  1 file changed, 28 insertions(+), 17 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c
> b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c
> index bf07aa200030..0cf3e639c480 100644
> --- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c
> @@ -111,6 +111,7 @@ static int vcn_v4_0_sw_init(void *handle)  {
>       struct amdgpu_ring *ring;
>       struct amdgpu_device *adev = (struct amdgpu_device *)handle;
> +     volatile struct amdgpu_vcn4_fw_shared *fw_shared;
>       int i, r;
>
>       r = amdgpu_vcn_sw_init(adev);
> @@ -124,11 +125,12 @@ static int vcn_v4_0_sw_init(void *handle)
>               return r;
>
>       for (i = 0; i < adev->vcn.num_vcn_inst; i++) {
> -             volatile struct amdgpu_vcn4_fw_shared *fw_shared;
> -
>               if (adev->vcn.harvest_config & (1 << i))
>                       continue;
>
> +             fw_shared = adev->vcn.inst[i].fw_shared.cpu_addr;
> +             memset(fw_shared, 0, sizeof(struct
> amdgpu_vcn4_fw_shared));
> +
>               /* Init instance 0 sched_score to 1, so it's scheduled after
> other instances */
>               if (i == 0)
>                       atomic_set(&adev->vcn.inst[i].sched_score, 1); @@ -
> 161,21 +163,6 @@ static int vcn_v4_0_sw_init(void *handle)
>               if (r)
>                       return r;
>
> -             fw_shared = adev->vcn.inst[i].fw_shared.cpu_addr;
> -             fw_shared->present_flag_0 =
> cpu_to_le32(AMDGPU_FW_SHARED_FLAG_0_UNIFIED_QUEUE);
> -             fw_shared->sq.is_enabled = 1;
> -
> -             fw_shared->present_flag_0 |=
> cpu_to_le32(AMDGPU_VCN_SMU_DPM_INTERFACE_FLAG);
> -             fw_shared->smu_dpm_interface.smu_interface_type = (adev-
> >flags & AMD_IS_APU) ?
> -                     AMDGPU_VCN_SMU_DPM_INTERFACE_APU :
> AMDGPU_VCN_SMU_DPM_INTERFACE_DGPU;
> -
> -             if (amdgpu_ip_version(adev, VCN_HWIP, 0) ==
> -                 IP_VERSION(4, 0, 2)) {
> -                     fw_shared->present_flag_0 |=
> AMDGPU_FW_SHARED_FLAG_0_DRM_KEY_INJECT;
> -                     fw_shared->drm_key_wa.method =
> -
>       AMDGPU_DRM_KEY_INJECT_WORKAROUND_VCNFW_ASD_HANDSH
> AKING;
> -             }
> -
>               if (amdgpu_vcnfw_log)
>                       amdgpu_vcn_fwlog_init(&adev->vcn.inst[i]);
>       }
> @@ -245,9 +232,33 @@ static int vcn_v4_0_sw_fini(void *handle)  static int
> vcn_v4_0_hw_init(void *handle)  {
>       struct amdgpu_device *adev = (struct amdgpu_device *)handle;
> +     volatile struct amdgpu_vcn4_fw_shared *fw_shared;
>       struct amdgpu_ring *ring;
>       int i, r;
>
> +     for (i = 0; i < adev->vcn.num_vcn_inst; i++) {
> +             if (adev->vcn.harvest_config & (1 << i))
> +                     continue;
> +
> +             fw_shared = adev->vcn.inst[i].fw_shared.cpu_addr;
> +             fw_shared->present_flag_0 |=
> cpu_to_le32(AMDGPU_FW_SHARED_FLAG_0_UNIFIED_QUEUE);
> +             fw_shared->sq.is_enabled = 1;
> +
> +             fw_shared->present_flag_0 |=
> cpu_to_le32(AMDGPU_VCN_SMU_DPM_INTERFACE_FLAG);
> +             fw_shared->smu_dpm_interface.smu_interface_type = (adev-
> >flags & AMD_IS_APU) ?
> +                     AMDGPU_VCN_SMU_DPM_INTERFACE_APU :
> +AMDGPU_VCN_SMU_DPM_INTERFACE_DGPU;
> +
> +             if (amdgpu_ip_version(adev, VCN_HWIP, 0) ==
> +                 IP_VERSION(4, 0, 2)) {
> +                     fw_shared->present_flag_0 |=
> AMDGPU_FW_SHARED_FLAG_0_DRM_KEY_INJECT;
> +                     fw_shared->drm_key_wa.method =
> +
>       AMDGPU_DRM_KEY_INJECT_WORKAROUND_VCNFW_ASD_HANDSH
> AKING;
> +             }
> +
> +             if (amdgpu_vcnfw_log)
> +                     fw_shared->present_flag_0 |=
> cpu_to_le32(AMDGPU_VCN_FW_LOGGING_FLAG);
> +     }
> +
>       if (amdgpu_sriov_vf(adev)) {
>               r = vcn_v4_0_start_sriov(adev);
>               if (r)
> --
> 2.34.1