RE: [PATCH] drm/amdgpu: Update programming for boot error reporting

"Zhou1, Tao" <Tao.Zhou1@xxxxxxx> · Thu, 30 May 2024 09:16:48 +0000

[AMD Official Use Only - AMD Internal Distribution Only]

Reviewed-by: Tao Zhou <tao.zhou1@xxxxxxx>

One more question, do we need to consider the compatible with old FW?

> -----Original Message-----
> From: Hawking Zhang <Hawking.Zhang@xxxxxxx>
> Sent: Thursday, May 30, 2024 4:41 PM
> To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx; Zhou1, Tao <Tao.Zhou1@xxxxxxx>
> Cc: Zhang, Hawking <Hawking.Zhang@xxxxxxx>
> Subject: [PATCH] drm/amdgpu: Update programming for boot error reporting
>
> AMDGPU_RAS_GPU_ERR_BOOT_STATUS field is no longer valid.
> The polling sequence is also simplifed according to the latest firmware change.
>
> Signed-off-by: Hawking Zhang <Hawking.Zhang@xxxxxxx>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 99 +++++++++++--------------
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  4 +-
>  2 files changed, 46 insertions(+), 57 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index eedf2b613ac2..2c338d39cd45 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -4416,64 +4416,74 @@ int amdgpu_ras_error_statistic_de_count(struct
> ras_err_data *err_data,
>  #define mmMP0_SMN_C2PMSG_92  0x1609C
>  #define mmMP0_SMN_C2PMSG_126 0x160BE
>  static void amdgpu_ras_boot_time_error_reporting(struct amdgpu_device
> *adev,
> -                                              u32 instance, u32 boot_error)
> +                                              u32 instance)
>  {
>       u32 socket_id, aid_id, hbm_id;
> -     u32 reg_data;
> +     u32 fw_status;
> +     u32 boot_error;
>       u64 reg_addr;
>
> -     socket_id = AMDGPU_RAS_GPU_ERR_SOCKET_ID(boot_error);
> -     aid_id = AMDGPU_RAS_GPU_ERR_AID_ID(boot_error);
> -     hbm_id = AMDGPU_RAS_GPU_ERR_HBM_ID(boot_error);
> -
>       /* The pattern for smn addressing in other SOC could be different from
>        * the one for aqua_vanjaram. We should revisit the code if the pattern
>        * is changed. In such case, replace the aqua_vanjaram implementation
>        * with more common helper */
>       reg_addr = (mmMP0_SMN_C2PMSG_92 << 2) +
>                  aqua_vanjaram_encode_ext_smn_addressing(instance);
> +     fw_status = amdgpu_device_indirect_rreg_ext(adev, reg_addr);
> +
> +     reg_addr = (mmMP0_SMN_C2PMSG_126 << 2) +
> +                aqua_vanjaram_encode_ext_smn_addressing(instance);
> +     boot_error = amdgpu_device_indirect_rreg_ext(adev, reg_addr);
>
> -     reg_data = amdgpu_device_indirect_rreg_ext(adev, reg_addr);
> -     dev_err(adev->dev, "socket: %d, aid: %d, firmware boot failed, fw status
> is 0x%x\n",
> -             socket_id, aid_id, reg_data);
> +     socket_id = AMDGPU_RAS_GPU_ERR_SOCKET_ID(boot_error);
> +     aid_id = AMDGPU_RAS_GPU_ERR_AID_ID(boot_error);
> +     hbm_id = AMDGPU_RAS_GPU_ERR_HBM_ID(boot_error);
>
>       if (AMDGPU_RAS_GPU_ERR_MEM_TRAINING(boot_error))
> -             dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, memory
> training failed\n",
> -                      socket_id, aid_id, hbm_id);
> +             dev_info(adev->dev,
> +                      "socket: %d, aid: %d, hbm: %d, fw_status: 0x%x,
> memory training failed\n",
> +                      socket_id, aid_id, hbm_id, fw_status);
>
>       if (AMDGPU_RAS_GPU_ERR_FW_LOAD(boot_error))
> -             dev_info(adev->dev, "socket: %d, aid: %d, firmware load failed at
> boot time\n",
> -                      socket_id, aid_id);
> +             dev_info(adev->dev,
> +                      "socket: %d, aid: %d, fw_status: 0x%x, firmware load
> failed at boot time\n",
> +                      socket_id, aid_id, fw_status);
>
>       if (AMDGPU_RAS_GPU_ERR_WAFL_LINK_TRAINING(boot_error))
> -             dev_info(adev->dev, "socket: %d, aid: %d, wafl link training
> failed\n",
> -                      socket_id, aid_id);
> +             dev_info(adev->dev,
> +                      "socket: %d, aid: %d, fw_status: 0x%x, wafl link training
> failed\n",
> +                      socket_id, aid_id, fw_status);
>
>       if (AMDGPU_RAS_GPU_ERR_XGMI_LINK_TRAINING(boot_error))
> -             dev_info(adev->dev, "socket: %d, aid: %d, xgmi link training
> failed\n",
> -                      socket_id, aid_id);
> +             dev_info(adev->dev,
> +                      "socket: %d, aid: %d, fw_status: 0x%x, xgmi link training
> failed\n",
> +                      socket_id, aid_id, fw_status);
>
>       if (AMDGPU_RAS_GPU_ERR_USR_CP_LINK_TRAINING(boot_error))
> -             dev_info(adev->dev, "socket: %d, aid: %d, usr cp link training
> failed\n",
> -                      socket_id, aid_id);
> +             dev_info(adev->dev,
> +                      "socket: %d, aid: %d, fw_status: 0x%x, usr cp link
> training failed\n",
> +                      socket_id, aid_id, fw_status);
>
>       if (AMDGPU_RAS_GPU_ERR_USR_DP_LINK_TRAINING(boot_error))
> -             dev_info(adev->dev, "socket: %d, aid: %d, usr dp link training
> failed\n",
> -                      socket_id, aid_id);
> +             dev_info(adev->dev,
> +                      "socket: %d, aid: %d, fw_status: 0x%x, usr dp link
> training failed\n",
> +                      socket_id, aid_id, fw_status);
>
>       if (AMDGPU_RAS_GPU_ERR_HBM_MEM_TEST(boot_error))
> -             dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, hbm
> memory test failed\n",
> -                      socket_id, aid_id, hbm_id);
> +             dev_info(adev->dev,
> +                      "socket: %d, aid: %d, hbm: %d, fw_status: 0x%x, hbm
> memory test failed\n",
> +                      socket_id, aid_id, hbm_id, fw_status);
>
>       if (AMDGPU_RAS_GPU_ERR_HBM_BIST_TEST(boot_error))
> -             dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, hbm bist
> test failed\n",
> -                      socket_id, aid_id, hbm_id);
> +             dev_info(adev->dev,
> +                      "socket: %d, aid: %d, hbm: %d, fw_status: 0x%x, hbm
> bist test failed\n",
> +                      socket_id, aid_id, hbm_id, fw_status);
>  }
>
> -static int amdgpu_ras_wait_for_boot_complete(struct amdgpu_device *adev,
> -                                          u32 instance, u32 *boot_error)
> +static bool amdgpu_ras_boot_error_detected(struct amdgpu_device *adev,
> +                                        u32 instance)
>  {
> -     u32 reg_addr;
> +     u64 reg_addr;
>       u32 reg_data;
>       int retry_loop;
>
> @@ -4482,41 +4492,22 @@ static int
> amdgpu_ras_wait_for_boot_complete(struct amdgpu_device *adev,
>
>       for (retry_loop = 0; retry_loop <
> AMDGPU_RAS_BOOT_STATUS_POLLING_LIMIT; retry_loop++) {
>               reg_data = amdgpu_device_indirect_rreg_ext(adev, reg_addr);
> -             if ((reg_data & AMDGPU_RAS_BOOT_STATUS_MASK) ==
> AMDGPU_RAS_BOOT_STEADY_STATUS) {
> -                     *boot_error = AMDGPU_RAS_BOOT_SUCEESS;
> -                     return 0;
> -             }
> -             msleep(1);
> -     }
> -
> -     /* The pattern for smn addressing in other SOC could be different from
> -      * the one for aqua_vanjaram. We should revisit the code if the pattern
> -      * is changed. In such case, replace the aqua_vanjaram implementation
> -      * with more common helper */
> -     reg_addr = (mmMP0_SMN_C2PMSG_126 << 2) +
> -                aqua_vanjaram_encode_ext_smn_addressing(instance);
> -
> -     for (retry_loop = 0; retry_loop <
> AMDGPU_RAS_BOOT_STATUS_POLLING_LIMIT; retry_loop++) {
> -             reg_data = amdgpu_device_indirect_rreg_ext(adev, reg_addr);
> -             if (AMDGPU_RAS_GPU_ERR_BOOT_STATUS(reg_data)) {
> -                     *boot_error = reg_data;
> -                     return 0;
> -             }
> -             msleep(1);
> +             if ((reg_data & AMDGPU_RAS_BOOT_STATUS_MASK) ==
> AMDGPU_RAS_BOOT_STEADY_STATUS)
> +                     return false;
> +             else
> +                     msleep(1);
>       }
>
> -     *boot_error = reg_data;
> -     return -ETIME;
> +     return true;
>  }
>
>  void amdgpu_ras_query_boot_status(struct amdgpu_device *adev, u32
> num_instances)  {
> -     u32 boot_error = 0;
>       u32 i;
>
>       for (i = 0; i < num_instances; i++) {
> -             if (amdgpu_ras_wait_for_boot_complete(adev, i, &boot_error))
> -                     amdgpu_ras_boot_time_error_reporting(adev, i,
> boot_error);
> +             if (amdgpu_ras_boot_error_detected(adev, i))
> +                     amdgpu_ras_boot_time_error_reporting(adev, i);
>       }
>  }
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> index 7021c4a66fb5..d0a125743d3a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> @@ -47,12 +47,10 @@ struct amdgpu_iv_entry;
>  #define AMDGPU_RAS_GPU_ERR_SOCKET_ID(x)
>       AMDGPU_GET_REG_FIELD(x, 10, 8)
>  #define AMDGPU_RAS_GPU_ERR_AID_ID(x)
>       AMDGPU_GET_REG_FIELD(x, 12, 11)
>  #define AMDGPU_RAS_GPU_ERR_HBM_ID(x)
>       AMDGPU_GET_REG_FIELD(x, 14, 13)
> -#define AMDGPU_RAS_GPU_ERR_BOOT_STATUS(x)
>       AMDGPU_GET_REG_FIELD(x, 31, 31)
>
> -#define AMDGPU_RAS_BOOT_STATUS_POLLING_LIMIT 1000
> +#define AMDGPU_RAS_BOOT_STATUS_POLLING_LIMIT 100
>  #define AMDGPU_RAS_BOOT_STEADY_STATUS                0xBA
>  #define AMDGPU_RAS_BOOT_STATUS_MASK          0xFF
> -#define AMDGPU_RAS_BOOT_SUCEESS                      0x80000000
>
>  #define AMDGPU_RAS_FLAG_INIT_BY_VBIOS                (0x1 << 0)
>  /* position of instance value in sub_block_index of
> --
> 2.17.1