RE: [PATCH v2 3/5] drm/amdgpu: Add ras helper to query boot errors v2

"Zhang, Hawking" <Hawking.Zhang@xxxxxxx> · Sun, 7 Jan 2024 15:31:31 +0000

[AMD Official Use Only - General]

Please check my comments inline.

Regards,

Hawking

-----Original Message-----

From: Zhang, Morris <Shiwu.Zhang@xxxxxxx> 

Sent: Wednesday, January 3, 2024 17:46

To: Zhang, Hawking <Hawking.Zhang@xxxxxxx>; amd-gfx@xxxxxxxxxxxxxxxxxxxxx; Zhou1, Tao <Tao.Zhou1@xxxxxxx>; Yang, Stanley <Stanley.Yang@xxxxxxx>; Wang, Yang(Kevin) <KevinYang.Wang@xxxxxxx>; Chai, Thomas <YiPeng.Chai@xxxxxxx>; Li, Candice <Candice.Li@xxxxxxx>

Cc: Deucher, Alexander <Alexander.Deucher@xxxxxxx>; Ma, Le <Le.Ma@xxxxxxx>; Lazar, Lijo <Lijo.Lazar@xxxxxxx>; Zhang, Hawking <Hawking.Zhang@xxxxxxx>

Subject: RE: [PATCH v2 3/5] drm/amdgpu: Add ras helper to query boot errors v2

[AMD Official Use Only - General]

--Brs,
Morris Zhang
MLSE Linux  ML SRDC
Ext. 25147

> -----Original Message-----
> From: amd-gfx <amd-gfx-bounces@xxxxxxxxxxxxxxxxxxxxx> On Behalf Of 
> Hawking Zhang
> Sent: Tuesday, January 2, 2024 10:08 PM
> To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx; Zhou1, Tao <Tao.Zhou1@xxxxxxx>; 
> Yang, Stanley <Stanley.Yang@xxxxxxx>; Wang, Yang(Kevin) 
> <KevinYang.Wang@xxxxxxx>; Chai, Thomas <YiPeng.Chai@xxxxxxx>; Li, 
> Candice <Candice.Li@xxxxxxx>
> Cc: Deucher, Alexander <Alexander.Deucher@xxxxxxx>; Ma, Le 
> <Le.Ma@xxxxxxx>; Lazar, Lijo <Lijo.Lazar@xxxxxxx>; Zhang, Hawking 
> <Hawking.Zhang@xxxxxxx>
> Subject: [PATCH v2 3/5] drm/amdgpu: Add ras helper to query boot 
> errors v2
>
> Add ras helper function to query boot time gpu errors.
> v2: use aqua_vanjaram smn addressing pattern
>
> Signed-off-by: Hawking Zhang <Hawking.Zhang@xxxxxxx>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu.h     |  1 +
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 95
> +++++++++++++++++++++++++  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |
> 15 +++-
>  3 files changed, 110 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 616b6c911767..cd91533d641c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -1328,6 +1328,7 @@ int emu_soc_asic_init(struct amdgpu_device 
> *adev);  #define WREG32_FIELD_OFFSET(reg, offset, field, val) \
>       WREG32(mm##reg + offset, (RREG32(mm##reg + offset) & 
> ~REG_FIELD_MASK(reg, field)) | (val) << REG_FIELD_SHIFT(reg, field))
>
> +#define AMDGPU_GET_REG_FIELD(x, h, l) (((x) & GENMASK_ULL(h, l)) >>
> +(l))
>  /*
>   * BIOS helpers.
>   */
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index fc42fb6ee191..a901b00d4949 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -3763,3 +3763,98 @@ int amdgpu_ras_error_statistic_ce_count(struct
> ras_err_data *err_data,
>
>       return 0;
>  }
> +
> +#define mmMP0_SMN_C2PMSG_92  0x1609C
> +#define mmMP0_SMN_C2PMSG_126 0x160BE
> +static void amdgpu_ras_boot_time_error_reporting(struct amdgpu_device
> *adev,
> +                                              u32 instance, u32 
> +boot_error) {
> +     u32 socket_id, aid_id, hbm_id;
> +     u32 reg_data;
> +     u64 reg_addr;
> +
> +     socket_id = AMDGPU_RAS_GPU_ERR_SOCKET_ID(boot_error);
> +     aid_id = AMDGPU_RAS_GPU_ERR_AID_ID(boot_error);
> +     hbm_id = AMDGPU_RAS_GPU_ERR_HBM_ID(boot_error);
> +
> +     /* The pattern for smn addressing in other SOC could be different from
> +      * the one for aqua_vanjaram. We should revisit the code if the pattern
> +      * is changed. In such case, replace the aqua_vanjaram implementation
> +      * with more common helper */
> +     reg_addr = (mmMP0_SMN_C2PMSG_92 << 2) +
> +                aqua_vanjaram_encode_ext_smn_addressing(instance);
> +
> +     reg_data = amdgpu_device_indirect_rreg_ext(adev, reg_addr);
> +     dev_err(adev->dev, "socket: %d, aid: %d, firmware boot failed, 
> + fw status
> is 0x%x\n",
> +             socket_id, aid_id, reg_data);
> +
> +     if (AMDGPU_RAS_GPU_ERR_MEM_TRAINING(boot_error))
> +             dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, 
> + memory
> training failed\n",
> +                      socket_id, aid_id, hbm_id);
> +
> +     if (AMDGPU_RAS_GPU_ERR_FW_LOAD(boot_error))
> +             dev_info(adev->dev, "socket: %d, aid: %d, firmware load 
> + failed at
> boot time\n",
> +                      socket_id, aid_id);
> +
> +     if (AMDGPU_RAS_GPU_ERR_WAFL_LINK_TRAINING(boot_error))
> +             dev_info(adev->dev, "socket: %d, aid: %d, wafl link 
> + training
> failed\n",
> +                      socket_id, aid_id);
> +
> +     if (AMDGPU_RAS_GPU_ERR_XGMI_LINK_TRAINING(boot_error))
> +             dev_info(adev->dev, "socket: %d, aid: %d, xgmi link 
> + training
> failed\n",
> +                      socket_id, aid_id);
> +
> +     if (AMDGPU_RAS_GPU_ERR_USR_CP_LINK_TRAINING(boot_error))
> +             dev_info(adev->dev, "socket: %d, aid: %d, usr cp link 
> + training
> failed\n",
> +                      socket_id, aid_id);
> +
> +     if (AMDGPU_RAS_GPU_ERR_USR_DP_LINK_TRAINING(boot_error))
> +             dev_info(adev->dev, "socket: %d, aid: %d, usr dp link 
> + training
> failed\n",
> +                      socket_id, aid_id);
> +
> +     if (AMDGPU_RAS_GPU_ERR_HBM_MEM_TEST(boot_error))
> +             dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, hbm
> memory test failed\n",
> +                      socket_id, aid_id, hbm_id);
> +
> +     if (AMDGPU_RAS_GPU_ERR_HBM_BIST_TEST(boot_error))
> +             dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, hbm 
> + bist
> test failed\n",
> +                      socket_id, aid_id, hbm_id); }
> +
> +static int amdgpu_ras_wait_for_boot_complete(struct amdgpu_device *adev,
> +                                          u32 instance, u32 *boot_error) {
> +     u32 reg_addr;
> +     u32 reg_data;
> +     int retry_loop;
> +
> +     /* The pattern for smn addressing in other SOC could be different from
> +      * the one for aqua_vanjaram. We should revisit the code if the pattern
> +      * is changed. In such case, replace the aqua_vanjaram implementation
> +      * with more common helper */
> +     reg_addr = (mmMP0_SMN_C2PMSG_126 << 2) +
> +                aqua_vanjaram_encode_ext_smn_addressing(instance);
> +
> +     for (retry_loop = 0; retry_loop < 1000; retry_loop++) {
> +             reg_data = amdgpu_device_indirect_rreg_ext(adev, reg_addr);
> +             if (AMDGPU_RAS_GPU_ERR_BOOT_STATUS(reg_data)) {
> +                     *boot_error = reg_data;
> +                     return 0;
> +             }
> +             msleep(1);
> +     }
> +
> +     *boot_error = reg_data;
> +     return -ETIME;
> +}
> +
> +void amdgpu_ras_query_boot_status(struct amdgpu_device *adev, u32
> +num_instances) {
> +     u32 boot_error = 0;
> +     u32 i;
> +
> +     for (i = 0; i < num_instances; i++) {
> +             if (amdgpu_ras_wait_for_boot_complete(adev, i, 
> + &boot_error))
I suppose we need to check the boot_error value returned to report them in case that before timeout an error happens, right?
[Hawking]: We only need to dump the boot_error value if boot status bit is not set to 1 till the time out expired.

> +                     amdgpu_ras_boot_time_error_reporting(adev, i,
> boot_error);
> +     }
> +}
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> index 76fb85628716..5785b705c692 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> @@ -32,6 +32,19 @@
>
>  struct amdgpu_iv_entry;
>
> +#define AMDGPU_RAS_GPU_ERR_MEM_TRAINING(x)
>       AMDGPU_GET_REG_FIELD(x, 0, 0)
> +#define AMDGPU_RAS_GPU_ERR_FW_LOAD(x)
>       AMDGPU_GET_REG_FIELD(x, 1, 1)
> +#define AMDGPU_RAS_GPU_ERR_WAFL_LINK_TRAINING(x)
>       AMDGPU_GET_REG_FIELD(x, 2, 2)
> +#define AMDGPU_RAS_GPU_ERR_XGMI_LINK_TRAINING(x)
>       AMDGPU_GET_REG_FIELD(x, 3, 3)
> +#define AMDGPU_RAS_GPU_ERR_USR_CP_LINK_TRAINING(x)
>       AMDGPU_GET_REG_FIELD(x, 4, 4)
> +#define AMDGPU_RAS_GPU_ERR_USR_DP_LINK_TRAINING(x)
>       AMDGPU_GET_REG_FIELD(x, 5, 5)
> +#define AMDGPU_RAS_GPU_ERR_HBM_MEM_TEST(x)
>       AMDGPU_GET_REG_FIELD(x, 6, 6)
> +#define AMDGPU_RAS_GPU_ERR_HBM_BIST_TEST(x)
>       AMDGPU_GET_REG_FIELD(x, 7, 7)
> +#define AMDGPU_RAS_GPU_ERR_SOCKET_ID(x)
>       AMDGPU_GET_REG_FIELD(x, 10, 8)
> +#define AMDGPU_RAS_GPU_ERR_AID_ID(x)
>       AMDGPU_GET_REG_FIELD(x, 12, 11)
> +#define AMDGPU_RAS_GPU_ERR_HBM_ID(x)
>       AMDGPU_GET_REG_FIELD(x, 13, 13)
> +#define AMDGPU_RAS_GPU_ERR_BOOT_STATUS(x)
>       AMDGPU_GET_REG_FIELD(x, 31, 31)
> +
>  #define AMDGPU_RAS_FLAG_INIT_BY_VBIOS                (0x1 << 0)
>  /* position of instance value in sub_block_index of
>   * ta_ras_trigger_error_input, the sub block uses lower 12 bits @@ 
> -818,5
> +831,5 @@ int amdgpu_ras_error_statistic_ce_count(struct ras_err_data
> *err_data,  int amdgpu_ras_error_statistic_ue_count(struct 
> ras_err_data *err_data,
>               struct amdgpu_smuio_mcm_config_info *mcm_info,
>               struct ras_err_addr *err_addr, u64 count);
> -
> +void amdgpu_ras_query_boot_status(struct amdgpu_device *adev, u32 
> +num_instances);
>  #endif
> --
> 2.17.1