[AMD Official Use Only - AMD Internal Distribution Only] Reviewed-by: Tao Zhou <tao.zhou1@xxxxxxx> One more question, do we need to consider the compatible with old FW? > -----Original Message----- > From: Hawking Zhang <Hawking.Zhang@xxxxxxx> > Sent: Thursday, May 30, 2024 4:41 PM > To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx; Zhou1, Tao <Tao.Zhou1@xxxxxxx> > Cc: Zhang, Hawking <Hawking.Zhang@xxxxxxx> > Subject: [PATCH] drm/amdgpu: Update programming for boot error reporting > > AMDGPU_RAS_GPU_ERR_BOOT_STATUS field is no longer valid. > The polling sequence is also simplifed according to the latest firmware change. > > Signed-off-by: Hawking Zhang <Hawking.Zhang@xxxxxxx> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 99 +++++++++++-------------- > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 4 +- > 2 files changed, 46 insertions(+), 57 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > index eedf2b613ac2..2c338d39cd45 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > @@ -4416,64 +4416,74 @@ int amdgpu_ras_error_statistic_de_count(struct > ras_err_data *err_data, > #define mmMP0_SMN_C2PMSG_92 0x1609C > #define mmMP0_SMN_C2PMSG_126 0x160BE > static void amdgpu_ras_boot_time_error_reporting(struct amdgpu_device > *adev, > - u32 instance, u32 boot_error) > + u32 instance) > { > u32 socket_id, aid_id, hbm_id; > - u32 reg_data; > + u32 fw_status; > + u32 boot_error; > u64 reg_addr; > > - socket_id = AMDGPU_RAS_GPU_ERR_SOCKET_ID(boot_error); > - aid_id = AMDGPU_RAS_GPU_ERR_AID_ID(boot_error); > - hbm_id = AMDGPU_RAS_GPU_ERR_HBM_ID(boot_error); > - > /* The pattern for smn addressing in other SOC could be different from > * the one for aqua_vanjaram. We should revisit the code if the pattern > * is changed. In such case, replace the aqua_vanjaram implementation > * with more common helper */ > reg_addr = (mmMP0_SMN_C2PMSG_92 << 2) + > aqua_vanjaram_encode_ext_smn_addressing(instance); > + fw_status = amdgpu_device_indirect_rreg_ext(adev, reg_addr); > + > + reg_addr = (mmMP0_SMN_C2PMSG_126 << 2) + > + aqua_vanjaram_encode_ext_smn_addressing(instance); > + boot_error = amdgpu_device_indirect_rreg_ext(adev, reg_addr); > > - reg_data = amdgpu_device_indirect_rreg_ext(adev, reg_addr); > - dev_err(adev->dev, "socket: %d, aid: %d, firmware boot failed, fw status > is 0x%x\n", > - socket_id, aid_id, reg_data); > + socket_id = AMDGPU_RAS_GPU_ERR_SOCKET_ID(boot_error); > + aid_id = AMDGPU_RAS_GPU_ERR_AID_ID(boot_error); > + hbm_id = AMDGPU_RAS_GPU_ERR_HBM_ID(boot_error); > > if (AMDGPU_RAS_GPU_ERR_MEM_TRAINING(boot_error)) > - dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, memory > training failed\n", > - socket_id, aid_id, hbm_id); > + dev_info(adev->dev, > + "socket: %d, aid: %d, hbm: %d, fw_status: 0x%x, > memory training failed\n", > + socket_id, aid_id, hbm_id, fw_status); > > if (AMDGPU_RAS_GPU_ERR_FW_LOAD(boot_error)) > - dev_info(adev->dev, "socket: %d, aid: %d, firmware load failed at > boot time\n", > - socket_id, aid_id); > + dev_info(adev->dev, > + "socket: %d, aid: %d, fw_status: 0x%x, firmware load > failed at boot time\n", > + socket_id, aid_id, fw_status); > > if (AMDGPU_RAS_GPU_ERR_WAFL_LINK_TRAINING(boot_error)) > - dev_info(adev->dev, "socket: %d, aid: %d, wafl link training > failed\n", > - socket_id, aid_id); > + dev_info(adev->dev, > + "socket: %d, aid: %d, fw_status: 0x%x, wafl link training > failed\n", > + socket_id, aid_id, fw_status); > > if (AMDGPU_RAS_GPU_ERR_XGMI_LINK_TRAINING(boot_error)) > - dev_info(adev->dev, "socket: %d, aid: %d, xgmi link training > failed\n", > - socket_id, aid_id); > + dev_info(adev->dev, > + "socket: %d, aid: %d, fw_status: 0x%x, xgmi link training > failed\n", > + socket_id, aid_id, fw_status); > > if (AMDGPU_RAS_GPU_ERR_USR_CP_LINK_TRAINING(boot_error)) > - dev_info(adev->dev, "socket: %d, aid: %d, usr cp link training > failed\n", > - socket_id, aid_id); > + dev_info(adev->dev, > + "socket: %d, aid: %d, fw_status: 0x%x, usr cp link > training failed\n", > + socket_id, aid_id, fw_status); > > if (AMDGPU_RAS_GPU_ERR_USR_DP_LINK_TRAINING(boot_error)) > - dev_info(adev->dev, "socket: %d, aid: %d, usr dp link training > failed\n", > - socket_id, aid_id); > + dev_info(adev->dev, > + "socket: %d, aid: %d, fw_status: 0x%x, usr dp link > training failed\n", > + socket_id, aid_id, fw_status); > > if (AMDGPU_RAS_GPU_ERR_HBM_MEM_TEST(boot_error)) > - dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, hbm > memory test failed\n", > - socket_id, aid_id, hbm_id); > + dev_info(adev->dev, > + "socket: %d, aid: %d, hbm: %d, fw_status: 0x%x, hbm > memory test failed\n", > + socket_id, aid_id, hbm_id, fw_status); > > if (AMDGPU_RAS_GPU_ERR_HBM_BIST_TEST(boot_error)) > - dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, hbm bist > test failed\n", > - socket_id, aid_id, hbm_id); > + dev_info(adev->dev, > + "socket: %d, aid: %d, hbm: %d, fw_status: 0x%x, hbm > bist test failed\n", > + socket_id, aid_id, hbm_id, fw_status); > } > > -static int amdgpu_ras_wait_for_boot_complete(struct amdgpu_device *adev, > - u32 instance, u32 *boot_error) > +static bool amdgpu_ras_boot_error_detected(struct amdgpu_device *adev, > + u32 instance) > { > - u32 reg_addr; > + u64 reg_addr; > u32 reg_data; > int retry_loop; > > @@ -4482,41 +4492,22 @@ static int > amdgpu_ras_wait_for_boot_complete(struct amdgpu_device *adev, > > for (retry_loop = 0; retry_loop < > AMDGPU_RAS_BOOT_STATUS_POLLING_LIMIT; retry_loop++) { > reg_data = amdgpu_device_indirect_rreg_ext(adev, reg_addr); > - if ((reg_data & AMDGPU_RAS_BOOT_STATUS_MASK) == > AMDGPU_RAS_BOOT_STEADY_STATUS) { > - *boot_error = AMDGPU_RAS_BOOT_SUCEESS; > - return 0; > - } > - msleep(1); > - } > - > - /* The pattern for smn addressing in other SOC could be different from > - * the one for aqua_vanjaram. We should revisit the code if the pattern > - * is changed. In such case, replace the aqua_vanjaram implementation > - * with more common helper */ > - reg_addr = (mmMP0_SMN_C2PMSG_126 << 2) + > - aqua_vanjaram_encode_ext_smn_addressing(instance); > - > - for (retry_loop = 0; retry_loop < > AMDGPU_RAS_BOOT_STATUS_POLLING_LIMIT; retry_loop++) { > - reg_data = amdgpu_device_indirect_rreg_ext(adev, reg_addr); > - if (AMDGPU_RAS_GPU_ERR_BOOT_STATUS(reg_data)) { > - *boot_error = reg_data; > - return 0; > - } > - msleep(1); > + if ((reg_data & AMDGPU_RAS_BOOT_STATUS_MASK) == > AMDGPU_RAS_BOOT_STEADY_STATUS) > + return false; > + else > + msleep(1); > } > > - *boot_error = reg_data; > - return -ETIME; > + return true; > } > > void amdgpu_ras_query_boot_status(struct amdgpu_device *adev, u32 > num_instances) { > - u32 boot_error = 0; > u32 i; > > for (i = 0; i < num_instances; i++) { > - if (amdgpu_ras_wait_for_boot_complete(adev, i, &boot_error)) > - amdgpu_ras_boot_time_error_reporting(adev, i, > boot_error); > + if (amdgpu_ras_boot_error_detected(adev, i)) > + amdgpu_ras_boot_time_error_reporting(adev, i); > } > } > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h > index 7021c4a66fb5..d0a125743d3a 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h > @@ -47,12 +47,10 @@ struct amdgpu_iv_entry; > #define AMDGPU_RAS_GPU_ERR_SOCKET_ID(x) > AMDGPU_GET_REG_FIELD(x, 10, 8) > #define AMDGPU_RAS_GPU_ERR_AID_ID(x) > AMDGPU_GET_REG_FIELD(x, 12, 11) > #define AMDGPU_RAS_GPU_ERR_HBM_ID(x) > AMDGPU_GET_REG_FIELD(x, 14, 13) > -#define AMDGPU_RAS_GPU_ERR_BOOT_STATUS(x) > AMDGPU_GET_REG_FIELD(x, 31, 31) > > -#define AMDGPU_RAS_BOOT_STATUS_POLLING_LIMIT 1000 > +#define AMDGPU_RAS_BOOT_STATUS_POLLING_LIMIT 100 > #define AMDGPU_RAS_BOOT_STEADY_STATUS 0xBA > #define AMDGPU_RAS_BOOT_STATUS_MASK 0xFF > -#define AMDGPU_RAS_BOOT_SUCEESS 0x80000000 > > #define AMDGPU_RAS_FLAG_INIT_BY_VBIOS (0x1 << 0) > /* position of instance value in sub_block_index of > -- > 2.17.1