From: Hawking Zhang <Hawking.Zhang@xxxxxxx> It turns out STATUS_VALID_FLAG needs to be checked ahead of any other fields. ADDRESS_VALID_FLAG and ERR_INFO_VALID_FLAG only manages ADDRESS and ERR_INFO field respectively. driver should continue poll ERR CNT field even ERR_INFO_VALD_FLAG is not set. Signed-off-by: Hawking Zhang <Hawking.Zhang@xxxxxxx> Reviewed-by: Tao Zhou <tao.zhou1@xxxxxxx> Signed-off-by: Alex Deucher <alexander.deucher@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 5ae89602a116..64f80e8cbd63 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -3164,7 +3164,8 @@ bool amdgpu_ras_inst_get_err_cnt_field(struct amdgpu_device *adev, if ((reg_entry->flags & AMDGPU_RAS_ERR_INFO_VALID) && !REG_GET_FIELD(err_status_hi_data, ERR_STATUS_HI, ERR_INFO_VALID_FLAG)) - return false; + /* keep the check here in case we need to refer to the result later */ + dev_dbg(adev->dev, "Invalid err_info field\n"); /* read err count */ *err_cnt = REG_GET_FIELD(err_status_hi_data, ERR_STATUS, ERR_CNT); @@ -3187,17 +3188,17 @@ void amdgpu_ras_inst_query_ras_error_count(struct amdgpu_device *adev, uint32_t i, j; for (i = 0; i < reg_list_size; i++) { + /* query memory_id from err_status_lo */ + if (!amdgpu_ras_inst_get_memory_id_field(adev, ®_list[i], + instance, &memory_id)) + continue; + /* query err_cnt from err_status_hi */ if (!amdgpu_ras_inst_get_err_cnt_field(adev, ®_list[i], instance, &err_cnt) || !err_cnt) continue; - /* query memory_id from err_status_lo */ - if (!amdgpu_ras_inst_get_memory_id_field(adev, ®_list[i], - instance, &memory_id)) - continue; - *err_count += err_cnt; /* log the errors */ -- 2.40.1