[AMD Official Use Only - AMD Internal Distribution Only] Shall we check (deferred || poison) bit to identify deferred errors during either poison creation or poison consumption events, and differentiate them from CE? Regards, Hawking -----Original Message----- From: Liu, Xiang(Dean) <Xiang.Liu@xxxxxxx> Sent: Wednesday, February 26, 2025 11:49 To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx Cc: Zhang, Hawking <Hawking.Zhang@xxxxxxx>; Zhou1, Tao <Tao.Zhou1@xxxxxxx>; Liu, Xiang(Dean) <Xiang.Liu@xxxxxxx> Subject: [PATCH] drm/amdgpu: Decode deferred error type in aca bank parser In the case of poison consumption's inband log, the error type need to be specified by checking the poison bit of status register. Signed-off-by: Xiang Liu <xiang.liu@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h | 5 +++++ drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 4 ++-- drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c | 5 +++-- drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c | 4 ++-- drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c | 4 ++-- drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c | 4 ++-- drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c | 4 ++-- 7 files changed, 18 insertions(+), 12 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h index b84a3489b116..a8c8457e4160 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h @@ -76,6 +76,11 @@ struct ras_query_context; #define mmSMNAID_XCD1_MCA_SMU 0x38430400 /* SMN AID XCD1 */ #define mmSMNXCD_XCD0_MCA_SMU 0x40430400 /* SMN XCD XCD0 */ +#define ACA_BANK_ERR_CE_DE_DECODE(bank) \ + (ACA_REG__STATUS__POISON((bank)->regs[ACA_REG_IDX_STATUS]) ? \ + ACA_ERROR_TYPE_DEFERRED : \ + ACA_ERROR_TYPE_CE) + enum aca_reg_idx { ACA_REG_IDX_CTL = 0, ACA_REG_IDX_STATUS = 1, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c index 49da137d42c9..c313c2cf6969 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c @@ -1169,8 +1169,8 @@ static int xgmi_v6_4_0_aca_bank_parser(struct aca_handle *handle, struct aca_ban break; case ACA_SMU_TYPE_CE: count = ext_error_code == 6 ? count : 0ULL; - bank->aca_err_type = ACA_ERROR_TYPE_CE; - ret = aca_error_cache_log_bank_error(handle, &info, ACA_ERROR_TYPE_CE, count); + bank->aca_err_type = ACA_BANK_ERR_CE_DE_DECODE(bank); + ret = aca_error_cache_log_bank_error(handle, &info, +bank->aca_err_type, count); break; default: return -EINVAL; diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c index 50eb856249d5..87add6274b98 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c @@ -883,9 +883,10 @@ static int gfx_v9_4_3_aca_bank_parser(struct aca_handle *handle, ACA_ERROR_TYPE_UE, 1ULL); break; case ACA_SMU_TYPE_CE: - bank->aca_err_type = ACA_ERROR_TYPE_CE; + bank->aca_err_type = ACA_BANK_ERR_CE_DE_DECODE(bank); ret = aca_error_cache_log_bank_error(handle, &info, - ACA_ERROR_TYPE_CE, ACA_REG__MISC0__ERRCNT(misc0)); + bank->aca_err_type, + ACA_REG__MISC0__ERRCNT(misc0)); break; default: return -EINVAL; diff --git a/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c b/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c index b8f06e9c9e62..1fcab0ef21c3 100644 --- a/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c +++ b/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c @@ -1332,8 +1332,8 @@ static int jpeg_v4_0_3_aca_bank_parser(struct aca_handle *handle, struct aca_ban 1ULL); break; case ACA_SMU_TYPE_CE: - bank->aca_err_type = ACA_ERROR_TYPE_CE; - ret = aca_error_cache_log_bank_error(handle, &info, ACA_ERROR_TYPE_CE, + bank->aca_err_type = ACA_BANK_ERR_CE_DE_DECODE(bank); + ret = aca_error_cache_log_bank_error(handle, &info, +bank->aca_err_type, ACA_REG__MISC0__ERRCNT(misc0)); break; default: diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c index 58d22f0d5a68..a54e7b929295 100644 --- a/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c +++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v1_8.c @@ -751,8 +751,8 @@ static int mmhub_v1_8_aca_bank_parser(struct aca_handle *handle, struct aca_bank 1ULL); break; case ACA_SMU_TYPE_CE: - bank->aca_err_type = ACA_ERROR_TYPE_CE; - ret = aca_error_cache_log_bank_error(handle, &info, ACA_ERROR_TYPE_CE, + bank->aca_err_type = ACA_BANK_ERR_CE_DE_DECODE(bank); + ret = aca_error_cache_log_bank_error(handle, &info, +bank->aca_err_type, ACA_REG__MISC0__ERRCNT(misc0)); break; default: diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c index 4fa688e00f5e..d5d3298adfb0 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c @@ -2494,8 +2494,8 @@ static int sdma_v4_4_2_aca_bank_parser(struct aca_handle *handle, struct aca_ban 1ULL); break; case ACA_SMU_TYPE_CE: - bank->aca_err_type = ACA_ERROR_TYPE_CE; - ret = aca_error_cache_log_bank_error(handle, &info, ACA_ERROR_TYPE_CE, + bank->aca_err_type = ACA_BANK_ERR_CE_DE_DECODE(bank); + ret = aca_error_cache_log_bank_error(handle, &info, +bank->aca_err_type, ACA_REG__MISC0__ERRCNT(misc0)); break; default: diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c index dda5ee187948..5971715495a7 100644 --- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c +++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0_3.c @@ -1930,8 +1930,8 @@ static int vcn_v4_0_3_aca_bank_parser(struct aca_handle *handle, struct aca_bank 1ULL); break; case ACA_SMU_TYPE_CE: - bank->aca_err_type = ACA_ERROR_TYPE_CE; - ret = aca_error_cache_log_bank_error(handle, &info, ACA_ERROR_TYPE_CE, + bank->aca_err_type = ACA_BANK_ERR_CE_DE_DECODE(bank); + ret = aca_error_cache_log_bank_error(handle, &info, +bank->aca_err_type, ACA_REG__MISC0__ERRCNT(misc0)); break; default: -- 2.34.1