[AMD Official Use Only - General] Please ignore my first comment. It doesn't necessarily associated with socket id in UMC MCA status log at this stage. Regards, Hawking -----Original Message----- From: amd-gfx <amd-gfx-bounces@xxxxxxxxxxxxxxxxxxxxx> On Behalf Of Zhang, Hawking Sent: Wednesday, January 17, 2024 19:12 To: Chai, Thomas <YiPeng.Chai@xxxxxxx>; amd-gfx@xxxxxxxxxxxxxxxxxxxxx Cc: Zhou1, Tao <Tao.Zhou1@xxxxxxx>; Yang, Stanley <Stanley.Yang@xxxxxxx>; Wang, Yang(Kevin) <KevinYang.Wang@xxxxxxx>; Li, Candice <Candice.Li@xxxxxxx> Subject: RE: [PATCH 1/5] drm/amdgpu: Add log info for umc_v12_0 and smu_v13_0_6 [AMD Official Use Only - General] [AMD Official Use Only - General] + dev_info(adev->dev, + "MCA_UMC_STATUS(0x%llx): Val:%llu, Poison:%llu, Deferred:%llu, PCC:%llu, UC:%llu, TCC:%llu\n", + mc_umc_status, Please also print out socket id for UMC MCA status. + dev_info(smu->adev->dev, "MSG %s(%d) query %s MCA count result:%u\n", + (msg == SMU_MSG_QueryValidMcaCeCount) ? + "SMU_MSG_QueryValidMcaCeCount" : "SMU_MSG_QueryValidMcaCount", + msg, + (msg == SMU_MSG_QueryValidMcaCeCount) ? "CE" : "UE", + *count); + This seems redundant or was added for debugging purpose. We can drop this print since there is log to cover failures. Regards, Hawking -----Original Message----- From: Chai, Thomas <YiPeng.Chai@xxxxxxx> Sent: Tuesday, January 16, 2024 16:21 To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx Cc: Chai, Thomas <YiPeng.Chai@xxxxxxx>; Zhang, Hawking <Hawking.Zhang@xxxxxxx>; Zhou1, Tao <Tao.Zhou1@xxxxxxx>; Li, Candice <Candice.Li@xxxxxxx>; Wang, Yang(Kevin) <KevinYang.Wang@xxxxxxx>; Yang, Stanley <Stanley.Yang@xxxxxxx>; Chai, Thomas <YiPeng.Chai@xxxxxxx> Subject: [PATCH 1/5] drm/amdgpu: Add log info for umc_v12_0 and smu_v13_0_6 Add log info for umc_v12_0 and smu_v13_0_6. Signed-off-by: YiPeng Chai <YiPeng.Chai@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/umc_v12_0.c | 11 +++++++++++ drivers/gpu/drm/amd/amdkfd/kfd_events.c | 6 +++++- .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c | 13 +++++++++++++ 3 files changed, 29 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c index 6423dca5b777..fa2168f1d3bf 100644 --- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c @@ -91,6 +91,17 @@ static void umc_v12_0_reset_error_count(struct amdgpu_device *adev) bool umc_v12_0_is_deferred_error(struct amdgpu_device *adev, uint64_t mc_umc_status) { + dev_info(adev->dev, + "MCA_UMC_STATUS(0x%llx): Val:%llu, Poison:%llu, Deferred:%llu, PCC:%llu, UC:%llu, TCC:%llu\n", + mc_umc_status, + REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val), + REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Poison), + REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Deferred), + REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, PCC), + REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UC), + REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, TCC) + ); + return (amdgpu_ras_is_poison_mode_supported(adev) && (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1) && (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Deferred) == 1)); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c index 11923964ce9a..51bb98db5d7a 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c @@ -1297,8 +1297,10 @@ void kfd_signal_poison_consumed_event(struct kfd_node *dev, u32 pasid) uint32_t id = KFD_FIRST_NONSIGNAL_EVENT_ID; int user_gpu_id; - if (!p) + if (!p) { + dev_warn(dev->adev->dev, "Not find process with pasid:%d\n", pasid); return; /* Presumably process exited. */ + } user_gpu_id = kfd_process_get_user_gpu_id(p, dev->id); if (unlikely(user_gpu_id == -EINVAL)) { @@ -1334,6 +1336,8 @@ void kfd_signal_poison_consumed_event(struct kfd_node *dev, u32 pasid) } } + dev_warn(dev->adev->dev, "Send SIGBUS to process %s(pasid:%d)\n", + p->lead_thread->comm, pasid); rcu_read_unlock(); /* user application will handle SIGBUS signal */ diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c index 952a983da49a..cee8ee5afcb6 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c @@ -2406,10 +2406,23 @@ static int smu_v13_0_6_get_valid_mca_count(struct smu_context *smu, enum amdgpu_ ret = smu_cmn_send_smc_msg(smu, msg, count); if (ret) { + dev_err(smu->adev->dev, "%s(%d) failed to query %s MCA count, ret:%d\n", + (msg == SMU_MSG_QueryValidMcaCeCount) ? + "SMU_MSG_QueryValidMcaCeCount" : "SMU_MSG_QueryValidMcaCount", + msg, + (msg == SMU_MSG_QueryValidMcaCeCount) ? "CE" : "UE", + ret); *count = 0; return ret; } + dev_info(smu->adev->dev, "MSG %s(%d) query %s MCA count result:%u\n", + (msg == SMU_MSG_QueryValidMcaCeCount) ? + "SMU_MSG_QueryValidMcaCeCount" : "SMU_MSG_QueryValidMcaCount", + msg, + (msg == SMU_MSG_QueryValidMcaCeCount) ? "CE" : "UE", + *count); + return 0; } -- 2.34.1