[AMD Official Use Only] Might be better call the function is_poison_mode_supported. Other than that the series is Reviewed-by: Hawking Zhang <Hawking.Zhang@xxxxxxx> -----Original Message----- From: Zhou1, Tao <Tao.Zhou1@xxxxxxx> Sent: Wednesday, September 22, 2021 18:33 To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx; Zhang, Hawking <Hawking.Zhang@xxxxxxx>; Clements, John <John.Clements@xxxxxxx>; Yang, Stanley <Stanley.Yang@xxxxxxx> Cc: Zhou1, Tao <Tao.Zhou1@xxxxxxx> Subject: [PATCH 2/3] drm/amdgpu: set poison supported flag for RAS (v2) Add RAS poison supported flag and tell PSP RAS TA about the info. v2: rename poison_mode to poison_supported, we can also disable poison mode even we support it. print poison_supported value if ras feature enablement fails. Signed-off-by: Tao Zhou <tao.zhou1@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 4 ++-- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 32 +++++++++++++++++++++++-- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 5 ++++ 3 files changed, 37 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c index 7d09b28889af..c5cf84829ea8 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c @@ -1442,9 +1442,9 @@ static int psp_ras_initialize(struct psp_context *psp) ras_cmd = (struct ta_ras_shared_memory *)psp->ras_context.context.mem_context.shared_buf; memset(ras_cmd, 0, sizeof(struct ta_ras_shared_memory)); - if (psp->adev->gmc.xgmi.connected_to_cpu) + if (amdgpu_ras_is_poison_supported(adev)) ras_cmd->ras_in_message.init_flags.poison_mode_en = 1; - else + if (!adev->gmc.xgmi.connected_to_cpu) ras_cmd->ras_in_message.init_flags.dgpu_mode = 1; ret = psp_ras_load(psp); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 912ea1f9fd04..5b362e944541 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -710,10 +710,10 @@ int amdgpu_ras_feature_enable(struct amdgpu_device *adev, if (!amdgpu_ras_intr_triggered()) { ret = psp_ras_enable_features(&adev->psp, info, enable); if (ret) { - dev_err(adev->dev, "ras %s %s failed %d\n", + dev_err(adev->dev, "ras %s %s failed poison:%d ret:%d\n", enable ? "enable":"disable", get_ras_block_str(head), - ret); + amdgpu_ras_is_poison_supported(adev), ret); goto out; } } @@ -2251,6 +2251,7 @@ int amdgpu_ras_init(struct amdgpu_device *adev) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); int r; + bool df_poison, umc_poison; if (con) return 0; @@ -2321,6 +2322,23 @@ int amdgpu_ras_init(struct amdgpu_device *adev) goto release_con; } + /* Init poison supported flag, the default value is false */ + if (adev->df.funcs && + adev->df.funcs->query_ras_poison_mode && + adev->umc.ras_funcs && + adev->umc.ras_funcs->query_ras_poison_mode) { + df_poison = + adev->df.funcs->query_ras_poison_mode(adev); + umc_poison = + adev->umc.ras_funcs->query_ras_poison_mode(adev); + /* Only poison is set in both DF and UMC, we can support it */ + if (df_poison && umc_poison) + con->poison_supported = true; + else if (df_poison != umc_poison) + dev_warn(adev->dev, "Poison setting is inconsistent in DF/UMC(%d:%d)!\n", + df_poison, umc_poison); + } + if (amdgpu_ras_fs_init(adev)) { r = -EINVAL; goto release_con; @@ -2364,6 +2382,16 @@ static int amdgpu_persistent_edc_harvesting(struct amdgpu_device *adev, return 0; } +bool amdgpu_ras_is_poison_supported(struct amdgpu_device *adev) { + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + + if (!con) + return false; + + return con->poison_supported; +} + /* helper function to handle common stuff in ip late init phase */ int amdgpu_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index ec42e9873aaa..d6377e1ad20a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -352,6 +352,9 @@ struct amdgpu_ras { /* disable ras error count harvest in recovery */ bool disable_ras_err_cnt_harvest; + /* is poison mode supported */ + bool poison_supported; + /* RAS count errors delayed work */ struct delayed_work ras_counte_delay_work; atomic_t ras_ue_count; @@ -649,4 +652,6 @@ int amdgpu_persistent_edc_harvesting_supported(struct amdgpu_device *adev); const char *get_ras_block_str(struct ras_common_if *ras_block); +bool amdgpu_ras_is_poison_supported(struct amdgpu_device *adev); + #endif -- 2.17.1