[AMD Official Use Only - General] The series is: Reviewed-by: Tao Zhou <tao.zhou1@xxxxxxx> > -----Original Message----- > From: amd-gfx <amd-gfx-bounces@xxxxxxxxxxxxxxxxxxxxx> On Behalf Of Hawking > Zhang > Sent: Tuesday, January 2, 2024 11:45 AM > To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx; Zhou1, Tao <Tao.Zhou1@xxxxxxx>; Yang, > Stanley <Stanley.Yang@xxxxxxx>; Wang, Yang(Kevin) > <KevinYang.Wang@xxxxxxx>; Chai, Thomas <YiPeng.Chai@xxxxxxx>; Li, > Candice <Candice.Li@xxxxxxx> > Cc: Deucher, Alexander <Alexander.Deucher@xxxxxxx>; Ma, Le > <Le.Ma@xxxxxxx>; Lazar, Lijo <Lijo.Lazar@xxxxxxx>; Zhang, Hawking > <Hawking.Zhang@xxxxxxx> > Subject: [PATCH 3/3] drm/amdgpu: Centralize ras cap query to > amdgpu_ras_check_supported > > Move ras capablity check to amdgpu_ras_check_supported. > Driver will query ras capablity through psp interace, or vbios interface, or specific > ip callbacks. > > Signed-off-by: Hawking Zhang <Hawking.Zhang@xxxxxxx> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 170 +++++++++++++----------- > 1 file changed, 93 insertions(+), 77 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > index 5f302b7693b3..72b6e41329b0 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > @@ -39,6 +39,7 @@ > #include "nbio_v7_9.h" > #include "atom.h" > #include "amdgpu_reset.h" > +#include "amdgpu_psp.h" > > #ifdef CONFIG_X86_MCE_AMD > #include <asm/mce.h> > @@ -2680,6 +2681,87 @@ static void amdgpu_ras_get_quirks(struct > amdgpu_device *adev) > adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__GFX); } > > +/* Query ras capablity via atomfirmware interface */ static void > +amdgpu_ras_query_ras_capablity_from_vbios(struct amdgpu_device *adev) { > + /* mem_ecc cap */ > + if (amdgpu_atomfirmware_mem_ecc_supported(adev)) { > + dev_info(adev->dev, "MEM ECC is active.\n"); > + adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__UMC | > + 1 << AMDGPU_RAS_BLOCK__DF); > + } else { > + dev_info(adev->dev, "MEM ECC is not presented.\n"); > + } > + > + /* sram_ecc cap */ > + if (amdgpu_atomfirmware_sram_ecc_supported(adev)) { > + dev_info(adev->dev, "SRAM ECC is active.\n"); > + if (!amdgpu_sriov_vf(adev)) > + adev->ras_hw_enabled |= ~(1 << > AMDGPU_RAS_BLOCK__UMC | > + 1 << > AMDGPU_RAS_BLOCK__DF); > + else > + adev->ras_hw_enabled |= (1 << > AMDGPU_RAS_BLOCK__PCIE_BIF | > + 1 << > AMDGPU_RAS_BLOCK__SDMA | > + 1 << > AMDGPU_RAS_BLOCK__GFX); > + > + /* > + * VCN/JPEG RAS can be supported on both bare metal and > + * SRIOV environment > + */ > + if (amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(2, 6, > 0) || > + amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(4, 0, > 0) || > + amdgpu_ip_version(adev, VCN_HWIP, 0) == IP_VERSION(4, 0, > 3)) > + adev->ras_hw_enabled |= (1 << > AMDGPU_RAS_BLOCK__VCN | > + 1 << > AMDGPU_RAS_BLOCK__JPEG); > + else > + adev->ras_hw_enabled &= ~(1 << > AMDGPU_RAS_BLOCK__VCN | > + 1 << > AMDGPU_RAS_BLOCK__JPEG); > + > + /* > + * XGMI RAS is not supported if xgmi num physical nodes > + * is zero > + */ > + if (!adev->gmc.xgmi.num_physical_nodes) > + adev->ras_hw_enabled &= ~(1 << > AMDGPU_RAS_BLOCK__XGMI_WAFL); > + } else { > + dev_info(adev->dev, "SRAM ECC is not presented.\n"); > + } > +} > + > +/* Query poison mode from umc/df IP callbacks */ static void > +amdgpu_ras_query_poison_mode(struct amdgpu_device *adev) { > + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); > + bool df_poison, umc_poison; > + > + /* poison setting is useless on SRIOV guest */ > + if (amdgpu_sriov_vf(adev) || !con) > + return; > + > + /* Init poison supported flag, the default value is false */ > + if (adev->gmc.xgmi.connected_to_cpu || > + adev->gmc.is_app_apu) { > + /* enabled by default when GPU is connected to CPU */ > + con->poison_supported = true; > + } else if (adev->df.funcs && > + adev->df.funcs->query_ras_poison_mode && > + adev->umc.ras && > + adev->umc.ras->query_ras_poison_mode) { > + df_poison = > + adev->df.funcs->query_ras_poison_mode(adev); > + umc_poison = > + adev->umc.ras->query_ras_poison_mode(adev); > + > + /* Only poison is set in both DF and UMC, we can support it */ > + if (df_poison && umc_poison) > + con->poison_supported = true; > + else if (df_poison != umc_poison) > + dev_warn(adev->dev, > + "Poison setting is inconsistent in > DF/UMC(%d:%d)!\n", > + df_poison, umc_poison); > + } > +} > + > /* > * check hardware's ras ability which will be saved in hw_supported. > * if hardware does not support ras, we can skip some ras initializtion and @@ - > 2696,49 +2778,13 @@ static void amdgpu_ras_check_supported(struct > amdgpu_device *adev) > if (!amdgpu_ras_asic_supported(adev)) > return; > > - if (!adev->gmc.xgmi.connected_to_cpu && !adev- > >gmc.is_app_apu) { > - if (amdgpu_atomfirmware_mem_ecc_supported(adev)) { > - dev_info(adev->dev, "MEM ECC is active.\n"); > - adev->ras_hw_enabled |= (1 << > AMDGPU_RAS_BLOCK__UMC | > - 1 << > AMDGPU_RAS_BLOCK__DF); > - } else { > - dev_info(adev->dev, "MEM ECC is not presented.\n"); > - } > - > - if (amdgpu_atomfirmware_sram_ecc_supported(adev)) { > - dev_info(adev->dev, "SRAM ECC is active.\n"); > - if (!amdgpu_sriov_vf(adev)) > - adev->ras_hw_enabled |= ~(1 << > AMDGPU_RAS_BLOCK__UMC | > - 1 << > AMDGPU_RAS_BLOCK__DF); > - else > - adev->ras_hw_enabled |= (1 << > AMDGPU_RAS_BLOCK__PCIE_BIF | > - 1 << > AMDGPU_RAS_BLOCK__SDMA | > - 1 << > AMDGPU_RAS_BLOCK__GFX); > - > - /* VCN/JPEG RAS can be supported on both bare metal > and > - * SRIOV environment > - */ > - if (amdgpu_ip_version(adev, VCN_HWIP, 0) == > - IP_VERSION(2, 6, 0) || > - amdgpu_ip_version(adev, VCN_HWIP, 0) == > - IP_VERSION(4, 0, 0) || > - amdgpu_ip_version(adev, VCN_HWIP, 0) == > - IP_VERSION(4, 0, 3)) > - adev->ras_hw_enabled |= (1 << > AMDGPU_RAS_BLOCK__VCN | > - 1 << > AMDGPU_RAS_BLOCK__JPEG); > - else > - adev->ras_hw_enabled &= ~(1 << > AMDGPU_RAS_BLOCK__VCN | > - 1 << > AMDGPU_RAS_BLOCK__JPEG); > + /* query ras capability from psp */ > + if (amdgpu_psp_get_ras_capability(&adev->psp)) > + goto init_ras_enabled_flag; > > - /* > - * XGMI RAS is not supported if xgmi num physical nodes > - * is zero > - */ > - if (!adev->gmc.xgmi.num_physical_nodes) > - adev->ras_hw_enabled &= ~(1 << > AMDGPU_RAS_BLOCK__XGMI_WAFL); > - } else { > - dev_info(adev->dev, "SRAM ECC is not presented.\n"); > - } > + /* query ras capablity from bios */ > + if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) { > + amdgpu_ras_query_ras_capablity_from_vbios(adev); > } else { > /* driver only manages a few IP blocks RAS feature > * when GPU is connected cpu through XGMI */ @@ -2747,8 > +2793,13 @@ static void amdgpu_ras_check_supported(struct amdgpu_device > *adev) > 1 << > AMDGPU_RAS_BLOCK__MMHUB); > } > > + /* apply asic specific settings (vega20 only for now) */ > amdgpu_ras_get_quirks(adev); > > + /* query poison mode from umc/df ip callback */ > + amdgpu_ras_query_poison_mode(adev); > + > +init_ras_enabled_flag: > /* hw_supported needs to be aligned with RAS block mask. */ > adev->ras_hw_enabled &= AMDGPU_RAS_BLOCK_MASK; > > @@ -2781,39 +2832,6 @@ static void amdgpu_ras_counte_dw(struct > work_struct *work) > pm_runtime_put_autosuspend(dev->dev); > } > > -static void amdgpu_ras_query_poison_mode(struct amdgpu_device *adev) -{ > - struct amdgpu_ras *con = amdgpu_ras_get_context(adev); > - bool df_poison, umc_poison; > - > - /* poison setting is useless on SRIOV guest */ > - if (amdgpu_sriov_vf(adev) || !con) > - return; > - > - /* Init poison supported flag, the default value is false */ > - if (adev->gmc.xgmi.connected_to_cpu || > - adev->gmc.is_app_apu) { > - /* enabled by default when GPU is connected to CPU */ > - con->poison_supported = true; > - } else if (adev->df.funcs && > - adev->df.funcs->query_ras_poison_mode && > - adev->umc.ras && > - adev->umc.ras->query_ras_poison_mode) { > - df_poison = > - adev->df.funcs->query_ras_poison_mode(adev); > - umc_poison = > - adev->umc.ras->query_ras_poison_mode(adev); > - > - /* Only poison is set in both DF and UMC, we can support it */ > - if (df_poison && umc_poison) > - con->poison_supported = true; > - else if (df_poison != umc_poison) > - dev_warn(adev->dev, > - "Poison setting is inconsistent in > DF/UMC(%d:%d)!\n", > - df_poison, umc_poison); > - } > -} > - > static int amdgpu_get_ras_schema(struct amdgpu_device *adev) { > return amdgpu_ras_is_poison_mode_supported(adev) ? > AMDGPU_RAS_ERROR__POISON : 0 | @@ -2918,8 +2936,6 @@ int > amdgpu_ras_init(struct amdgpu_device *adev) > goto release_con; > } > > - amdgpu_ras_query_poison_mode(adev); > - > /* Get RAS schema for particular SOC */ > con->schema = amdgpu_get_ras_schema(adev); > > -- > 2.17.1