[AMD Official Use Only - General] > -----Original Message----- > From: Zhang, Hawking <Hawking.Zhang@xxxxxxx> > Sent: Sunday, June 11, 2023 6:46 PM > To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx; Yang, Stanley <Stanley.Yang@xxxxxxx>; > Li, Candice <Candice.Li@xxxxxxx>; Chai, Thomas <YiPeng.Chai@xxxxxxx>; > Zhou1, Tao <Tao.Zhou1@xxxxxxx> > Cc: Zhang, Hawking <Hawking.Zhang@xxxxxxx> > Subject: [PATCH 2/2] drm/amdgpu: Enable gfx v11_0_3 ras if poison mode is > supported > > GFX v11_0_3 ras needs to be enabled if poison mode is supported. Driver > doesn't need issue an feature enable call in gfx_v11_0 late init phase. The ras > late init call is already centralized to amdgpu_ras_late_init. > In addition, move poison_mode check out of common helper like > amdgpu_ras_is_supported and amdgpu_ras_is_feature_allowed ensure only > GFX RAS is enabled when poison mode is supported. > > Signed-off-by: Hawking Zhang <Hawking.Zhang@xxxxxxx> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 49 ++++++++----------------- > drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c | 26 ------------- > 2 files changed, 16 insertions(+), 59 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > index dd7cdc234d7e..35e70860d628 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > @@ -126,6 +126,7 @@ static bool > amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con, > uint64_t addr); > static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev, > uint64_t addr); > +static void amdgpu_ras_query_poison_mode(struct amdgpu_device *adev); > #ifdef CONFIG_X86_MCE_AMD > static void amdgpu_register_bad_pages_mca_notifier(struct amdgpu_device > *adev); struct mce_notifier_adev_list { @@ -757,16 +758,6 @@ static int > __amdgpu_ras_feature_enable(struct amdgpu_device *adev, > return 0; > } > > -static int amdgpu_ras_check_feature_allowed(struct amdgpu_device *adev, > - struct ras_common_if *head) > -{ > - if (amdgpu_ras_is_feature_allowed(adev, head) || > - amdgpu_ras_is_poison_mode_supported(adev)) > - return 1; > - else > - return 0; > -} > - > /* wrapper of psp_ras_enable_features */ int > amdgpu_ras_feature_enable(struct amdgpu_device *adev, > struct ras_common_if *head, bool enable) @@ -797,7 +788,7 > @@ int amdgpu_ras_feature_enable(struct amdgpu_device *adev, > } > > /* Do not enable if it is not allowed. */ > - if (enable && !amdgpu_ras_check_feature_allowed(adev, head)) > + if (enable && !amdgpu_ras_is_feature_allowed(adev, head)) > goto out; > > /* Only enable ras feature operation handle on host side */ @@ - > 2420,9 +2411,9 @@ static bool amdgpu_ras_asic_supported(struct > amdgpu_device *adev) } > > /* > - * this is workaround for vega20 workstation sku, > - * force enable gfx ras, ignore vbios gfx ras flag > - * due to GC EDC can not write > + * Common helpers for device or IP specific RAS quirks including > + * a). Enable gfx ras on D16406 or D36002 board > + * b). Enable gfx ras in gfx_v11_0_3 if poison mode is supported > */ > static void amdgpu_ras_get_quirks(struct amdgpu_device *adev) { @@ - > 2431,10 +2422,16 @@ static void amdgpu_ras_get_quirks(struct > amdgpu_device *adev) > if (!ctx) > return; > > + /* Enable gfx ras on specific board */ > if (strnstr(ctx->vbios_version, "D16406", > sizeof(ctx->vbios_version)) || > - strnstr(ctx->vbios_version, "D36002", > - sizeof(ctx->vbios_version))) > + strnstr(ctx->vbios_version, "D36002", > + sizeof(ctx->vbios_version))) > + adev->ras_hw_enabled |= (1 << > AMDGPU_RAS_BLOCK__GFX); > + > + /* Enable gfx ras on gfx_v11_0_3 if poison mode is supported */ > + if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(11, 0, 3) && > + amdgpu_ras_is_poison_mode_supported(adev)) > adev->ras_hw_enabled |= (1 << > AMDGPU_RAS_BLOCK__GFX); } [Stanley]: For GC 11.0.3, it's better not expose AMDGPU_RAS_BLOCK__GFX to kfd, may be with below is more reasonable. { diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c index 8e4124dcb6e4..84030289ac96 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c @@ -1996,9 +1996,10 @@ int kfd_topology_add_device(struct kfd_dev *gpu) } /* kfd only concerns sram ecc on GFX and HBM ecc on UMC */ - dev->node_props.capability |= - ((dev->gpu->adev->ras_enabled & BIT(AMDGPU_RAS_BLOCK__GFX)) != 0) ? - HSA_CAP_SRAM_EDCSUPPORTED : 0; + if (KFD_GC_VERSION(dev->gpu) != IP_VERSION(11, 0, 3)) + dev->node_props.capability |= + ((dev->gpu->adev->ras_enabled & BIT(AMDGPU_RAS_BLOCK__GFX)) != 0) ? + HSA_CAP_SRAM_EDCSUPPORTED : 0; dev->node_props.capability |= ((dev->gpu->adev->ras_enabled & BIT(AMDGPU_RAS_BLOCK__UMC)) != 0) ? HSA_CAP_MEM_EDCSUPPORTED : 0; -- } Regards, Stanley > > @@ -2502,6 +2499,8 @@ static void amdgpu_ras_check_supported(struct > amdgpu_device *adev) > 1 << > AMDGPU_RAS_BLOCK__MMHUB); > } > > + amdgpu_ras_query_poison_mode(adev); > + > amdgpu_ras_get_quirks(adev); > > /* hw_supported needs to be aligned with RAS block mask. */ @@ - > 2659,8 +2658,6 @@ int amdgpu_ras_init(struct amdgpu_device *adev) > goto release_con; > } > > - amdgpu_ras_query_poison_mode(adev); > - > if (amdgpu_ras_fs_init(adev)) { > r = -EINVAL; > goto release_con; > @@ -3115,26 +3112,12 @@ int amdgpu_ras_set_context(struct > amdgpu_device *adev, struct amdgpu_ras *ras_co int > amdgpu_ras_is_supported(struct amdgpu_device *adev, > unsigned int block) > { > - int ret = 0; > struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); > > if (block >= AMDGPU_RAS_BLOCK_COUNT) > return 0; > > - ret = ras && (adev->ras_enabled & (1 << block)); > - > - /* For the special asic with mem ecc enabled but sram ecc > - * not enabled, even if the ras block is not supported on > - * .ras_enabled, if the asic supports poison mode and the > - * ras block has ras configuration, it can be considered > - * that the ras block supports ras function. > - */ > - if (!ret && > - amdgpu_ras_is_poison_mode_supported(adev) && > - amdgpu_ras_get_ras_block(adev, block, 0)) > - ret = 1; > - > - return ret; > + return (ras && (adev->ras_enabled & (1 << block))); > } > > int amdgpu_ras_reset_gpu(struct amdgpu_device *adev) diff --git > a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c > b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c > index 690e121d9dda..11e0c574b9f7 100644 > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c > @@ -4650,26 +4650,6 @@ static int gfx_v11_0_early_init(void *handle) > return gfx_v11_0_init_microcode(adev); } > > -static int gfx_v11_0_ras_late_init(void *handle) -{ > - struct amdgpu_device *adev = (struct amdgpu_device *)handle; > - struct ras_common_if *gfx_common_if; > - int ret; > - > - gfx_common_if = kzalloc(sizeof(struct ras_common_if), GFP_KERNEL); > - if (!gfx_common_if) > - return -ENOMEM; > - > - gfx_common_if->block = AMDGPU_RAS_BLOCK__GFX; > - > - ret = amdgpu_ras_feature_enable(adev, gfx_common_if, true); > - if (ret) > - dev_warn(adev->dev, "Failed to enable gfx11 ras feature\n"); > - > - kfree(gfx_common_if); > - return 0; > -} > - > static int gfx_v11_0_late_init(void *handle) { > struct amdgpu_device *adev = (struct amdgpu_device *)handle; @@ - > 4683,12 +4663,6 @@ static int gfx_v11_0_late_init(void *handle) > if (r) > return r; > > - if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(11, 0, 3)) { > - r = gfx_v11_0_ras_late_init(handle); > - if (r) > - return r; > - } > - > return 0; > } > > -- > 2.17.1