[AMD Official Use Only - General] Good point. gfx v11_0_3 doesn't support sram ecc. We should *not* let KFD report SRAMEDC cap to user mode - it will result to wrong shader compiler with ecc enabled for gfx v11_0_3. I'm leaning toward creating per IP feature matrix to deal with the cases that hardware just supports partial RAS features. Let me think about it more and get back to you. Regards, Hawking -----Original Message----- From: Yang, Stanley <Stanley.Yang@xxxxxxx> Sent: Monday, June 12, 2023 16:30 To: Zhang, Hawking <Hawking.Zhang@xxxxxxx>; amd-gfx@xxxxxxxxxxxxxxxxxxxxx; Li, Candice <Candice.Li@xxxxxxx>; Chai, Thomas <YiPeng.Chai@xxxxxxx>; Zhou1, Tao <Tao.Zhou1@xxxxxxx> Subject: RE: [PATCH 2/2] drm/amdgpu: Enable gfx v11_0_3 ras if poison mode is supported [AMD Official Use Only - General] > -----Original Message----- > From: Zhang, Hawking <Hawking.Zhang@xxxxxxx> > Sent: Sunday, June 11, 2023 6:46 PM > To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx; Yang, Stanley > <Stanley.Yang@xxxxxxx>; Li, Candice <Candice.Li@xxxxxxx>; Chai, Thomas > <YiPeng.Chai@xxxxxxx>; Zhou1, Tao <Tao.Zhou1@xxxxxxx> > Cc: Zhang, Hawking <Hawking.Zhang@xxxxxxx> > Subject: [PATCH 2/2] drm/amdgpu: Enable gfx v11_0_3 ras if poison mode > is supported > > GFX v11_0_3 ras needs to be enabled if poison mode is supported. > Driver doesn't need issue an feature enable call in gfx_v11_0 late > init phase. The ras late init call is already centralized to amdgpu_ras_late_init. > In addition, move poison_mode check out of common helper like > amdgpu_ras_is_supported and amdgpu_ras_is_feature_allowed ensure only > GFX RAS is enabled when poison mode is supported. > > Signed-off-by: Hawking Zhang <Hawking.Zhang@xxxxxxx> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 49 > ++++++++----------------- drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c | 26 > ------------- > 2 files changed, 16 insertions(+), 59 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > index dd7cdc234d7e..35e70860d628 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > @@ -126,6 +126,7 @@ static bool > amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con, > uint64_t addr); static bool > amdgpu_ras_check_bad_page(struct amdgpu_device *adev, > uint64_t addr); > +static void amdgpu_ras_query_poison_mode(struct amdgpu_device *adev); > #ifdef CONFIG_X86_MCE_AMD > static void amdgpu_register_bad_pages_mca_notifier(struct > amdgpu_device *adev); struct mce_notifier_adev_list { @@ -757,16 > +758,6 @@ static int __amdgpu_ras_feature_enable(struct amdgpu_device *adev, > return 0; > } > > -static int amdgpu_ras_check_feature_allowed(struct amdgpu_device *adev, > - struct ras_common_if *head) > -{ > - if (amdgpu_ras_is_feature_allowed(adev, head) || > - amdgpu_ras_is_poison_mode_supported(adev)) > - return 1; > - else > - return 0; > -} > - > /* wrapper of psp_ras_enable_features */ int > amdgpu_ras_feature_enable(struct amdgpu_device *adev, > struct ras_common_if *head, bool enable) @@ -797,7 > +788,7 @@ int amdgpu_ras_feature_enable(struct amdgpu_device *adev, > } > > /* Do not enable if it is not allowed. */ > - if (enable && !amdgpu_ras_check_feature_allowed(adev, head)) > + if (enable && !amdgpu_ras_is_feature_allowed(adev, head)) > goto out; > > /* Only enable ras feature operation handle on host side */ @@ - > 2420,9 +2411,9 @@ static bool amdgpu_ras_asic_supported(struct > amdgpu_device *adev) } > > /* > - * this is workaround for vega20 workstation sku, > - * force enable gfx ras, ignore vbios gfx ras flag > - * due to GC EDC can not write > + * Common helpers for device or IP specific RAS quirks including > + * a). Enable gfx ras on D16406 or D36002 board > + * b). Enable gfx ras in gfx_v11_0_3 if poison mode is supported > */ > static void amdgpu_ras_get_quirks(struct amdgpu_device *adev) { @@ - > 2431,10 +2422,16 @@ static void amdgpu_ras_get_quirks(struct > amdgpu_device *adev) > if (!ctx) > return; > > + /* Enable gfx ras on specific board */ > if (strnstr(ctx->vbios_version, "D16406", > sizeof(ctx->vbios_version)) || > - strnstr(ctx->vbios_version, "D36002", > - sizeof(ctx->vbios_version))) > + strnstr(ctx->vbios_version, "D36002", > + sizeof(ctx->vbios_version))) > + adev->ras_hw_enabled |= (1 << > AMDGPU_RAS_BLOCK__GFX); > + > + /* Enable gfx ras on gfx_v11_0_3 if poison mode is supported */ > + if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(11, 0, 3) && > + amdgpu_ras_is_poison_mode_supported(adev)) > adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__GFX); } [Stanley]: For GC 11.0.3, it's better not expose AMDGPU_RAS_BLOCK__GFX to kfd, may be with below is more reasonable. { diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c index 8e4124dcb6e4..84030289ac96 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c @@ -1996,9 +1996,10 @@ int kfd_topology_add_device(struct kfd_dev *gpu) } /* kfd only concerns sram ecc on GFX and HBM ecc on UMC */ - dev->node_props.capability |= - ((dev->gpu->adev->ras_enabled & BIT(AMDGPU_RAS_BLOCK__GFX)) != 0) ? - HSA_CAP_SRAM_EDCSUPPORTED : 0; + if (KFD_GC_VERSION(dev->gpu) != IP_VERSION(11, 0, 3)) + dev->node_props.capability |= + ((dev->gpu->adev->ras_enabled & BIT(AMDGPU_RAS_BLOCK__GFX)) != 0) ? + HSA_CAP_SRAM_EDCSUPPORTED : 0; dev->node_props.capability |= ((dev->gpu->adev->ras_enabled & BIT(AMDGPU_RAS_BLOCK__UMC)) != 0) ? HSA_CAP_MEM_EDCSUPPORTED : 0; -- } Regards, Stanley > > @@ -2502,6 +2499,8 @@ static void amdgpu_ras_check_supported(struct > amdgpu_device *adev) > 1 << > AMDGPU_RAS_BLOCK__MMHUB); > } > > + amdgpu_ras_query_poison_mode(adev); > + > amdgpu_ras_get_quirks(adev); > > /* hw_supported needs to be aligned with RAS block mask. */ @@ - > 2659,8 +2658,6 @@ int amdgpu_ras_init(struct amdgpu_device *adev) > goto release_con; > } > > - amdgpu_ras_query_poison_mode(adev); > - > if (amdgpu_ras_fs_init(adev)) { > r = -EINVAL; > goto release_con; > @@ -3115,26 +3112,12 @@ int amdgpu_ras_set_context(struct > amdgpu_device *adev, struct amdgpu_ras *ras_co int > amdgpu_ras_is_supported(struct amdgpu_device *adev, > unsigned int block) > { > - int ret = 0; > struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); > > if (block >= AMDGPU_RAS_BLOCK_COUNT) > return 0; > > - ret = ras && (adev->ras_enabled & (1 << block)); > - > - /* For the special asic with mem ecc enabled but sram ecc > - * not enabled, even if the ras block is not supported on > - * .ras_enabled, if the asic supports poison mode and the > - * ras block has ras configuration, it can be considered > - * that the ras block supports ras function. > - */ > - if (!ret && > - amdgpu_ras_is_poison_mode_supported(adev) && > - amdgpu_ras_get_ras_block(adev, block, 0)) > - ret = 1; > - > - return ret; > + return (ras && (adev->ras_enabled & (1 << block))); > } > > int amdgpu_ras_reset_gpu(struct amdgpu_device *adev) diff --git > a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c > b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c > index 690e121d9dda..11e0c574b9f7 100644 > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c > @@ -4650,26 +4650,6 @@ static int gfx_v11_0_early_init(void *handle) > return gfx_v11_0_init_microcode(adev); } > > -static int gfx_v11_0_ras_late_init(void *handle) -{ > - struct amdgpu_device *adev = (struct amdgpu_device *)handle; > - struct ras_common_if *gfx_common_if; > - int ret; > - > - gfx_common_if = kzalloc(sizeof(struct ras_common_if), GFP_KERNEL); > - if (!gfx_common_if) > - return -ENOMEM; > - > - gfx_common_if->block = AMDGPU_RAS_BLOCK__GFX; > - > - ret = amdgpu_ras_feature_enable(adev, gfx_common_if, true); > - if (ret) > - dev_warn(adev->dev, "Failed to enable gfx11 ras feature\n"); > - > - kfree(gfx_common_if); > - return 0; > -} > - > static int gfx_v11_0_late_init(void *handle) { > struct amdgpu_device *adev = (struct amdgpu_device *)handle; @@ > - > 4683,12 +4663,6 @@ static int gfx_v11_0_late_init(void *handle) > if (r) > return r; > > - if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(11, 0, 3)) { > - r = gfx_v11_0_ras_late_init(handle); > - if (r) > - return r; > - } > - > return 0; > } > > -- > 2.17.1