[AMD Official Use Only - General] Reviewed-by: Stanley.Yang <Stanley.Yang@xxxxxxx> Regards, Stanley > -----Original Message----- > From: amd-gfx <amd-gfx-bounces@xxxxxxxxxxxxxxxxxxxxx> On Behalf Of > Hawking Zhang > Sent: Friday, October 14, 2022 2:19 PM > To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx; Zhou1, Tao <Tao.Zhou1@xxxxxxx>; > Yang, Stanley <Stanley.Yang@xxxxxxx> > Cc: Russell, Kent <Kent.Russell@xxxxxxx>; Zhang, Hawking > <Hawking.Zhang@xxxxxxx> > Subject: [PATCH] drm/amdgpu: move convert_error_address out of umc_ras > > RAS error address translation algorithm is common across dGPU and A + A > platform as along as the SOC integrates the same generation of UMC IP. > > UMC RAS is managed by x86 MCA on A + A platform, umc_ras in GPU driver > is not initialized at all on A + A platform. In such case, any umc_ras callback > implemented for dGPU config shouldn't be invoked from A + A specific > callback. > > The change moves convert_error_address out of dGPU umc_ras structure > and makes it share between A + A and dGPU config. > > Signed-off-by: Hawking Zhang <Hawking.Zhang@xxxxxxx> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 15 +++++++++++---- > drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h | 3 --- > drivers/gpu/drm/amd/amdgpu/umc_v6_7.c | 7 +++---- > drivers/gpu/drm/amd/amdgpu/umc_v6_7.h | 4 +++- > 4 files changed, 17 insertions(+), 12 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > index 75f1402101f4..ff92ea99d513 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > @@ -36,6 +36,7 @@ > #include "ivsrcid/nbio/irqsrcs_nbif_7_4.h" > #include "atom.h" > #include "amdgpu_reset.h" > +#include "umc_v6_7.h" > > #ifdef CONFIG_X86_MCE_AMD > #include <asm/mce.h> > @@ -2885,10 +2886,16 @@ static int amdgpu_bad_page_notifier(struct > notifier_block *nb, > /* > * Translate UMC channel address to Physical address > */ > - if (adev->umc.ras && > - adev->umc.ras->convert_ras_error_address) > - adev->umc.ras->convert_ras_error_address(adev, > - &err_data, m->addr, ch_inst, umc_inst); > + switch (adev->ip_versions[UMC_HWIP][0]) { > + case IP_VERSION(6, 7, 0): > + umc_v6_7_convert_error_address(adev, > + &err_data, m->addr, ch_inst, umc_inst); > + break; > + default: > + dev_warn(adev->dev, > + "UMC address to Physical address translation is not > supported\n"); > + return NOTIFY_DONE; > + } > > if (amdgpu_bad_page_threshold != 0) { > amdgpu_ras_add_bad_pages(adev, err_data.err_addr, diff -- > git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h > index e46439274f3a..3629d8f292ef 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h > @@ -51,9 +51,6 @@ struct amdgpu_umc_ras { > struct amdgpu_ras_block_object ras_block; > void (*err_cnt_init)(struct amdgpu_device *adev); > bool (*query_ras_poison_mode)(struct amdgpu_device *adev); > - void (*convert_ras_error_address)(struct amdgpu_device *adev, > - struct ras_err_data *err_data, uint64_t > err_addr, > - uint32_t ch_inst, uint32_t umc_inst); > void (*ecc_info_query_ras_error_count)(struct amdgpu_device > *adev, > void *ras_error_status); > void (*ecc_info_query_ras_error_address)(struct amdgpu_device > *adev, diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c > b/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c > index 5d5d031c9e7d..72fd963f178b 100644 > --- a/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c > +++ b/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c > @@ -187,9 +187,9 @@ static void > umc_v6_7_ecc_info_query_ras_error_count(struct amdgpu_device *adev, > } > } > > -static void umc_v6_7_convert_error_address(struct amdgpu_device *adev, > - struct ras_err_data *err_data, > uint64_t err_addr, > - uint32_t ch_inst, uint32_t umc_inst) > +void umc_v6_7_convert_error_address(struct amdgpu_device *adev, > + struct ras_err_data *err_data, uint64_t > err_addr, > + uint32_t ch_inst, uint32_t umc_inst) > { > uint32_t channel_index; > uint64_t soc_pa, retired_page, column; @@ -553,5 +553,4 @@ struct > amdgpu_umc_ras umc_v6_7_ras = { > .query_ras_poison_mode = umc_v6_7_query_ras_poison_mode, > .ecc_info_query_ras_error_count = > umc_v6_7_ecc_info_query_ras_error_count, > .ecc_info_query_ras_error_address = > umc_v6_7_ecc_info_query_ras_error_address, > - .convert_ras_error_address = umc_v6_7_convert_error_address, > }; > diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v6_7.h > b/drivers/gpu/drm/amd/amdgpu/umc_v6_7.h > index fe41ed2f5945..105245d5b6e5 100644 > --- a/drivers/gpu/drm/amd/amdgpu/umc_v6_7.h > +++ b/drivers/gpu/drm/amd/amdgpu/umc_v6_7.h > @@ -71,5 +71,7 @@ extern const uint32_t > > umc_v6_7_channel_idx_tbl_second[UMC_V6_7_UMC_INSTANCE_NU > M][UMC_V6_7_CHANNEL_INSTANCE_NUM]; > extern const uint32_t > > umc_v6_7_channel_idx_tbl_first[UMC_V6_7_UMC_INSTANCE_NUM] > [UMC_V6_7_CHANNEL_INSTANCE_NUM]; > - > +void umc_v6_7_convert_error_address(struct amdgpu_device *adev, > + struct ras_err_data *err_data, uint64_t err_addr, > + uint32_t ch_inst, uint32_t > +umc_inst); > #endif > -- > 2.17.1