[AMD Official Use Only - General] The series is: Reviewed-by: Tao Zhou <tao.zhou1@xxxxxxx> > -----Original Message----- > From: Li, Candice <Candice.Li@xxxxxxx> > Sent: Monday, September 4, 2023 3:20 PM > To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx > Cc: Li, Candice <Candice.Li@xxxxxxx>; Zhou1, Tao <Tao.Zhou1@xxxxxxx> > Subject: [PATCH 3/3] drm/amdgpu: Add umc v12_0 ras functions > > Add umc v12_0 ras error querying. > > Signed-off-by: Candice Li <candice.li@xxxxxxx> > Reviewed-by: Tao Zhou <tao.zhou1@xxxxxxx> > --- > drivers/gpu/drm/amd/amdgpu/Makefile | 2 +- > drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 16 +- > drivers/gpu/drm/amd/amdgpu/umc_v12_0.c | 256 > +++++++++++++++++++++++++ drivers/gpu/drm/amd/amdgpu/umc_v12_0.h | > 56 ++++++ > 4 files changed, 327 insertions(+), 3 deletions(-) create mode 100644 > drivers/gpu/drm/amd/amdgpu/umc_v12_0.c > create mode 100644 drivers/gpu/drm/amd/amdgpu/umc_v12_0.h > > diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile > b/drivers/gpu/drm/amd/amdgpu/Makefile > index ce0188b329cdeb..adf5470aa81020 100644 > --- a/drivers/gpu/drm/amd/amdgpu/Makefile > +++ b/drivers/gpu/drm/amd/amdgpu/Makefile > @@ -121,7 +121,7 @@ amdgpu-y += \ > > # add UMC block > amdgpu-y += \ > - umc_v6_0.o umc_v6_1.o umc_v6_7.o umc_v8_7.o umc_v8_10.o > + umc_v6_0.o umc_v6_1.o umc_v6_7.o umc_v8_7.o umc_v8_10.o > umc_v12_0.o > > # add IH block > amdgpu-y += \ > diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c > b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c > index 8447fcada8bb92..41e1759b5f1eaa 100644 > --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c > +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c > @@ -56,6 +56,7 @@ > #include "umc_v6_1.h" > #include "umc_v6_0.h" > #include "umc_v6_7.h" > +#include "umc_v12_0.h" > #include "hdp_v4_0.h" > #include "mca_v3_0.h" > > @@ -737,7 +738,8 @@ static void gmc_v9_0_set_irq_funcs(struct > amdgpu_device *adev) > adev->gmc.vm_fault.funcs = &gmc_v9_0_irq_funcs; > > if (!amdgpu_sriov_vf(adev) && > - !adev->gmc.xgmi.connected_to_cpu) { > + !adev->gmc.xgmi.connected_to_cpu && > + !adev->gmc.is_app_apu) { > adev->gmc.ecc_irq.num_types = 1; > adev->gmc.ecc_irq.funcs = &gmc_v9_0_ecc_funcs; > } > @@ -1487,6 +1489,15 @@ static void gmc_v9_0_set_umc_funcs(struct > amdgpu_device *adev) > else > adev->umc.channel_idx_tbl = > &umc_v6_7_channel_idx_tbl_second[0][0]; > break; > + case IP_VERSION(12, 0, 0): > + adev->umc.max_ras_err_cnt_per_query = > UMC_V12_0_TOTAL_CHANNEL_NUM(adev); > + adev->umc.channel_inst_num = > UMC_V12_0_CHANNEL_INSTANCE_NUM; > + adev->umc.umc_inst_num = > UMC_V12_0_UMC_INSTANCE_NUM; > + adev->umc.node_inst_num /= > UMC_V12_0_UMC_INSTANCE_NUM; > + adev->umc.channel_offs = > UMC_V12_0_PER_CHANNEL_OFFSET; > + adev->umc.active_mask = adev->aid_mask; > + if (!adev->gmc.xgmi.connected_to_cpu && !adev- > >gmc.is_app_apu) > + adev->umc.ras = &umc_v12_0_ras; > default: > break; > } > @@ -2131,7 +2142,8 @@ static int gmc_v9_0_sw_init(void *handle) > return r; > > if (!amdgpu_sriov_vf(adev) && > - !adev->gmc.xgmi.connected_to_cpu) { > + !adev->gmc.xgmi.connected_to_cpu && > + !adev->gmc.is_app_apu) { > /* interrupt sent to DF. */ > r = amdgpu_irq_add_id(adev, SOC15_IH_CLIENTID_DF, 0, > &adev->gmc.ecc_irq); > diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c > b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c > new file mode 100644 > index 00000000000000..b3d6db14b351f1 > --- /dev/null > +++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c > @@ -0,0 +1,256 @@ > +/* > + * Copyright 2023 Advanced Micro Devices, Inc. > + * > + * Permission is hereby granted, free of charge, to any person > +obtaining a > + * copy of this software and associated documentation files (the > +"Software"), > + * to deal in the Software without restriction, including without > +limitation > + * the rights to use, copy, modify, merge, publish, distribute, > +sublicense, > + * and/or sell copies of the Software, and to permit persons to whom > +the > + * Software is furnished to do so, subject to the following conditions: > + * > + * The above copyright notice and this permission notice shall be > +included in > + * all copies or substantial portions of the Software. > + * > + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, > +EXPRESS OR > + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF > +MERCHANTABILITY, > + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO > EVENT > +SHALL > + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, > +DAMAGES OR > + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR > +OTHERWISE, > + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE > USE > +OR > + * OTHER DEALINGS IN THE SOFTWARE. > + * > + */ > +#include "umc_v12_0.h" > +#include "amdgpu_ras.h" > +#include "amdgpu_umc.h" > +#include "amdgpu.h" > +#include "umc/umc_12_0_0_offset.h" > +#include "umc/umc_12_0_0_sh_mask.h" > + > +static inline uint64_t get_umc_v12_0_reg_offset(struct amdgpu_device *adev, > + uint32_t node_inst, > + uint32_t umc_inst, > + uint32_t ch_inst) > +{ > + uint32_t index = umc_inst * adev->umc.channel_inst_num + ch_inst; > + uint64_t cross_node_offset = (node_inst == 0) ? 0 : > +UMC_V12_0_CROSS_NODE_OFFSET; > + > + umc_inst = index / 4; > + ch_inst = index % 4; > + > + return adev->umc.channel_offs * ch_inst + UMC_V12_0_INST_DIST * > umc_inst + > + UMC_V12_0_NODE_DIST * node_inst + cross_node_offset; } > + > +static int umc_v12_0_reset_error_count_per_channel(struct amdgpu_device > *adev, > + uint32_t node_inst, uint32_t umc_inst, > + uint32_t ch_inst, void *data) > +{ > + uint64_t odecc_err_cnt_addr; > + uint64_t umc_reg_offset = > + get_umc_v12_0_reg_offset(adev, node_inst, umc_inst, ch_inst); > + > + odecc_err_cnt_addr = > + SOC15_REG_OFFSET(UMC, 0, regUMCCH0_OdEccErrCnt); > + > + /* clear error count */ > + WREG32_PCIE_EXT((odecc_err_cnt_addr + umc_reg_offset) * 4, > + UMC_V12_0_CE_CNT_INIT); > + > + return 0; > +} > + > +static void umc_v12_0_reset_error_count(struct amdgpu_device *adev) { > + amdgpu_umc_loop_channels(adev, > + umc_v12_0_reset_error_count_per_channel, NULL); } > + > +static void umc_v12_0_query_correctable_error_count(struct amdgpu_device > *adev, > + uint64_t umc_reg_offset, > + unsigned long *error_count) > +{ > + uint64_t mc_umc_status; > + uint64_t mc_umc_status_addr; > + > + mc_umc_status_addr = > + SOC15_REG_OFFSET(UMC, 0, > regMCA_UMC_UMC0_MCUMC_STATUST0); > + > + /* Rely on MCUMC_STATUS for correctable error counter > + * MCUMC_STATUS is a 64 bit register > + */ > + mc_umc_status = > + RREG64_PCIE_EXT((mc_umc_status_addr + umc_reg_offset) * > 4); > + > + if (REG_GET_FIELD(mc_umc_status, > MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 && > + REG_GET_FIELD(mc_umc_status, > MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1) > + *error_count += 1; > +} > + > +static void umc_v12_0_query_uncorrectable_error_count(struct amdgpu_device > *adev, > + uint64_t umc_reg_offset, > + unsigned long *error_count) > { > + uint64_t mc_umc_status; > + uint64_t mc_umc_status_addr; > + > + mc_umc_status_addr = > + SOC15_REG_OFFSET(UMC, 0, > regMCA_UMC_UMC0_MCUMC_STATUST0); > + > + /* Check the MCUMC_STATUS. */ > + mc_umc_status = > + RREG64_PCIE_EXT((mc_umc_status_addr + umc_reg_offset) * > 4); > + > + if ((REG_GET_FIELD(mc_umc_status, > MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1) && > + (REG_GET_FIELD(mc_umc_status, > MCA_UMC_UMC0_MCUMC_STATUST0, Deferred) == 1 || > + REG_GET_FIELD(mc_umc_status, > MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1 || > + REG_GET_FIELD(mc_umc_status, > MCA_UMC_UMC0_MCUMC_STATUST0, PCC) == 1 || > + REG_GET_FIELD(mc_umc_status, > MCA_UMC_UMC0_MCUMC_STATUST0, UC) == 1 || > + REG_GET_FIELD(mc_umc_status, > MCA_UMC_UMC0_MCUMC_STATUST0, TCC) == 1)) > + *error_count += 1; > +} > + > +static int umc_v12_0_query_error_count(struct amdgpu_device *adev, > + uint32_t node_inst, uint32_t umc_inst, > + uint32_t ch_inst, void *data) > +{ > + struct ras_err_data *err_data = (struct ras_err_data *)data; > + uint64_t umc_reg_offset = > + get_umc_v12_0_reg_offset(adev, node_inst, umc_inst, ch_inst); > + > + umc_v12_0_query_correctable_error_count(adev, > + umc_reg_offset, > + &(err_data->ce_count)); > + umc_v12_0_query_uncorrectable_error_count(adev, > + umc_reg_offset, > + &(err_data->ue_count)); > + > + return 0; > +} > + > +static void umc_v12_0_query_ras_error_count(struct amdgpu_device *adev, > + void *ras_error_status) > +{ > + amdgpu_umc_loop_channels(adev, > + umc_v12_0_query_error_count, ras_error_status); > + > + umc_v12_0_reset_error_count(adev); > +} > + > +static void umc_v12_0_convert_error_address(struct amdgpu_device *adev, > + struct ras_err_data *err_data, > uint64_t err_addr, > + uint32_t ch_inst, uint32_t umc_inst, > + uint32_t node_inst, uint64_t > mc_umc_status) { > + > +} > + > +static int umc_v12_0_query_error_address(struct amdgpu_device *adev, > + uint32_t node_inst, uint32_t umc_inst, > + uint32_t ch_inst, void *data) > +{ > + uint64_t mc_umc_status_addr; > + uint64_t mc_umc_status, err_addr; > + uint64_t mc_umc_addrt0; > + struct ras_err_data *err_data = (struct ras_err_data *)data; > + uint64_t umc_reg_offset = > + get_umc_v12_0_reg_offset(adev, node_inst, umc_inst, ch_inst); > + > + mc_umc_status_addr = > + SOC15_REG_OFFSET(UMC, 0, > regMCA_UMC_UMC0_MCUMC_STATUST0); > + > + mc_umc_status = RREG64_PCIE_EXT((mc_umc_status_addr + > umc_reg_offset) > +* 4); > + > + if (mc_umc_status == 0) > + return 0; > + > + if (!err_data->err_addr) { > + /* clear umc status */ > + WREG64_PCIE_EXT((mc_umc_status_addr + umc_reg_offset) * > 4, 0x0ULL); > + > + return 0; > + } > + > + /* calculate error address if ue error is detected */ > + if (REG_GET_FIELD(mc_umc_status, > MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 && > + REG_GET_FIELD(mc_umc_status, > MCA_UMC_UMC0_MCUMC_STATUST0, AddrV) == 1 && > + REG_GET_FIELD(mc_umc_status, > MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == > +1) { > + > + mc_umc_addrt0 = > + SOC15_REG_OFFSET(UMC, 0, > regMCA_UMC_UMC0_MCUMC_ADDRT0); > + > + err_addr = RREG64_PCIE_EXT((mc_umc_addrt0 + > umc_reg_offset) * 4); > + > + err_addr = REG_GET_FIELD(err_addr, > MCA_UMC_UMC0_MCUMC_ADDRT0, > +ErrorAddr); > + > + umc_v12_0_convert_error_address(adev, err_data, err_addr, > + ch_inst, umc_inst, node_inst); > + } > + > + /* clear umc status */ > + WREG64_PCIE_EXT((mc_umc_status_addr + umc_reg_offset) * 4, > 0x0ULL); > + > + return 0; > +} > + > +static void umc_v12_0_query_ras_error_address(struct amdgpu_device *adev, > + void *ras_error_status) > +{ > + amdgpu_umc_loop_channels(adev, > + umc_v12_0_query_error_address, ras_error_status); } > + > +static int umc_v12_0_err_cnt_init_per_channel(struct amdgpu_device *adev, > + uint32_t node_inst, uint32_t umc_inst, > + uint32_t ch_inst, void *data) > +{ > + uint32_t odecc_cnt_sel; > + uint64_t odecc_cnt_sel_addr, odecc_err_cnt_addr; > + uint64_t umc_reg_offset = > + get_umc_v12_0_reg_offset(adev, node_inst, umc_inst, ch_inst); > + > + odecc_cnt_sel_addr = > + SOC15_REG_OFFSET(UMC, 0, regUMCCH0_OdEccCntSel); > + odecc_err_cnt_addr = > + SOC15_REG_OFFSET(UMC, 0, regUMCCH0_OdEccErrCnt); > + > + odecc_cnt_sel = RREG32_PCIE_EXT((odecc_cnt_sel_addr + > umc_reg_offset) > +* 4); > + > + /* set ce error interrupt type to APIC based interrupt */ > + odecc_cnt_sel = REG_SET_FIELD(odecc_cnt_sel, UMCCH0_OdEccCntSel, > + OdEccErrInt, 0x1); > + WREG32_PCIE_EXT((odecc_cnt_sel_addr + umc_reg_offset) * 4, > +odecc_cnt_sel); > + > + /* set error count to initial value */ > + WREG32_PCIE_EXT((odecc_err_cnt_addr + umc_reg_offset) * 4, > +UMC_V12_0_CE_CNT_INIT); > + > + return 0; > +} > + > +static void umc_v12_0_err_cnt_init(struct amdgpu_device *adev) { > + amdgpu_umc_loop_channels(adev, > + umc_v12_0_err_cnt_init_per_channel, NULL); } > + > +static bool umc_v12_0_query_ras_poison_mode(struct amdgpu_device *adev) > +{ > + /* > + * Force return true, because regUMCCH0_EccCtrl > + * is not accessible from host side > + */ > + return true; > +} > + > +const struct amdgpu_ras_block_hw_ops umc_v12_0_ras_hw_ops = { > + .query_ras_error_count = umc_v12_0_query_ras_error_count, > + .query_ras_error_address = umc_v12_0_query_ras_error_address, > +}; > + > +struct amdgpu_umc_ras umc_v12_0_ras = { > + .ras_block = { > + .hw_ops = &umc_v12_0_ras_hw_ops, > + }, > + .err_cnt_init = umc_v12_0_err_cnt_init, > + .query_ras_poison_mode = umc_v12_0_query_ras_poison_mode, }; > diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h > b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h > new file mode 100644 > index 00000000000000..2e63cc30766bc3 > --- /dev/null > +++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h > @@ -0,0 +1,56 @@ > +/* > + * Copyright 2023 Advanced Micro Devices, Inc. > + * > + * Permission is hereby granted, free of charge, to any person > +obtaining a > + * copy of this software and associated documentation files (the > +"Software"), > + * to deal in the Software without restriction, including without > +limitation > + * the rights to use, copy, modify, merge, publish, distribute, > +sublicense, > + * and/or sell copies of the Software, and to permit persons to whom > +the > + * Software is furnished to do so, subject to the following conditions: > + * > + * The above copyright notice and this permission notice shall be > +included in > + * all copies or substantial portions of the Software. > + * > + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, > +EXPRESS OR > + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF > +MERCHANTABILITY, > + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO > EVENT > +SHALL > + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, > +DAMAGES OR > + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR > +OTHERWISE, > + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE > USE > +OR > + * OTHER DEALINGS IN THE SOFTWARE. > + * > + */ > +#ifndef __UMC_V12_0_H__ > +#define __UMC_V12_0_H__ > + > +#include "soc15_common.h" > +#include "amdgpu.h" > + > +#define UMC_V12_0_NODE_DIST 0x40000000 > +#define UMC_V12_0_INST_DIST 0x40000 > + > +/* UMC register per channel offset */ > +#define UMC_V12_0_PER_CHANNEL_OFFSET 0x400 > + > +/* UMC cross node offset */ > +#define UMC_V12_0_CROSS_NODE_OFFSET 0x100000000 > + > +/* OdEccErrCnt max value */ > +#define UMC_V12_0_CE_CNT_MAX 0xffff > +/* umc ce interrupt threshold */ > +#define UMC_V12_0_CE_INT_THRESHOLD 0xffff > +/* umc ce count initial value */ > +#define UMC_V12_0_CE_CNT_INIT (UMC_V12_0_CE_CNT_MAX - > UMC_V12_0_CE_INT_THRESHOLD) > + > +/* number of umc channel instance with memory map register access */ > +#define UMC_V12_0_CHANNEL_INSTANCE_NUM 8 > +/* number of umc instance with memory map register access */ > +#define UMC_V12_0_UMC_INSTANCE_NUM 4 > + > +/* Total channel instances for all available umc nodes */ #define > +UMC_V12_0_TOTAL_CHANNEL_NUM(adev) \ > + (UMC_V12_0_CHANNEL_INSTANCE_NUM * (adev)->gmc.num_umc) > + > +extern struct amdgpu_umc_ras umc_v12_0_ras; > + > +#endif > -- > 2.25.1