[AMD Official Use Only - General] ----------------- Best Regards, Thomas -----Original Message----- From: Zhou1, Tao <Tao.Zhou1@xxxxxxx> Sent: Monday, April 3, 2023 11:45 AM To: Chai, Thomas <YiPeng.Chai@xxxxxxx>; amd-gfx@xxxxxxxxxxxxxxxxxxxxx Cc: Zhang, Hawking <Hawking.Zhang@xxxxxxx>; Li, Candice <Candice.Li@xxxxxxx>; Yang, Stanley <Stanley.Yang@xxxxxxx> Subject: RE: [PATCH 1/2] drm/amdgpu: optimize redundant code in umc_v8_10 [AMD Official Use Only - General] > -----Original Message----- > From: Chai, Thomas <YiPeng.Chai@xxxxxxx> > Sent: Monday, April 3, 2023 9:59 AM > To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx > Cc: Chai, Thomas <YiPeng.Chai@xxxxxxx>; Zhang, Hawking > <Hawking.Zhang@xxxxxxx>; Zhou1, Tao <Tao.Zhou1@xxxxxxx>; Li, Candice > <Candice.Li@xxxxxxx>; Yang, Stanley <Stanley.Yang@xxxxxxx>; Chai, > Thomas <YiPeng.Chai@xxxxxxx> > Subject: [PATCH 1/2] drm/amdgpu: optimize redundant code in umc_v8_10 > > Optimize redundant code in umc_v8_10 > > Signed-off-by: YiPeng Chai <YiPeng.Chai@xxxxxxx> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 31 ++++ > drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h | 7 + > drivers/gpu/drm/amd/amdgpu/umc_v8_10.c | 197 > +++++++++--------------- > 3 files changed, 115 insertions(+), 120 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c > index 9e2e97207e53..734442315cf6 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c > @@ -302,3 +302,34 @@ void amdgpu_umc_fill_error_record(struct > ras_err_data *err_data, > > err_data->err_addr_cnt++; > } > + > +int amdgpu_umc_scan_all_umc_channels(struct amdgpu_device *adev, > + umc_func func, void *data) > +{ > + uint32_t node_inst = 0; > + uint32_t umc_inst = 0; > + uint32_t ch_inst = 0; > + int ret = 0; > + > + if (adev->umc.node_inst_num) { > + LOOP_UMC_EACH_NODE_INST_AND_CH(node_inst, umc_inst, > ch_inst) { > + ret = func(adev, node_inst, umc_inst, ch_inst, data); > + if (ret) { > + dev_err(adev->dev, "Node %d umc %d ch %d > func returns %d\n", > + node_inst, umc_inst, ch_inst, ret); > + return ret; > + } > + } > + } else { > + LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) { >[Tao] for ASIC which doesn't support node, can we set its node_inst_num to 1 and retire the macro LOOP_UMC_INST_AND_CH? [Thomas] I am afraid not. " #define LOOP_UMC_NODE_INST(node_inst) \ for_each_set_bit((node_inst), &(adev->umc.active_mask), adev->umc.node_inst_num) " The node instance loop of LOOP_UMC_EACH_NODE_INST_AND_CH supports node harvest, so node_inst_num is not the real node instance number. > + ret = func(adev, 0, umc_inst, ch_inst, data); > + if (ret) { > + dev_err(adev->dev, "Umc %d ch %d func > returns %d\n", > + umc_inst, ch_inst, ret); > + return ret; > + } > + } > + } > + > + return 0; > +} > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h > index d7f1229ff11f..f279c8057f96 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h > @@ -47,6 +47,10 @@ > #define LOOP_UMC_EACH_NODE_INST_AND_CH(node_inst, umc_inst, ch_inst) > \ > LOOP_UMC_NODE_INST((node_inst)) > LOOP_UMC_INST_AND_CH((umc_inst), (ch_inst)) > > + > +typedef int (*umc_func)(struct amdgpu_device *adev, uint32_t node_inst, > + uint32_t umc_inst, uint32_t ch_inst, void *data); > + > struct amdgpu_umc_ras { > struct amdgpu_ras_block_object ras_block; > void (*err_cnt_init)(struct amdgpu_device *adev); @@ -104,4 +108,7 > @@ int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev, > struct amdgpu_iv_entry *entry); > int amdgpu_umc_page_retirement_mca(struct amdgpu_device *adev, > uint64_t err_addr, uint32_t ch_inst, uint32_t umc_inst); > + > +int amdgpu_umc_scan_all_umc_channels(struct amdgpu_device *adev, > + umc_func func, void *data); > #endif > diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v8_10.c > b/drivers/gpu/drm/amd/amdgpu/umc_v8_10.c > index fb55e8cb9967..6dff313ac04c 100644 > --- a/drivers/gpu/drm/amd/amdgpu/umc_v8_10.c > +++ b/drivers/gpu/drm/amd/amdgpu/umc_v8_10.c > @@ -76,10 +76,13 @@ static inline uint32_t > get_umc_v8_10_reg_offset(struct amdgpu_device *adev, > UMC_8_NODE_DIST * node_inst; > } > > -static void umc_v8_10_clear_error_count_per_channel(struct > amdgpu_device *adev, > - uint32_t umc_reg_offset) > +static int umc_v8_10_clear_error_count_per_channel(struct > +amdgpu_device > *adev, > + uint32_t node_inst, uint32_t umc_inst, > + uint32_t ch_inst, void *data) > { > uint32_t ecc_err_cnt_addr; > + uint32_t umc_reg_offset = > + get_umc_v8_10_reg_offset(adev, node_inst, umc_inst, ch_inst); > > ecc_err_cnt_addr = > SOC15_REG_OFFSET(UMC, 0, regUMCCH0_0_GeccErrCnt); @@ > -87,24 +90,14 @@ static void > umc_v8_10_clear_error_count_per_channel(struct amdgpu_device *adev, > /* clear error count */ > WREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4, > UMC_V8_10_CE_CNT_INIT); > + > + return 0; > } > > static void umc_v8_10_clear_error_count(struct amdgpu_device *adev) { > - uint32_t node_inst = 0; > - uint32_t umc_inst = 0; > - uint32_t ch_inst = 0; > - uint32_t umc_reg_offset = 0; > - > - LOOP_UMC_EACH_NODE_INST_AND_CH(node_inst, umc_inst, ch_inst) > { > - umc_reg_offset = get_umc_v8_10_reg_offset(adev, > - node_inst, > - umc_inst, > - ch_inst); > - > - umc_v8_10_clear_error_count_per_channel(adev, > - umc_reg_offset); > - } > + amdgpu_umc_scan_all_umc_channels(adev, > + umc_v8_10_clear_error_count_per_channel, NULL); > } > > static void umc_v8_10_query_correctable_error_count(struct > amdgpu_device *adev, @@ -147,29 +140,29 @@ static void > umc_v8_10_query_uncorrectable_error_count(struct amdgpu_device *adev > *error_count += 1; > } > > +static int umc_v8_10_query_ecc_error_count(struct amdgpu_device *adev, > + uint32_t node_inst, uint32_t umc_inst, > + uint32_t ch_inst, void *data) > +{ > + struct ras_err_data *err_data = (struct ras_err_data *)data; > + uint32_t umc_reg_offset = > + get_umc_v8_10_reg_offset(adev, node_inst, umc_inst, ch_inst); > + > + umc_v8_10_query_correctable_error_count(adev, > + umc_reg_offset, > + &(err_data->ce_count)); > + umc_v8_10_query_uncorrectable_error_count(adev, > + umc_reg_offset, > + &(err_data->ue_count)); > + > + return 0; > +} > + > static void umc_v8_10_query_ras_error_count(struct amdgpu_device *adev, > void *ras_error_status) > { > - struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status; > - > - uint32_t node_inst = 0; > - uint32_t umc_inst = 0; > - uint32_t ch_inst = 0; > - uint32_t umc_reg_offset = 0; > - > - LOOP_UMC_EACH_NODE_INST_AND_CH(node_inst, umc_inst, ch_inst) > { > - umc_reg_offset = get_umc_v8_10_reg_offset(adev, > - node_inst, > - umc_inst, > - ch_inst); > - > - umc_v8_10_query_correctable_error_count(adev, > - umc_reg_offset, > - &(err_data->ce_count)); > - umc_v8_10_query_uncorrectable_error_count(adev, > - umc_reg_offset, > - &(err_data->ue_count)); > - } > + amdgpu_umc_scan_all_umc_channels(adev, > + umc_v8_10_query_ecc_error_count, ras_error_status); > > umc_v8_10_clear_error_count(adev); > } > @@ -248,28 +241,28 @@ static void > umc_v8_10_convert_error_address(struct > amdgpu_device *adev, > } > } > > -static void umc_v8_10_query_error_address(struct amdgpu_device *adev, > - struct ras_err_data *err_data, > - uint32_t umc_reg_offset, > - uint32_t node_inst, > - uint32_t ch_inst, > - uint32_t umc_inst) > +static int umc_v8_10_query_error_address(struct amdgpu_device *adev, > + uint32_t node_inst, uint32_t umc_inst, > + uint32_t ch_inst, void *data) > { > uint64_t mc_umc_status_addr; > uint64_t mc_umc_status, err_addr; > uint64_t mc_umc_addrt0; > + struct ras_err_data *err_data = (struct ras_err_data *)data; > + uint32_t umc_reg_offset = > + get_umc_v8_10_reg_offset(adev, node_inst, umc_inst, ch_inst); > > mc_umc_status_addr = > SOC15_REG_OFFSET(UMC, 0, > regMCA_UMC_UMC0_MCUMC_STATUST0); > mc_umc_status = RREG64_PCIE((mc_umc_status_addr + > umc_reg_offset) * 4); > > if (mc_umc_status == 0) > - return; > + return 0; > > if (!err_data->err_addr) { > /* clear umc status */ > WREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4, 0x0ULL); > - return; > + return 0; > } > > /* calculate error address if ue error is detected */ @@ -287,37 > +280,25 @@ static void umc_v8_10_query_error_address(struct > amdgpu_device *adev, > > /* clear umc status */ > WREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4, 0x0ULL); > + > + return 0; > } > > static void umc_v8_10_query_ras_error_address(struct amdgpu_device *adev, > void *ras_error_status) > { > - struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status; > - uint32_t node_inst = 0; > - uint32_t umc_inst = 0; > - uint32_t ch_inst = 0; > - uint32_t umc_reg_offset = 0; > - > - LOOP_UMC_EACH_NODE_INST_AND_CH(node_inst, umc_inst, ch_inst) > { > - umc_reg_offset = get_umc_v8_10_reg_offset(adev, > - node_inst, > - umc_inst, > - ch_inst); > - > - umc_v8_10_query_error_address(adev, > - err_data, > - umc_reg_offset, > - node_inst, > - ch_inst, > - umc_inst); > - } > + amdgpu_umc_scan_all_umc_channels(adev, > + umc_v8_10_query_error_address, ras_error_status); > } > > -static void umc_v8_10_err_cnt_init_per_channel(struct amdgpu_device *adev, > - uint32_t umc_reg_offset) > +static int umc_v8_10_err_cnt_init_per_channel(struct amdgpu_device *adev, > + uint32_t node_inst, uint32_t umc_inst, > + uint32_t ch_inst, void *data) > { > uint32_t ecc_err_cnt_sel, ecc_err_cnt_sel_addr; > uint32_t ecc_err_cnt_addr; > + uint32_t umc_reg_offset = > + get_umc_v8_10_reg_offset(adev, node_inst, umc_inst, ch_inst); > > ecc_err_cnt_sel_addr = > SOC15_REG_OFFSET(UMC, 0, regUMCCH0_0_GeccErrCntSel); @@ -332,23 > +313,14 @@ static void umc_v8_10_err_cnt_init_per_channel(struct > amdgpu_device *adev, > WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4, > ecc_err_cnt_sel); > /* set error count to initial value */ > WREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4, > UMC_V8_10_CE_CNT_INIT); > + > + return 0; > } > > static void umc_v8_10_err_cnt_init(struct amdgpu_device *adev) { > - uint32_t node_inst = 0; > - uint32_t umc_inst = 0; > - uint32_t ch_inst = 0; > - uint32_t umc_reg_offset = 0; > - > - LOOP_UMC_EACH_NODE_INST_AND_CH(node_inst, umc_inst, ch_inst) > { > - umc_reg_offset = get_umc_v8_10_reg_offset(adev, > - node_inst, > - umc_inst, > - ch_inst); > - > - umc_v8_10_err_cnt_init_per_channel(adev, umc_reg_offset); > - } > + amdgpu_umc_scan_all_umc_channels(adev, > + umc_v8_10_err_cnt_init_per_channel, NULL); > } > > static bool umc_v8_10_query_ras_poison_mode(struct amdgpu_device > *adev) @@ -406,37 +378,35 @@ static void > umc_v8_10_ecc_info_query_uncorrectable_error_count(struct amdgpu_dev > } > } > > +static int umc_v8_10_ecc_info_query_ecc_error_count(struct > +amdgpu_device > *adev, > + uint32_t node_inst, uint32_t umc_inst, > + uint32_t ch_inst, void *data) > +{ > + struct ras_err_data *err_data = (struct ras_err_data *)data; > + > + umc_v8_10_ecc_info_query_correctable_error_count(adev, > + node_inst, umc_inst, ch_inst, > + &(err_data->ce_count)); > + umc_v8_10_ecc_info_query_uncorrectable_error_count(adev, > + node_inst, umc_inst, ch_inst, > + &(err_data->ue_count)); > + return 0; > +} > + > static void umc_v8_10_ecc_info_query_ras_error_count(struct > amdgpu_device *adev, > void *ras_error_status) > { > - struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status; > - > - uint32_t node_inst = 0; > - uint32_t umc_inst = 0; > - uint32_t ch_inst = 0; > - > - /* TODO: driver needs to toggle DF Cstate to ensure > - * safe access of UMC registers. Will add the protection > - */ > - LOOP_UMC_EACH_NODE_INST_AND_CH(node_inst, umc_inst, ch_inst) > { > - umc_v8_10_ecc_info_query_correctable_error_count(adev, > - node_inst, umc_inst, > ch_inst, > - &(err_data- > >ce_count)); > - umc_v8_10_ecc_info_query_uncorrectable_error_count(adev, > - node_inst, umc_inst, > ch_inst, > - &(err_data- > >ue_count)); > - } > + amdgpu_umc_scan_all_umc_channels(adev, > + umc_v8_10_ecc_info_query_ecc_error_count, > ras_error_status); > } > > -static void umc_v8_10_ecc_info_query_error_address(struct > amdgpu_device *adev, > - struct ras_err_data *err_data, > - uint32_t ch_inst, > - uint32_t umc_inst, > - uint32_t node_inst) > +static int umc_v8_10_ecc_info_query_error_address(struct > +amdgpu_device > *adev, > + uint32_t node_inst, uint32_t umc_inst, > + uint32_t ch_inst, void *data) > { > uint32_t eccinfo_table_idx; > uint64_t mc_umc_status, err_addr; > - > + struct ras_err_data *err_data = (struct ras_err_data *)data; > struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); > > eccinfo_table_idx = node_inst * adev->umc.umc_inst_num * @@ - > 447,10 +417,10 @@ static void > umc_v8_10_ecc_info_query_error_address(struct amdgpu_device *adev, > mc_umc_status = ras- > >umc_ecc.ecc[eccinfo_table_idx].mca_umc_status; > > if (mc_umc_status == 0) > - return; > + return 0; > > if (!err_data->err_addr) > - return; > + return 0; > > /* calculate error address if ue error is detected */ > if (REG_GET_FIELD(mc_umc_status, > MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 && @@ -463,28 +433,15 @@ static > void umc_v8_10_ecc_info_query_error_address(struct amdgpu_device > *adev, > umc_v8_10_convert_error_address(adev, err_data, err_addr, > ch_inst, umc_inst, node_inst, > mc_umc_status); > } > + > + return 0; > } > > static void umc_v8_10_ecc_info_query_ras_error_address(struct > amdgpu_device *adev, > void *ras_error_status) > { > - struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status; > - > - uint32_t node_inst = 0; > - uint32_t umc_inst = 0; > - uint32_t ch_inst = 0; > - > - /* TODO: driver needs to toggle DF Cstate to ensure > - * safe access of UMC resgisters. Will add the protection > - * when firmware interface is ready > - */ > - LOOP_UMC_EACH_NODE_INST_AND_CH(node_inst, umc_inst, ch_inst) > { > - umc_v8_10_ecc_info_query_error_address(adev, > - err_data, > - ch_inst, > - umc_inst, > - node_inst); > - } > + amdgpu_umc_scan_all_umc_channels(adev, > + umc_v8_10_ecc_info_query_error_address, ras_error_status); > } > > const struct amdgpu_ras_block_hw_ops umc_v8_10_ras_hw_ops = { > -- > 2.34.1