[AMD Official Use Only - General] OK, Will do. ----------------- Best Regards, Thomas -----Original Message----- From: Zhou1, Tao <Tao.Zhou1@xxxxxxx> Sent: Monday, April 3, 2023 3:21 PM To: Chai, Thomas <YiPeng.Chai@xxxxxxx>; amd-gfx@xxxxxxxxxxxxxxxxxxxxx Cc: Zhang, Hawking <Hawking.Zhang@xxxxxxx>; Li, Candice <Candice.Li@xxxxxxx>; Yang, Stanley <Stanley.Yang@xxxxxxx> Subject: RE: [PATCH 1/2] drm/amdgpu: optimize redundant code in umc_v8_10 [AMD Official Use Only - General] > -----Original Message----- > From: Chai, Thomas <YiPeng.Chai@xxxxxxx> > Sent: Monday, April 3, 2023 3:00 PM > To: Zhou1, Tao <Tao.Zhou1@xxxxxxx>; amd-gfx@xxxxxxxxxxxxxxxxxxxxx > Cc: Zhang, Hawking <Hawking.Zhang@xxxxxxx>; Li, Candice > <Candice.Li@xxxxxxx>; Yang, Stanley <Stanley.Yang@xxxxxxx> > Subject: RE: [PATCH 1/2] drm/amdgpu: optimize redundant code in > umc_v8_10 > > [AMD Official Use Only - General] > > > > > ----------------- > Best Regards, > Thomas > > -----Original Message----- > From: Zhou1, Tao <Tao.Zhou1@xxxxxxx> > Sent: Monday, April 3, 2023 11:45 AM > To: Chai, Thomas <YiPeng.Chai@xxxxxxx>; amd-gfx@xxxxxxxxxxxxxxxxxxxxx > Cc: Zhang, Hawking <Hawking.Zhang@xxxxxxx>; Li, Candice > <Candice.Li@xxxxxxx>; Yang, Stanley <Stanley.Yang@xxxxxxx> > Subject: RE: [PATCH 1/2] drm/amdgpu: optimize redundant code in > umc_v8_10 > > [AMD Official Use Only - General] > > > > > -----Original Message----- > > From: Chai, Thomas <YiPeng.Chai@xxxxxxx> > > Sent: Monday, April 3, 2023 9:59 AM > > To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx > > Cc: Chai, Thomas <YiPeng.Chai@xxxxxxx>; Zhang, Hawking > > <Hawking.Zhang@xxxxxxx>; Zhou1, Tao <Tao.Zhou1@xxxxxxx>; Li, > Candice > > <Candice.Li@xxxxxxx>; Yang, Stanley <Stanley.Yang@xxxxxxx>; Chai, > > Thomas <YiPeng.Chai@xxxxxxx> > > Subject: [PATCH 1/2] drm/amdgpu: optimize redundant code in > > umc_v8_10 > > > > Optimize redundant code in umc_v8_10 > > > > Signed-off-by: YiPeng Chai <YiPeng.Chai@xxxxxxx> > > --- > > drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 31 ++++ > > drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h | 7 + > > drivers/gpu/drm/amd/amdgpu/umc_v8_10.c | 197 > > +++++++++--------------- > > 3 files changed, 115 insertions(+), 120 deletions(-) > > > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c > > index 9e2e97207e53..734442315cf6 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c > > @@ -302,3 +302,34 @@ void amdgpu_umc_fill_error_record(struct > > ras_err_data *err_data, > > > > err_data->err_addr_cnt++; > > } > > + > > +int amdgpu_umc_scan_all_umc_channels(struct amdgpu_device *adev, > > + umc_func func, void *data) > > +{ > > + uint32_t node_inst = 0; > > + uint32_t umc_inst = 0; > > + uint32_t ch_inst = 0; > > + int ret = 0; > > + > > + if (adev->umc.node_inst_num) { > > + LOOP_UMC_EACH_NODE_INST_AND_CH(node_inst, umc_inst, > > ch_inst) { > > + ret = func(adev, node_inst, umc_inst, ch_inst, data); > > + if (ret) { > > + dev_err(adev->dev, "Node %d umc %d ch %d > > func returns %d\n", > > + node_inst, umc_inst, ch_inst, ret); > > + return ret; > > + } > > + } > > + } else { > > + LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) { > > >[Tao] for ASIC which doesn't support node, can we set its > >node_inst_num to 1 > and retire the macro LOOP_UMC_INST_AND_CH? > > [Thomas] I am afraid not. > > " #define LOOP_UMC_NODE_INST(node_inst) \ > for_each_set_bit((node_inst), &(adev->umc.active_mask), > adev->umc.node_inst_num) " > > The node instance loop of LOOP_UMC_EACH_NODE_INST_AND_CH supports > node harvest, so node_inst_num is not the real node instance number. [Tao] we can set both node_inst_num and active_mask to 1, but either way is fine for me. BTW, I think amdgpu_umc_loop_channels is simple than amdgpu_umc_scan_all_umc_channels, with this fixed, the series is: Reviewed-by: Tao Zhou <tao.zhou1@xxxxxxx> > > > > + ret = func(adev, 0, umc_inst, ch_inst, data); > > + if (ret) { > > + dev_err(adev->dev, "Umc %d ch %d func > > returns %d\n", > > + umc_inst, ch_inst, ret); > > + return ret; > > + } > > + } > > + } > > + > > + return 0; > > +} > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h > > index d7f1229ff11f..f279c8057f96 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h > > @@ -47,6 +47,10 @@ > > #define LOOP_UMC_EACH_NODE_INST_AND_CH(node_inst, umc_inst, > ch_inst) > > \ > > LOOP_UMC_NODE_INST((node_inst)) > > LOOP_UMC_INST_AND_CH((umc_inst), (ch_inst)) > > > > + > > +typedef int (*umc_func)(struct amdgpu_device *adev, uint32_t node_inst, > > + uint32_t umc_inst, uint32_t ch_inst, void *data); > > + > > struct amdgpu_umc_ras { > > struct amdgpu_ras_block_object ras_block; > > void (*err_cnt_init)(struct amdgpu_device *adev); @@ -104,4 +108,7 > > @@ int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev, > > struct amdgpu_iv_entry *entry); > > int amdgpu_umc_page_retirement_mca(struct amdgpu_device *adev, > > uint64_t err_addr, uint32_t ch_inst, uint32_t umc_inst); > > + > > +int amdgpu_umc_scan_all_umc_channels(struct amdgpu_device *adev, > > + umc_func func, void *data); > > #endif > > diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v8_10.c > > b/drivers/gpu/drm/amd/amdgpu/umc_v8_10.c > > index fb55e8cb9967..6dff313ac04c 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/umc_v8_10.c > > +++ b/drivers/gpu/drm/amd/amdgpu/umc_v8_10.c > > @@ -76,10 +76,13 @@ static inline uint32_t > > get_umc_v8_10_reg_offset(struct amdgpu_device *adev, > > UMC_8_NODE_DIST * node_inst; > > } > > > > -static void umc_v8_10_clear_error_count_per_channel(struct > > amdgpu_device *adev, > > - uint32_t umc_reg_offset) > > +static int umc_v8_10_clear_error_count_per_channel(struct > > +amdgpu_device > > *adev, > > + uint32_t node_inst, uint32_t umc_inst, > > + uint32_t ch_inst, void *data) > > { > > uint32_t ecc_err_cnt_addr; > > + uint32_t umc_reg_offset = > > + get_umc_v8_10_reg_offset(adev, node_inst, umc_inst, ch_inst); > > > > ecc_err_cnt_addr = > > SOC15_REG_OFFSET(UMC, 0, regUMCCH0_0_GeccErrCnt); @@ > > -87,24 +90,14 @@ static void > > umc_v8_10_clear_error_count_per_channel(struct amdgpu_device *adev, > > /* clear error count */ > > WREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4, > > UMC_V8_10_CE_CNT_INIT); > > + > > + return 0; > > } > > > > static void umc_v8_10_clear_error_count(struct amdgpu_device *adev) { > > - uint32_t node_inst = 0; > > - uint32_t umc_inst = 0; > > - uint32_t ch_inst = 0; > > - uint32_t umc_reg_offset = 0; > > - > > - LOOP_UMC_EACH_NODE_INST_AND_CH(node_inst, umc_inst, ch_inst) > > { > > - umc_reg_offset = get_umc_v8_10_reg_offset(adev, > > - node_inst, > > - umc_inst, > > - ch_inst); > > - > > - umc_v8_10_clear_error_count_per_channel(adev, > > - umc_reg_offset); > > - } > > + amdgpu_umc_scan_all_umc_channels(adev, > > + umc_v8_10_clear_error_count_per_channel, NULL); > > } > > > > static void umc_v8_10_query_correctable_error_count(struct > > amdgpu_device *adev, @@ -147,29 +140,29 @@ static void > > umc_v8_10_query_uncorrectable_error_count(struct amdgpu_device *adev > > *error_count += 1; > > } > > > > +static int umc_v8_10_query_ecc_error_count(struct amdgpu_device *adev, > > + uint32_t node_inst, uint32_t umc_inst, > > + uint32_t ch_inst, void *data) > > +{ > > + struct ras_err_data *err_data = (struct ras_err_data *)data; > > + uint32_t umc_reg_offset = > > + get_umc_v8_10_reg_offset(adev, node_inst, umc_inst, ch_inst); > > + > > + umc_v8_10_query_correctable_error_count(adev, > > + umc_reg_offset, > > + &(err_data->ce_count)); > > + umc_v8_10_query_uncorrectable_error_count(adev, > > + umc_reg_offset, > > + &(err_data->ue_count)); > > + > > + return 0; > > +} > > + > > static void umc_v8_10_query_ras_error_count(struct amdgpu_device *adev, > > void *ras_error_status) > > { > > - struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status; > > - > > - uint32_t node_inst = 0; > > - uint32_t umc_inst = 0; > > - uint32_t ch_inst = 0; > > - uint32_t umc_reg_offset = 0; > > - > > - LOOP_UMC_EACH_NODE_INST_AND_CH(node_inst, umc_inst, ch_inst) > > { > > - umc_reg_offset = get_umc_v8_10_reg_offset(adev, > > - node_inst, > > - umc_inst, > > - ch_inst); > > - > > - umc_v8_10_query_correctable_error_count(adev, > > - umc_reg_offset, > > - &(err_data->ce_count)); > > - umc_v8_10_query_uncorrectable_error_count(adev, > > - umc_reg_offset, > > - &(err_data->ue_count)); > > - } > > + amdgpu_umc_scan_all_umc_channels(adev, > > + umc_v8_10_query_ecc_error_count, ras_error_status); > > > > umc_v8_10_clear_error_count(adev); > > } > > @@ -248,28 +241,28 @@ static void > > umc_v8_10_convert_error_address(struct > > amdgpu_device *adev, > > } > > } > > > > -static void umc_v8_10_query_error_address(struct amdgpu_device *adev, > > - struct ras_err_data *err_data, > > - uint32_t umc_reg_offset, > > - uint32_t node_inst, > > - uint32_t ch_inst, > > - uint32_t umc_inst) > > +static int umc_v8_10_query_error_address(struct amdgpu_device *adev, > > + uint32_t node_inst, uint32_t umc_inst, > > + uint32_t ch_inst, void *data) > > { > > uint64_t mc_umc_status_addr; > > uint64_t mc_umc_status, err_addr; > > uint64_t mc_umc_addrt0; > > + struct ras_err_data *err_data = (struct ras_err_data *)data; > > + uint32_t umc_reg_offset = > > + get_umc_v8_10_reg_offset(adev, node_inst, umc_inst, ch_inst); > > > > mc_umc_status_addr = > > SOC15_REG_OFFSET(UMC, 0, > > regMCA_UMC_UMC0_MCUMC_STATUST0); > > mc_umc_status = RREG64_PCIE((mc_umc_status_addr + > > umc_reg_offset) * 4); > > > > if (mc_umc_status == 0) > > - return; > > + return 0; > > > > if (!err_data->err_addr) { > > /* clear umc status */ > > WREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4, > 0x0ULL); > > - return; > > + return 0; > > } > > > > /* calculate error address if ue error is detected */ @@ -287,37 > > +280,25 @@ static void umc_v8_10_query_error_address(struct > > amdgpu_device *adev, > > > > /* clear umc status */ > > WREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4, 0x0ULL); > > + > > + return 0; > > } > > > > static void umc_v8_10_query_ras_error_address(struct amdgpu_device *adev, > > void *ras_error_status) > > { > > - struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status; > > - uint32_t node_inst = 0; > > - uint32_t umc_inst = 0; > > - uint32_t ch_inst = 0; > > - uint32_t umc_reg_offset = 0; > > - > > - LOOP_UMC_EACH_NODE_INST_AND_CH(node_inst, umc_inst, ch_inst) > > { > > - umc_reg_offset = get_umc_v8_10_reg_offset(adev, > > - node_inst, > > - umc_inst, > > - ch_inst); > > - > > - umc_v8_10_query_error_address(adev, > > - err_data, > > - umc_reg_offset, > > - node_inst, > > - ch_inst, > > - umc_inst); > > - } > > + amdgpu_umc_scan_all_umc_channels(adev, > > + umc_v8_10_query_error_address, ras_error_status); > > } > > > > -static void umc_v8_10_err_cnt_init_per_channel(struct amdgpu_device > *adev, > > - uint32_t umc_reg_offset) > > +static int umc_v8_10_err_cnt_init_per_channel(struct amdgpu_device *adev, > > + uint32_t node_inst, uint32_t umc_inst, > > + uint32_t ch_inst, void *data) > > { > > uint32_t ecc_err_cnt_sel, ecc_err_cnt_sel_addr; > > uint32_t ecc_err_cnt_addr; > > + uint32_t umc_reg_offset = > > + get_umc_v8_10_reg_offset(adev, node_inst, umc_inst, ch_inst); > > > > ecc_err_cnt_sel_addr = > > SOC15_REG_OFFSET(UMC, 0, regUMCCH0_0_GeccErrCntSel); > @@ -332,23 > > +313,14 @@ static void umc_v8_10_err_cnt_init_per_channel(struct > > amdgpu_device *adev, > > WREG32_PCIE((ecc_err_cnt_sel_addr + umc_reg_offset) * 4, > > ecc_err_cnt_sel); > > /* set error count to initial value */ > > WREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4, > > UMC_V8_10_CE_CNT_INIT); > > + > > + return 0; > > } > > > > static void umc_v8_10_err_cnt_init(struct amdgpu_device *adev) { > > - uint32_t node_inst = 0; > > - uint32_t umc_inst = 0; > > - uint32_t ch_inst = 0; > > - uint32_t umc_reg_offset = 0; > > - > > - LOOP_UMC_EACH_NODE_INST_AND_CH(node_inst, umc_inst, ch_inst) > > { > > - umc_reg_offset = get_umc_v8_10_reg_offset(adev, > > - node_inst, > > - umc_inst, > > - ch_inst); > > - > > - umc_v8_10_err_cnt_init_per_channel(adev, umc_reg_offset); > > - } > > + amdgpu_umc_scan_all_umc_channels(adev, > > + umc_v8_10_err_cnt_init_per_channel, NULL); > > } > > > > static bool umc_v8_10_query_ras_poison_mode(struct amdgpu_device > > *adev) @@ -406,37 +378,35 @@ static void > > umc_v8_10_ecc_info_query_uncorrectable_error_count(struct amdgpu_dev > > } > > } > > > > +static int umc_v8_10_ecc_info_query_ecc_error_count(struct > > +amdgpu_device > > *adev, > > + uint32_t node_inst, uint32_t umc_inst, > > + uint32_t ch_inst, void *data) > > +{ > > + struct ras_err_data *err_data = (struct ras_err_data *)data; > > + > > + umc_v8_10_ecc_info_query_correctable_error_count(adev, > > + node_inst, umc_inst, ch_inst, > > + &(err_data->ce_count)); > > + umc_v8_10_ecc_info_query_uncorrectable_error_count(adev, > > + node_inst, umc_inst, ch_inst, > > + &(err_data->ue_count)); > > + return 0; > > +} > > + > > static void umc_v8_10_ecc_info_query_ras_error_count(struct > > amdgpu_device *adev, > > void *ras_error_status) > > { > > - struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status; > > - > > - uint32_t node_inst = 0; > > - uint32_t umc_inst = 0; > > - uint32_t ch_inst = 0; > > - > > - /* TODO: driver needs to toggle DF Cstate to ensure > > - * safe access of UMC registers. Will add the protection > > - */ > > - LOOP_UMC_EACH_NODE_INST_AND_CH(node_inst, umc_inst, ch_inst) > > { > > - umc_v8_10_ecc_info_query_correctable_error_count(adev, > > - node_inst, umc_inst, > > ch_inst, > > - &(err_data- > > >ce_count)); > > - umc_v8_10_ecc_info_query_uncorrectable_error_count(adev, > > - node_inst, umc_inst, > > ch_inst, > > - &(err_data- > > >ue_count)); > > - } > > + amdgpu_umc_scan_all_umc_channels(adev, > > + umc_v8_10_ecc_info_query_ecc_error_count, > > ras_error_status); > > } > > > > -static void umc_v8_10_ecc_info_query_error_address(struct > > amdgpu_device *adev, > > - struct ras_err_data *err_data, > > - uint32_t ch_inst, > > - uint32_t umc_inst, > > - uint32_t node_inst) > > +static int umc_v8_10_ecc_info_query_error_address(struct > > +amdgpu_device > > *adev, > > + uint32_t node_inst, uint32_t umc_inst, > > + uint32_t ch_inst, void *data) > > { > > uint32_t eccinfo_table_idx; > > uint64_t mc_umc_status, err_addr; > > - > > + struct ras_err_data *err_data = (struct ras_err_data *)data; > > struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); > > > > eccinfo_table_idx = node_inst * adev->umc.umc_inst_num * @@ - > > 447,10 +417,10 @@ static void > > umc_v8_10_ecc_info_query_error_address(struct amdgpu_device *adev, > > mc_umc_status = ras- > > >umc_ecc.ecc[eccinfo_table_idx].mca_umc_status; > > > > if (mc_umc_status == 0) > > - return; > > + return 0; > > > > if (!err_data->err_addr) > > - return; > > + return 0; > > > > /* calculate error address if ue error is detected */ > > if (REG_GET_FIELD(mc_umc_status, > > MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 && @@ -463,28 +433,15 > @@ static > > void umc_v8_10_ecc_info_query_error_address(struct amdgpu_device > > *adev, > > umc_v8_10_convert_error_address(adev, err_data, err_addr, > > ch_inst, umc_inst, node_inst, > > mc_umc_status); > > } > > + > > + return 0; > > } > > > > static void umc_v8_10_ecc_info_query_ras_error_address(struct > > amdgpu_device *adev, > > void *ras_error_status) > > { > > - struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status; > > - > > - uint32_t node_inst = 0; > > - uint32_t umc_inst = 0; > > - uint32_t ch_inst = 0; > > - > > - /* TODO: driver needs to toggle DF Cstate to ensure > > - * safe access of UMC resgisters. Will add the protection > > - * when firmware interface is ready > > - */ > > - LOOP_UMC_EACH_NODE_INST_AND_CH(node_inst, umc_inst, ch_inst) > > { > > - umc_v8_10_ecc_info_query_error_address(adev, > > - err_data, > > - ch_inst, > > - umc_inst, > > - node_inst); > > - } > > + amdgpu_umc_scan_all_umc_channels(adev, > > + umc_v8_10_ecc_info_query_error_address, ras_error_status); > > } > > > > const struct amdgpu_ras_block_hw_ops umc_v8_10_ras_hw_ops = { > > -- > > 2.34.1