[AMD Official Use Only - AMD Internal Distribution Only] ----------------- Best Regards, Thomas -----Original Message----- From: Zhou1, Tao <Tao.Zhou1@xxxxxxx> Sent: Thursday, July 18, 2024 10:51 AM To: Chai, Thomas <YiPeng.Chai@xxxxxxx>; amd-gfx@xxxxxxxxxxxxxxxxxxxxx Cc: Zhang, Hawking <Hawking.Zhang@xxxxxxx>; Li, Candice <Candice.Li@xxxxxxx>; Wang, Yang(Kevin) <KevinYang.Wang@xxxxxxx>; Yang, Stanley <Stanley.Yang@xxxxxxx> Subject: RE: [PATCH 1/3] drm/amdgpu: optimize umc v12 address conversion function [AMD Official Use Only - AMD Internal Distribution Only] > -----Original Message----- > From: Chai, Thomas <YiPeng.Chai@xxxxxxx> > Sent: Wednesday, July 17, 2024 4:16 PM > To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx > Cc: Zhang, Hawking <Hawking.Zhang@xxxxxxx>; Zhou1, Tao > <Tao.Zhou1@xxxxxxx>; Li, Candice <Candice.Li@xxxxxxx>; Wang, > Yang(Kevin) <KevinYang.Wang@xxxxxxx>; Yang, Stanley > <Stanley.Yang@xxxxxxx>; Chai, Thomas <YiPeng.Chai@xxxxxxx> > Subject: [PATCH 1/3] drm/amdgpu: optimize umc v12 address conversion > function > > Split into 3 parts: > 1. Convert soc physical address via ras ta. > 2. Expand bad pages from soc physical address. > 3. Dump bad address info. > > Signed-off-by: YiPeng Chai <YiPeng.Chai@xxxxxxx> > --- > drivers/gpu/drm/amd/amdgpu/umc_v12_0.c | 116 > ++++++++++++++++--------- > 1 file changed, 77 insertions(+), 39 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c > b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c > index 9dbb13adb661..eca5ac6a0532 100644 > --- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c > +++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c > @@ -225,26 +225,16 @@ static void > umc_v12_0_convert_error_address(struct > amdgpu_device *adev, > } > } > > -static int umc_v12_0_convert_err_addr(struct amdgpu_device *adev, > - struct ta_ras_query_address_input *addr_in, > - uint64_t *pfns, int len) > +static void umc_v12_0_dump_addr_info(struct amdgpu_device *adev, > + struct ta_ras_query_address_output *addr_out, > + uint64_t err_addr) > { > uint32_t col, row, row_xor, bank, channel_index; > - uint64_t soc_pa, retired_page, column, err_addr; > - struct ta_ras_query_address_output addr_out; > - uint32_t pos = 0; > - > - err_addr = addr_in->ma.err_addr; > - addr_in->addr_type = TA_RAS_MCA_TO_PA; > - if (psp_ras_query_address(&adev->psp, addr_in, &addr_out)) { > - dev_warn(adev->dev, "Failed to query RAS physical address for > 0x%llx", > - err_addr); > - return 0; > - } > + uint64_t soc_pa, retired_page, column; > > - soc_pa = addr_out.pa.pa; > - bank = addr_out.pa.bank; > - channel_index = addr_out.pa.channel_idx; > + soc_pa = addr_out->pa.pa; > + bank = addr_out->pa.bank; > + channel_index = addr_out->pa.channel_idx; > > col = (err_addr >> 1) & 0x1fULL; > row = (err_addr >> 10) & 0x3fffULL; @@ -258,11 +248,6 @@ static > int umc_v12_0_convert_err_addr(struct amdgpu_device *adev, > for (column = 0; column < UMC_V12_0_NA_MAP_PA_NUM; column++) { > retired_page = soc_pa | ((column & 0x3) << > UMC_V12_0_PA_C2_BIT); > retired_page |= (((column & 0x4) >> 2) << > UMC_V12_0_PA_C4_BIT); > - > - if (pos >= len) > - return 0; > - pfns[pos++] = retired_page >> AMDGPU_GPU_PAGE_SHIFT; > - > /* include column bit 0 and 1 */ > col &= 0x3; > col |= (column << 2); > @@ -270,6 +255,35 @@ static int umc_v12_0_convert_err_addr(struct > amdgpu_device *adev, > "Error Address(PA):0x%-10llx Row:0x%-4x > Col:0x%-2x Bank:0x%x Channel:0x%x\n", > retired_page, row, col, bank, channel_index); > > + /* shift R13 bit */ > + retired_page ^= (0x1ULL << UMC_V12_0_PA_R13_BIT); > + dev_info(adev->dev, > + "Error Address(PA):0x%-10llx Row:0x%-4x > + Col:0x%-2x > Bank:0x%x Channel:0x%x\n", > + retired_page, row_xor, col, bank, channel_index); > + } > +} > + > +static int umc_v12_0_expand_addr_to_bad_pages(struct amdgpu_device > *adev, > + uint64_t pa_addr, uint64_t *pfns, int len) { > + uint64_t soc_pa, retired_page, column; > + uint32_t pos = 0; > + > + soc_pa = pa_addr; > + /* clear [C3 C2] in soc physical address */ > + soc_pa &= ~(0x3ULL << UMC_V12_0_PA_C2_BIT); > + /* clear [C4] in soc physical address */ > + soc_pa &= ~(0x1ULL << UMC_V12_0_PA_C4_BIT); >[Tao] these bits are already cleared via UMC_V12_ADDR_MASK_BAD_COLS in patch #2, is the clear here redundant? [Thomas] I think these code are like checking input parameters and cannot rely on external call data. > + > + /* loop for all possibilities of [C4 C3 C2] */ > + for (column = 0; column < UMC_V12_0_NA_MAP_PA_NUM; column++) { > + retired_page = soc_pa | ((column & 0x3) << > UMC_V12_0_PA_C2_BIT); > + retired_page |= (((column & 0x4) >> 2) << > UMC_V12_0_PA_C4_BIT); > + > + if (pos >= len) > + return 0; > + pfns[pos++] = retired_page >> AMDGPU_GPU_PAGE_SHIFT; > + > /* shift R13 bit */ > retired_page ^= (0x1ULL << UMC_V12_0_PA_R13_BIT); > > @@ -277,14 +291,40 @@ static int umc_v12_0_convert_err_addr(struct > amdgpu_device *adev, > return 0; > pfns[pos++] = retired_page >> AMDGPU_GPU_PAGE_SHIFT; > > - dev_info(adev->dev, > - "Error Address(PA):0x%-10llx Row:0x%-4x Col:0x%-2x > Bank:0x%x Channel:0x%x\n", > - retired_page, row_xor, col, bank, channel_index); > } > > return pos; > } > > +static int umc_v12_0_convert_mca_to_addr(struct amdgpu_device *adev, > + uint64_t err_addr, uint32_t ch, uint32_t umc, > + uint32_t node, uint32_t socket, > + uint64_t *addr, bool dump_addr) { > + struct ta_ras_query_address_input addr_in; > + struct ta_ras_query_address_output addr_out; > + > + memset(&addr_in, 0, sizeof(addr_in)); > + addr_in.ma.err_addr = err_addr; > + addr_in.ma.ch_inst = ch; > + addr_in.ma.umc_inst = umc; > + addr_in.ma.node_inst = node; > + addr_in.ma.socket_id = socket; > + addr_in.addr_type = TA_RAS_MCA_TO_PA; > + if (psp_ras_query_address(&adev->psp, &addr_in, &addr_out)) { > + dev_warn(adev->dev, "Failed to query RAS physical > + address for > 0x%llx", > + err_addr); > + return -EINVAL; > + } > + > + if (dump_addr) > + umc_v12_0_dump_addr_info(adev, &addr_out, err_addr); > + > + *addr = addr_out.pa.pa; > + > + return 0; > +} > + > static int umc_v12_0_query_error_address(struct amdgpu_device *adev, > uint32_t node_inst, uint32_t umc_inst, > uint32_t ch_inst, void *data) @@ > -483,12 +523,10 @@ static int umc_v12_0_update_ecc_status(struct > amdgpu_device *adev, { > struct amdgpu_ras *con = amdgpu_ras_get_context(adev); > uint16_t hwid, mcatype; > - struct ta_ras_query_address_input addr_in; > uint64_t page_pfn[UMC_V12_0_BAD_PAGE_NUM_PER_CHANNEL]; > - uint64_t err_addr, hash_val = 0; > + uint64_t err_addr, hash_val = 0, pa_addr = 0; > struct ras_ecc_err *ecc_err; > - int count; > - int ret; > + int count, ret; > > hwid = REG_GET_FIELD(ipid, MCMP1_IPIDT0, HardwareID); > mcatype = REG_GET_FIELD(ipid, MCMP1_IPIDT0, McaType); @@ -514,17 > +552,17 @@ static int umc_v12_0_update_ecc_status(struct amdgpu_device > *adev, > MCA_IPID_2_UMC_CH(ipid), > err_addr); > > - memset(page_pfn, 0, sizeof(page_pfn)); > - > - memset(&addr_in, 0, sizeof(addr_in)); > - addr_in.ma.err_addr = err_addr; > - addr_in.ma.ch_inst = MCA_IPID_2_UMC_CH(ipid); > - addr_in.ma.umc_inst = MCA_IPID_2_UMC_INST(ipid); > - addr_in.ma.node_inst = MCA_IPID_2_DIE_ID(ipid); > - addr_in.ma.socket_id = MCA_IPID_2_SOCKET_ID(ipid); > + ret = umc_v12_0_convert_mca_to_addr(adev, > + err_addr, MCA_IPID_2_UMC_CH(ipid), > + MCA_IPID_2_UMC_INST(ipid), > MCA_IPID_2_DIE_ID(ipid), > + MCA_IPID_2_SOCKET_ID(ipid), &pa_addr, true); > [Tao] I prefer to using addr_in as input parameter for convert_mca_to_addr to simplify the function, and it's also convenient to make the function as a common interface for all ASICs in the future. [Thomas] convert_mca_to_addr will be called in multiple places: 1. Detecting a newly ecc error; 2. Loading ecc data from EEPROM (when using new data format). 3. Switch NPS? If using addr_in as input parameter, a lot of redundant code must be added to convert IPID to add_in in each place before calling convert_mca_to_addr. Each ASIC may have different address translation, umc_v12_0_convert_mca_to_addr is only for umc v12, if address translation has a common interface in future, it only need to change umc_v12_0_convert_mca_to_addr. > + if (ret) > + return ret; > > - count = umc_v12_0_convert_err_addr(adev, > - &addr_in, page_pfn, ARRAY_SIZE(page_pfn)); > + memset(page_pfn, 0, sizeof(page_pfn)); > + count = umc_v12_0_expand_addr_to_bad_pages(adev, > + pa_addr, > + page_pfn, ARRAY_SIZE(page_pfn)); > if (count <= 0) { > dev_warn(adev->dev, "Fail to convert error address! > count:%d\n", count); > return 0; > -- > 2.34.1