RE: [PATCH 1/3] drm/amdgpu: optimize umc v12 address conversion function

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



[AMD Official Use Only - AMD Internal Distribution Only]

-----------------
Best Regards,
Thomas

-----Original Message-----
From: Zhou1, Tao <Tao.Zhou1@xxxxxxx>
Sent: Thursday, July 18, 2024 10:51 AM
To: Chai, Thomas <YiPeng.Chai@xxxxxxx>; amd-gfx@xxxxxxxxxxxxxxxxxxxxx
Cc: Zhang, Hawking <Hawking.Zhang@xxxxxxx>; Li, Candice <Candice.Li@xxxxxxx>; Wang, Yang(Kevin) <KevinYang.Wang@xxxxxxx>; Yang, Stanley <Stanley.Yang@xxxxxxx>
Subject: RE: [PATCH 1/3] drm/amdgpu: optimize umc v12 address conversion function

[AMD Official Use Only - AMD Internal Distribution Only]

> -----Original Message-----
> From: Chai, Thomas <YiPeng.Chai@xxxxxxx>
> Sent: Wednesday, July 17, 2024 4:16 PM
> To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx
> Cc: Zhang, Hawking <Hawking.Zhang@xxxxxxx>; Zhou1, Tao
> <Tao.Zhou1@xxxxxxx>; Li, Candice <Candice.Li@xxxxxxx>; Wang,
> Yang(Kevin) <KevinYang.Wang@xxxxxxx>; Yang, Stanley
> <Stanley.Yang@xxxxxxx>; Chai, Thomas <YiPeng.Chai@xxxxxxx>
> Subject: [PATCH 1/3] drm/amdgpu: optimize umc v12 address conversion
> function
>
> Split into 3 parts:
> 1. Convert soc physical address via ras ta.
> 2. Expand bad pages from soc physical address.
> 3. Dump bad address info.
>
> Signed-off-by: YiPeng Chai <YiPeng.Chai@xxxxxxx>
> ---
>  drivers/gpu/drm/amd/amdgpu/umc_v12_0.c | 116
> ++++++++++++++++---------
>  1 file changed, 77 insertions(+), 39 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
> b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
> index 9dbb13adb661..eca5ac6a0532 100644
> --- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
> @@ -225,26 +225,16 @@ static void
> umc_v12_0_convert_error_address(struct
> amdgpu_device *adev,
>       }
>  }
>
> -static int umc_v12_0_convert_err_addr(struct amdgpu_device *adev,
> -                             struct ta_ras_query_address_input *addr_in,
> -                             uint64_t *pfns, int len)
> +static void umc_v12_0_dump_addr_info(struct amdgpu_device *adev,
> +                             struct ta_ras_query_address_output *addr_out,
> +                             uint64_t err_addr)
>  {
>       uint32_t col, row, row_xor, bank, channel_index;
> -     uint64_t soc_pa, retired_page, column, err_addr;
> -     struct ta_ras_query_address_output addr_out;
> -     uint32_t pos = 0;
> -
> -     err_addr = addr_in->ma.err_addr;
> -     addr_in->addr_type = TA_RAS_MCA_TO_PA;
> -     if (psp_ras_query_address(&adev->psp, addr_in, &addr_out)) {
> -             dev_warn(adev->dev, "Failed to query RAS physical address for
> 0x%llx",
> -                     err_addr);
> -             return 0;
> -     }
> +     uint64_t soc_pa, retired_page, column;
>
> -     soc_pa = addr_out.pa.pa;
> -     bank = addr_out.pa.bank;
> -     channel_index = addr_out.pa.channel_idx;
> +     soc_pa = addr_out->pa.pa;
> +     bank = addr_out->pa.bank;
> +     channel_index = addr_out->pa.channel_idx;
>
>       col = (err_addr >> 1) & 0x1fULL;
>       row = (err_addr >> 10) & 0x3fffULL; @@ -258,11 +248,6 @@ static
> int umc_v12_0_convert_err_addr(struct amdgpu_device *adev,
>       for (column = 0; column < UMC_V12_0_NA_MAP_PA_NUM; column++) {
>               retired_page = soc_pa | ((column & 0x3) <<
> UMC_V12_0_PA_C2_BIT);
>               retired_page |= (((column & 0x4) >> 2) <<
> UMC_V12_0_PA_C4_BIT);
> -
> -             if (pos >= len)
> -                     return 0;
> -             pfns[pos++] = retired_page >> AMDGPU_GPU_PAGE_SHIFT;
> -
>               /* include column bit 0 and 1 */
>               col &= 0x3;
>               col |= (column << 2);
> @@ -270,6 +255,35 @@ static int umc_v12_0_convert_err_addr(struct
> amdgpu_device *adev,
>                       "Error Address(PA):0x%-10llx Row:0x%-4x
> Col:0x%-2x Bank:0x%x Channel:0x%x\n",
>                       retired_page, row, col, bank, channel_index);
>
> +             /* shift R13 bit */
> +             retired_page ^= (0x1ULL << UMC_V12_0_PA_R13_BIT);
> +             dev_info(adev->dev,
> +                     "Error Address(PA):0x%-10llx Row:0x%-4x
> + Col:0x%-2x
> Bank:0x%x Channel:0x%x\n",
> +                     retired_page, row_xor, col, bank, channel_index);
> +     }
> +}
> +
> +static int umc_v12_0_expand_addr_to_bad_pages(struct amdgpu_device
> *adev,
> +                     uint64_t pa_addr, uint64_t *pfns, int len) {
> +     uint64_t soc_pa, retired_page, column;
> +     uint32_t pos = 0;
> +
> +     soc_pa = pa_addr;
> +     /* clear [C3 C2] in soc physical address */
> +     soc_pa &= ~(0x3ULL << UMC_V12_0_PA_C2_BIT);
> +     /* clear [C4] in soc physical address */
> +     soc_pa &= ~(0x1ULL << UMC_V12_0_PA_C4_BIT);

>[Tao] these bits are already cleared via UMC_V12_ADDR_MASK_BAD_COLS in patch #2, is the clear here redundant?

[Thomas] I think these code are like checking input parameters and cannot rely on external call data.

> +
> +     /* loop for all possibilities of [C4 C3 C2] */
> +     for (column = 0; column < UMC_V12_0_NA_MAP_PA_NUM; column++) {
> +             retired_page = soc_pa | ((column & 0x3) <<
> UMC_V12_0_PA_C2_BIT);
> +             retired_page |= (((column & 0x4) >> 2) <<
> UMC_V12_0_PA_C4_BIT);
> +
> +             if (pos >= len)
> +                     return 0;
> +             pfns[pos++] = retired_page >> AMDGPU_GPU_PAGE_SHIFT;
> +
>               /* shift R13 bit */
>               retired_page ^= (0x1ULL << UMC_V12_0_PA_R13_BIT);
>
> @@ -277,14 +291,40 @@ static int umc_v12_0_convert_err_addr(struct
> amdgpu_device *adev,
>                       return 0;
>               pfns[pos++] = retired_page >> AMDGPU_GPU_PAGE_SHIFT;
>
> -             dev_info(adev->dev,
> -                     "Error Address(PA):0x%-10llx Row:0x%-4x Col:0x%-2x
> Bank:0x%x Channel:0x%x\n",
> -                     retired_page, row_xor, col, bank, channel_index);
>       }
>
>       return pos;
>  }
>
> +static int umc_v12_0_convert_mca_to_addr(struct amdgpu_device *adev,
> +                     uint64_t err_addr, uint32_t ch, uint32_t umc,
> +                     uint32_t node, uint32_t socket,
> +                     uint64_t *addr, bool dump_addr) {
> +     struct ta_ras_query_address_input addr_in;
> +     struct ta_ras_query_address_output addr_out;
> +
> +     memset(&addr_in, 0, sizeof(addr_in));
> +     addr_in.ma.err_addr = err_addr;
> +     addr_in.ma.ch_inst = ch;
> +     addr_in.ma.umc_inst = umc;
> +     addr_in.ma.node_inst = node;
> +     addr_in.ma.socket_id = socket;
> +     addr_in.addr_type = TA_RAS_MCA_TO_PA;
> +     if (psp_ras_query_address(&adev->psp, &addr_in, &addr_out)) {
> +             dev_warn(adev->dev, "Failed to query RAS physical
> + address for
> 0x%llx",
> +                     err_addr);
> +             return -EINVAL;
> +     }
> +
> +     if (dump_addr)
> +             umc_v12_0_dump_addr_info(adev, &addr_out, err_addr);
> +
> +     *addr = addr_out.pa.pa;
> +
> +     return 0;
> +}
> +
>  static int umc_v12_0_query_error_address(struct amdgpu_device *adev,
>                                       uint32_t node_inst, uint32_t umc_inst,
>                                       uint32_t ch_inst, void *data) @@
> -483,12 +523,10 @@ static int umc_v12_0_update_ecc_status(struct
> amdgpu_device *adev,  {
>       struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
>       uint16_t hwid, mcatype;
> -     struct ta_ras_query_address_input addr_in;
>       uint64_t page_pfn[UMC_V12_0_BAD_PAGE_NUM_PER_CHANNEL];
> -     uint64_t err_addr, hash_val = 0;
> +     uint64_t err_addr, hash_val = 0, pa_addr = 0;
>       struct ras_ecc_err *ecc_err;
> -     int count;
> -     int ret;
> +     int count, ret;
>
>       hwid = REG_GET_FIELD(ipid, MCMP1_IPIDT0, HardwareID);
>       mcatype = REG_GET_FIELD(ipid, MCMP1_IPIDT0, McaType); @@ -514,17
> +552,17 @@ static int umc_v12_0_update_ecc_status(struct amdgpu_device
> *adev,
>               MCA_IPID_2_UMC_CH(ipid),
>               err_addr);
>
> -     memset(page_pfn, 0, sizeof(page_pfn));
> -
> -     memset(&addr_in, 0, sizeof(addr_in));
> -     addr_in.ma.err_addr = err_addr;
> -     addr_in.ma.ch_inst = MCA_IPID_2_UMC_CH(ipid);
> -     addr_in.ma.umc_inst = MCA_IPID_2_UMC_INST(ipid);
> -     addr_in.ma.node_inst = MCA_IPID_2_DIE_ID(ipid);
> -     addr_in.ma.socket_id = MCA_IPID_2_SOCKET_ID(ipid);
> +     ret = umc_v12_0_convert_mca_to_addr(adev,
> +                     err_addr, MCA_IPID_2_UMC_CH(ipid),
> +                     MCA_IPID_2_UMC_INST(ipid),
> MCA_IPID_2_DIE_ID(ipid),
> +                     MCA_IPID_2_SOCKET_ID(ipid), &pa_addr, true);

> [Tao] I prefer to using addr_in as input parameter for convert_mca_to_addr to simplify the function, and it's also convenient to make the function as a common interface for all ASICs in the future.

[Thomas]  convert_mca_to_addr will be called in multiple places: 1. Detecting a newly ecc error; 2. Loading ecc data from EEPROM (when using new data format). 3. Switch NPS?
      If using addr_in as input parameter,  a lot of  redundant  code must be added to convert IPID to add_in in  each place before calling convert_mca_to_addr.
    Each ASIC may have different address translation,  umc_v12_0_convert_mca_to_addr is only for umc v12,  if address translation has a common interface in future,  it only need to change umc_v12_0_convert_mca_to_addr.

> +     if (ret)
> +             return ret;
>
> -     count = umc_v12_0_convert_err_addr(adev,
> -                             &addr_in, page_pfn, ARRAY_SIZE(page_pfn));
> +     memset(page_pfn, 0, sizeof(page_pfn));
> +     count = umc_v12_0_expand_addr_to_bad_pages(adev,
> +                             pa_addr,
> +                             page_pfn, ARRAY_SIZE(page_pfn));
>       if (count <= 0) {
>               dev_warn(adev->dev, "Fail to convert error address!
> count:%d\n", count);
>               return 0;
> --
> 2.34.1






[Index of Archives]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux