Save the global channel index returned by RAS TA to eeprom. We can get memory physical address by MCA address and channel index. Signed-off-by: Tao Zhou <tao.zhou1@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 2 ++ drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 7 ++----- drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h | 2 +- drivers/gpu/drm/amd/amdgpu/umc_v12_0.c | 13 ++++++++----- 4 files changed, 13 insertions(+), 11 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index 871b2d6278e0..252e360c6416 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -481,6 +481,8 @@ struct ras_ecc_err { uint64_t ipid; uint64_t addr; uint64_t pa_pfn; + /* save global channel index across all UMC instances */ + uint32_t channel_idx; struct ras_err_pages err_pages; }; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c index 11f7299f773f..17659fa4450a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c @@ -498,10 +498,9 @@ int amdgpu_umc_lookup_bad_pages_in_a_row(struct amdgpu_device *adev, int amdgpu_umc_mca_to_addr(struct amdgpu_device *adev, uint64_t err_addr, uint32_t ch, uint32_t umc, uint32_t node, uint32_t socket, - uint64_t *addr, bool dump_addr) + struct ta_ras_query_address_output *addr_out, bool dump_addr) { struct ta_ras_query_address_input addr_in; - struct ta_ras_query_address_output addr_out; int ret; memset(&addr_in, 0, sizeof(addr_in)); @@ -513,14 +512,12 @@ int amdgpu_umc_mca_to_addr(struct amdgpu_device *adev, if (adev->umc.ras && adev->umc.ras->convert_ras_err_addr) { ret = adev->umc.ras->convert_ras_err_addr(adev, NULL, &addr_in, - &addr_out, dump_addr); + addr_out, dump_addr); if (ret) return ret; } else { return 0; } - *addr = addr_out.pa.pa; - return 0; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h index ce1e4fb385b5..2f71194d5da8 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h @@ -146,5 +146,5 @@ int amdgpu_umc_lookup_bad_pages_in_a_row(struct amdgpu_device *adev, int amdgpu_umc_mca_to_addr(struct amdgpu_device *adev, uint64_t err_addr, uint32_t ch, uint32_t umc, uint32_t node, uint32_t socket, - uint64_t *addr, bool dump_addr); + struct ta_ras_query_address_output *addr_out, bool dump_addr); #endif diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c index 17ef9a6743f5..cce93b4ffb58 100644 --- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c @@ -180,7 +180,7 @@ static int umc_v12_0_convert_error_address(struct amdgpu_device *adev, bool dump_addr) { uint32_t col, col_lower, row, row_lower, bank; - uint32_t channel_index, umc_inst = 0; + uint32_t channel_index = 0, umc_inst = 0; uint32_t i, loop_bits[UMC_V12_0_RETIRE_LOOP_BITS]; uint64_t soc_pa, column, err_addr; struct ta_ras_query_address_output addr_out_tmp; @@ -193,7 +193,7 @@ static int umc_v12_0_convert_error_address(struct amdgpu_device *adev, else paddr_out = addr_out; - err_addr = bank = channel_index = 0; + err_addr = bank = 0; if (addr_in) { err_addr = addr_in->ma.err_addr; addr_in->addr_type = TA_RAS_MCA_TO_PA; @@ -206,7 +206,6 @@ static int umc_v12_0_convert_error_address(struct amdgpu_device *adev, } bank = paddr_out->pa.bank; - channel_index = paddr_out->pa.channel_idx; /* no need to care about umc inst if addr_in is NULL */ umc_inst = addr_in->ma.umc_inst; } @@ -228,6 +227,7 @@ static int umc_v12_0_convert_error_address(struct amdgpu_device *adev, } soc_pa = paddr_out->pa.pa; + channel_index = paddr_out->pa.channel_idx; /* clear loop bits in soc physical address */ for (i = 0; i < UMC_V12_0_RETIRE_LOOP_BITS; i++) soc_pa &= ~BIT_ULL(loop_bits[i]); @@ -466,6 +466,7 @@ static int umc_v12_0_update_ecc_status(struct amdgpu_device *adev, uint64_t page_pfn[UMC_V12_0_BAD_PAGE_NUM_PER_CHANNEL]; uint64_t err_addr, pa_addr = 0; struct ras_ecc_err *ecc_err; + struct ta_ras_query_address_output addr_out; int count, ret, i; hwid = REG_GET_FIELD(ipid, MCMP1_IPIDT0, HardwareID); @@ -495,7 +496,7 @@ static int umc_v12_0_update_ecc_status(struct amdgpu_device *adev, ret = amdgpu_umc_mca_to_addr(adev, err_addr, MCA_IPID_2_UMC_CH(ipid), MCA_IPID_2_UMC_INST(ipid), MCA_IPID_2_DIE_ID(ipid), - MCA_IPID_2_SOCKET_ID(ipid), &pa_addr, true); + MCA_IPID_2_SOCKET_ID(ipid), &addr_out, true); if (ret) return ret; @@ -503,10 +504,12 @@ static int umc_v12_0_update_ecc_status(struct amdgpu_device *adev, if (!ecc_err) return -ENOMEM; + pa_addr = addr_out.pa.pa; ecc_err->status = status; ecc_err->ipid = ipid; ecc_err->addr = addr; ecc_err->pa_pfn = pa_addr >> AMDGPU_GPU_PAGE_SHIFT; + ecc_err->channel_idx = addr_out.pa.channel_idx; /* If converted pa_pfn is 0, use pa C4 pfn. */ if (!ecc_err->pa_pfn) @@ -577,7 +580,7 @@ static int umc_v12_0_fill_error_record(struct amdgpu_device *adev, ret = amdgpu_umc_fill_error_record(err_data, ecc_err->addr, page_pfn[i] << AMDGPU_GPU_PAGE_SHIFT, - MCA_IPID_2_UMC_CH(ecc_err->ipid), + ecc_err->channel_idx, MCA_IPID_2_UMC_INST(ecc_err->ipid)); if (ret) break; -- 2.34.1