1. Use pa_pfn as the radix-tree key index to log deferred error info. 2. Use local array to store expanded bad pages. Signed-off-by: YiPeng Chai <YiPeng.Chai@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 14 ++---- drivers/gpu/drm/amd/amdgpu/umc_v12_0.c | 65 ++++++++++++------------- drivers/gpu/drm/amd/amdgpu/umc_v12_0.h | 5 ++ 4 files changed, 40 insertions(+), 46 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index dcf1f3dbb5c4..f607ff620015 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -476,10 +476,10 @@ struct ras_err_pages { }; struct ras_ecc_err { - u64 hash_index; uint64_t status; uint64_t ipid; uint64_t addr; + uint64_t pa_pfn; struct ras_err_pages err_pages; }; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c index 5d08c03fe543..2fc90799bf8d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c @@ -523,18 +523,10 @@ int amdgpu_umc_logs_ecc_err(struct amdgpu_device *adev, ecc_log = &con->umc_ecc_log; mutex_lock(&ecc_log->lock); - ret = radix_tree_insert(ecc_tree, ecc_err->hash_index, ecc_err); - if (!ret) { - struct ras_err_pages *err_pages = &ecc_err->err_pages; - int i; - - /* Reserve memory */ - for (i = 0; i < err_pages->count; i++) - amdgpu_ras_reserve_page(adev, err_pages->pfn[i]); - + ret = radix_tree_insert(ecc_tree, ecc_err->pa_pfn, ecc_err); + if (!ret) radix_tree_tag_set(ecc_tree, - ecc_err->hash_index, UMC_ECC_NEW_DETECTED_TAG); - } + ecc_err->pa_pfn, UMC_ECC_NEW_DETECTED_TAG); mutex_unlock(&ecc_log->lock); return ret; diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c index eca5ac6a0532..f2235c9ead29 100644 --- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c @@ -524,9 +524,9 @@ static int umc_v12_0_update_ecc_status(struct amdgpu_device *adev, struct amdgpu_ras *con = amdgpu_ras_get_context(adev); uint16_t hwid, mcatype; uint64_t page_pfn[UMC_V12_0_BAD_PAGE_NUM_PER_CHANNEL]; - uint64_t err_addr, hash_val = 0, pa_addr = 0; + uint64_t err_addr, pa_addr = 0; struct ras_ecc_err *ecc_err; - int count, ret; + int count, ret, i; hwid = REG_GET_FIELD(ipid, MCMP1_IPIDT0, HardwareID); mcatype = REG_GET_FIELD(ipid, MCMP1_IPIDT0, McaType); @@ -559,39 +559,18 @@ static int umc_v12_0_update_ecc_status(struct amdgpu_device *adev, if (ret) return ret; - memset(page_pfn, 0, sizeof(page_pfn)); - count = umc_v12_0_expand_addr_to_bad_pages(adev, - pa_addr, - page_pfn, ARRAY_SIZE(page_pfn)); - if (count <= 0) { - dev_warn(adev->dev, "Fail to convert error address! count:%d\n", count); - return 0; - } - - ret = amdgpu_umc_build_pages_hash(adev, - page_pfn, count, &hash_val); - if (ret) { - dev_err(adev->dev, "Fail to build error pages hash\n"); - return ret; - } - ecc_err = kzalloc(sizeof(*ecc_err), GFP_KERNEL); if (!ecc_err) return -ENOMEM; - ecc_err->err_pages.pfn = kcalloc(count, sizeof(*ecc_err->err_pages.pfn), GFP_KERNEL); - if (!ecc_err->err_pages.pfn) { - kfree(ecc_err); - return -ENOMEM; - } - - memcpy(ecc_err->err_pages.pfn, page_pfn, count * sizeof(*ecc_err->err_pages.pfn)); - ecc_err->err_pages.count = count; - - ecc_err->hash_index = hash_val; ecc_err->status = status; ecc_err->ipid = ipid; ecc_err->addr = addr; + ecc_err->pa_pfn = UMC_V12_ADDR_MASK_BAD_COLS(pa_addr) >> AMDGPU_GPU_PAGE_SHIFT; + + /* If converted pa_pfn is 0, use pa xor pfn. */ + if (!ecc_err->pa_pfn) + ecc_err->pa_pfn = BIT_ULL(UMC_V12_0_PA_R13_BIT) >> AMDGPU_GPU_PAGE_SHIFT; ret = amdgpu_umc_logs_ecc_err(adev, &con->umc_ecc_log.de_page_tree, ecc_err); if (ret) { @@ -600,13 +579,25 @@ static int umc_v12_0_update_ecc_status(struct amdgpu_device *adev, else dev_err(adev->dev, "Fail to log ecc error! ret:%d\n", ret); - kfree(ecc_err->err_pages.pfn); kfree(ecc_err); return ret; } con->umc_ecc_log.de_queried_count++; + memset(page_pfn, 0, sizeof(page_pfn)); + count = umc_v12_0_expand_addr_to_bad_pages(adev, + pa_addr, + page_pfn, ARRAY_SIZE(page_pfn)); + if (count <= 0) { + dev_warn(adev->dev, "Fail to convert error address! count:%d\n", count); + return 0; + } + + /* Reserve memory */ + for (i = 0; i < count; i++) + amdgpu_ras_reserve_page(adev, page_pfn[i]); + /* The problem case is as follows: * 1. GPU A triggers a gpu ras reset, and GPU A drives * GPU B to also perform a gpu ras reset. @@ -631,16 +622,21 @@ static int umc_v12_0_fill_error_record(struct amdgpu_device *adev, struct ras_ecc_err *ecc_err, void *ras_error_status) { struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status; - uint32_t i = 0; - int ret = 0; + uint64_t page_pfn[UMC_V12_0_BAD_PAGE_NUM_PER_CHANNEL]; + int ret, i, count; if (!err_data || !ecc_err) return -EINVAL; - for (i = 0; i < ecc_err->err_pages.count; i++) { + memset(page_pfn, 0, sizeof(page_pfn)); + count = umc_v12_0_expand_addr_to_bad_pages(adev, + ecc_err->pa_pfn << AMDGPU_GPU_PAGE_SHIFT, + page_pfn, ARRAY_SIZE(page_pfn)); + + for (i = 0; i < count; i++) { ret = amdgpu_umc_fill_error_record(err_data, ecc_err->addr, - ecc_err->err_pages.pfn[i] << AMDGPU_GPU_PAGE_SHIFT, + page_pfn[i] << AMDGPU_GPU_PAGE_SHIFT, MCA_IPID_2_UMC_CH(ecc_err->ipid), MCA_IPID_2_UMC_INST(ecc_err->ipid)); if (ret) @@ -674,7 +670,8 @@ static void umc_v12_0_query_ras_ecc_err_addr(struct amdgpu_device *adev, dev_err(adev->dev, "Fail to fill umc error record, ret:%d\n", ret); break; } - radix_tree_tag_clear(ecc_tree, entries[i]->hash_index, UMC_ECC_NEW_DETECTED_TAG); + radix_tree_tag_clear(ecc_tree, + entries[i]->pa_pfn, UMC_ECC_NEW_DETECTED_TAG); } mutex_unlock(&con->umc_ecc_log.lock); } diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h index b4974793850b..be5598d76c1d 100644 --- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h +++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h @@ -81,6 +81,11 @@ (((REG_GET_FIELD(ipid, MCMP1_IPIDT0, InstanceIdLo) & 0x1) << 2) | \ (REG_GET_FIELD(ipid, MCMP1_IPIDT0, InstanceIdHi) & 0x03)) +#define UMC_V12_ADDR_MASK_BAD_COLS(addr) \ + ((addr) & ~((0x3ULL << UMC_V12_0_PA_C2_BIT) | \ + (0x1ULL << UMC_V12_0_PA_C4_BIT) | \ + (0x1ULL << UMC_V12_0_PA_R13_BIT))) + bool umc_v12_0_is_deferred_error(struct amdgpu_device *adev, uint64_t mc_umc_status); bool umc_v12_0_is_uncorrectable_error(struct amdgpu_device *adev, uint64_t mc_umc_status); bool umc_v12_0_is_correctable_error(struct amdgpu_device *adev, uint64_t mc_umc_status); -- 2.34.1