add nps info into eeprom records, and refine adding bad page logic based on nps info. Signed-off-by: ganglxie <ganglxie@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 244 +++++++++--------- .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 25 +- .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h | 20 +- drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h | 7 + 4 files changed, 153 insertions(+), 143 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 5420e2d6d244..3945dda54b3f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -2801,20 +2801,101 @@ static int amdgpu_ras_mca2pa(struct amdgpu_device *adev, return -EINVAL; } +static int __amdgpu_ras_restore_bad_pages(struct amdgpu_device *adev, + struct eeprom_table_record *bps, int count) +{ + int j; + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + struct ras_err_handler_data *data = con->eh_data; + + for (j = 0; j < count; j++) { + if (amdgpu_ras_check_bad_page_unlock(con, + bps[j].retired_page << AMDGPU_GPU_PAGE_SHIFT)) + continue; + + if (!data->space_left && + amdgpu_ras_realloc_eh_data_space(adev, data, 256)) { + return -ENOMEM; + } + + amdgpu_ras_reserve_page(adev, bps[j].retired_page); + + memcpy(&data->bps[data->count], &(bps[j]), + sizeof(struct eeprom_table_record)); + data->count++; + data->space_left--; + } + + return 0; +} + +static int __amdgpu_ras_convert_unit_rec_from_rom(struct amdgpu_device *adev, + struct eeprom_table_record *bps, struct ras_err_data *err_data, + enum amdgpu_memory_partition nps) +{ + int i = 0; + int ret = 0; + enum amdgpu_memory_partition save_nps; + + save_nps = (bps[0].retired_page >> UMC_NPS_SHIFT) & UMC_NPS_MASK; + + for (i = 0; i < adev->umc.retire_unit; i++) + bps[i].retired_page &= ~(UMC_NPS_MASK << UMC_NPS_SHIFT); + + if (save_nps) { + if (save_nps == nps) { + if (amdgpu_umc_pages_in_a_row(adev, err_data, + bps[0].retired_page << AMDGPU_GPU_PAGE_SHIFT)) + return -EINVAL; + } else { + if (amdgpu_ras_mca2pa_by_idx(adev, &bps[0], err_data)) + return -EINVAL; + } + } else { + if (amdgpu_ras_mca2pa(adev, &bps[0], err_data)) { + if (nps == AMDGPU_NPS1_PARTITION_MODE) + memcpy(err_data->err_addr, bps, + sizeof(struct eeprom_table_record) * adev->umc.retire_unit); + else + return -EOPNOTSUPP; + } + } + + return __amdgpu_ras_restore_bad_pages(adev, err_data->err_addr, adev->umc.retire_unit); +} + +static int __amdgpu_ras_convert_single_rec_from_rom(struct amdgpu_device *adev, + struct eeprom_table_record *bps, struct ras_err_data *err_data, + enum amdgpu_memory_partition nps) +{ + enum amdgpu_memory_partition save_nps; + + save_nps = (bps->retired_page >> UMC_NPS_SHIFT) & UMC_NPS_MASK; + bps->retired_page &= ~(UMC_NPS_MASK << UMC_NPS_SHIFT); + + if (save_nps == nps) { + if (amdgpu_umc_pages_in_a_row(adev, err_data, + bps->retired_page << AMDGPU_GPU_PAGE_SHIFT)) + return -EINVAL; + } else { + if (amdgpu_ras_mca2pa_by_idx(adev, bps, err_data)) + return -EINVAL; + } + return __amdgpu_ras_restore_bad_pages(adev, err_data->err_addr, + adev->umc.retire_unit); +} + /* it deal with vram only. */ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev, struct eeprom_table_record *bps, int pages, bool from_rom) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); - struct ras_err_handler_data *data; struct ras_err_data err_data; - struct eeprom_table_record *err_rec; struct amdgpu_ras_eeprom_control *control = &adev->psp.ras_context.ras->eeprom_control; enum amdgpu_memory_partition nps = AMDGPU_NPS1_PARTITION_MODE; int ret = 0; - uint32_t i, j, loop_cnt = 1; - bool find_pages_per_pa = false; + uint32_t i; if (!con || !con->eh_data || !bps || pages <= 0) return 0; @@ -2825,108 +2906,41 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev, sizeof(struct eeprom_table_record), GFP_KERNEL); if (!err_data.err_addr) { dev_warn(adev->dev, "Failed to alloc UMC error address record in mca2pa conversion!\n"); - ret = -ENOMEM; - goto out; + return -ENOMEM; } - err_rec = err_data.err_addr; - loop_cnt = adev->umc.retire_unit; if (adev->gmc.gmc_funcs->query_mem_partition_mode) nps = adev->gmc.gmc_funcs->query_mem_partition_mode(adev); } mutex_lock(&con->recovery_lock); - data = con->eh_data; - if (!data) { - /* Returning 0 as the absence of eh_data is acceptable */ - goto free; - } - - for (i = 0; i < pages; i++) { - if (from_rom && - control->rec_type == AMDGPU_RAS_EEPROM_REC_MCA) { - if (!find_pages_per_pa) { - if (amdgpu_ras_mca2pa_by_idx(adev, &bps[i], &err_data)) { - if (!i && nps == AMDGPU_NPS1_PARTITION_MODE) { - /* may use old RAS TA, use PA to find pages in - * one row - */ - if (amdgpu_umc_pages_in_a_row(adev, &err_data, - bps[i].retired_page << - AMDGPU_GPU_PAGE_SHIFT)) { - ret = -EINVAL; - goto free; - } else { - find_pages_per_pa = true; - } - } else { - /* unsupported cases */ - ret = -EOPNOTSUPP; - goto free; - } - } - } else { - if (amdgpu_umc_pages_in_a_row(adev, &err_data, - bps[i].retired_page << AMDGPU_GPU_PAGE_SHIFT)) { - ret = -EINVAL; - goto free; - } - } - } else { - if (from_rom && !find_pages_per_pa) { - if (bps[i].retired_page & UMC_CHANNEL_IDX_V2) { - /* bad page in any NPS mode in eeprom */ - if (amdgpu_ras_mca2pa_by_idx(adev, &bps[i], &err_data)) { - ret = -EINVAL; + + if (from_rom) { + for (i = 0; i < pages; i++) { + if (control->ras_num_recs - i >= adev->umc.retire_unit) { + if ((bps[i].address == bps[i + 1].address) && + (bps[i].mem_channel == bps[i + 1].mem_channel)) { + //deal retire_unit records a time + ret = __amdgpu_ras_convert_unit_rec_from_rom(adev, + &bps[i], &err_data, nps); + if (ret) goto free; - } + i += (adev->umc.retire_unit - 1); } else { - /* legacy bad page in eeprom, generated only in - * NPS1 mode - */ - if (amdgpu_ras_mca2pa(adev, &bps[i], &err_data)) { - /* old RAS TA or ASICs which don't support to - * convert addrss via mca address - */ - if (!i && nps == AMDGPU_NPS1_PARTITION_MODE) { - find_pages_per_pa = true; - err_rec = &bps[i]; - loop_cnt = 1; - } else { - /* non-nps1 mode, old RAS TA - * can't support it - */ - ret = -EOPNOTSUPP; - goto free; - } - } + break; } - - if (!find_pages_per_pa) - i += (adev->umc.retire_unit - 1); } else { - err_rec = &bps[i]; + break; } } - - for (j = 0; j < loop_cnt; j++) { - if (amdgpu_ras_check_bad_page_unlock(con, - err_rec[j].retired_page << AMDGPU_GPU_PAGE_SHIFT)) - continue; - - if (!data->space_left && - amdgpu_ras_realloc_eh_data_space(adev, data, 256)) { - ret = -ENOMEM; + for (; i < pages; i++) { + ret = __amdgpu_ras_convert_single_rec_from_rom(adev, + &bps[i], &err_data, nps); + if (ret) goto free; - } - - amdgpu_ras_reserve_page(adev, err_rec[j].retired_page); - - memcpy(&data->bps[data->count], &(err_rec[j]), - sizeof(struct eeprom_table_record)); - data->count++; - data->space_left--; } + } else { + ret = __amdgpu_ras_restore_bad_pages(adev, bps, pages); } free: @@ -2971,24 +2985,14 @@ int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev, /* only new entries are saved */ if (save_count > 0) { - if (control->rec_type == AMDGPU_RAS_EEPROM_REC_PA) { + for (i = 0; i < unit_num; i++) { if (amdgpu_ras_eeprom_append(control, - &data->bps[control->ras_num_recs], - save_count)) { + &data->bps[bad_page_num + i * adev->umc.retire_unit], + 1)) { dev_err(adev->dev, "Failed to save EEPROM table data!"); return -EIO; } - } else { - for (i = 0; i < unit_num; i++) { - if (amdgpu_ras_eeprom_append(control, - &data->bps[bad_page_num + i * adev->umc.retire_unit], - 1)) { - dev_err(adev->dev, "Failed to save EEPROM table data!"); - return -EIO; - } - } } - dev_info(adev->dev, "Saved %d pages to EEPROM table.\n", save_count); } @@ -3005,6 +3009,7 @@ static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev) &adev->psp.ras_context.ras->eeprom_control; struct eeprom_table_record *bps; int ret; + int i = 0; /* no bad page record, skip eeprom access */ if (control->ras_num_recs == 0 || amdgpu_bad_page_threshold == 0) @@ -3018,13 +3023,23 @@ static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev) if (ret) { dev_err(adev->dev, "Failed to load EEPROM table records!"); } else { - if (control->ras_num_recs > 1 && - adev->umc.ras && adev->umc.ras->convert_ras_err_addr) { - if ((bps[0].address == bps[1].address) && - (bps[0].mem_channel == bps[1].mem_channel)) - control->rec_type = AMDGPU_RAS_EEPROM_REC_PA; - else - control->rec_type = AMDGPU_RAS_EEPROM_REC_MCA; + if (adev->umc.ras && adev->umc.ras->convert_ras_err_addr) { + for (i = 0; i < control->ras_num_recs; i++) { + if ((control->ras_num_recs - i) >= adev->umc.retire_unit) { + if ((bps[i].address == bps[i + 1].address) && + (bps[i].mem_channel == bps[i + 1].mem_channel)) { + control->ras_num_pa_recs += adev->umc.retire_unit; + i += (adev->umc.retire_unit - 1); + } else { + control->ras_num_mca_recs += + (control->ras_num_recs - i); + break; + } + } else { + control->ras_num_mca_recs += (control->ras_num_recs - i); + break; + } + } } ret = amdgpu_ras_eeprom_check(control); @@ -3438,12 +3453,7 @@ int amdgpu_ras_init_badpage_info(struct amdgpu_device *adev) return ret; if (!adev->umc.ras || !adev->umc.ras->convert_ras_err_addr) - control->rec_type = AMDGPU_RAS_EEPROM_REC_PA; - - /* default status is MCA storage */ - if (control->ras_num_recs <= 1 && - adev->umc.ras && adev->umc.ras->convert_ras_err_addr) - control->rec_type = AMDGPU_RAS_EEPROM_REC_MCA; + control->ras_num_pa_recs = control->ras_num_recs; if (control->ras_num_recs) { ret = amdgpu_ras_load_bad_pages(adev); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c index 83b54efcaa87..ab27cecb5519 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c @@ -727,11 +727,9 @@ amdgpu_ras_eeprom_append_table(struct amdgpu_ras_eeprom_control *control, - control->ras_fri) % control->ras_max_record_count; - if (control->rec_type == AMDGPU_RAS_EEPROM_REC_PA) - control->ras_num_bad_pages = control->ras_num_recs; - else - control->ras_num_bad_pages = - control->ras_num_recs * adev->umc.retire_unit; + control->ras_num_mca_recs += num; + control->ras_num_bad_pages += num * adev->umc.retire_unit; + Out: kfree(buf); return res; @@ -852,6 +850,7 @@ int amdgpu_ras_eeprom_append(struct amdgpu_ras_eeprom_control *control, { struct amdgpu_device *adev = to_amdgpu_device(control); int res, i; + uint64_t nps = AMDGPU_NPS1_PARTITION_MODE; if (!__is_ras_eeprom_supported(adev)) return 0; @@ -865,9 +864,12 @@ int amdgpu_ras_eeprom_append(struct amdgpu_ras_eeprom_control *control, return -EINVAL; } + if (adev->gmc.gmc_funcs->query_mem_partition_mode) + nps = adev->gmc.gmc_funcs->query_mem_partition_mode(adev); + /* set the new channel index flag */ for (i = 0; i < num; i++) - record[i].retired_page |= UMC_CHANNEL_IDX_V2; + record[i].retired_page |= (nps << UMC_NPS_SHIFT); mutex_lock(&control->ras_tbl_mutex); @@ -881,7 +883,7 @@ int amdgpu_ras_eeprom_append(struct amdgpu_ras_eeprom_control *control, /* clear channel index flag, the flag is only saved on eeprom */ for (i = 0; i < num; i++) - record[i].retired_page &= ~UMC_CHANNEL_IDX_V2; + record[i].retired_page &= ~(nps << UMC_NPS_SHIFT); return res; } @@ -1392,6 +1394,8 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control) } control->ras_fri = RAS_OFFSET_TO_INDEX(control, hdr->first_rec_offset); + control->ras_num_mca_recs = 0; + control->ras_num_pa_recs = 0; return 0; } @@ -1412,11 +1416,8 @@ int amdgpu_ras_eeprom_check(struct amdgpu_ras_eeprom_control *control) if (!__get_eeprom_i2c_addr(adev, control)) return -EINVAL; - if (control->rec_type == AMDGPU_RAS_EEPROM_REC_PA) - control->ras_num_bad_pages = control->ras_num_recs; - else - control->ras_num_bad_pages = - control->ras_num_recs * adev->umc.retire_unit; + control->ras_num_bad_pages = control->ras_num_pa_recs + + control->ras_num_mca_recs * adev->umc.retire_unit; if (hdr->header == RAS_TABLE_HDR_VAL) { DRM_DEBUG_DRIVER("Found existing EEPROM table with %d records", diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h index 81d55cb7b397..13f7eda9a696 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h @@ -43,19 +43,6 @@ enum amdgpu_ras_eeprom_err_type { AMDGPU_RAS_EEPROM_ERR_COUNT, }; -/* - * one UMC MCA address could map to multiply physical address (PA), - * such as 1:16, we use eeprom_table_record.address to store MCA - * address and use eeprom_table_record.retired_page to save PA. - * - * AMDGPU_RAS_EEPROM_REC_PA: one record store one PA - * AMDGPU_RAS_EEPROM_REC_MCA: one record store one MCA address - */ -enum amdgpu_ras_eeprom_rec_type { - AMDGPU_RAS_EEPROM_REC_PA, - AMDGPU_RAS_EEPROM_REC_MCA, -}; - struct amdgpu_ras_eeprom_table_header { uint32_t header; uint32_t version; @@ -100,6 +87,12 @@ struct amdgpu_ras_eeprom_control { */ u32 ras_num_bad_pages; + /* Number of records store mca address */ + u32 ras_num_mca_recs; + + /* Number of records store physical address */ + u32 ras_num_pa_recs; + /* First record index to read, 0-based. * Range is [0, num_recs-1]. This is * an absolute index, starting right after @@ -120,7 +113,6 @@ struct amdgpu_ras_eeprom_control { /* Record channel info which occurred bad pages */ u32 bad_channel_bitmap; - enum amdgpu_ras_eeprom_rec_type rec_type; }; /* diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h index a4a7e61817aa..857693bcd8d4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h @@ -71,6 +71,13 @@ */ #define UMC_CHANNEL_IDX_V2 BIT_ULL(47) +/* + * save nps value to eeprom_table_record.retired_page[47:40], + * the channel index flag above will be retired. + */ +#define UMC_NPS_SHIFT 40 +#define UMC_NPS_MASK 0xffULL + typedef int (*umc_func)(struct amdgpu_device *adev, uint32_t node_inst, uint32_t umc_inst, uint32_t ch_inst, void *data); -- 2.34.1