Update table version and restore bad page records to EEPROM RAS table for mismatched table version case. Otherwise force to reset the table. Signed-off-by: Candice Li <candice.li@xxxxxxx> --- .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 88 ++++++++++++++++--- 1 file changed, 78 insertions(+), 10 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c index 06a62a8a992e9b..42d0ef2f512474 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c @@ -1319,6 +1319,37 @@ static int __read_table_ras_info(struct amdgpu_ras_eeprom_control *control) return res == RAS_TABLE_V2_1_INFO_SIZE ? 0 : res; } +static bool amdgpu_ras_eeprom_table_version_validate(struct amdgpu_ras_eeprom_control *control) +{ + struct amdgpu_device *adev = to_amdgpu_device(control); + struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr; + + switch (amdgpu_ip_version(adev, UMC_HWIP, 0)) { + case IP_VERSION(8, 10, 0): + case IP_VERSION(12, 0, 0): + return hdr->version == RAS_TABLE_VER_V2_1; + default: + return hdr->version == RAS_TABLE_VER_V1; + } +} + +static void amdgpu_ras_update_eeprom_control(struct amdgpu_ras_eeprom_table_header *hdr) +{ + struct amdgpu_ras_eeprom_control *control = + container_of(hdr, struct amdgpu_ras_eeprom_control, tbl_hdr); + + if (hdr->version == RAS_TABLE_VER_V2_1) { + control->ras_num_recs = RAS_NUM_RECS_V2_1(hdr); + control->ras_record_offset = RAS_RECORD_START_V2_1; + control->ras_max_record_count = RAS_MAX_RECORD_COUNT_V2_1; + } else { + control->ras_num_recs = RAS_NUM_RECS(hdr); + control->ras_record_offset = RAS_RECORD_START; + control->ras_max_record_count = RAS_MAX_RECORD_COUNT; + } + control->ras_fri = RAS_OFFSET_TO_INDEX(control, hdr->first_rec_offset); +} + int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control, bool *exceed_err_limit) { @@ -1326,7 +1357,9 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control, unsigned char buf[RAS_TABLE_HEADER_SIZE] = { 0 }; struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr; struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); - int res; + int res, res1; + struct eeprom_table_record *bps; + u32 num_recs; *exceed_err_limit = false; @@ -1355,16 +1388,51 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control, __decode_table_header_from_buf(hdr, buf); - if (hdr->version == RAS_TABLE_VER_V2_1) { - control->ras_num_recs = RAS_NUM_RECS_V2_1(hdr); - control->ras_record_offset = RAS_RECORD_START_V2_1; - control->ras_max_record_count = RAS_MAX_RECORD_COUNT_V2_1; - } else { - control->ras_num_recs = RAS_NUM_RECS(hdr); - control->ras_record_offset = RAS_RECORD_START; - control->ras_max_record_count = RAS_MAX_RECORD_COUNT; + amdgpu_ras_update_eeprom_control(hdr); + + if (!amdgpu_ras_eeprom_table_version_validate(control)) { + num_recs = control->ras_num_recs; + if (num_recs && amdgpu_bad_page_threshold) { + /* Save bad page records existed in EEPROM */ + bps = kcalloc(num_recs, sizeof(*bps), GFP_KERNEL); + if (!bps) + return -ENOMEM; + + res1 = amdgpu_ras_eeprom_read(control, bps, num_recs); + if (res1) + dev_warn(adev->dev, "Fail to load EEPROM table, force to reset it."); + + res = amdgpu_ras_eeprom_reset_table(control); + if (res) { + dev_err(adev->dev, "Failed to create a new EEPROM table."); + kfree(bps); + return res < 0 ? res : 0; + } + + if (!res1) { + /* Update the EEPROM table with correct table version and + * original bad page records + */ + amdgpu_ras_update_eeprom_control(hdr); + res = amdgpu_ras_eeprom_append(control, bps, num_recs); + + if (res) { + dev_warn(adev->dev, "Fail to update EEPROM table, force to reset it."); + res = amdgpu_ras_eeprom_reset_table(control); + } + } + + kfree(bps); + } else + res = amdgpu_ras_eeprom_reset_table(control); + + if (res) { + dev_err(adev->dev, "Failed to reset EEPROM table."); + return res < 0 ? res : 0; + } + + amdgpu_ras_update_eeprom_control(hdr); } - control->ras_fri = RAS_OFFSET_TO_INDEX(control, hdr->first_rec_offset); if (hdr->header == RAS_TABLE_HDR_VAL) { DRM_DEBUG_DRIVER("Found existing EEPROM table with %d records", -- 2.25.1