[AMD Official Use Only - General] > -----Original Message----- > From: amd-gfx <amd-gfx-bounces@xxxxxxxxxxxxxxxxxxxxx> On Behalf Of Candice Li > Sent: Wednesday, March 27, 2024 2:16 PM > To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx > Cc: Li, Candice <Candice.Li@xxxxxxx> > Subject: [PATCH] drm/amdgpu: Update EEPROM RAS table for mismatched table > version > > Update table version and restore bad page records to EEPROM RAS table for > mismatched table version case. Otherwise force to reset the table. > > Signed-off-by: Candice Li <candice.li@xxxxxxx> > --- > .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 88 ++++++++++++++++--- > 1 file changed, 78 insertions(+), 10 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c > index 06a62a8a992e9b..42d0ef2f512474 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c > @@ -1319,6 +1319,37 @@ static int __read_table_ras_info(struct > amdgpu_ras_eeprom_control *control) > return res == RAS_TABLE_V2_1_INFO_SIZE ? 0 : res; } > > +static bool amdgpu_ras_eeprom_table_version_validate(struct > +amdgpu_ras_eeprom_control *control) { > + struct amdgpu_device *adev = to_amdgpu_device(control); > + struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr; > + > + switch (amdgpu_ip_version(adev, UMC_HWIP, 0)) { > + case IP_VERSION(8, 10, 0): > + case IP_VERSION(12, 0, 0): > + return hdr->version == RAS_TABLE_VER_V2_1; > + default: > + return hdr->version == RAS_TABLE_VER_V1; > + } > +} > + > +static void amdgpu_ras_update_eeprom_control(struct > +amdgpu_ras_eeprom_table_header *hdr) { > + struct amdgpu_ras_eeprom_control *control = > + container_of(hdr, struct amdgpu_ras_eeprom_control, tbl_hdr); > + > + if (hdr->version == RAS_TABLE_VER_V2_1) { > + control->ras_num_recs = RAS_NUM_RECS_V2_1(hdr); > + control->ras_record_offset = RAS_RECORD_START_V2_1; > + control->ras_max_record_count = > RAS_MAX_RECORD_COUNT_V2_1; > + } else { > + control->ras_num_recs = RAS_NUM_RECS(hdr); > + control->ras_record_offset = RAS_RECORD_START; > + control->ras_max_record_count = RAS_MAX_RECORD_COUNT; > + } > + control->ras_fri = RAS_OFFSET_TO_INDEX(control, > +hdr->first_rec_offset); } > + > int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control, > bool *exceed_err_limit) > { > @@ -1326,7 +1357,9 @@ int amdgpu_ras_eeprom_init(struct > amdgpu_ras_eeprom_control *control, > unsigned char buf[RAS_TABLE_HEADER_SIZE] = { 0 }; > struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr; > struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); > - int res; > + int res, res1; > + struct eeprom_table_record *bps; > + u32 num_recs; > > *exceed_err_limit = false; > > @@ -1355,16 +1388,51 @@ int amdgpu_ras_eeprom_init(struct > amdgpu_ras_eeprom_control *control, > > __decode_table_header_from_buf(hdr, buf); > > - if (hdr->version == RAS_TABLE_VER_V2_1) { > - control->ras_num_recs = RAS_NUM_RECS_V2_1(hdr); > - control->ras_record_offset = RAS_RECORD_START_V2_1; > - control->ras_max_record_count = > RAS_MAX_RECORD_COUNT_V2_1; > - } else { > - control->ras_num_recs = RAS_NUM_RECS(hdr); > - control->ras_record_offset = RAS_RECORD_START; > - control->ras_max_record_count = RAS_MAX_RECORD_COUNT; > + amdgpu_ras_update_eeprom_control(hdr); > + > + if (!amdgpu_ras_eeprom_table_version_validate(control)) { > + num_recs = control->ras_num_recs; > + if (num_recs && amdgpu_bad_page_threshold) { > + /* Save bad page records existed in EEPROM */ > + bps = kcalloc(num_recs, sizeof(*bps), GFP_KERNEL); > + if (!bps) > + return -ENOMEM; > + > + res1 = amdgpu_ras_eeprom_read(control, bps, > num_recs); > + if (res1) > + dev_warn(adev->dev, "Fail to load EEPROM > table, force to reset > +it."); > + > + res = amdgpu_ras_eeprom_reset_table(control); > + if (res) { > + dev_err(adev->dev, "Failed to create a new > EEPROM table."); > + kfree(bps); > + return res < 0 ? res : 0; > + } > + > + if (!res1) { > + /* Update the EEPROM table with correct table > version and > + * original bad page records > + */ > + amdgpu_ras_update_eeprom_control(hdr); > + res = amdgpu_ras_eeprom_append(control, bps, > num_recs); > + > + if (res) { > + dev_warn(adev->dev, "Fail to update > EEPROM table, force to reset it."); > + res = > amdgpu_ras_eeprom_reset_table(control); [Tao] I think the reset here can be dropped, apart from this, the patch is: Reviewed-by: Tao Zhou <tao.zhou1@xxxxxxx> It's better to get Stanley's Reviewed-by. > + } > + } > + > + kfree(bps); > + } else > + res = amdgpu_ras_eeprom_reset_table(control); > + > + if (res) { > + dev_err(adev->dev, "Failed to reset EEPROM table."); > + return res < 0 ? res : 0; > + } > + > + amdgpu_ras_update_eeprom_control(hdr); > } > - control->ras_fri = RAS_OFFSET_TO_INDEX(control, hdr->first_rec_offset); > > if (hdr->header == RAS_TABLE_HDR_VAL) { > DRM_DEBUG_DRIVER("Found existing EEPROM table with %d > records", > -- > 2.25.1