When GPU executes recovery and retriving bad GPU tag from external eerpom device, the recovery will be broken and error message is printed as well for user's awareness. v2: Refine warning message in threshold reaching case, and fix spelling typo. v3: Fix explicit calling of bad gpu. v4: Rename function names. Signed-off-by: Guchun Chen <guchun.chen@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 20 ++++++++-- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 16 ++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 2 + .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 40 +++++++++++++++++++ .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h | 4 ++ 5 files changed, 79 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 30af0dfee1a1..c893d9adbab7 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -4139,8 +4139,23 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, amdgpu_fbdev_set_suspend(tmp_adev, 0); - /* must succeed. */ - amdgpu_ras_resume(tmp_adev); + /* + * The GPU enters bad state once faulty pages + * by ECC has reached the threshold, and ras + * recovery is scheduled next. So add one check + * here to break recovery if it indeed exceeds + * bad page threshold, and remind user to + * retire this GPU or setting one bigger + * bad_page_threshold value to fix this once + * probing driver again. + */ + if (!amdgpu_ras_check_err_threshold(tmp_adev)) { + /* must succeed. */ + amdgpu_ras_resume(tmp_adev); + } else { + r = -EINVAL; + goto out; + } /* Update PSP FW topology after reset */ if (hive && tmp_adev->gmc.xgmi.num_physical_nodes > 1) @@ -4148,7 +4163,6 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, } } - out: if (!r) { amdgpu_irq_gpu_reset_resume_helper(tmp_adev); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 64ae0742f385..fab6f8d6bee6 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -2206,3 +2206,19 @@ bool amdgpu_ras_need_emergency_restart(struct amdgpu_device *adev) return false; } + +bool amdgpu_ras_check_err_threshold(struct amdgpu_device *adev) +{ + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + bool exc_err_limit = false; + + if (con && (amdgpu_bad_page_threshold != 0)) + amdgpu_ras_eeprom_check_err_threshold(&con->eeprom_control, + &exc_err_limit); + + /* + * We are only interested in variable exc_err_limit, + * as it says if GPU is in bad state or not. + */ + return exc_err_limit; +} diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index cf9f60202334..70a6fca73617 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -497,6 +497,8 @@ void amdgpu_ras_suspend(struct amdgpu_device *adev); unsigned long amdgpu_ras_query_error_count(struct amdgpu_device *adev, bool is_ce); +bool amdgpu_ras_check_err_threshold(struct amdgpu_device *adev); + /* error handling functions */ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev, struct eeprom_table_record *bps, int pages); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c index d24bf65f6dd7..be895dc2d739 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c @@ -386,6 +386,46 @@ static uint32_t __correct_eeprom_dest_address(uint32_t curr_address) return curr_address; } +int amdgpu_ras_eeprom_check_err_threshold( + struct amdgpu_ras_eeprom_control *control, + bool *exceed_err_limit) +{ + struct amdgpu_device *adev = to_amdgpu_device(control); + unsigned char buff[EEPROM_ADDRESS_SIZE + + EEPROM_TABLE_HEADER_SIZE] = { 0 }; + struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr; + struct i2c_msg msg = { + .addr = control->i2c_address, + .flags = I2C_M_RD, + .len = EEPROM_ADDRESS_SIZE + EEPROM_TABLE_HEADER_SIZE, + .buf = buff, + }; + int ret; + + *exceed_err_limit = false; + + /* read EEPROM table header */ + mutex_lock(&control->tbl_mutex); + ret = i2c_transfer(&adev->pm.smu_i2c, &msg, 1); + if (ret < 1) { + dev_err(adev->dev, "Failed to read EEPROM table header.\n"); + goto err; + } + + __decode_table_header_from_buff(hdr, &buff[2]); + + if (hdr->header == EEPROM_TABLE_HDR_BAD) { + dev_warn(adev->dev, "This GPU is in BAD status."); + dev_warn(adev->dev, "Please retire it or setting one bigger " + "threshold value when reloading driver.\n"); + *exceed_err_limit = true; + } + +err: + mutex_unlock(&control->tbl_mutex); + return 0; +} + int amdgpu_ras_eeprom_process_recods(struct amdgpu_ras_eeprom_control *control, struct eeprom_table_record *records, bool write, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h index f245b96d9599..f29fafea5392 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h @@ -81,6 +81,10 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control, bool *exceed_err_limit); int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control); +int amdgpu_ras_eeprom_check_err_threshold( + struct amdgpu_ras_eeprom_control *control, + bool *exceed_err_limit); + int amdgpu_ras_eeprom_process_recods(struct amdgpu_ras_eeprom_control *control, struct eeprom_table_record *records, bool write, -- 2.17.1 _______________________________________________ amd-gfx mailing list amd-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/amd-gfx