When retrieving bad gpu tag from eeprom, GPU init should fail as the GPU needs to be retired for further check. Signed-off-by: Guchun Chen <guchun.chen@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 12 +++++++++--- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 15 +++++++++++++-- drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 11 ++++++++++- drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h | 3 ++- 4 files changed, 34 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 2662cd7c8685..882f8a0964a5 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2059,13 +2059,19 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev) * it should be called after amdgpu_device_ip_hw_init_phase2 since * for some ASICs the RAS EEPROM code relies on SMU fully functioning * for I2C communication which only true at this point. - * recovery_init may fail, but it can free all resources allocated by - * itself and its failure should not stop amdgpu init process. + * + * amdgpu_ras_recovery_init may fail, but the upper only cares the + * failure from bad gpu situation and stop amdgpu init process + * arrordingly. For other failed cases, it will still release all + * the resource and print error message, rather than returning one + * negative value to upper level. * * Note: theoretically, this should be called before all vram allocations * to protect retired page from abusing */ - amdgpu_ras_recovery_init(adev); + r = amdgpu_ras_recovery_init(adev); + if (r) + goto init_failed; if (adev->gmc.xgmi.num_physical_nodes > 1) amdgpu_xgmi_add_device(adev); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 8daeb54917ed..06db2f0b78d7 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1819,6 +1819,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev) struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct ras_err_handler_data **data; uint32_t max_eeprom_records_len = 0; + bool bad_gpu = false; int ret; if (con) @@ -1840,9 +1841,12 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev) max_eeprom_records_len = amdgpu_ras_eeprom_get_record_max_length(); amdgpu_ras_validate_threshold(adev, max_eeprom_records_len); - ret = amdgpu_ras_eeprom_init(&con->eeprom_control); - if (ret) + ret = amdgpu_ras_eeprom_init(&con->eeprom_control, &bad_gpu); + /* We only fail this calling and halt booting when bad_gpu is true. */ + if (bad_gpu) { + ret = -EINVAL; goto free; + } if (con->eeprom_control.num_recs) { ret = amdgpu_ras_load_bad_pages(adev); @@ -1865,6 +1869,13 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev) out: dev_warn(adev->dev, "Failed to initialize ras recovery!\n"); + /* + * Except bad_gpu case, other failure cases in this function + * would not fail amdgpu driver init. + */ + if (!bad_gpu) + ret = 0; + return ret; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c index 35c0c849d49b..3f1b167afe6b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c @@ -241,12 +241,14 @@ int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control) } -int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control) +int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control, + bool *bad_gpu) { int ret = 0; struct amdgpu_device *adev = to_amdgpu_device(control); unsigned char buff[EEPROM_ADDRESS_SIZE + EEPROM_TABLE_HEADER_SIZE] = { 0 }; struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr; + struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); struct i2c_msg msg = { .addr = 0, .flags = I2C_M_RD, @@ -254,6 +256,8 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control) .buf = buff, }; + *bad_gpu = false; + /* Verify i2c adapter is initialized */ if (!adev->pm.smu_i2c.algo) return -ENOENT; @@ -282,6 +286,11 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control) DRM_DEBUG_DRIVER("Found existing EEPROM table with %d records", control->num_recs); + } else if ((ras->bad_page_cnt_threshold != 0xFFFFFFFF) && ( + hdr->header == EEPROM_TABLE_HDR_BAD)) { + *bad_gpu = true; + DRM_ERROR("Detect BAD GPU TAG in eeprom table and " + "GPU shall be retired.\n"); } else { DRM_INFO("Creating new EEPROM table"); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h index b272840cb069..a2de243da31d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h @@ -77,7 +77,8 @@ struct eeprom_table_record { unsigned char mcumc_id; }__attribute__((__packed__)); -int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control); +int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control, + bool *bad_gpu); int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control); int amdgpu_ras_eeprom_process_recods(struct amdgpu_ras_eeprom_control *control, -- 2.17.1 _______________________________________________ amd-gfx mailing list amd-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/amd-gfx