-----Original Message----- From: amd-gfx <amd-gfx-bounces@xxxxxxxxxxxxxxxxxxxxx> On Behalf Of Tao Zhou Sent: Friday, August 30, 2019 8:25 PM To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx; Grodzovsky, Andrey <Andrey.Grodzovsky@xxxxxxx>; Chen, Guchun <Guchun.Chen@xxxxxxx>; Li, Dennis <Dennis.Li@xxxxxxx>; Zhang, Hawking <Hawking.Zhang@xxxxxxx> Cc: Zhou1, Tao <Tao.Zhou1@xxxxxxx> Subject: [PATCH 2/4] drm/amdgpu: Hook EEPROM table to RAS support eeprom records load and save for ras, move EEPROM records storing to bad page reserving Signed-off-by: Tao Zhou <tao.zhou1@xxxxxxx> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 111 ++++++++++++++++++------ 1 file changed, 83 insertions(+), 28 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 24663ec41248..02120aa3cb5d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1348,6 +1348,72 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev, return ret; } +/* + * write error record array to eeprom, the function should be + * protected by recovery_lock + */ +static int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev) { + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + struct ras_err_handler_data *data; + struct amdgpu_ras_eeprom_control *control = + &adev->psp.ras.ras->eeprom_control; + int save_count; + + if (!con || !con->eh_data) + return 0; + + data = con->eh_data; + if (!data) + return 0; [Guchun]Such check (!data) is redundant and not needed. As we have checked !con->eh_data earlier, and the whole function is protected by recovery_lock. + save_count = data->count - control->num_recs; + /* only new entries are saved */ + if (save_count > 0) + if (amdgpu_ras_eeprom_process_recods(&con->eeprom_control, + &data->bps[control->num_recs], + true, + save_count)) { + DRM_ERROR("Failed to save EEPROM table data!"); + return -EIO; + } + + return 0; +} + +/* + * read error record array in eeprom and reserve enough space for + * storing new bad pages + */ +static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev) { + struct amdgpu_ras_eeprom_control *control = + &adev->psp.ras.ras->eeprom_control; + struct eeprom_table_record *bps = NULL; + int ret = 0; + + /* no bad page record, skip eeprom access */ + if (!control->num_recs) + return ret; + + bps = kcalloc(control->num_recs, sizeof(*bps), GFP_KERNEL); + if (!bps) + return -ENOMEM; + + if (amdgpu_ras_eeprom_process_recods(control, bps, false, + control->num_recs)) { + DRM_ERROR("Failed to load EEPROM table records!"); + ret = -EIO; + goto out; + } + + ret = amdgpu_ras_add_bad_pages(adev, bps, control->num_recs); + +out: + kfree(bps); + return ret; +} + /* called in gpu recovery/init */ int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev) { @@ -1355,7 +1421,7 @@ int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev) struct ras_err_handler_data *data; uint64_t bp; struct amdgpu_bo *bo; - int i; + int i, ret = 0; if (!con || !con->eh_data) return 0; @@ -1375,9 +1441,11 @@ int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev) data->bps_bo[i] = bo; data->last_reserved = i + 1; } + + ret = amdgpu_ras_save_bad_pages(adev); out: mutex_unlock(&con->recovery_lock); - return 0; + return ret; } /* called when driver unload */ @@ -1409,33 +1477,11 @@ static int amdgpu_ras_release_bad_pages(struct amdgpu_device *adev) return 0; } -static int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev) -{ - /* TODO - * write the array to eeprom when SMU disabled. - */ - return 0; -} - -/* - * read error record array in eeprom and reserve enough space for - * storing new bad pages - */ -static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev) -{ - struct eeprom_table_record *bps = NULL; - int ret; - - ret = amdgpu_ras_add_bad_pages(adev, bps, - adev->umc.max_ras_err_cnt_per_query); - - return ret; -} - static int amdgpu_ras_recovery_init(struct amdgpu_device *adev) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct ras_err_handler_data **data = &con->eh_data; + int ret; *data = kmalloc(sizeof(**data), GFP_KERNEL|__GFP_ZERO); @@ -1447,8 +1493,18 @@ static int amdgpu_ras_recovery_init(struct amdgpu_device *adev) atomic_set(&con->in_recovery, 0); con->adev = adev; - amdgpu_ras_load_bad_pages(adev); - amdgpu_ras_reserve_bad_pages(adev); + ret = amdgpu_ras_eeprom_init(&adev->psp.ras.ras->eeprom_control); + if (ret) + return ret; + + if (adev->psp.ras.ras->eeprom_control.num_recs) { + ret = amdgpu_ras_load_bad_pages(adev); + if (ret) + return ret; + ret = amdgpu_ras_reserve_bad_pages(adev); + if (ret) + return ret; + } return 0; } @@ -1459,7 +1515,6 @@ static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev) struct ras_err_handler_data *data = con->eh_data; cancel_work_sync(&con->recovery_work); - amdgpu_ras_save_bad_pages(adev); amdgpu_ras_release_bad_pages(adev); mutex_lock(&con->recovery_lock); -- 2.17.1 _______________________________________________ amd-gfx mailing list amd-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/amd-gfx _______________________________________________ amd-gfx mailing list amd-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/amd-gfx