> -----Original Message----- > From: Chen, Guchun <Guchun.Chen@xxxxxxx> > Sent: 2019年9月2日 10:11 > To: Zhou1, Tao <Tao.Zhou1@xxxxxxx>; amd-gfx@xxxxxxxxxxxxxxxxxxxxx; > Grodzovsky, Andrey <Andrey.Grodzovsky@xxxxxxx>; Li, Dennis > <Dennis.Li@xxxxxxx>; Zhang, Hawking <Hawking.Zhang@xxxxxxx> > Cc: Zhou1, Tao <Tao.Zhou1@xxxxxxx> > Subject: RE: [PATCH 2/4] drm/amdgpu: Hook EEPROM table to RAS > > > > -----Original Message----- > From: amd-gfx <amd-gfx-bounces@xxxxxxxxxxxxxxxxxxxxx> On Behalf Of Tao > Zhou > Sent: Friday, August 30, 2019 8:25 PM > To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx; Grodzovsky, Andrey > <Andrey.Grodzovsky@xxxxxxx>; Chen, Guchun <Guchun.Chen@xxxxxxx>; > Li, Dennis <Dennis.Li@xxxxxxx>; Zhang, Hawking > <Hawking.Zhang@xxxxxxx> > Cc: Zhou1, Tao <Tao.Zhou1@xxxxxxx> > Subject: [PATCH 2/4] drm/amdgpu: Hook EEPROM table to RAS > > support eeprom records load and save for ras, move EEPROM records > storing to bad page reserving > > Signed-off-by: Tao Zhou <tao.zhou1@xxxxxxx> > Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@xxxxxxx> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 111 ++++++++++++++++++-- > ---- > 1 file changed, 83 insertions(+), 28 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > index 24663ec41248..02120aa3cb5d 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > @@ -1348,6 +1348,72 @@ int amdgpu_ras_add_bad_pages(struct > amdgpu_device *adev, > return ret; > } > > +/* > + * write error record array to eeprom, the function should be > + * protected by recovery_lock > + */ > +static int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev) { > + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); > + struct ras_err_handler_data *data; > + struct amdgpu_ras_eeprom_control *control = > + &adev->psp.ras.ras->eeprom_control; > + int save_count; > + > + if (!con || !con->eh_data) > + return 0; > + > + data = con->eh_data; > + if (!data) > + return 0; > [Guchun]Such check (!data) is redundant and not needed. As we have > checked !con->eh_data earlier, and the whole function is protected by > recovery_lock. [Tao] OK, I'll remove it. > > + save_count = data->count - control->num_recs; > + /* only new entries are saved */ > + if (save_count > 0) > + if (amdgpu_ras_eeprom_process_recods(&con- > >eeprom_control, > + &data->bps[control- > >num_recs], > + true, > + save_count)) { > + DRM_ERROR("Failed to save EEPROM table data!"); > + return -EIO; > + } > + > + return 0; > +} > + > +/* > + * read error record array in eeprom and reserve enough space for > + * storing new bad pages > + */ > +static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev) { > + struct amdgpu_ras_eeprom_control *control = > + &adev->psp.ras.ras->eeprom_control; > + struct eeprom_table_record *bps = NULL; > + int ret = 0; > + > + /* no bad page record, skip eeprom access */ > + if (!control->num_recs) > + return ret; > + > + bps = kcalloc(control->num_recs, sizeof(*bps), GFP_KERNEL); > + if (!bps) > + return -ENOMEM; > + > + if (amdgpu_ras_eeprom_process_recods(control, bps, false, > + control->num_recs)) { > + DRM_ERROR("Failed to load EEPROM table records!"); > + ret = -EIO; > + goto out; > + } > + > + ret = amdgpu_ras_add_bad_pages(adev, bps, control->num_recs); > + > +out: > + kfree(bps); > + return ret; > +} > + > /* called in gpu recovery/init */ > int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev) { @@ - > 1355,7 +1421,7 @@ int amdgpu_ras_reserve_bad_pages(struct > amdgpu_device *adev) > struct ras_err_handler_data *data; > uint64_t bp; > struct amdgpu_bo *bo; > - int i; > + int i, ret = 0; > > if (!con || !con->eh_data) > return 0; > @@ -1375,9 +1441,11 @@ int amdgpu_ras_reserve_bad_pages(struct > amdgpu_device *adev) > data->bps_bo[i] = bo; > data->last_reserved = i + 1; > } > + > + ret = amdgpu_ras_save_bad_pages(adev); > out: > mutex_unlock(&con->recovery_lock); > - return 0; > + return ret; > } > > /* called when driver unload */ > @@ -1409,33 +1477,11 @@ static int amdgpu_ras_release_bad_pages(struct > amdgpu_device *adev) > return 0; > } > > -static int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev) -{ > - /* TODO > - * write the array to eeprom when SMU disabled. > - */ > - return 0; > -} > - > -/* > - * read error record array in eeprom and reserve enough space for > - * storing new bad pages > - */ > -static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev) -{ > - struct eeprom_table_record *bps = NULL; > - int ret; > - > - ret = amdgpu_ras_add_bad_pages(adev, bps, > - adev->umc.max_ras_err_cnt_per_query); > - > - return ret; > -} > - > static int amdgpu_ras_recovery_init(struct amdgpu_device *adev) { > struct amdgpu_ras *con = amdgpu_ras_get_context(adev); > struct ras_err_handler_data **data = &con->eh_data; > + int ret; > > *data = kmalloc(sizeof(**data), > GFP_KERNEL|__GFP_ZERO); > @@ -1447,8 +1493,18 @@ static int amdgpu_ras_recovery_init(struct > amdgpu_device *adev) > atomic_set(&con->in_recovery, 0); > con->adev = adev; > > - amdgpu_ras_load_bad_pages(adev); > - amdgpu_ras_reserve_bad_pages(adev); > + ret = amdgpu_ras_eeprom_init(&adev->psp.ras.ras- > >eeprom_control); > + if (ret) > + return ret; > + > + if (adev->psp.ras.ras->eeprom_control.num_recs) { > + ret = amdgpu_ras_load_bad_pages(adev); > + if (ret) > + return ret; > + ret = amdgpu_ras_reserve_bad_pages(adev); > + if (ret) > + return ret; > + } > > return 0; > } > @@ -1459,7 +1515,6 @@ static int amdgpu_ras_recovery_fini(struct > amdgpu_device *adev) > struct ras_err_handler_data *data = con->eh_data; > > cancel_work_sync(&con->recovery_work); > - amdgpu_ras_save_bad_pages(adev); > amdgpu_ras_release_bad_pages(adev); > > mutex_lock(&con->recovery_lock); > -- > 2.17.1 > > _______________________________________________ > amd-gfx mailing list > amd-gfx@xxxxxxxxxxxxxxxxxxxxx > https://lists.freedesktop.org/mailman/listinfo/amd-gfx _______________________________________________ amd-gfx mailing list amd-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/amd-gfx