> -----Original Message----- > From: Grodzovsky, Andrey <Andrey.Grodzovsky@xxxxxxx> > Sent: 2019年8月30日 22:03 > To: Zhou1, Tao <Tao.Zhou1@xxxxxxx>; amd-gfx@xxxxxxxxxxxxxxxxxxxxx; > Chen, Guchun <Guchun.Chen@xxxxxxx>; Li, Dennis <Dennis.Li@xxxxxxx>; > Zhang, Hawking <Hawking.Zhang@xxxxxxx> > Subject: Re: [PATCH 4/4] drm/amdgpu: move the call of ras recovery_init and > bad page reserve to proper place > > > On 8/30/19 8:24 AM, Tao Zhou wrote: > > ras recovery_init should be called after ttm and smu init, bad page > > reserve should be put in front of gpu reset since i2c may be unstable > > during gpu reset add cleanup for recovery_init and recovery_fini > > > > Signed-off-by: Tao Zhou <tao.zhou1@xxxxxxx> > > --- > > drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 16 +++++++++---- > > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 28 +++++++++++++------- > -- > > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 5 ++++ > > 3 files changed, 33 insertions(+), 16 deletions(-) > > > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > > index 67b09cb2a9e2..4e4895ac926d 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > > @@ -2894,6 +2894,17 @@ int amdgpu_device_init(struct amdgpu_device > *adev, > > goto failed; > > } > > > > + /* retired pages will be loaded from eeprom and reserved here, > > + * new bo may be created, it should be called after ttm init, > > + * accessing eeprom also relies on smu (unlock i2c bus) and it > > + * should be called after smu init > > + * > > + * Note: theoretically, this should be called before all vram allocations > > + * to protect retired page from abusing, but there are some > allocations > > + * in front of smu fw loading > > + */ > > + amdgpu_ras_recovery_init(adev); > > > You probably should check for return value here. [Tao] No check here is intentional, the failure of recovery_init should not stop amdgpu init process and recovery_init could free resources allocated by itself if it fails. I'll add comment here and add a DRM_WARN for recovery_init. > > > > + > > /* must succeed. */ > > amdgpu_ras_resume(adev); > > > > @@ -3627,11 +3638,6 @@ static int amdgpu_do_asic_reset(struct > amdgpu_hive_info *hive, > > break; > > } > > } > > - > > - list_for_each_entry(tmp_adev, device_list_handle, > > - gmc.xgmi.head) { > > - amdgpu_ras_reserve_bad_pages(tmp_adev); > > - } > > } > > } > > > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > > index 02120aa3cb5d..ad67a109122f 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > > @@ -1477,14 +1477,13 @@ static int > amdgpu_ras_release_bad_pages(struct amdgpu_device *adev) > > return 0; > > } > > > > -static int amdgpu_ras_recovery_init(struct amdgpu_device *adev) > > +int amdgpu_ras_recovery_init(struct amdgpu_device *adev) > > { > > struct amdgpu_ras *con = amdgpu_ras_get_context(adev); > > struct ras_err_handler_data **data = &con->eh_data; > > int ret; > > > > - *data = kmalloc(sizeof(**data), > > - GFP_KERNEL|__GFP_ZERO); > > + *data = kmalloc(sizeof(**data), GFP_KERNEL | __GFP_ZERO); > > if (!*data) > > return -ENOMEM; > > > > @@ -1495,18 +1494,29 @@ static int amdgpu_ras_recovery_init(struct > > amdgpu_device *adev) > > > > ret = amdgpu_ras_eeprom_init(&adev->psp.ras.ras- > >eeprom_control); > > if (ret) > > - return ret; > > + goto cancel_work; > > > Why you need do go to cancel_work_sync(&con->recovery_work) at such > early stage of device init, is RAS active already by this time and RAS interrupt > might fire triggering reset ? > > Andrey > [Tao] Good point, I'll remove cancel_work_sync. > > > > > if (adev->psp.ras.ras->eeprom_control.num_recs) { > > ret = amdgpu_ras_load_bad_pages(adev); > > if (ret) > > - return ret; > > + goto cancel_work; > > ret = amdgpu_ras_reserve_bad_pages(adev); > > if (ret) > > - return ret; > > + goto release; > > } > > > > return 0; > > + > > +release: > > + amdgpu_ras_release_bad_pages(adev); > > +cancel_work: > > + cancel_work_sync(&con->recovery_work); > > + con->eh_data = NULL; > > + kfree((*data)->bps); > > + kfree((*data)->bps_bo); > > + kfree(*data); > > + > > + return ret; > > } > > > > static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev) @@ > > -1520,6 +1530,7 @@ static int amdgpu_ras_recovery_fini(struct > amdgpu_device *adev) > > mutex_lock(&con->recovery_lock); > > con->eh_data = NULL; > > kfree(data->bps); > > + kfree(data->bps_bo); > > kfree(data); > > mutex_unlock(&con->recovery_lock); > > > > @@ -1611,9 +1622,6 @@ int amdgpu_ras_init(struct amdgpu_device > *adev) > > return r; > > } > > > > - if (amdgpu_ras_recovery_init(adev)) > > - goto recovery_out; > > - > > amdgpu_ras_mask &= AMDGPU_RAS_BLOCK_MASK; > > > > if (amdgpu_ras_fs_init(adev)) > > @@ -1628,8 +1636,6 @@ int amdgpu_ras_init(struct amdgpu_device > *adev) > > con->hw_supported, con->supported); > > return 0; > > fs_out: > > - amdgpu_ras_recovery_fini(adev); > > -recovery_out: > > amdgpu_ras_set_context(adev, NULL); > > kfree(con); > > > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h > > index 42e1d379e21c..6d00f79b612b 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h > > @@ -480,6 +480,7 @@ static inline int amdgpu_ras_is_supported(struct > amdgpu_device *adev, > > return ras && (ras->supported & (1 << block)); > > } > > > > +int amdgpu_ras_recovery_init(struct amdgpu_device *adev); > > int amdgpu_ras_request_reset_on_boot(struct amdgpu_device *adev, > > unsigned int block); > > > > @@ -500,6 +501,10 @@ static inline int amdgpu_ras_reset_gpu(struct > amdgpu_device *adev, > > { > > struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); > > > > + /* save bad page to eeprom before gpu reset, > > + * i2c may be unstable in gpu reset > > + */ > > + amdgpu_ras_reserve_bad_pages(adev); > > if (atomic_cmpxchg(&ras->in_recovery, 0, 1) == 0) > > schedule_work(&ras->recovery_work); > > return 0; _______________________________________________ amd-gfx mailing list amd-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/amd-gfx