On 8/30/19 8:24 AM, Tao Zhou wrote: > ras recovery_init should be called after ttm and smu init, > bad page reserve should be put in front of gpu reset since i2c > may be unstable during gpu reset > add cleanup for recovery_init and recovery_fini > > Signed-off-by: Tao Zhou <tao.zhou1@xxxxxxx> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 16 +++++++++---- > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 28 +++++++++++++--------- > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 5 ++++ > 3 files changed, 33 insertions(+), 16 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > index 67b09cb2a9e2..4e4895ac926d 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > @@ -2894,6 +2894,17 @@ int amdgpu_device_init(struct amdgpu_device *adev, > goto failed; > } > > + /* retired pages will be loaded from eeprom and reserved here, > + * new bo may be created, it should be called after ttm init, > + * accessing eeprom also relies on smu (unlock i2c bus) and it > + * should be called after smu init > + * > + * Note: theoretically, this should be called before all vram allocations > + * to protect retired page from abusing, but there are some allocations > + * in front of smu fw loading > + */ > + amdgpu_ras_recovery_init(adev); You probably should check for return value here. > + > /* must succeed. */ > amdgpu_ras_resume(adev); > > @@ -3627,11 +3638,6 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, > break; > } > } > - > - list_for_each_entry(tmp_adev, device_list_handle, > - gmc.xgmi.head) { > - amdgpu_ras_reserve_bad_pages(tmp_adev); > - } > } > } > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > index 02120aa3cb5d..ad67a109122f 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > @@ -1477,14 +1477,13 @@ static int amdgpu_ras_release_bad_pages(struct amdgpu_device *adev) > return 0; > } > > -static int amdgpu_ras_recovery_init(struct amdgpu_device *adev) > +int amdgpu_ras_recovery_init(struct amdgpu_device *adev) > { > struct amdgpu_ras *con = amdgpu_ras_get_context(adev); > struct ras_err_handler_data **data = &con->eh_data; > int ret; > > - *data = kmalloc(sizeof(**data), > - GFP_KERNEL|__GFP_ZERO); > + *data = kmalloc(sizeof(**data), GFP_KERNEL | __GFP_ZERO); > if (!*data) > return -ENOMEM; > > @@ -1495,18 +1494,29 @@ static int amdgpu_ras_recovery_init(struct amdgpu_device *adev) > > ret = amdgpu_ras_eeprom_init(&adev->psp.ras.ras->eeprom_control); > if (ret) > - return ret; > + goto cancel_work; Why you need do go to cancel_work_sync(&con->recovery_work) at such early stage of device init, is RAS active already by this time and RAS interrupt might fire triggering reset ? Andrey > > if (adev->psp.ras.ras->eeprom_control.num_recs) { > ret = amdgpu_ras_load_bad_pages(adev); > if (ret) > - return ret; > + goto cancel_work; > ret = amdgpu_ras_reserve_bad_pages(adev); > if (ret) > - return ret; > + goto release; > } > > return 0; > + > +release: > + amdgpu_ras_release_bad_pages(adev); > +cancel_work: > + cancel_work_sync(&con->recovery_work); > + con->eh_data = NULL; > + kfree((*data)->bps); > + kfree((*data)->bps_bo); > + kfree(*data); > + > + return ret; > } > > static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev) > @@ -1520,6 +1530,7 @@ static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev) > mutex_lock(&con->recovery_lock); > con->eh_data = NULL; > kfree(data->bps); > + kfree(data->bps_bo); > kfree(data); > mutex_unlock(&con->recovery_lock); > > @@ -1611,9 +1622,6 @@ int amdgpu_ras_init(struct amdgpu_device *adev) > return r; > } > > - if (amdgpu_ras_recovery_init(adev)) > - goto recovery_out; > - > amdgpu_ras_mask &= AMDGPU_RAS_BLOCK_MASK; > > if (amdgpu_ras_fs_init(adev)) > @@ -1628,8 +1636,6 @@ int amdgpu_ras_init(struct amdgpu_device *adev) > con->hw_supported, con->supported); > return 0; > fs_out: > - amdgpu_ras_recovery_fini(adev); > -recovery_out: > amdgpu_ras_set_context(adev, NULL); > kfree(con); > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h > index 42e1d379e21c..6d00f79b612b 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h > @@ -480,6 +480,7 @@ static inline int amdgpu_ras_is_supported(struct amdgpu_device *adev, > return ras && (ras->supported & (1 << block)); > } > > +int amdgpu_ras_recovery_init(struct amdgpu_device *adev); > int amdgpu_ras_request_reset_on_boot(struct amdgpu_device *adev, > unsigned int block); > > @@ -500,6 +501,10 @@ static inline int amdgpu_ras_reset_gpu(struct amdgpu_device *adev, > { > struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); > > + /* save bad page to eeprom before gpu reset, > + * i2c may be unstable in gpu reset > + */ > + amdgpu_ras_reserve_bad_pages(adev); > if (atomic_cmpxchg(&ras->in_recovery, 0, 1) == 0) > schedule_work(&ras->recovery_work); > return 0; _______________________________________________ amd-gfx mailing list amd-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/amd-gfx