Am 13.12.2017 um 20:01 schrieb Andrey Grodzovsky: > Add new parameter to control GPU recovery procedure. > Retire old way of disabling GPU recovery by setting lockup_timeout == 0 and > set default for lockup_timeout to 10s. > > v2: > Add auto logic where reset is disabled for bare metal and enabled > for SR-IOV. > Allow forced reset from debugfs. > > Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky at amd.com> Reviewed-by: Christian König <christian.koenig at amd.com> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu.h | 3 ++- > drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 9 ++++++++- > drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 4 ++++ > drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 2 +- > drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c | 2 +- > drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 2 +- > drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 2 +- > drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c | 2 +- > 8 files changed, 19 insertions(+), 7 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > index 3735500..d7f0263 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > @@ -126,6 +126,7 @@ extern int amdgpu_param_buf_per_se; > extern int amdgpu_job_hang_limit; > extern int amdgpu_lbpw; > extern int amdgpu_compute_multipipe; > +extern int amdgpu_gpu_recovery; > > #ifdef CONFIG_DRM_AMDGPU_SI > extern int amdgpu_si_support; > @@ -1879,7 +1880,7 @@ amdgpu_get_sdma_instance(struct amdgpu_ring *ring) > #define amdgpu_psp_check_fw_loading_status(adev, i) (adev)->firmware.funcs->check_fw_loading_status((adev), (i)) > > /* Common functions */ > -int amdgpu_gpu_recover(struct amdgpu_device *adev, struct amdgpu_job* job); > +int amdgpu_gpu_recover(struct amdgpu_device *adev, struct amdgpu_job* job, bool force); > bool amdgpu_need_backup(struct amdgpu_device *adev); > void amdgpu_pci_config_reset(struct amdgpu_device *adev); > bool amdgpu_need_post(struct amdgpu_device *adev); > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > index 8d03baa..a074502 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > @@ -3015,11 +3015,12 @@ static int amdgpu_reset_sriov(struct amdgpu_device *adev, uint64_t *reset_flags, > * > * @adev: amdgpu device pointer > * @job: which job trigger hang > + * @force forces reset regardless of amdgpu_gpu_recovery > * > * Attempt to reset the GPU if it has hung (all asics). > * Returns 0 for success or an error on failure. > */ > -int amdgpu_gpu_recover(struct amdgpu_device *adev, struct amdgpu_job *job) > +int amdgpu_gpu_recover(struct amdgpu_device *adev, struct amdgpu_job *job, bool force) > { > struct drm_atomic_state *state = NULL; > uint64_t reset_flags = 0; > @@ -3030,6 +3031,12 @@ int amdgpu_gpu_recover(struct amdgpu_device *adev, struct amdgpu_job *job) > return 0; > } > > + if (!force && (amdgpu_gpu_recovery == 0 || > + (amdgpu_gpu_recovery == -1 && !amdgpu_sriov_vf(adev)))) { > + DRM_INFO("GPU recovery disabled.\n"); > + return 0; > + } > + > dev_info(adev->dev, "GPU reset begin!\n"); > > mutex_lock(&adev->lock_reset); > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c > index 0b039bd..b734cd6 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c > @@ -128,6 +128,7 @@ int amdgpu_param_buf_per_se = 0; > int amdgpu_job_hang_limit = 0; > int amdgpu_lbpw = -1; > int amdgpu_compute_multipipe = -1; > +int amdgpu_gpu_recovery = -1; /* auto */ > > MODULE_PARM_DESC(vramlimit, "Restrict VRAM for testing, in megabytes"); > module_param_named(vramlimit, amdgpu_vram_limit, int, 0600); > @@ -280,6 +281,9 @@ module_param_named(lbpw, amdgpu_lbpw, int, 0444); > MODULE_PARM_DESC(compute_multipipe, "Force compute queues to be spread across pipes (1 = enable, 0 = disable, -1 = auto)"); > module_param_named(compute_multipipe, amdgpu_compute_multipipe, int, 0444); > > +MODULE_PARM_DESC(gpu_recovery, "Enable GPU recovery mechanism, (1 = enable, 0 = disable, -1 = auto"); > +module_param_named(gpu_recovery, amdgpu_gpu_recovery, int, 0444); > + > #ifdef CONFIG_DRM_AMDGPU_SI > > #if defined(CONFIG_DRM_RADEON) || defined(CONFIG_DRM_RADEON_MODULE) > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c > index 1469963..854baf0 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c > @@ -705,7 +705,7 @@ static int amdgpu_debugfs_gpu_recover(struct seq_file *m, void *data) > struct amdgpu_device *adev = dev->dev_private; > > seq_printf(m, "gpu recover\n"); > - amdgpu_gpu_recover(adev, NULL); > + amdgpu_gpu_recover(adev, NULL, true); > > return 0; > } > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c > index c340774..c43643e 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c > @@ -88,7 +88,7 @@ static void amdgpu_irq_reset_work_func(struct work_struct *work) > reset_work); > > if (!amdgpu_sriov_vf(adev)) > - amdgpu_gpu_recover(adev, NULL); > + amdgpu_gpu_recover(adev, NULL, false); > } > > /* Disable *all* interrupts */ > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c > index 013c0a8..be8a437 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c > @@ -37,7 +37,7 @@ static void amdgpu_job_timedout(struct drm_sched_job *s_job) > atomic_read(&job->ring->fence_drv.last_seq), > job->ring->fence_drv.sync_seq); > > - amdgpu_gpu_recover(job->adev, job); > + amdgpu_gpu_recover(job->adev, job, false); > } > > int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs, > diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c > index 71f5690..7ade56d 100644 > --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c > +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c > @@ -253,7 +253,7 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work) > } > > /* Trigger recovery due to world switch failure */ > - amdgpu_gpu_recover(adev, NULL); > + amdgpu_gpu_recover(adev, NULL, false); > } > > static int xgpu_ai_set_mailbox_rcv_irq(struct amdgpu_device *adev, > diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c > index df52824..e05823d 100644 > --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c > +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c > @@ -521,7 +521,7 @@ static void xgpu_vi_mailbox_flr_work(struct work_struct *work) > } > > /* Trigger recovery due to world switch failure */ > - amdgpu_gpu_recover(adev, NULL); > + amdgpu_gpu_recover(adev, NULL, false); > } > > static int xgpu_vi_set_mailbox_rcv_irq(struct amdgpu_device *adev,