Applied. Thanks! On Wed, Feb 26, 2025 at 8:11 AM André Almeida <andrealmeid@xxxxxxxxxx> wrote: > > Prior to the addition of ring reset, the debug option > `debug_disable_soft_recovery` could be used to force a full device > reset. Now that we have ring reset, create a debug option to disable > them in amdgpu, forcing the driver to go with the full device > reset path again when both options are combined. > > This option is useful for testing and debugging purposes when one wants > to test the full reset from userspace. > > Signed-off-by: André Almeida <andrealmeid@xxxxxxxxxx> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 + > drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 6 ++++++ > drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 5 +++-- > 3 files changed, 10 insertions(+), 2 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > index 69895fccb474..75dc4b962d64 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > @@ -1186,6 +1186,7 @@ struct amdgpu_device { > bool debug_use_vram_fw_buf; > bool debug_enable_ras_aca; > bool debug_exp_resets; > + bool debug_disable_gpu_ring_reset; > > bool enforce_isolation[MAX_XCP]; > /* Added this mutex for cleaner shader isolation between GFX and compute processes */ > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c > index 95a05b03f799..edeb12c816e8 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c > @@ -136,6 +136,7 @@ enum AMDGPU_DEBUG_MASK { > AMDGPU_DEBUG_USE_VRAM_FW_BUF = BIT(3), > AMDGPU_DEBUG_ENABLE_RAS_ACA = BIT(4), > AMDGPU_DEBUG_ENABLE_EXP_RESETS = BIT(5), > + AMDGPU_DEBUG_DISABLE_GPU_RING_RESET = BIT(6), > }; > > unsigned int amdgpu_vram_limit = UINT_MAX; > @@ -2221,6 +2222,11 @@ static void amdgpu_init_debug_options(struct amdgpu_device *adev) > pr_info("debug: enable experimental reset features\n"); > adev->debug_exp_resets = true; > } > + > + if (amdgpu_debug_mask & AMDGPU_DEBUG_DISABLE_GPU_RING_RESET) { > + pr_info("debug: ring reset disabled\n"); > + adev->debug_disable_gpu_ring_reset = true; > + } > } > > static unsigned long amdgpu_fix_asic_type(struct pci_dev *pdev, unsigned long flags) > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c > index 7b79b0f39ba1..8ab23182127e 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c > @@ -137,8 +137,9 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job) > dma_fence_set_error(&s_job->s_fence->finished, -ETIME); > > /* attempt a per ring reset */ > - if (amdgpu_gpu_recovery && > - ring->funcs->reset) { > + if (unlikely(adev->debug_disable_gpu_ring_reset)) { > + dev_err(adev->dev, "Ring reset disabled by debug mask\n"); > + } else if (amdgpu_gpu_recovery && ring->funcs->reset) { > dev_err(adev->dev, "Starting %s ring reset\n", s_job->sched->name); > /* stop the scheduler, but don't mess with the > * bad job yet because if ring reset fails > -- > 2.48.1 >