On Thu, Jan 4, 2024 at 12:55 AM Ma Jun <Jun.Ma2@xxxxxxx> wrote: > > Fix the warning info below during mode1 reset. > [ +0.000004] Call Trace: > [ +0.000004] <TASK> > [ +0.000006] ? show_regs+0x6e/0x80 > [ +0.000011] ? __flush_work.isra.0+0x2e8/0x390 > [ +0.000005] ? __warn+0x91/0x150 > [ +0.000009] ? __flush_work.isra.0+0x2e8/0x390 > [ +0.000006] ? report_bug+0x19d/0x1b0 > [ +0.000013] ? handle_bug+0x46/0x80 > [ +0.000012] ? exc_invalid_op+0x1d/0x80 > [ +0.000011] ? asm_exc_invalid_op+0x1f/0x30 > [ +0.000014] ? __flush_work.isra.0+0x2e8/0x390 > [ +0.000007] ? __flush_work.isra.0+0x208/0x390 > [ +0.000007] ? _prb_read_valid+0x216/0x290 > [ +0.000008] __cancel_work_timer+0x11d/0x1a0 > [ +0.000007] ? try_to_grab_pending+0xe8/0x190 > [ +0.000012] cancel_work_sync+0x14/0x20 > [ +0.000008] amddrm_sched_stop+0x3c/0x1d0 [amd_sched] > [ +0.000032] amdgpu_device_gpu_recover+0x29a/0xe90 [amdgpu] > > This warning info was printed after applying the patch > "drm/sched: Convert drm scheduler to use a work queue rather than kthread". > The root cause is that amdgpu driver tries to use the uninitialized > work_struct in the struct drm_gpu_scheduler > > Signed-off-by: Ma Jun <Jun.Ma2@xxxxxxx> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 16 +++++++++++++--- > drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c | 3 ++- > 2 files changed, 15 insertions(+), 4 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > index 4b1d5f42249f..7bac3019aa0a 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > @@ -5575,6 +5575,16 @@ static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) > > } > > +static bool amdgpu_is_ring_sched_ready(struct amdgpu_ring *ring) I would either rename this amdgpu_device_is_ring_sched_ready() for consistency, or move it to amdgpu_ring.c and rename it to amdgpu_ring_sched_ready(). Alex > +{ > + if (!ring) > + return false; > + > + if (ring->no_scheduler || !drm_sched_wqueue_ready(&ring->sched)) > + return false; > + > + return true; > +} > /** > * amdgpu_device_gpu_recover - reset the asic and recover scheduler > * > @@ -5700,7 +5710,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, > for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { > struct amdgpu_ring *ring = tmp_adev->rings[i]; > > - if (!ring || !drm_sched_wqueue_ready(&ring->sched)) > + if (!amdgpu_is_ring_sched_ready(ring)) > continue; > > drm_sched_stop(&ring->sched, job ? &job->base : NULL); > @@ -5776,7 +5786,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, > for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { > struct amdgpu_ring *ring = tmp_adev->rings[i]; > > - if (!ring || !drm_sched_wqueue_ready(&ring->sched)) > + if (!amdgpu_is_ring_sched_ready(ring)) > continue; > > drm_sched_start(&ring->sched, true); > @@ -6265,7 +6275,7 @@ void amdgpu_pci_resume(struct pci_dev *pdev) > for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { > struct amdgpu_ring *ring = adev->rings[i]; > > - if (!ring || !drm_sched_wqueue_ready(&ring->sched)) > + if (!amdgpu_is_ring_sched_ready(ring)) > continue; > > drm_sched_start(&ring->sched, true); > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c > index 41266bc99345..3650e4d06e53 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c > @@ -636,7 +636,8 @@ int amdgpu_ring_test_helper(struct amdgpu_ring *ring) > DRM_DEV_DEBUG(adev->dev, "ring test on %s succeeded\n", > ring->name); > > - ring->sched.ready = !r; > + if (!ring->no_scheduler) > + ring->sched.ready = !r; > return r; > } > > -- > 2.34.1 >