[AMD Official Use Only - AMD Internal Distribution Only] > -----Original Message----- > From: amd-gfx <amd-gfx-bounces@xxxxxxxxxxxxxxxxxxxxx> On Behalf Of Lijo Lazar > Sent: Tuesday, November 5, 2024 1:39 PM > To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx > Cc: Zhang, Hawking <Hawking.Zhang@xxxxxxx>; Deucher, Alexander > <Alexander.Deucher@xxxxxxx>; Koenig, Christian <Christian.Koenig@xxxxxxx> > Subject: [PATCH 1/2] drm/amdgpu: Fix unmap queue logic > > In current logic, it calls ring_alloc followed by a ring_test. ring_test in turn will call > another ring_alloc. This is illegal usage as a ring_alloc is expected to be closed > properly with a ring_commit. Change to commit the unmap queue packet first > followed by a ring_test. Add a comment about the usage of ring_test. Regarding the description " This is illegal usage as a ring_alloc is expected to be closed properly with a ring_commit ", does this only apply to unmap queue logic ? The current logic to do map queue is also to commit once in ring_test but ring_alloc twice. > > Also, reorder the current pre-condition checks of job hang or kiq ring scheduler not > ready. Without them being met, it is not useful to attempt ring or memory allocations. > > Fixes tag refers to the original patch which introduced this issue which then got > carried over into newer code. > > Signed-off-by: Lijo Lazar <lijo.lazar@xxxxxxx> > > Fixes: 6c10b5cc4eaa ("drm/amdgpu: Remove duplicate code in gfx_v8_0.c") > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 13 +++++- > drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 47 ++++++++++++++-------- > drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 7 ++++ > 3 files changed, 49 insertions(+), 18 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c > index c218e8f117e4..2a40150ae10f 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c > @@ -844,6 +844,9 @@ int amdgpu_amdkfd_unmap_hiq(struct amdgpu_device > *adev, u32 doorbell_off, > if (!kiq->pmf || !kiq->pmf->kiq_unmap_queues) > return -EINVAL; > > + if (!kiq_ring->sched.ready || adev->job_hang) > + return 0; > + > ring_funcs = kzalloc(sizeof(*ring_funcs), GFP_KERNEL); > if (!ring_funcs) > return -ENOMEM; > @@ -868,8 +871,14 @@ int amdgpu_amdkfd_unmap_hiq(struct amdgpu_device > *adev, u32 doorbell_off, > > kiq->pmf->kiq_unmap_queues(kiq_ring, ring, RESET_QUEUES, 0, 0); > > - if (kiq_ring->sched.ready && !adev->job_hang) > - r = amdgpu_ring_test_helper(kiq_ring); > + /* Submit unmap queue packet */ > + amdgpu_ring_commit(kiq_ring); > + /* > + * Ring test will do a basic scratch register change check. Just run > + * this to ensure that unmap queues that is submitted before got > + * processed successfully before returning. > + */ > + r = amdgpu_ring_test_helper(kiq_ring); > > spin_unlock(&kiq->ring_lock); > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c > index dabc01cf1fbb..6733ff5d9628 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c > @@ -515,6 +515,17 @@ int amdgpu_gfx_disable_kcq(struct amdgpu_device *adev, > int xcc_id) > if (!kiq->pmf || !kiq->pmf->kiq_unmap_queues) > return -EINVAL; > > + if (!kiq_ring->sched.ready || adev->job_hang) > + return 0; > + /** > + * This is workaround: only skip kiq_ring test > + * during ras recovery in suspend stage for gfx9.4.3 > + */ > + if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || > + amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) && > + amdgpu_ras_in_recovery(adev)) > + return 0; > + > spin_lock(&kiq->ring_lock); > if (amdgpu_ring_alloc(kiq_ring, kiq->pmf->unmap_queues_size * > adev->gfx.num_compute_rings)) { > @@ -528,20 +539,15 @@ int amdgpu_gfx_disable_kcq(struct amdgpu_device > *adev, int xcc_id) > &adev->gfx.compute_ring[j], > RESET_QUEUES, 0, 0); > } > - > - /** > - * This is workaround: only skip kiq_ring test > - * during ras recovery in suspend stage for gfx9.4.3 > + /* Submit unmap queue packet */ > + amdgpu_ring_commit(kiq_ring); > + /* > + * Ring test will do a basic scratch register change check. Just run > + * this to ensure that unmap queues that is submitted before got > + * processed successfully before returning. > */ > - if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || > - amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) && > - amdgpu_ras_in_recovery(adev)) { > - spin_unlock(&kiq->ring_lock); > - return 0; > - } > + r = amdgpu_ring_test_helper(kiq_ring); > > - if (kiq_ring->sched.ready && !adev->job_hang) > - r = amdgpu_ring_test_helper(kiq_ring); > spin_unlock(&kiq->ring_lock); > > return r; > @@ -569,8 +575,11 @@ int amdgpu_gfx_disable_kgq(struct amdgpu_device *adev, > int xcc_id) > if (!kiq->pmf || !kiq->pmf->kiq_unmap_queues) > return -EINVAL; > > - spin_lock(&kiq->ring_lock); > + if (!adev->gfx.kiq[0].ring.sched.ready || adev->job_hang) > + return 0; > + > if (amdgpu_gfx_is_master_xcc(adev, xcc_id)) { > + spin_lock(&kiq->ring_lock); > if (amdgpu_ring_alloc(kiq_ring, kiq->pmf->unmap_queues_size * > adev->gfx.num_gfx_rings)) { > spin_unlock(&kiq->ring_lock); > @@ -583,11 +592,17 @@ int amdgpu_gfx_disable_kgq(struct amdgpu_device > *adev, int xcc_id) > &adev->gfx.gfx_ring[j], > PREEMPT_QUEUES, 0, 0); > } > - } > + /* Submit unmap queue packet */ > + amdgpu_ring_commit(kiq_ring); > > - if (adev->gfx.kiq[0].ring.sched.ready && !adev->job_hang) > + /* > + * Ring test will do a basic scratch register change check. > + * Just run this to ensure that unmap queues that is submitted > + * before got processed successfully before returning. > + */ > r = amdgpu_ring_test_helper(kiq_ring); > - spin_unlock(&kiq->ring_lock); > + spin_unlock(&kiq->ring_lock); > + } > > return r; > } > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c > b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c > index f85e545653c7..553a6113fa67 100644 > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c > @@ -4823,6 +4823,13 @@ static int gfx_v8_0_kcq_disable(struct amdgpu_device > *adev) > amdgpu_ring_write(kiq_ring, 0); > amdgpu_ring_write(kiq_ring, 0); > } > + /* Submit unmap queue packet */ > + amdgpu_ring_commit(kiq_ring); > + /* > + * Ring test will do a basic scratch register change check. Just run > + * this to ensure that unmap queues that is submitted before got > + * processed successfully before returning. > + */ > r = amdgpu_ring_test_helper(kiq_ring); > if (r) > DRM_ERROR("KCQ disable failed\n"); > -- > 2.25.1