RE: [PATCH 5/5] drm/amdgpu: rework gfx10 queue reset

"Zhu, Jiadong" <Jiadong.Zhu@xxxxxxx> · Wed, 5 Feb 2025 11:28:47 +0000



[AMD Official Use Only - AMD Internal Distribution Only]

> -----Original Message-----
> From: amd-gfx <amd-gfx-bounces@xxxxxxxxxxxxxxxxxxxxx> On Behalf Of Christian
> König
> Sent: Tuesday, February 4, 2025 10:31 PM
> To: timur.kristof@xxxxxxxxx; Deucher, Alexander <Alexander.Deucher@xxxxxxx>;
> amd-gfx@xxxxxxxxxxxxxxxxxxxxx
> Subject: [PATCH 5/5] drm/amdgpu: rework gfx10 queue reset
>
> Apply the same changes to gfx10 as done to gfx9.
>
> The general idea to reset the whole kernel queue and then asking the kiq to map it
> again didn't worked at all. Background is that we don't use per application kernel
> queues for gfx10 on Linux for performance reasons.
>
> So instead use the gfx9 approach here as well and only reset all submissions from
> a specific VMID instead of the whole queue.
>
> This also avoids reserving and kmap the MQD which are operations generally not
> allowed in the reset handler.
>
> This approach seems to work for at least some time, but not as reliable as it is on
> gfx9. It will probably need some more work until it survives a whole night of reset
> stress testing.
>
> Signed-off-by: Christian König <christian.koenig@xxxxxxx>
> ---
>  drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 65 +++++++-------------------
>  1 file changed, 16 insertions(+), 49 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> index 5ba263fe5512..7ffdba974f87 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> @@ -8684,7 +8684,17 @@ static void gfx_v10_0_ring_emit_pipeline_sync(struct
> amdgpu_ring *ring)
>       int usepfp = (ring->funcs->type == AMDGPU_RING_TYPE_GFX);
>       uint32_t seq = ring->fence_drv.sync_seq;
>       uint64_t addr = ring->fence_drv.gpu_addr;
> +     struct amdgpu_device *adev = ring->adev;
>
> +     amdgpu_ring_emit_reg_wait(ring,
> +                               SOC15_REG_OFFSET(GC, 0,
> mmCP_VMID_RESET),
> +                               0, 0xffff);
> +     amdgpu_ring_emit_wreg(ring,
> +                           SOC15_REG_OFFSET(GC, 0, mmCP_VMID_RESET),
> +                           0);
> +     amdgpu_ring_emit_fence(ring, ring->fence_drv.gpu_addr,
> +                            ring->fence_drv.sync_seq,
> +                            AMDGPU_FENCE_FLAG_EXEC);
>       gfx_v10_0_wait_reg_mem(ring, usepfp, 1, 0, lower_32_bits(addr),
>                              upper_32_bits(addr), seq, 0xffffffff, 4);  } @@ -8984,21
> +8994,6 @@ static void gfx_v10_0_ring_emit_reg_write_reg_wait(struct
> amdgpu_ring *ring,
>                                                          ref, mask);
>  }
>
> -static void gfx_v10_0_ring_soft_recovery(struct amdgpu_ring *ring,
> -                                      unsigned int vmid)
> -{
> -     struct amdgpu_device *adev = ring->adev;
> -     uint32_t value = 0;
> -
> -     value = REG_SET_FIELD(value, SQ_CMD, CMD, 0x03);
> -     value = REG_SET_FIELD(value, SQ_CMD, MODE, 0x01);
> -     value = REG_SET_FIELD(value, SQ_CMD, CHECK_VMID, 1);
> -     value = REG_SET_FIELD(value, SQ_CMD, VM_ID, vmid);
> -     amdgpu_gfx_rlc_enter_safe_mode(adev, 0);
> -     WREG32_SOC15(GC, 0, mmSQ_CMD, value);
> -     amdgpu_gfx_rlc_exit_safe_mode(adev, 0);
> -}
> -
>  static void
>  gfx_v10_0_set_gfx_eop_interrupt_state(struct amdgpu_device *adev,
>                                     uint32_t me, uint32_t pipe,
> @@ -9467,7 +9462,6 @@ static int gfx_v10_0_reset_kgq(struct amdgpu_ring
> *ring, unsigned int vmid)
>       struct amdgpu_ring *kiq_ring = &kiq->ring;
>       unsigned long flags;
>       u32 tmp;
> -     u64 addr;
>       int r;
>
>       if (amdgpu_sriov_vf(adev))
> @@ -9478,27 +9472,14 @@ static int gfx_v10_0_reset_kgq(struct amdgpu_ring
> *ring, unsigned int vmid)
>
>       spin_lock_irqsave(&kiq->ring_lock, flags);
>
> -     if (amdgpu_ring_alloc(kiq_ring, 5 + 7 + 7 + kiq->pmf->map_queues_size)) {
> +     if (amdgpu_ring_alloc(kiq_ring, 5)) {
>               spin_unlock_irqrestore(&kiq->ring_lock, flags);
>               return -ENOMEM;
>       }
>
> -     addr = amdgpu_bo_gpu_offset(ring->mqd_obj) +
> -             offsetof(struct v10_gfx_mqd, cp_gfx_hqd_active);
>       tmp = REG_SET_FIELD(0, CP_VMID_RESET, RESET_REQUEST, 1 <<
> vmid);
> -     if (ring->pipe == 0)
> -             tmp = REG_SET_FIELD(tmp, CP_VMID_RESET, PIPE0_QUEUES,
> 1 << ring->queue);
> -     else
> -             tmp = REG_SET_FIELD(tmp, CP_VMID_RESET, PIPE1_QUEUES,
> 1 << ring->queue);
> -
>       gfx_v10_0_ring_emit_wreg(kiq_ring,
>                                SOC15_REG_OFFSET(GC, 0,
> mmCP_VMID_RESET), tmp);
> -     gfx_v10_0_wait_reg_mem(kiq_ring, 0, 1, 0,
> -                            lower_32_bits(addr), upper_32_bits(addr),
> -                            0, 1, 0x20);
> -     gfx_v10_0_ring_emit_reg_wait(kiq_ring,
> -                                  SOC15_REG_OFFSET(GC, 0,
> mmCP_VMID_RESET), 0, 0xffffffff);
> -     kiq->pmf->kiq_map_queues(kiq_ring, ring);
>       amdgpu_ring_commit(kiq_ring);
>
>       spin_unlock_irqrestore(&kiq->ring_lock, flags); @@ -9507,24 +9488,12
> @@ static int gfx_v10_0_reset_kgq(struct amdgpu_ring *ring, unsigned int vmid)
>       if (r)
>               return r;
>
> -     r = amdgpu_bo_reserve(ring->mqd_obj, false);
> -     if (unlikely(r != 0)) {
> -             DRM_ERROR("fail to resv mqd_obj\n");
> -             return r;
> -     }
> -     r = amdgpu_bo_kmap(ring->mqd_obj, (void **)&ring->mqd_ptr);
> -     if (!r) {
> -             r = gfx_v10_0_kgq_init_queue(ring, true);
> -             amdgpu_bo_kunmap(ring->mqd_obj);
> -             ring->mqd_ptr = NULL;
> -     }
> -     amdgpu_bo_unreserve(ring->mqd_obj);
> -     if (r) {
> -             DRM_ERROR("fail to unresv mqd_obj\n");
> -             return r;
> -     }
> +     if (amdgpu_ring_alloc(ring, 7 + 7 + 5 + 7))
> +             return -ENOMEM;
> +     gfx_v10_0_ring_emit_pipeline_sync(ring);
Why we need to set CP_VMID_RESET 0xffff here? Does it mean that we have to reset(terminate) all the waves from the other vmids?

Thanks,
Jiadong

> +     amdgpu_ring_commit(ring);
>
> -     return amdgpu_ring_test_ring(ring);
> +     return gfx_v10_0_ring_test_ib(ring,
> AMDGPU_QUEUE_RESET_TIMEOUT);
>  }
>
>  static int gfx_v10_0_reset_kcq(struct amdgpu_ring *ring, @@ -9819,7 +9788,6
> @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_gfx = {
>       .emit_wreg = gfx_v10_0_ring_emit_wreg,
>       .emit_reg_wait = gfx_v10_0_ring_emit_reg_wait,
>       .emit_reg_write_reg_wait = gfx_v10_0_ring_emit_reg_write_reg_wait,
> -     .soft_recovery = gfx_v10_0_ring_soft_recovery,
>       .emit_mem_sync = gfx_v10_0_emit_mem_sync,
>       .reset = gfx_v10_0_reset_kgq,
>       .emit_cleaner_shader = gfx_v10_0_ring_emit_cleaner_shader,
> @@ -9860,7 +9828,6 @@ static const struct amdgpu_ring_funcs
> gfx_v10_0_ring_funcs_compute = {
>       .emit_wreg = gfx_v10_0_ring_emit_wreg,
>       .emit_reg_wait = gfx_v10_0_ring_emit_reg_wait,
>       .emit_reg_write_reg_wait = gfx_v10_0_ring_emit_reg_write_reg_wait,
> -     .soft_recovery = gfx_v10_0_ring_soft_recovery,
>       .emit_mem_sync = gfx_v10_0_emit_mem_sync,
>       .reset = gfx_v10_0_reset_kcq,
>       .emit_cleaner_shader = gfx_v10_0_ring_emit_cleaner_shader,
> --
> 2.34.1