Re: [PATCH 2/5] drm/amdgpu: rework gfx9 queue reset

Alex Deucher <alexdeucher@xxxxxxxxx> · Tue, 4 Feb 2025 09:56:06 -0500

On Tue, Feb 4, 2025 at 9:48 AM Christian König
<ckoenig.leichtzumerken@xxxxxxxxx> wrote:
>
> Testing this feature turned out that it was a bit unstable. The
> CP_VMID_RESET register takes the VMID which all submissions from should
> be canceled.
>
> Unlike Windows Linux uses per process VMIDs instead of per engine VMIDs
> for the simple reason that we don't have enough. So resetting one VMID
> only killed the submissions of one specific process.
>
> Fortunately that turned out to be exactly what we want to have.
>
> So clear the CP_VMID_RESET register between every context switch between
> applications when we do the pipeline sync to avoid trouble if multiple
> VMIDs are used on the ring right behind each other.
>
> Use the same pipeline sync function in the reset handler and issue an IB
> test instead of a ring test after the queue reset to provide a longer
> timeout and additional fence value should there be additional work on
> the ring after the one aborted.
>
> Also drop the soft recovery since that pretty much does the same thing as
> CP_VMID_RESET, just on a lower level and with less chance of succeeding.
>
> This now survives a stress test running over night sending a broken
> submission ever 45 seconds and recovering fine from each of them.
>
> Signed-off-by: Christian König <christian.koenig@xxxxxxx>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu.h   |  1 +
>  drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 47 ++++++++++-----------------
>  2 files changed, 19 insertions(+), 29 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 8902fafbcf8d..1eee2a1bca5a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -275,6 +275,7 @@ extern int amdgpu_wbrf;
>  #define AMDGPU_WAIT_IDLE_TIMEOUT_IN_MS         3000
>  #define AMDGPU_MAX_USEC_TIMEOUT                        100000  /* 100 ms */
>  #define AMDGPU_FENCE_JIFFIES_TIMEOUT           (HZ / 2)
> +#define AMDGPU_QUEUE_RESET_TIMEOUT             (HZ / 10)
>  #define AMDGPU_DEBUGFS_MAX_COMPONENTS          32
>  #define AMDGPUFB_CONN_LIMIT                    4
>  #define AMDGPU_BIOS_NUM_SCRATCH                        16
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> index fa572b40989e..705f5a9c11a0 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> @@ -5607,7 +5607,17 @@ static void gfx_v9_0_ring_emit_pipeline_sync(struct amdgpu_ring *ring)
>         int usepfp = (ring->funcs->type == AMDGPU_RING_TYPE_GFX);
>         uint32_t seq = ring->fence_drv.sync_seq;
>         uint64_t addr = ring->fence_drv.gpu_addr;
> +       struct amdgpu_device *adev = ring->adev;
>
> +       amdgpu_ring_emit_reg_wait(ring,
> +                                 SOC15_REG_OFFSET(GC, 0, mmCP_VMID_RESET),
> +                                 0, 0xffff);
> +       amdgpu_ring_emit_wreg(ring,
> +                             SOC15_REG_OFFSET(GC, 0, mmCP_VMID_RESET),
> +                             0);
> +       amdgpu_ring_emit_fence(ring, ring->fence_drv.gpu_addr,
> +                              ring->fence_drv.sync_seq,
> +                              AMDGPU_FENCE_FLAG_EXEC);
>         gfx_v9_0_wait_reg_mem(ring, usepfp, 1, 0,
>                               lower_32_bits(addr), upper_32_bits(addr),
>                               seq, 0xffffffff, 4);
> @@ -5963,20 +5973,6 @@ static void gfx_v9_0_ring_emit_reg_write_reg_wait(struct amdgpu_ring *ring,
>                                                            ref, mask);
>  }
>
> -static void gfx_v9_0_ring_soft_recovery(struct amdgpu_ring *ring, unsigned vmid)
> -{
> -       struct amdgpu_device *adev = ring->adev;
> -       uint32_t value = 0;
> -
> -       value = REG_SET_FIELD(value, SQ_CMD, CMD, 0x03);
> -       value = REG_SET_FIELD(value, SQ_CMD, MODE, 0x01);
> -       value = REG_SET_FIELD(value, SQ_CMD, CHECK_VMID, 1);
> -       value = REG_SET_FIELD(value, SQ_CMD, VM_ID, vmid);
> -       amdgpu_gfx_rlc_enter_safe_mode(adev, 0);
> -       WREG32_SOC15(GC, 0, mmSQ_CMD, value);
> -       amdgpu_gfx_rlc_exit_safe_mode(adev, 0);
> -}
> -
>  static void gfx_v9_0_set_gfx_eop_interrupt_state(struct amdgpu_device *adev,
>                                                  enum amdgpu_interrupt_state state)
>  {
> @@ -7252,16 +7248,12 @@ static int gfx_v9_0_reset_kgq(struct amdgpu_ring *ring, unsigned int vmid)
>         if (r)
>                 return r;
>
> -       if (amdgpu_ring_alloc(ring, 7 + 7 + 5))
> +       if (amdgpu_ring_alloc(ring, 7 + 7 + 5 + 7))
>                 return -ENOMEM;
> -       gfx_v9_0_ring_emit_fence(ring, ring->fence_drv.gpu_addr,
> -                                ring->fence_drv.sync_seq, AMDGPU_FENCE_FLAG_EXEC);
> -       gfx_v9_0_ring_emit_reg_wait(ring,
> -                                   SOC15_REG_OFFSET(GC, 0, mmCP_VMID_RESET), 0, 0xffff);
> -       gfx_v9_0_ring_emit_wreg(ring,
> -                               SOC15_REG_OFFSET(GC, 0, mmCP_VMID_RESET), 0);
> +       gfx_v9_0_ring_emit_pipeline_sync(ring);
> +       amdgpu_ring_commit(ring);
>
> -       return amdgpu_ring_test_ring(ring);
> +       return gfx_v9_0_ring_test_ib(ring, AMDGPU_QUEUE_RESET_TIMEOUT);
>  }
>
>  static int gfx_v9_0_reset_kcq(struct amdgpu_ring *ring,
> @@ -7468,7 +7460,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
>         .set_wptr = gfx_v9_0_ring_set_wptr_gfx,
>         .emit_frame_size = /* totally 242 maximum if 16 IBs */
>                 5 +  /* COND_EXEC */
> -               7 +  /* PIPELINE_SYNC */
> +               7 + 7 + 5 + 7 +  /* PIPELINE_SYNC */
>                 SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>                 SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>                 2 + /* VM_FLUSH */
> @@ -7506,7 +7498,6 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
>         .emit_wreg = gfx_v9_0_ring_emit_wreg,
>         .emit_reg_wait = gfx_v9_0_ring_emit_reg_wait,
>         .emit_reg_write_reg_wait = gfx_v9_0_ring_emit_reg_write_reg_wait,
> -       .soft_recovery = gfx_v9_0_ring_soft_recovery,
>         .emit_mem_sync = gfx_v9_0_emit_mem_sync,
>         .reset = gfx_v9_0_reset_kgq,
>         .emit_cleaner_shader = gfx_v9_0_ring_emit_cleaner_shader,
> @@ -7525,7 +7516,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_sw_ring_funcs_gfx = {
>         .set_wptr = amdgpu_sw_ring_set_wptr_gfx,
>         .emit_frame_size = /* totally 242 maximum if 16 IBs */
>                 5 +  /* COND_EXEC */
> -               7 +  /* PIPELINE_SYNC */
> +               7 + 7 + 5 + 7 +  /* PIPELINE_SYNC */
>                 SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>                 SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>                 2 + /* VM_FLUSH */
> @@ -7564,7 +7555,6 @@ static const struct amdgpu_ring_funcs gfx_v9_0_sw_ring_funcs_gfx = {
>         .emit_wreg = gfx_v9_0_ring_emit_wreg,
>         .emit_reg_wait = gfx_v9_0_ring_emit_reg_wait,
>         .emit_reg_write_reg_wait = gfx_v9_0_ring_emit_reg_write_reg_wait,
> -       .soft_recovery = gfx_v9_0_ring_soft_recovery,
>         .emit_mem_sync = gfx_v9_0_emit_mem_sync,
>         .patch_cntl = gfx_v9_0_ring_patch_cntl,
>         .patch_de = gfx_v9_0_ring_patch_de_meta,
> @@ -7586,7 +7576,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
>                 20 + /* gfx_v9_0_ring_emit_gds_switch */
>                 7 + /* gfx_v9_0_ring_emit_hdp_flush */
>                 5 + /* hdp invalidate */
> -               7 + /* gfx_v9_0_ring_emit_pipeline_sync */
> +               7 + 7 + 5 + 7 +  /* PIPELINE_SYNC */
>                 SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>                 SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>                 8 + 8 + 8 + /* gfx_v9_0_ring_emit_fence x3 for user fence, vm fence */
> @@ -7608,7 +7598,6 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
>         .emit_wreg = gfx_v9_0_ring_emit_wreg,
>         .emit_reg_wait = gfx_v9_0_ring_emit_reg_wait,
>         .emit_reg_write_reg_wait = gfx_v9_0_ring_emit_reg_write_reg_wait,
> -       .soft_recovery = gfx_v9_0_ring_soft_recovery,

Probably want to keep soft_recovery for compute as compute queues
don't support vmid reset.

Alex

>         .emit_mem_sync = gfx_v9_0_emit_mem_sync,
>         .emit_wave_limit = gfx_v9_0_emit_wave_limit,
>         .reset = gfx_v9_0_reset_kcq,
> @@ -7629,7 +7618,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_kiq = {
>                 20 + /* gfx_v9_0_ring_emit_gds_switch */
>                 7 + /* gfx_v9_0_ring_emit_hdp_flush */
>                 5 + /* hdp invalidate */
> -               7 + /* gfx_v9_0_ring_emit_pipeline_sync */
> +               7 + 7 + 5 + 7 +  /* PIPELINE_SYNC */
>                 SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>                 SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>                 8 + 8 + 8, /* gfx_v9_0_ring_emit_fence_kiq x3 for user fence, vm fence */
> --
> 2.34.1
>