Re: [PATCH 4/4 V6] drm/amdgpu: Improve SDMA reset logic with guilty queue tracking

Alex Deucher <alexdeucher@xxxxxxxxx> · Wed, 12 Feb 2025 14:57:58 -0500

On Tue, Feb 11, 2025 at 9:42 AM Jesse.zhang@xxxxxxx <jesse.zhang@xxxxxxx> wrote:
>
> From: "Jesse.zhang@xxxxxxx" <Jesse.zhang@xxxxxxx>
>
> This commit introduces several improvements to the SDMA reset logic:
>
> 1. Added `cached_rptr` to the `amdgpu_ring` structure to store the read pointer
>    before a reset, ensuring proper state restoration after reset.
>
> 2. Introduced `gfx_guilty` and `page_guilty` flags in the `amdgpu_sdma` structure
>    to track which queue (GFX or PAGE) caused a timeout or error.
>
> 3. Replaced the `caller` parameter with a `guilty` boolean in the reset and resume
>    functions to simplify the logic and handle resets based on the guilty state.
>
> 4. Added a helper function `sdma_v4_4_2_is_queue_selected` to check the
>    `SDMA*_*_CONTEXT_STATUS.SELECTED` register and determine if a queue is guilty.
>
> v2:
>    1.replace the caller with a guilty bool.
>    If the queue is the guilty one, set the rptr and wptr  to the saved wptr value,
>    else, set the rptr and wptr to the saved rptr value. (Alex)
>    2. cache the rptr before the reset. (Alex)
>
> v3: add a new ring callback, is_guilty(), which will get called to check if
>     the ring in amdgpu_job_timedout() is actually the guilty ring. If it's not,
>     we can return goto exit(Alex)
>
> v4: cache the rptr for page ring
>
> v5: update the register addresses to correctly use the page ring registers
>       (regSDMA_PAGE_RB_RPTR) in page resume.
>
> v6: Keeping intermediate variables like u64 rwptr simplifies resotre rptr/wptr.(Lijo)
>
> Suggested-by: Alex Deucher <alexander.deucher@xxxxxxx>
> Suggested-by: Jiadong Zhu <Jiadong.Zhu@xxxxxxx>
> Signed-off-by: Jesse Zhang <jesse.zhang@xxxxxxx>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_job.c  | 10 +++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c |  2 +
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h |  3 +
>  drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c |  6 +-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h |  3 +
>  drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c | 96 ++++++++++++++++++++----
>  6 files changed, 106 insertions(+), 14 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> index 100f04475943..ce3e7a9d6688 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> @@ -102,6 +102,16 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
>                 return DRM_GPU_SCHED_STAT_ENODEV;
>         }
>
> +       /* Check if the ring is actually guilty of causing the timeout.
> +        * If not, skip error handling and fence completion.
> +        */
> +       if (amdgpu_gpu_recovery && ring->funcs->is_guilty) {
> +               if (!ring->funcs->is_guilty(ring)) {
> +                       dev_err(adev->dev, "ring %s timeout, but not guilty\n",
> +                               s_job->sched->name);
> +                       goto exit;
> +               }
> +       }

I'd suggest breaking this change and the amdgpu_ring.h change out into
its own patch.

>         /*
>          * Do the coredump immediately after a job timeout to get a very
>          * close dump/snapshot/representation of GPU's current error status
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> index a6e28fe3f8d6..20cd21df38ba 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> @@ -342,6 +342,8 @@ int amdgpu_ring_init(struct amdgpu_device *adev, struct amdgpu_ring *ring,
>         ring->buf_mask = (ring->ring_size / 4) - 1;
>         ring->ptr_mask = ring->funcs->support_64bit_ptrs ?
>                 0xffffffffffffffff : ring->buf_mask;
> +       /*  Initialize cached_rptr to 0 */
> +       ring->cached_rptr = 0;
>
>         /* Allocate ring buffer */
>         if (ring->is_mes_queue) {
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> index 04af26536f97..182aa535d395 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> @@ -237,6 +237,7 @@ struct amdgpu_ring_funcs {
>         void (*patch_de)(struct amdgpu_ring *ring, unsigned offset);
>         int (*reset)(struct amdgpu_ring *ring, unsigned int vmid);
>         void (*emit_cleaner_shader)(struct amdgpu_ring *ring);
> +       bool (*is_guilty)(struct amdgpu_ring *ring);
>  };
>
>  struct amdgpu_ring {
> @@ -306,6 +307,8 @@ struct amdgpu_ring {
>
>         bool            is_sw_ring;
>         unsigned int    entry_index;
> +       /* store the cached rptr to restore after reset */
> +       uint64_t cached_rptr;
>
>  };
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
> index 32eebf9d4408..6ba785798a4a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
> @@ -474,6 +474,10 @@ void amdgpu_sdma_register_on_reset_callbacks(struct amdgpu_device *adev, struct
>         if (!funcs)
>                 return;
>
> +       /* Ensure the reset_callback_list is initialized */
> +       if (!adev->sdma.reset_callback_list.next) {
> +               INIT_LIST_HEAD(&adev->sdma.reset_callback_list);
> +       }

I'd squash this change into the patch which adds the callback list.

>         /* Initialize the list node in the callback structure */
>         INIT_LIST_HEAD(&funcs->list);
>
> @@ -520,7 +524,7 @@ int amdgpu_sdma_reset_instance(struct amdgpu_device *adev, uint32_t instance_id,
>         */
>         if (!amdgpu_ring_sched_ready(gfx_ring)) {
>                 drm_sched_wqueue_stop(&gfx_ring->sched);
> -               gfx_sched_stopped = true;;
> +               gfx_sched_stopped = true;
>         }
>
>         if (adev->sdma.has_page_queue && !amdgpu_ring_sched_ready(page_ring)) {
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
> index df5c9f7a4374..d84c3eccc510 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
> @@ -126,6 +126,9 @@ struct amdgpu_sdma {
>         uint32_t                *ip_dump;
>         uint32_t                supported_reset;
>         struct list_head        reset_callback_list;
> +       /* track guilty state of GFX and PAGE queues */
> +       bool gfx_guilty;
> +       bool page_guilty;
>  };
>
>  /*
> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
> index c297b4a13680..2c66382011d3 100644
> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
> @@ -671,11 +671,12 @@ static uint32_t sdma_v4_4_2_rb_cntl(struct amdgpu_ring *ring, uint32_t rb_cntl)
>   * @adev: amdgpu_device pointer
>   * @i: instance to resume
>   * @restore: used to restore wptr when restart
> + * @guilty: boolean indicating whether this queue is the guilty one (caused the timeout/error)
>   *
>   * Set up the gfx DMA ring buffers and enable them.
>   * Returns 0 for success, error for failure.
>   */
> -static void sdma_v4_4_2_gfx_resume(struct amdgpu_device *adev, unsigned int i, bool restore)
> +static void sdma_v4_4_2_gfx_resume(struct amdgpu_device *adev, unsigned int i, bool restore, bool guilty)
>  {
>         struct amdgpu_ring *ring = &adev->sdma.instance[i].ring;
>         u32 rb_cntl, ib_cntl, wptr_poll_cntl;
> @@ -683,6 +684,7 @@ static void sdma_v4_4_2_gfx_resume(struct amdgpu_device *adev, unsigned int i, b
>         u32 doorbell;
>         u32 doorbell_offset;
>         u64 wptr_gpu_addr;
> +       u64 rwptr;
>
>         wb_offset = (ring->rptr_offs * 4);
>
> @@ -708,12 +710,20 @@ static void sdma_v4_4_2_gfx_resume(struct amdgpu_device *adev, unsigned int i, b
>         /* before programing wptr to a less value, need set minor_ptr_update first */
>         WREG32_SDMA(i, regSDMA_GFX_MINOR_PTR_UPDATE, 1);
>
> +       /* For the guilty queue, set RPTR to the current wptr to skip bad commands,
> +        * It is not a guilty queue, restore cache_rptr and continue execution.
> +         */
> +       if (guilty)
> +               rwptr = ring->wptr;
> +       else
> +               rwptr = ring->cached_rptr;
> +
>         /* Initialize the ring buffer's read and write pointers */
>         if (restore) {
> -               WREG32_SDMA(i, regSDMA_GFX_RB_RPTR, lower_32_bits(ring->wptr << 2));
> -               WREG32_SDMA(i, regSDMA_GFX_RB_RPTR_HI, upper_32_bits(ring->wptr << 2));
> -               WREG32_SDMA(i, regSDMA_GFX_RB_WPTR, lower_32_bits(ring->wptr << 2));
> -               WREG32_SDMA(i, regSDMA_GFX_RB_WPTR_HI, upper_32_bits(ring->wptr << 2));
> +               WREG32_SDMA(i, regSDMA_GFX_RB_RPTR, lower_32_bits(rwptr << 2));
> +               WREG32_SDMA(i, regSDMA_GFX_RB_RPTR_HI, upper_32_bits(rwptr << 2));
> +               WREG32_SDMA(i, regSDMA_GFX_RB_WPTR, lower_32_bits(rwptr << 2));
> +               WREG32_SDMA(i, regSDMA_GFX_RB_WPTR_HI, upper_32_bits(rwptr << 2));
>         } else {
>                 WREG32_SDMA(i, regSDMA_GFX_RB_RPTR, 0);
>                 WREG32_SDMA(i, regSDMA_GFX_RB_RPTR_HI, 0);
> @@ -768,11 +778,12 @@ static void sdma_v4_4_2_gfx_resume(struct amdgpu_device *adev, unsigned int i, b
>   * @adev: amdgpu_device pointer
>   * @i: instance to resume
>   * @restore: boolean to say restore needed or not
> + * @guilty: boolean indicating whether this queue is the guilty one (caused the timeout/error)
>   *
>   * Set up the page DMA ring buffers and enable them.
>   * Returns 0 for success, error for failure.
>   */
> -static void sdma_v4_4_2_page_resume(struct amdgpu_device *adev, unsigned int i, bool restore)
> +static void sdma_v4_4_2_page_resume(struct amdgpu_device *adev, unsigned int i, bool restore, bool guilty)
>  {
>         struct amdgpu_ring *ring = &adev->sdma.instance[i].page;
>         u32 rb_cntl, ib_cntl, wptr_poll_cntl;
> @@ -780,6 +791,7 @@ static void sdma_v4_4_2_page_resume(struct amdgpu_device *adev, unsigned int i,
>         u32 doorbell;
>         u32 doorbell_offset;
>         u64 wptr_gpu_addr;
> +       u64 rwptr;
>
>         wb_offset = (ring->rptr_offs * 4);
>
> @@ -787,12 +799,20 @@ static void sdma_v4_4_2_page_resume(struct amdgpu_device *adev, unsigned int i,
>         rb_cntl = sdma_v4_4_2_rb_cntl(ring, rb_cntl);
>         WREG32_SDMA(i, regSDMA_PAGE_RB_CNTL, rb_cntl);
>
> +       /* For the guilty queue, set RPTR to the current wptr to skip bad commands,
> +        * It is not a guilty queue, restore cache_rptr and continue execution.
> +         */
> +       if (guilty)
> +               rwptr = ring->wptr;
> +       else
> +               rwptr = ring->cached_rptr;
> +
>         /* Initialize the ring buffer's read and write pointers */
>         if (restore) {
> -               WREG32_SDMA(i, regSDMA_GFX_RB_RPTR, lower_32_bits(ring->wptr << 2));
> -               WREG32_SDMA(i, regSDMA_GFX_RB_RPTR_HI, upper_32_bits(ring->wptr << 2));
> -               WREG32_SDMA(i, regSDMA_GFX_RB_WPTR, lower_32_bits(ring->wptr << 2));
> -               WREG32_SDMA(i, regSDMA_GFX_RB_WPTR_HI, upper_32_bits(ring->wptr << 2));
> +               WREG32_SDMA(i, regSDMA_PAGE_RB_RPTR, lower_32_bits(rwptr << 2));
> +               WREG32_SDMA(i, regSDMA_PAGE_RB_RPTR_HI, upper_32_bits(rwptr << 2));
> +               WREG32_SDMA(i, regSDMA_PAGE_RB_WPTR, lower_32_bits(rwptr << 2));
> +               WREG32_SDMA(i, regSDMA_PAGE_RB_WPTR_HI, upper_32_bits(rwptr << 2));
>         } else {
>                 WREG32_SDMA(i, regSDMA_PAGE_RB_RPTR, 0);
>                 WREG32_SDMA(i, regSDMA_PAGE_RB_RPTR_HI, 0);
> @@ -968,9 +988,9 @@ static int sdma_v4_4_2_inst_start(struct amdgpu_device *adev,
>                 uint32_t temp;
>
>                 WREG32_SDMA(i, regSDMA_SEM_WAIT_FAIL_TIMER_CNTL, 0);
> -               sdma_v4_4_2_gfx_resume(adev, i, restore);
> +               sdma_v4_4_2_gfx_resume(adev, i, restore, adev->sdma.gfx_guilty);
>                 if (adev->sdma.has_page_queue)
> -                       sdma_v4_4_2_page_resume(adev, i, restore);
> +                       sdma_v4_4_2_page_resume(adev, i, restore, adev->sdma.page_guilty);
>
>                 /* set utc l1 enable flag always to 1 */
>                 temp = RREG32_SDMA(i, regSDMA_CNTL);
> @@ -1480,7 +1500,9 @@ static int sdma_v4_4_2_sw_init(struct amdgpu_ip_block *ip_block)
>         r = amdgpu_sdma_sysfs_reset_mask_init(adev);
>         if (r)
>                 return r;
> -       INIT_LIST_HEAD(&adev->sdma.reset_callback_list);
> +       /* Initialize guilty flags for GFX and PAGE queues */
> +       adev->sdma.gfx_guilty = false;
> +       adev->sdma.page_guilty = false;
>
>         return r;
>  }
> @@ -1606,6 +1628,34 @@ static int sdma_v4_4_2_soft_reset(struct amdgpu_ip_block *ip_block)
>         return 0;
>  }
>
> +static bool sdma_v4_4_2_is_queue_selected(struct amdgpu_device *adev, uint32_t instance_id, bool is_page_queue)
> +{
> +       uint32_t reg_offset = is_page_queue ? regSDMA_PAGE_CONTEXT_STATUS : regSDMA_GFX_CONTEXT_STATUS;
> +       uint32_t context_status = RREG32(sdma_v4_4_2_get_reg_offset(adev, instance_id, reg_offset));
> +
> +       /* Check if the SELECTED bit is set */
> +       return (context_status & SDMA_GFX_CONTEXT_STATUS__SELECTED_MASK) != 0;
> +}
> +
> +static bool sdma_v4_4_2_gfx_ring_is_guilty(struct amdgpu_ring *ring)
> +{
> +       struct amdgpu_device *adev = ring->adev;
> +       uint32_t instance_id = ring->me;
> +
> +       return sdma_v4_4_2_is_queue_selected(adev, instance_id, false);
> +}
> +
> +static bool sdma_v4_4_2_page_ring_is_guilty(struct amdgpu_ring *ring)
> +{
> +       struct amdgpu_device *adev = ring->adev;
> +       uint32_t instance_id = ring->me;
> +
> +       if (adev->sdma.has_page_queue)
> +               return false;
> +
> +       return sdma_v4_4_2_is_queue_selected(adev, instance_id, true);
> +}

Could also split this and the is_guilty callbacks out into a separate patch.

Alex

> +
>  static int sdma_v4_4_2_reset_queue(struct amdgpu_ring *ring, unsigned int vmid)
>  {
>         struct amdgpu_device *adev = ring->adev;
> @@ -1616,11 +1666,29 @@ static int sdma_v4_4_2_reset_queue(struct amdgpu_ring *ring, unsigned int vmid)
>  static int sdma_v4_4_2_stop_queue(struct amdgpu_device *adev, uint32_t instance_id, int src)
>  {
>         u32 inst_mask;
> +       uint64_t rptr;
>         struct amdgpu_ring *ring = &adev->sdma.instance[instance_id].ring;
>
>         if (amdgpu_sriov_vf(adev))
>                 return -EINVAL;
>
> +       /* Check if this queue is the guilty one */
> +       adev->sdma.gfx_guilty = sdma_v4_4_2_is_queue_selected(adev, instance_id, false);
> +       if (adev->sdma.has_page_queue)
> +               adev->sdma.page_guilty = sdma_v4_4_2_is_queue_selected(adev, instance_id, true);
> +
> +       /* Cache the rptr before reset, after the reset,
> +       * all of the registers will be reset to 0
> +       */
> +       rptr = amdgpu_ring_get_rptr(ring);
> +       ring->cached_rptr = rptr;
> +       /* Cache the rptr for the page queue if it exists */
> +       if (adev->sdma.has_page_queue) {
> +               struct amdgpu_ring *page_ring = &adev->sdma.instance[instance_id].page;
> +               rptr = amdgpu_ring_get_rptr(page_ring);
> +               page_ring->cached_rptr = rptr;
> +       }
> +
>         /* stop queue */
>         inst_mask = 1 << ring->me;
>         sdma_v4_4_2_inst_gfx_stop(adev, inst_mask);
> @@ -2055,6 +2123,7 @@ static const struct amdgpu_ring_funcs sdma_v4_4_2_ring_funcs = {
>         .emit_reg_wait = sdma_v4_4_2_ring_emit_reg_wait,
>         .emit_reg_write_reg_wait = amdgpu_ring_emit_reg_write_reg_wait_helper,
>         .reset = sdma_v4_4_2_reset_queue,
> +       .is_guilty = sdma_v4_4_2_gfx_ring_is_guilty,
>  };
>
>  static const struct amdgpu_ring_funcs sdma_v4_4_2_page_ring_funcs = {
> @@ -2086,6 +2155,7 @@ static const struct amdgpu_ring_funcs sdma_v4_4_2_page_ring_funcs = {
>         .emit_wreg = sdma_v4_4_2_ring_emit_wreg,
>         .emit_reg_wait = sdma_v4_4_2_ring_emit_reg_wait,
>         .emit_reg_write_reg_wait = amdgpu_ring_emit_reg_write_reg_wait_helper,
> +       .is_guilty = sdma_v4_4_2_page_ring_is_guilty,
>  };
>
>  static void sdma_v4_4_2_set_ring_funcs(struct amdgpu_device *adev)
> --
> 2.25.1
>