Re: [PATCH] drm/amdgpu/sdma: fix engine reset handling

Alex Deucher <alexdeucher@xxxxxxxxx> · Mon, 17 Mar 2025 08:58:08 -0400

On Sun, Mar 16, 2025 at 10:01 PM Zhang, Jesse(Jie) <Jesse.Zhang@xxxxxxx> wrote:
>
> [AMD Official Use Only - AMD Internal Distribution Only]
>
> -----Original Message-----
> From: amd-gfx <amd-gfx-bounces@xxxxxxxxxxxxxxxxxxxxx> On Behalf Of Alex Deucher
> Sent: Sunday, March 16, 2025 1:22 AM
> To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx
> Cc: Deucher, Alexander <Alexander.Deucher@xxxxxxx>
> Subject: [PATCH] drm/amdgpu/sdma: fix engine reset handling
>
> Move the kfd suspend/resume code into the caller.  That is where the KFD is likely to detect a reset so on the KFD side there is no need to call them.  Also add a mutex to lock the actual reset sequence.
>
> Fixes: bac38ca8c475 ("drm/amdkfd: implement per queue sdma reset for gfx 9.4+")
> Signed-off-by: Alex Deucher <alexander.deucher@xxxxxxx>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c        |  1 +
>  drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c          | 15 +++------------
>  drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h          |  3 ++-
>  drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c          |  8 +++++++-
>  .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c |  2 +-
>  5 files changed, 14 insertions(+), 15 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 1d8cfdc51bdcb..611b27c91a18a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -4281,6 +4281,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>         mutex_init(&adev->gfx.kfd_sch_mutex);
>         mutex_init(&adev->gfx.workload_profile_mutex);
>         mutex_init(&adev->vcn.workload_profile_mutex);
> +       mutex_init(&adev->sdma.engine_reset_mutex);
>
>         amdgpu_device_init_apu_flags(adev);
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
> index 03c4c012a1508..0b2529ef5a963 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
> @@ -524,7 +524,6 @@ void amdgpu_sdma_register_on_reset_callbacks(struct amdgpu_device *adev, struct
>   * amdgpu_sdma_reset_engine - Reset a specific SDMA engine
>   * @adev: Pointer to the AMDGPU device
>   * @instance_id: ID of the SDMA engine instance to reset
> - * @suspend_user_queues: check if suspend user queue.
>   *
>   * This function performs the following steps:
>   * 1. Calls all registered pre_reset callbacks to allow KFD and AMDGPU to save their state.
> @@ -533,7 +532,7 @@ void amdgpu_sdma_register_on_reset_callbacks(struct amdgpu_device *adev, struct
>   *
>   * Returns: 0 on success, or a negative error code on failure.
>   */
> -int amdgpu_sdma_reset_engine(struct amdgpu_device *adev, uint32_t instance_id, bool suspend_user_queues)
> +int amdgpu_sdma_reset_engine(struct amdgpu_device *adev, uint32_t
> +instance_id)
>  {
>         struct sdma_on_reset_funcs *funcs;
>         int ret = 0;
> @@ -542,13 +541,7 @@ int amdgpu_sdma_reset_engine(struct amdgpu_device *adev, uint32_t instance_id, b
>         struct amdgpu_ring *page_ring = &sdma_instance->page;
>         bool gfx_sched_stopped = false, page_sched_stopped = false;
>
> -       /* Suspend KFD if suspend_user_queues is true.
> -        * prevent the destruction of in-flight healthy user queue packets and
> -        * avoid race conditions between KFD and KGD during the reset process.
> -        */
> -       if (suspend_user_queues)
> -               amdgpu_amdkfd_suspend(adev, false);
> -
> +       mutex_lock(&adev->sdma.engine_reset_mutex);
>         /* Stop the scheduler's work queue for the GFX and page rings if they are running.
>         * This ensures that no new tasks are submitted to the queues while
>         * the reset is in progress.
> @@ -609,9 +602,7 @@ int amdgpu_sdma_reset_engine(struct amdgpu_device *adev, uint32_t instance_id, b
>                         drm_sched_wqueue_start(&page_ring->sched);
>                 }
>         }
> -
> -       if (suspend_user_queues)
> -               amdgpu_amdkfd_resume(adev, false);
> +       mutex_unlock(&adev->sdma.engine_reset_mutex);
>
>         return ret;
>  }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
> index 9651693200655..d3a44a8cf6104 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
> @@ -129,6 +129,7 @@ struct amdgpu_sdma {
>         /* track guilty state of GFX and PAGE queues */
>         bool gfx_guilty;
>         bool page_guilty;
> +       struct mutex            engine_reset_mutex;
>  };
>
> Should we move the definition of engine_reset_mutex to struct amdgpu_sdma_instance
> to avoid races on different smda instances?

Yes, we can make the lock per instance since amdgpu_dpm_reset_sdma()
has its own locking on the smu side.

Alex

>
> Except for that question, the  patch is Reviewed-by: Jesse.Zhang <Jesse.zhang@xxxxxxx>
>
>  /*
> @@ -169,7 +170,7 @@ struct amdgpu_buffer_funcs {  };
>
>  void amdgpu_sdma_register_on_reset_callbacks(struct amdgpu_device *adev, struct sdma_on_reset_funcs *funcs); -int amdgpu_sdma_reset_engine(struct amdgpu_device *adev, uint32_t instance_id, bool suspend_user_queues);
> +int amdgpu_sdma_reset_engine(struct amdgpu_device *adev, uint32_t
> +instance_id);
>
>  #define amdgpu_emit_copy_buffer(adev, ib, s, d, b, t) (adev)->mman.buffer_funcs->emit_copy_buffer((ib),  (s), (d), (b), (t))  #define amdgpu_emit_fill_buffer(adev, ib, s, d, b) (adev)->mman.buffer_funcs->emit_fill_buffer((ib), (s), (d), (b)) diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
> index fd34dc1380811..14e4f7358cc4c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
> @@ -1666,7 +1666,13 @@ static int sdma_v4_4_2_reset_queue(struct amdgpu_ring *ring, unsigned int vmid)  {
>         struct amdgpu_device *adev = ring->adev;
>         u32 id = GET_INST(SDMA0, ring->me);
> -       return amdgpu_sdma_reset_engine(adev, id, true);
> +       int r;
> +
> +       amdgpu_amdkfd_suspend(adev, false);
> +       r = amdgpu_sdma_reset_engine(adev, id);
> +       amdgpu_amdkfd_resume(adev, false);
> +
> +       return r;
>  }
>
>  static int sdma_v4_4_2_stop_queue(struct amdgpu_device *adev, uint32_t instance_id) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index 2ed003d3ff0e0..c610e172a2b83 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -2310,7 +2310,7 @@ static int reset_hung_queues_sdma(struct device_queue_manager *dqm)
>                                 continue;
>
>                         /* Reset engine and check. */
> -                       if (amdgpu_sdma_reset_engine(dqm->dev->adev, i, false) ||
> +                       if (amdgpu_sdma_reset_engine(dqm->dev->adev, i) ||
>                             dqm->dev->kfd2kgd->hqd_sdma_get_doorbell(dqm->dev->adev, i, j) ||
>                             !set_sdma_queue_as_reset(dqm, doorbell_off)) {
>                                 r = -ENOTRECOVERABLE;
> --
> 2.48.1
>