RE: [PATCH V7 3/9] drm/amdgpu: Add common lock and reset caller parameter for SDMA reset synchronization

"Deucher, Alexander" <Alexander.Deucher@xxxxxxx> · Wed, 19 Feb 2025 17:39:24 +0000

[Public]

> -----Original Message-----
> From: Kim, Jonathan <Jonathan.Kim@xxxxxxx>
> Sent: Tuesday, February 18, 2025 12:42 PM
> To: Lazar, Lijo <Lijo.Lazar@xxxxxxx>; Zhang, Jesse(Jie)
> <Jesse.Zhang@xxxxxxx>; amd-gfx@xxxxxxxxxxxxxxxxxxxxx
> Cc: Deucher, Alexander <Alexander.Deucher@xxxxxxx>; Kuehling, Felix
> <Felix.Kuehling@xxxxxxx>; Zhu, Jiadong <Jiadong.Zhu@xxxxxxx>
> Subject: RE: [PATCH V7 3/9] drm/amdgpu: Add common lock and reset caller
> parameter for SDMA reset synchronization
>
> [Public]
>
> > -----Original Message-----
> > From: Lazar, Lijo <Lijo.Lazar@xxxxxxx>
> > Sent: Monday, February 17, 2025 10:36 PM
> > To: Zhang, Jesse(Jie) <Jesse.Zhang@xxxxxxx>;
> > amd-gfx@xxxxxxxxxxxxxxxxxxxxx
> > Cc: Deucher, Alexander <Alexander.Deucher@xxxxxxx>; Kuehling, Felix
> > <Felix.Kuehling@xxxxxxx>; Kim, Jonathan <Jonathan.Kim@xxxxxxx>; Zhu,
> > Jiadong <Jiadong.Zhu@xxxxxxx>
> > Subject: Re: [PATCH V7 3/9] drm/amdgpu: Add common lock and reset
> > caller parameter for SDMA reset synchronization
> >
> >
> >
> > On 2/13/2025 11:17 AM, jesse.zhang@xxxxxxx wrote:
> > > From: "Jesse.zhang@xxxxxxx" <Jesse.zhang@xxxxxxx>
> > >
> > > This commit introduces a caller parameter to the
> > > amdgpu_sdma_reset_instance
> > function to differentiate
> > > between reset requests originating from the KGD and KFD.
> > > This change ensures proper synchronization between KGD and KFD
> > > during SDMA
> > resets.
> > >
> > > If the caller is KFD, the function now acquires and releases the
> > > scheduler lock
> > (ring->sched.job_list_lock)
> > > to protect the SDMA queue during the reset.
> > >
> > > These changes prevent race conditions and ensure safe SDMA reset
> > > operations when initiated by KFD, improving system stability and reliability.
> > >
> > > V2: replace the ring_lock with the existed the scheduler
> > >     locks for the queues (ring->sched) on the sdma engine.(Alex)
> > >
> > > v3: call drm_sched_wqueue_stop() rather than job_list_lock.
> > >     If a GPU ring reset was already initiated for one ring at amdgpu_job_timedout,
> > >     skip resetting that ring and call drm_sched_wqueue_stop()
> > >     for the other rings (Alex)
> > >
> > >    replace  the common lock (sdma_reset_lock) with DQM lock to
> > >    to resolve reset races between the two driver sections during KFD
> > > eviction.(Jon)
> > >
> > >    Rename the caller to Reset_src and
> > >    Change AMDGPU_RESET_SRC_SDMA_KGD/KFD to
> > AMDGPU_RESET_SRC_SDMA_HWS/RING (Jon)
> > > v4: restart the wqueue if the reset was successful,
> > >     or fall back to a full adapter reset. (Alex)
> > >
> > >    move definition of reset source to enumeration AMDGPU_RESET_SRCS,
> and
> > >    check reset src in amdgpu_sdma_reset_instance (Jon)
> > >
> > > v5: Call amdgpu_amdkfd_suspend/resume at the start/end of reset
> > > function
> > respectively under !SRC_HWS
> > >     conditions only (Jon)
> > >
> > > Suggested-by: Alex Deucher <alexander.deucher@xxxxxxx>
> > > Suggested-by: Jiadong Zhu <Jiadong.Zhu@xxxxxxx>
> > > Suggested-by: Jonathan Kim <Jonathan.Kim@xxxxxxx>
> > > Signed-off-by: Jesse Zhang <jesse.zhang@xxxxxxx>
> > > Reviewed-by: Alex Deucher <alexander.deucher@xxxxxxx>
> > > ---
> > >  drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h |  2 +
> > > drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c  | 65
> > > ++++++++++++++++++++---  drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h  |
> > > 6 +--  drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c  |  8 +--
> > >  4 files changed, 67 insertions(+), 14 deletions(-)
> > >
> > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
> > > index 4d9b9701139b..5b86e12ff9fe 100644
> > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
> > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
> > > @@ -43,6 +43,8 @@ enum AMDGPU_RESET_SRCS {
> > >     AMDGPU_RESET_SRC_MES,
> > >     AMDGPU_RESET_SRC_HWS,
> > >     AMDGPU_RESET_SRC_USER,
> > > +   AMDGPU_RESET_SRC_SDMA_RING,
> > > +   AMDGPU_RESET_SRC_SDMA_HWS,
> > >  };
> > >
> > >  struct amdgpu_reset_context {
> > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
> > > index fe39198307ec..808c7112ef10 100644
> > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
> > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
> > > @@ -25,6 +25,7 @@
> > >  #include "amdgpu.h"
> > >  #include "amdgpu_sdma.h"
> > >  #include "amdgpu_ras.h"
> > > +#include "amdgpu_reset.h"
> > >
> > >  #define AMDGPU_CSA_SDMA_SIZE 64
> > >  /* SDMA CSA reside in the 3rd page of CSA */ @@ -485,6 +486,7 @@
> > > void amdgpu_sdma_register_on_reset_callbacks(struct
> > amdgpu_device *adev, struct
> > >   * amdgpu_sdma_reset_engine - Reset a specific SDMA engine
> > >   * @adev: Pointer to the AMDGPU device
> > >   * @instance_id: ID of the SDMA engine instance to reset
> > > + * @src: The source of reset function (KGD or KFD)
> > >   *
> > >   * This function performs the following steps:
> > >   * 1. Calls all registered pre_reset callbacks to allow KFD and
> > > AMDGPU to save
> > their state.
> > > @@ -493,20 +495,49 @@ void
> > > amdgpu_sdma_register_on_reset_callbacks(struct
> > amdgpu_device *adev, struct
> > >   *
> > >   * Returns: 0 on success, or a negative error code on failure.
> > >   */
> > > -int amdgpu_sdma_reset_engine(struct amdgpu_device *adev, uint32_t
> > instance_id)
> > > +int amdgpu_sdma_reset_engine(struct amdgpu_device *adev, uint32_t
> > instance_id, int src)
> > >  {
> > >     struct sdma_on_reset_funcs *funcs;
> > > -   int ret;
> > > +   int ret = 0;
> > > +   struct amdgpu_sdma_instance *sdma_instance = &adev-
> > >sdma.instance[instance_id];;
> > > +   struct amdgpu_ring *gfx_ring = &sdma_instance->ring;
> > > +   struct amdgpu_ring *page_ring = &sdma_instance->page;
> > > +   bool gfx_sched_stopped = false, page_sched_stopped = false;
> > > +
> > > +   /* Check if the reset source is valid for SDMA ring reset */
> > > +   if (src != AMDGPU_RESET_SRC_SDMA_RING && src !=
> > AMDGPU_RESET_SRC_HWS)
> > > +           return -EINVAL;
> > > +
> > > +   /* Suspend KFD if the reset source is not SDMA_HWS.
> > > +    * prevent the destruction of in-flight healthy user queue packets and
> > > +    * avoid race conditions between KFD and KGD during the reset process.
> > > +    */
> > > +   if (src != AMDGPU_RESET_SRC_SDMA_HWS)
> > > +           amdgpu_amdkfd_suspend(adev, false);
> >
> > It this has to be done here, what's the idea behind registering a
> > separate pre/post callback for KFD initiated resets?
>
> The problem is that for SDMA v5.x and below, a single soft reset call will reset all
> queues on the target SDMA engine.
> If the KGD calls the reset, a transient KFD suspend/resume around the reset will
> guarantee that healthy user queues survive the reset.
> If the KFD calls the reset, it will only do so during preemption failure, and we don't
> want to suspend and resume the KFD again when the KFD calls this function.
> The KFD needs to call this common function to not disturb GFX and paging queues
> for the same reason KGD resets need to be wary of KFD queues.
> In the case of preemption failure and a KFD initiated reset, the KFD is holding the
> device wide preemption lock, so if the KGD is trying to reset as well, it will have to
> wait on its own KFD suspend call until the KFD is done its own reset.
>
> I think SDMA6+ can target specific queues via MMIO reg writes, but I see in the
> KGD code that soft_reset for SDMA 6+ does a sweep of all engines so we would
> still need to keep KFD queues safe (I'm not familiar with this chunk of the code or
> the need for sweeping all engines on the KGD side).
> So I'm not sure if Jesse had concerns about potential multiple calls being made in
> other parts of the code and maybe this was some way to simplify things.
> Either way, not sure if it's simpler to drop the reset source param and have the KGD
> remember to unconditionally KFD suspend/resume around the soft reset call for
> each IP version, or keep the reset source param here so that we only have to
> remember to do this once.

Could KFD keep track of whether the hang was detected by KFD or KGD?  E.g., if KFD detects the hang when suspending queues, it could set a flag and then KFD could include a call to amdgpu_amdkfd_suspend() in its pre_reset callback that would depend on the value of that flag.  At that point I don't think we'd need the src parameter any more.

Alex

>
> Jon
>
> >
> > Thanks,
> > Lijo
> >
> > > +
> > > +   /* Stop the scheduler's work queue for the GFX and page rings if
> > > + they are
> > running.
> > > +   * This ensures that no new tasks are submitted to the queues while
> > > +   * the reset is in progress.
> > > +   */
> > > +   if (!amdgpu_ring_sched_ready(gfx_ring)) {
> > > +           drm_sched_wqueue_stop(&gfx_ring->sched);
> > > +           gfx_sched_stopped = true;;
> > > +   }
> > > +
> > > +   if (adev->sdma.has_page_queue
> > && !amdgpu_ring_sched_ready(page_ring)) {
> > > +           drm_sched_wqueue_stop(&page_ring->sched);
> > > +           page_sched_stopped = true;
> > > +   }
> > >
> > >     /* Invoke all registered pre_reset callbacks */
> > >     list_for_each_entry(funcs, &adev->sdma.reset_callback_list, list) {
> > >             if (funcs->pre_reset) {
> > > -                   ret = funcs->pre_reset(adev, instance_id);
> > > +                   ret = funcs->pre_reset(adev, instance_id, src);
> > >                     if (ret) {
> > >                             dev_err(adev->dev,
> > >                             "beforeReset callback failed for instance %u: %d\n",
> > >                                     instance_id, ret);
> > > -                           return ret;
> > > +                           goto exit;
> > >                     }
> > >             }
> > >     }
> > > @@ -515,21 +546,39 @@ int amdgpu_sdma_reset_engine(struct
> > > amdgpu_device
> > *adev, uint32_t instance_id)
> > >     ret = amdgpu_dpm_reset_sdma(adev, 1 << instance_id);
> > >     if (ret) {
> > >             dev_err(adev->dev, "Failed to reset SDMA instance %u\n",
> > instance_id);
> > > -           return ret;
> > > +           goto exit;
> > >     }
> > >
> > >     /* Invoke all registered post_reset callbacks */
> > >     list_for_each_entry(funcs, &adev->sdma.reset_callback_list, list) {
> > >             if (funcs->post_reset) {
> > > -                   ret = funcs->post_reset(adev, instance_id);
> > > +                   ret = funcs->post_reset(adev, instance_id, src);
> > >                     if (ret) {
> > >                             dev_err(adev->dev,
> > >                             "afterReset callback failed for instance %u: %d\n",
> > >                                     instance_id, ret);
> > > -                           return ret;
> > > +                           goto exit;
> > >                     }
> > >             }
> > >     }
> > >
> > > -   return 0;
> > > +exit:
> > > +   /* Restart the scheduler's work queue for the GFX and page rings
> > > +    * if they were stopped by this function. This allows new tasks
> > > +    * to be submitted to the queues after the reset is complete.
> > > +    */
> > > +   if (ret) {
> > > +           if (gfx_sched_stopped && amdgpu_ring_sched_ready(gfx_ring)) {
> > > +                   drm_sched_wqueue_start(&gfx_ring->sched);
> > > +           }
> > > +           if (page_sched_stopped && amdgpu_ring_sched_ready(page_ring)) {
> > > +                   drm_sched_wqueue_start(&page_ring->sched);
> > > +           }
> > > +   }
> > > +
> > > +   /* Resume KFD if the reset source is not SDMA_HWS */
> > > +   if (src != AMDGPU_RESET_SRC_SDMA_HWS)
> > > +           amdgpu_amdkfd_resume(adev, false);
> > > +
> > > +       return ret;
> > >  }
> > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
> > > index f91d75848557..2ef2da772254 100644
> > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
> > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
> > > @@ -99,8 +99,8 @@ struct amdgpu_sdma_ras {  };
> > >
> > >  struct sdma_on_reset_funcs {
> > > -   int (*pre_reset)(struct amdgpu_device *adev, uint32_t instance_id);
> > > -   int (*post_reset)(struct amdgpu_device *adev, uint32_t instance_id);
> > > +   int (*pre_reset)(struct amdgpu_device *adev, uint32_t instance_id, int src);
> > > +   int (*post_reset)(struct amdgpu_device *adev, uint32_t
> > > + instance_id, int src);
> > >     /* Linked list node to store this structure in a list; */
> > >     struct list_head list;
> > >  };
> > > @@ -166,7 +166,7 @@ struct amdgpu_buffer_funcs {  };
> > >
> > >  void amdgpu_sdma_register_on_reset_callbacks(struct amdgpu_device
> > > *adev,
> > struct sdma_on_reset_funcs *funcs);
> > > -int amdgpu_sdma_reset_engine(struct amdgpu_device *adev, uint32_t
> > instance_id);
> > > +int amdgpu_sdma_reset_engine(struct amdgpu_device *adev, uint32_t
> > instance_id, int src);
> > >
> > >  #define amdgpu_emit_copy_buffer(adev, ib, s, d, b, t) (adev)-
> > >mman.buffer_funcs->emit_copy_buffer((ib),  (s), (d), (b), (t))
> > >  #define amdgpu_emit_fill_buffer(adev, ib, s, d, b)
> > >(adev)->mman.buffer_funcs- emit_fill_buffer((ib), (s), (d), (b))
> > >diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
> > b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
> > > index 29a123be90b7..50a086264792 100644
> > > --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
> > > +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
> > > @@ -30,6 +30,7 @@
> > >  #include "amdgpu_xcp.h"
> > >  #include "amdgpu_ucode.h"
> > >  #include "amdgpu_trace.h"
> > > +#include "amdgpu_reset.h"
> > >
> > >  #include "sdma/sdma_4_4_2_offset.h"
> > >  #include "sdma/sdma_4_4_2_sh_mask.h"
> > > @@ -1480,6 +1481,7 @@ static int sdma_v4_4_2_sw_init(struct
> > amdgpu_ip_block *ip_block)
> > >     if (r)
> > >             return r;
> > >     INIT_LIST_HEAD(&adev->sdma.reset_callback_list);
> > > +
> > >     return r;
> > >  }
> > >
> > > @@ -1608,10 +1610,10 @@ static int sdma_v4_4_2_reset_queue(struct
> > amdgpu_ring *ring, unsigned int vmid)
> > >  {
> > >     struct amdgpu_device *adev = ring->adev;
> > >     u32 id = GET_INST(SDMA0, ring->me);
> > > -   return amdgpu_sdma_reset_engine(adev, id);
> > > +   return amdgpu_sdma_reset_engine(adev, id,
> > AMDGPU_RESET_SRC_SDMA_RING);
> > >  }
> > >
> > > -static int sdma_v4_4_2_stop_queue(struct amdgpu_device *adev,
> > > uint32_t
> > instance_id)
> > > +static int sdma_v4_4_2_stop_queue(struct amdgpu_device *adev,
> > > +uint32_t
> > instance_id, int src)
> > >  {
> > >     u32 inst_mask;
> > >     struct amdgpu_ring *ring =
> > > &adev->sdma.instance[instance_id].ring;
> > > @@ -1628,7 +1630,7 @@ static int sdma_v4_4_2_stop_queue(struct
> > amdgpu_device *adev, uint32_t instance_
> > >     return 0;
> > >  }
> > >
> > > -static int sdma_v4_4_2_restore_queue(struct amdgpu_device *adev,
> > > uint32_t
> > instance_id)
> > > +static int sdma_v4_4_2_restore_queue(struct amdgpu_device *adev,
> > > +uint32_t
> > instance_id, int src)
> > >  {
> > >     int i;
> > >     u32 inst_mask;
>