[Public] > -----Original Message----- > From: Jesse.zhang@xxxxxxx <jesse.zhang@xxxxxxx> > Sent: Monday, February 10, 2025 2:32 AM > To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx > Cc: Deucher, Alexander <Alexander.Deucher@xxxxxxx>; Kuehling, Felix > <Felix.Kuehling@xxxxxxx>; Kim, Jonathan <Jonathan.Kim@xxxxxxx>; Zhu, > Jiadong <Jiadong.Zhu@xxxxxxx>; Zhang, Jesse(Jie) <Jesse.Zhang@xxxxxxx>; > Deucher, Alexander <Alexander.Deucher@xxxxxxx>; Zhang, Jesse(Jie) > <Jesse.Zhang@xxxxxxx> > Subject: [PATCH 3/4] drm/amdgpu: Add common lock and reset caller parameter for > SDMA reset synchronization > > From: "Jesse.zhang@xxxxxxx" <Jesse.zhang@xxxxxxx> > > This commit introduces a caller parameter to the amdgpu_sdma_reset_instance > function to differentiate > between reset requests originating from the KGD and KFD. > This change ensures proper synchronization between KGD and KFD during SDMA > resets. > > If the caller is KFD, the function now acquires and releases the scheduler lock (ring- > >sched.job_list_lock) > to protect the SDMA queue during the reset. > > These changes prevent race conditions and ensure safe SDMA reset operations > when initiated by KFD, improving system stability and reliability. > > V2: replace the ring_lock with the existed the scheduler > locks for the queues (ring->sched) on the sdma engine.(Alex) > > v3: call drm_sched_wqueue_stop() rather than job_list_lock. > If a GPU ring reset was already initiated for one ring at amdgpu_job_timedout, > skip resetting that ring and call drm_sched_wqueue_stop() > for the other rings (Alex) > > replace the common lock (sdma_reset_lock) with DQM lock to > to resolve reset races between the two driver sections during KFD eviction.(Jon) > > Rename the caller to Reset_src and > Change AMDGPU_RESET_SRC_SDMA_KGD/KFD to > AMDGPU_RESET_SRC_SDMA_HWS/RING (Jon) > v4: restart the wqueue if the reset was successful, > or fall back to a full adapter reset. (Alex) > > move definition of reset source to enumeration AMDGPU_RESET_SRCS, and > check reset src in amdgpu_sdma_reset_instance (Jon) > > Suggested-by: Alex Deucher <alexander.deucher@xxxxxxx> > Suggested-by: Jiadong Zhu <Jiadong.Zhu@xxxxxxx> > Suggested-by: Jonathan Kim <Jonathan.Kim@xxxxxxx> > Signed-off-by: Jesse Zhang <jesse.zhang@xxxxxxx> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 2 + > drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c | 54 +++++++++++++++++++---- > drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h | 6 +-- > drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c | 8 ++-- > 4 files changed, 56 insertions(+), 14 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h > index 4d9b9701139b..5b86e12ff9fe 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h > @@ -43,6 +43,8 @@ enum AMDGPU_RESET_SRCS { > AMDGPU_RESET_SRC_MES, > AMDGPU_RESET_SRC_HWS, > AMDGPU_RESET_SRC_USER, > + AMDGPU_RESET_SRC_SDMA_RING, > + AMDGPU_RESET_SRC_SDMA_HWS, > }; > > struct amdgpu_reset_context { > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c > index 19c8be7d72e2..8864a9d7455b 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c > @@ -25,6 +25,7 @@ > #include "amdgpu.h" > #include "amdgpu_sdma.h" > #include "amdgpu_ras.h" > +#include "amdgpu_reset.h" > > #define AMDGPU_CSA_SDMA_SIZE 64 > /* SDMA CSA reside in the 3rd page of CSA */ > @@ -484,6 +485,7 @@ void amdgpu_sdma_register_on_reset_callbacks(struct > amdgpu_device *adev, struct > * amdgpu_sdma_reset_instance - Reset a specific SDMA instance > * @adev: Pointer to the AMDGPU device > * @instance_id: ID of the SDMA engine instance to reset > + * @src: The source of reset function (KGD or KFD) > * > * This function performs the following steps: > * 1. Calls all registered pre_reset callbacks to allow KFD and AMDGPU to save their > state. > @@ -492,20 +494,42 @@ void amdgpu_sdma_register_on_reset_callbacks(struct > amdgpu_device *adev, struct > * > * Returns: 0 on success, or a negative error code on failure. > */ > -int amdgpu_sdma_reset_instance(struct amdgpu_device *adev, uint32_t > instance_id) > +int amdgpu_sdma_reset_instance(struct amdgpu_device *adev, uint32_t > instance_id, int src) > { > struct sdma_on_reset_funcs *funcs; > - int ret; > + int ret = 0; > + struct amdgpu_sdma_instance *sdma_instance = &adev- > >sdma.instance[instance_id];; > + struct amdgpu_ring *gfx_ring = &sdma_instance->ring; > + struct amdgpu_ring *page_ring = &sdma_instance->page; > + bool gfx_sched_stopped = false, page_sched_stopped = false; > + > + /* Check if the reset source is valid for SDMA ring reset */ > + if (src != AMDGPU_RESET_SRC_SDMA_RING && src != > AMDGPU_RESET_SRC_HWS) > + return -EINVAL; You still need quiesce the KFD transiently around the reset if the reset trigger is !AMDGPU_RESET_SRC_HWS for this call to work. Otherwise the engine based soft reset could destroy in-flight healthy user queue packets. Call amdgpu_amdkfd_suspend/resume at the start/end of this function respectively under !SRC_HWS (or inversely SRC_RING) conditions only. This will also set up the prevention of the reset call race between the KFD and KGD whenever the KFD invokes this call via SRC_HWS. Jon > + > + /* Stop the scheduler's work queue for the GFX and page rings if they are > running. > + * This ensures that no new tasks are submitted to the queues while > + * the reset is in progress. > + */ > + if (!amdgpu_ring_sched_ready(gfx_ring)) { > + drm_sched_wqueue_stop(&gfx_ring->sched); > + gfx_sched_stopped = true;; > + } > + > + if (adev->sdma.has_page_queue > && !amdgpu_ring_sched_ready(page_ring)) { > + drm_sched_wqueue_stop(&page_ring->sched); > + page_sched_stopped = true; > + } > > /* Invoke all registered pre_reset callbacks */ > list_for_each_entry(funcs, &adev->sdma.reset_callback_list, list) { > if (funcs->pre_reset) { > - ret = funcs->pre_reset(adev, instance_id); > + ret = funcs->pre_reset(adev, instance_id, src); > if (ret) { > dev_err(adev->dev, > "beforeReset callback failed for instance %u: %d\n", > instance_id, ret); > - return ret; > + goto exit; > } > } > } > @@ -514,21 +538,35 @@ int amdgpu_sdma_reset_instance(struct amdgpu_device > *adev, uint32_t instance_id) > ret = amdgpu_dpm_reset_sdma(adev, 1 << instance_id); > if (ret) { > dev_err(adev->dev, "Failed to reset SDMA instance %u\n", > instance_id); > - return ret; > + goto exit; > } > > /* Invoke all registered post_reset callbacks */ > list_for_each_entry(funcs, &adev->sdma.reset_callback_list, list) { > if (funcs->post_reset) { > - ret = funcs->post_reset(adev, instance_id); > + ret = funcs->post_reset(adev, instance_id, src); > if (ret) { > dev_err(adev->dev, > "afterReset callback failed for instance %u: %d\n", > instance_id, ret); > - return ret; > + goto exit; > } > } > } > > - return 0; > +exit: > + /* Restart the scheduler's work queue for the GFX and page rings > + * if they were stopped by this function. This allows new tasks > + * to be submitted to the queues after the reset is complete. > + */ > + if (ret) { > + if (gfx_sched_stopped && amdgpu_ring_sched_ready(gfx_ring)) { > + drm_sched_wqueue_start(&gfx_ring->sched); > + } > + if (page_sched_stopped && amdgpu_ring_sched_ready(page_ring)) { > + drm_sched_wqueue_start(&page_ring->sched); > + } > + } > + > + return ret; > } > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h > index fbb8b04ef2cb..df5c9f7a4374 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h > @@ -99,8 +99,8 @@ struct amdgpu_sdma_ras { > }; > > struct sdma_on_reset_funcs { > - int (*pre_reset)(struct amdgpu_device *adev, uint32_t instance_id); > - int (*post_reset)(struct amdgpu_device *adev, uint32_t instance_id); > + int (*pre_reset)(struct amdgpu_device *adev, uint32_t instance_id, int src); > + int (*post_reset)(struct amdgpu_device *adev, uint32_t instance_id, int src); > /* Linked list node to store this structure in a list; */ > struct list_head list; > }; > @@ -166,7 +166,7 @@ struct amdgpu_buffer_funcs { > }; > > void amdgpu_sdma_register_on_reset_callbacks(struct amdgpu_device *adev, > struct sdma_on_reset_funcs *funcs); > -int amdgpu_sdma_reset_instance(struct amdgpu_device *adev, uint32_t > instance_id); > +int amdgpu_sdma_reset_instance(struct amdgpu_device *adev, uint32_t > instance_id, int src); > > #define amdgpu_emit_copy_buffer(adev, ib, s, d, b, t) (adev)->mman.buffer_funcs- > >emit_copy_buffer((ib), (s), (d), (b), (t)) > #define amdgpu_emit_fill_buffer(adev, ib, s, d, b) (adev)->mman.buffer_funcs- > >emit_fill_buffer((ib), (s), (d), (b)) > diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c > b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c > index 3e60456b0db0..c297b4a13680 100644 > --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c > +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c > @@ -30,6 +30,7 @@ > #include "amdgpu_xcp.h" > #include "amdgpu_ucode.h" > #include "amdgpu_trace.h" > +#include "amdgpu_reset.h" > > #include "sdma/sdma_4_4_2_offset.h" > #include "sdma/sdma_4_4_2_sh_mask.h" > @@ -1480,6 +1481,7 @@ static int sdma_v4_4_2_sw_init(struct amdgpu_ip_block > *ip_block) > if (r) > return r; > INIT_LIST_HEAD(&adev->sdma.reset_callback_list); > + > return r; > } > > @@ -1608,10 +1610,10 @@ static int sdma_v4_4_2_reset_queue(struct > amdgpu_ring *ring, unsigned int vmid) > { > struct amdgpu_device *adev = ring->adev; > u32 id = GET_INST(SDMA0, ring->me); > - return amdgpu_sdma_reset_instance(adev, id); > + return amdgpu_sdma_reset_instance(adev, id, > AMDGPU_RESET_SRC_SDMA_RING); > } > > -static int sdma_v4_4_2_stop_queue(struct amdgpu_device *adev, uint32_t > instance_id) > +static int sdma_v4_4_2_stop_queue(struct amdgpu_device *adev, uint32_t > instance_id, int src) > { > u32 inst_mask; > struct amdgpu_ring *ring = &adev->sdma.instance[instance_id].ring; > @@ -1628,7 +1630,7 @@ static int sdma_v4_4_2_stop_queue(struct > amdgpu_device *adev, uint32_t instance_ > return 0; > } > > -static int sdma_v4_4_2_restore_queue(struct amdgpu_device *adev, uint32_t > instance_id) > +static int sdma_v4_4_2_restore_queue(struct amdgpu_device *adev, uint32_t > instance_id, int src) > { > int i; > u32 inst_mask; > -- > 2.25.1