Am 11.03.25 um 09:33 schrieb Jesse.zhang@xxxxxxx: > From: "Jesse.zhang@xxxxxxx" <Jesse.zhang@xxxxxxx> > > This patch introduces two new functions, `amdgpu_sdma_stop_queue` and > `amdgpu_sdma_start_queue`, to handle the stopping and starting of SDMA queues > during engine reset operations. The changes include: > > 1. **New Functions**: > - `amdgpu_sdma_stop_queue`: Stops the SDMA queues and the scheduler's work queue > for the GFX and page rings. > - `amdgpu_sdma_start_queue`: Starts the SDMA queues and restarts the scheduler's > work queue for the GFX and page rings. > > 2. **Integration with Ring Functions**: > - The `stop_queue` and `start_queue` callbacks are added to the `amdgpu_ring_funcs` > structure and implemented for SDMA v4.4.2. > > Suggested-by:Jonathan Kim <jonathan.kim@xxxxxxx> > Signed-off-by: Jesse Zhang <Jesse.Zhang@xxxxxxx> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c | 1 + > drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h | 2 + > drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c | 92 ++++++++++++++++++++++++ > drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h | 2 + > 4 files changed, 97 insertions(+) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c > index d55c8b7fdb59..ff9aacbdf046 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c > @@ -351,6 +351,7 @@ int amdgpu_ring_init(struct amdgpu_device *adev, struct amdgpu_ring *ring, > 0xffffffffffffffff : ring->buf_mask; > /* Initialize cached_rptr to 0 */ > ring->cached_rptr = 0; > + atomic_set(&ring->stop_refcount, 0); > > /* Allocate ring buffer */ > if (ring->is_mes_queue) { > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h > index 1c52ff92ea26..7a984dbb48c7 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h > @@ -312,6 +312,8 @@ struct amdgpu_ring { > unsigned int entry_index; > /* store the cached rptr to restore after reset */ > uint64_t cached_rptr; > + /* Reference counter for stop requests */ > + atomic_t stop_refcount; > > }; > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c > index 39669f8788a7..7cd6dcd6e7f0 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c > @@ -30,6 +30,7 @@ > #define AMDGPU_CSA_SDMA_SIZE 64 > /* SDMA CSA reside in the 3rd page of CSA */ > #define AMDGPU_CSA_SDMA_OFFSET (4096 * 2) > +DEFINE_MUTEX(sdma_queue_mutex); Absolutely clear NAK to using a global mutex for this. Regards, Christian. > > /* > * GPU SDMA IP block helpers function. > @@ -504,6 +505,97 @@ void amdgpu_sdma_sysfs_reset_mask_fini(struct amdgpu_device *adev) > } > } > > +int amdgpu_sdma_stop_queue(struct amdgpu_device *adev, uint32_t instance_id) > +{ > + struct amdgpu_sdma_instance *sdma_instance = &adev->sdma.instance[instance_id]; > + struct amdgpu_ring *gfx_ring = &sdma_instance->ring; > + struct amdgpu_ring *page_ring = &sdma_instance->page; > + int r; > + > + mutex_lock(&sdma_queue_mutex); > + > + /* Avoid accidentally unparking the sched thread during GPU reset */ > + r = down_read_killable(&adev->reset_domain->sem); > + if (r) > + goto exit; > + > + /* Increment the reference counter */ > + atomic_inc(&gfx_ring->stop_refcount); > + if (adev->sdma.has_page_queue) > + atomic_inc(&page_ring->stop_refcount); > + > + if (atomic_read(&gfx_ring->stop_refcount) != 1 || > + (adev->sdma.has_page_queue && atomic_read(&page_ring->stop_refcount) != 1)) { > + up_read(&adev->reset_domain->sem); > + r = -EBUSY; > + goto exit; > + } > + > + if (!amdgpu_ring_sched_ready(gfx_ring)) > + drm_sched_wqueue_stop(&gfx_ring->sched); > + > + if (adev->sdma.has_page_queue && !amdgpu_ring_sched_ready(page_ring)) > + drm_sched_wqueue_stop(&page_ring->sched); > + > + if (gfx_ring->funcs && gfx_ring->funcs->stop_queue) > + gfx_ring->funcs->stop_queue(adev, instance_id); > + > + if (adev->sdma.has_page_queue && page_ring->funcs && page_ring->funcs->stop_queue) > + page_ring->funcs->stop_queue(adev, instance_id); > + > + up_read(&adev->reset_domain->sem); > + > +exit: > + mutex_unlock(&sdma_queue_mutex); > + return r; > +} > + > +int amdgpu_sdma_start_queue(struct amdgpu_device *adev, uint32_t instance_id) > +{ > + struct amdgpu_sdma_instance *sdma_instance = &adev->sdma.instance[instance_id]; > + struct amdgpu_ring *gfx_ring = &sdma_instance->ring; > + struct amdgpu_ring *page_ring = &sdma_instance->page; > + int r; > + > + mutex_lock(&sdma_queue_mutex); > + > + /* Avoid accidentally unparking the sched thread during GPU reset */ > + r = down_read_killable(&adev->reset_domain->sem); > + if (r) > + goto exit; > + > + /* Decrement the reference counter */ > + atomic_dec(&gfx_ring->stop_refcount); > + if (adev->sdma.has_page_queue) > + atomic_dec(&page_ring->stop_refcount); > + > + if (atomic_read(&gfx_ring->stop_refcount) != 0 || > + (adev->sdma.has_page_queue && atomic_read(&page_ring->stop_refcount) != 0)) { > + up_read(&adev->reset_domain->sem); > + r = -EBUSY; > + goto exit; > + } > + > + if (gfx_ring->funcs && gfx_ring->funcs->start_queue) > + gfx_ring->funcs->start_queue(adev, instance_id); > + > + if (adev->sdma.has_page_queue && page_ring->funcs && page_ring->funcs->start_queue) > + page_ring->funcs->start_queue(adev, instance_id); > + > + /* Restart the scheduler's work queue for the GFX and page rings */ > + if (amdgpu_ring_sched_ready(gfx_ring)) > + drm_sched_wqueue_start(&gfx_ring->sched); > + > + if (amdgpu_ring_sched_ready(page_ring)) > + drm_sched_wqueue_start(&page_ring->sched); > + > + up_read(&adev->reset_domain->sem); > + > +exit: > + mutex_unlock(&sdma_queue_mutex); > + return r; > +} > + > /** > * amdgpu_sdma_register_on_reset_callbacks - Register SDMA reset callbacks > * @funcs: Pointer to the callback structure containing pre_reset and post_reset functions > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h > index 965169320065..a91791fa3ecf 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h > @@ -170,6 +170,8 @@ struct amdgpu_buffer_funcs { > > void amdgpu_sdma_register_on_reset_callbacks(struct amdgpu_device *adev, struct sdma_on_reset_funcs *funcs); > int amdgpu_sdma_reset_engine(struct amdgpu_device *adev, uint32_t instance_id, bool suspend_user_queues); > +int amdgpu_sdma_stop_queue(struct amdgpu_device *adev, uint32_t instance_id); > +int amdgpu_sdma_start_queue(struct amdgpu_device *adev, uint32_t instance_id); > > #define amdgpu_emit_copy_buffer(adev, ib, s, d, b, t) (adev)->mman.buffer_funcs->emit_copy_buffer((ib), (s), (d), (b), (t)) > #define amdgpu_emit_fill_buffer(adev, ib, s, d, b) (adev)->mman.buffer_funcs->emit_fill_buffer((ib), (s), (d), (b))