Move the kfd suspend/resume code into the caller. That is where the KFD is likely to detect a reset so on the KFD side there is no need to call them. Also add a mutex to lock the actual reset sequence. Fixes: bac38ca8c475 ("drm/amdkfd: implement per queue sdma reset for gfx 9.4+") Signed-off-by: Alex Deucher <alexander.deucher@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c | 15 +++------------ drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h | 3 ++- drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c | 8 +++++++- .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 2 +- 5 files changed, 14 insertions(+), 15 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 1d8cfdc51bdcb..611b27c91a18a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -4281,6 +4281,7 @@ int amdgpu_device_init(struct amdgpu_device *adev, mutex_init(&adev->gfx.kfd_sch_mutex); mutex_init(&adev->gfx.workload_profile_mutex); mutex_init(&adev->vcn.workload_profile_mutex); + mutex_init(&adev->sdma.engine_reset_mutex); amdgpu_device_init_apu_flags(adev); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c index 03c4c012a1508..0b2529ef5a963 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c @@ -524,7 +524,6 @@ void amdgpu_sdma_register_on_reset_callbacks(struct amdgpu_device *adev, struct * amdgpu_sdma_reset_engine - Reset a specific SDMA engine * @adev: Pointer to the AMDGPU device * @instance_id: ID of the SDMA engine instance to reset - * @suspend_user_queues: check if suspend user queue. * * This function performs the following steps: * 1. Calls all registered pre_reset callbacks to allow KFD and AMDGPU to save their state. @@ -533,7 +532,7 @@ void amdgpu_sdma_register_on_reset_callbacks(struct amdgpu_device *adev, struct * * Returns: 0 on success, or a negative error code on failure. */ -int amdgpu_sdma_reset_engine(struct amdgpu_device *adev, uint32_t instance_id, bool suspend_user_queues) +int amdgpu_sdma_reset_engine(struct amdgpu_device *adev, uint32_t instance_id) { struct sdma_on_reset_funcs *funcs; int ret = 0; @@ -542,13 +541,7 @@ int amdgpu_sdma_reset_engine(struct amdgpu_device *adev, uint32_t instance_id, b struct amdgpu_ring *page_ring = &sdma_instance->page; bool gfx_sched_stopped = false, page_sched_stopped = false; - /* Suspend KFD if suspend_user_queues is true. - * prevent the destruction of in-flight healthy user queue packets and - * avoid race conditions between KFD and KGD during the reset process. - */ - if (suspend_user_queues) - amdgpu_amdkfd_suspend(adev, false); - + mutex_lock(&adev->sdma.engine_reset_mutex); /* Stop the scheduler's work queue for the GFX and page rings if they are running. * This ensures that no new tasks are submitted to the queues while * the reset is in progress. @@ -609,9 +602,7 @@ int amdgpu_sdma_reset_engine(struct amdgpu_device *adev, uint32_t instance_id, b drm_sched_wqueue_start(&page_ring->sched); } } - - if (suspend_user_queues) - amdgpu_amdkfd_resume(adev, false); + mutex_unlock(&adev->sdma.engine_reset_mutex); return ret; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h index 9651693200655..d3a44a8cf6104 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h @@ -129,6 +129,7 @@ struct amdgpu_sdma { /* track guilty state of GFX and PAGE queues */ bool gfx_guilty; bool page_guilty; + struct mutex engine_reset_mutex; }; /* @@ -169,7 +170,7 @@ struct amdgpu_buffer_funcs { }; void amdgpu_sdma_register_on_reset_callbacks(struct amdgpu_device *adev, struct sdma_on_reset_funcs *funcs); -int amdgpu_sdma_reset_engine(struct amdgpu_device *adev, uint32_t instance_id, bool suspend_user_queues); +int amdgpu_sdma_reset_engine(struct amdgpu_device *adev, uint32_t instance_id); #define amdgpu_emit_copy_buffer(adev, ib, s, d, b, t) (adev)->mman.buffer_funcs->emit_copy_buffer((ib), (s), (d), (b), (t)) #define amdgpu_emit_fill_buffer(adev, ib, s, d, b) (adev)->mman.buffer_funcs->emit_fill_buffer((ib), (s), (d), (b)) diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c index fd34dc1380811..14e4f7358cc4c 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c @@ -1666,7 +1666,13 @@ static int sdma_v4_4_2_reset_queue(struct amdgpu_ring *ring, unsigned int vmid) { struct amdgpu_device *adev = ring->adev; u32 id = GET_INST(SDMA0, ring->me); - return amdgpu_sdma_reset_engine(adev, id, true); + int r; + + amdgpu_amdkfd_suspend(adev, false); + r = amdgpu_sdma_reset_engine(adev, id); + amdgpu_amdkfd_resume(adev, false); + + return r; } static int sdma_v4_4_2_stop_queue(struct amdgpu_device *adev, uint32_t instance_id) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c index 2ed003d3ff0e0..c610e172a2b83 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -2310,7 +2310,7 @@ static int reset_hung_queues_sdma(struct device_queue_manager *dqm) continue; /* Reset engine and check. */ - if (amdgpu_sdma_reset_engine(dqm->dev->adev, i, false) || + if (amdgpu_sdma_reset_engine(dqm->dev->adev, i) || dqm->dev->kfd2kgd->hqd_sdma_get_doorbell(dqm->dev->adev, i, j) || !set_sdma_queue_as_reset(dqm, doorbell_off)) { r = -ENOTRECOVERABLE; -- 2.48.1