If fatal error is detected, packet submission won't go through. Return error in such cases. Also, avoid waiting for fence when fatal error is detected. Signed-off-by: Lijo Lazar <lijo.lazar@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 5 +++++ drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 1 + drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 4 ++++ drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c | 8 +++++++- drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h | 2 +- drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c | 10 +++++----- 6 files changed, 23 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c index 190039f14c30..f5f2945711be 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c @@ -742,6 +742,11 @@ void amdgpu_amdkfd_debug_mem_fence(struct amdgpu_device *adev) amdgpu_device_flush_hdp(adev, NULL); } +bool amdgpu_amdkfd_is_fed(struct amdgpu_device *adev) +{ + return amdgpu_ras_get_fed_status(adev); +} + void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev, enum amdgpu_ras_block block, bool reset) { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h index e60f63ccf79a..4fb32d86cd0e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h @@ -337,6 +337,7 @@ int amdgpu_amdkfd_get_tile_config(struct amdgpu_device *adev, struct tile_config *config); void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev, enum amdgpu_ras_block block, bool reset); +bool amdgpu_amdkfd_is_fed(struct amdgpu_device *adev); bool amdgpu_amdkfd_bo_mapped_to_dev(struct amdgpu_device *adev, struct kgd_mem *mem); void amdgpu_amdkfd_block_mmu_notifications(void *p); int amdgpu_amdkfd_criu_resume(void *p); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c index c0e71543389a..f4d395e38683 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -1903,6 +1903,10 @@ int amdkfd_fence_wait_timeout(struct device_queue_manager *dqm, uint64_t *fence_addr = dqm->fence_addr; while (*fence_addr != fence_value) { + /* Fatal err detected, this response won't come */ + if (amdgpu_amdkfd_is_fed(dqm->dev->adev)) + return -EIO; + if (time_after(jiffies, end_jiffies)) { dev_err(dev, "qcm fence wait loop timeout expired\n"); /* In HWS case, this is used to halt the driver thread diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c index 1bea629c49ca..32c926986dbb 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c @@ -286,7 +286,7 @@ int kq_acquire_packet_buffer(struct kernel_queue *kq, return -ENOMEM; } -void kq_submit_packet(struct kernel_queue *kq) +int kq_submit_packet(struct kernel_queue *kq) { #ifdef DEBUG int i; @@ -298,6 +298,10 @@ void kq_submit_packet(struct kernel_queue *kq) } pr_debug("\n"); #endif + /* Fatal err detected, packet submission won't go through */ + if (amdgpu_amdkfd_is_fed(kq->dev->adev)) + return -EIO; + if (kq->dev->kfd->device_info.doorbell_size == 8) { *kq->wptr64_kernel = kq->pending_wptr64; write_kernel_doorbell64(kq->queue->properties.doorbell_ptr, @@ -307,6 +311,8 @@ void kq_submit_packet(struct kernel_queue *kq) write_kernel_doorbell(kq->queue->properties.doorbell_ptr, kq->pending_wptr); } + + return 0; } void kq_rollback_packet(struct kernel_queue *kq) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h index 9a6244430845..e24ee50acdf0 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h @@ -47,7 +47,7 @@ int kq_acquire_packet_buffer(struct kernel_queue *kq, size_t packet_size_in_dwords, unsigned int **buffer_ptr); -void kq_submit_packet(struct kernel_queue *kq); +int kq_submit_packet(struct kernel_queue *kq); void kq_rollback_packet(struct kernel_queue *kq); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c index 401096c103b2..d6f65f39072b 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c @@ -288,7 +288,7 @@ int pm_send_set_resources(struct packet_manager *pm, retval = pm->pmf->set_resources(pm, buffer, res); if (!retval) - kq_submit_packet(pm->priv_queue); + retval = kq_submit_packet(pm->priv_queue); else kq_rollback_packet(pm->priv_queue); @@ -325,7 +325,7 @@ int pm_send_runlist(struct packet_manager *pm, struct list_head *dqm_queues) if (retval) goto fail_create_runlist; - kq_submit_packet(pm->priv_queue); + retval = kq_submit_packet(pm->priv_queue); mutex_unlock(&pm->lock); @@ -361,7 +361,7 @@ int pm_send_query_status(struct packet_manager *pm, uint64_t fence_address, retval = pm->pmf->query_status(pm, buffer, fence_address, fence_value); if (!retval) - kq_submit_packet(pm->priv_queue); + retval = kq_submit_packet(pm->priv_queue); else kq_rollback_packet(pm->priv_queue); @@ -392,7 +392,7 @@ int pm_update_grace_period(struct packet_manager *pm, uint32_t grace_period) retval = pm->pmf->set_grace_period(pm, buffer, grace_period); if (!retval) - kq_submit_packet(pm->priv_queue); + retval = kq_submit_packet(pm->priv_queue); else kq_rollback_packet(pm->priv_queue); } @@ -421,7 +421,7 @@ int pm_send_unmap_queue(struct packet_manager *pm, retval = pm->pmf->unmap_queues(pm, buffer, filter, filter_param, reset); if (!retval) - kq_submit_packet(pm->priv_queue); + retval = kq_submit_packet(pm->priv_queue); else kq_rollback_packet(pm->priv_queue); -- 2.25.1