On evety test attempts the crash has always happened on the
same location while removing the 2nd queue of 3 with doorbell id 0x1002.
Below is the trace captured by adding more printouts to problem
location to print message also when the queue is evicted or resrored
succesfully.
[ 948.324174] amdgpu 0000:c4:00.0: amdgpu: add_queue_mes added hardware queue to MES, doorbell=0x1202, queue: 2, caller: restore_process_queues_cpsch
[ 948.334344] amdgpu 0000:c4:00.0: amdgpu: add_queue_mes added hardware queue to MES, doorbell=0x1002, queue: 1, caller: restore_process_queues_cpsch
[ 948.344499] amdgpu 0000:c4:00.0: amdgpu: add_queue_mes added hardware queue to MES, doorbell=0x1000, queue: 0, caller: restore_process_queues_cpsch
[ 952.380614] amdgpu 0000:c4:00.0: amdgpu: remove_queue_mes removed hardware queue from MES, doorbell=0x1202, queue: 2, caller: evict_process_queues_cpsch
[ 952.391330] amdgpu 0000:c4:00.0: amdgpu: remove_queue_mes removed hardware queue from MES, doorbell=0x1002, queue: 1, caller: evict_process_queues_cpsch
[ 952.401634] amdgpu 0000:c4:00.0: amdgpu: remove_queue_mes removed hardware queue from MES, doorbell=0x1000, queue: 0, caller: evict_process_queues_cpsch
[ 952.414507] amdgpu 0000:c4:00.0: amdgpu: add_queue_mes added hardware queue to MES, doorbell=0x1202, queue: 2, caller: restore_process_queues_cpsch
[ 952.424618] amdgpu 0000:c4:00.0: amdgpu: add_queue_mes added hardware queue to MES, doorbell=0x1002, queue: 1, caller: restore_process_queues_cpsch
[ 952.434922] amdgpu 0000:c4:00.0: amdgpu: add_queue_mes added hardware queue to MES, doorbell=0x1000, queue: 0, caller: restore_process_queues_cpsch
[ 952.446272] amdgpu 0000:c4:00.0: amdgpu: remove_queue_mes removed hardware queue from MES, doorbell=0x1202, queue: 2, caller: evict_process_queues_cpsch
[ 954.460341] amdgpu 0000:c4:00.0: amdgpu: MES failed to respond to msg=REMOVE_QUEUE
[ 954.460356] amdgpu 0000:c4:00.0: amdgpu: remove_queue_mes failed to remove hardware queue from MES, doorbell=0x1002, queue: 1, caller: evict_process_queues_cpsch
[ 954.460360] amdgpu 0000:c4:00.0: amdgpu: MES might be in unrecoverable state, issue a GPU reset
[ 954.460366] amdgpu 0000:c4:00.0: amdgpu: Failed to evict queue 1
[ 954.460368] amdgpu 0000:c4:00.0: amdgpu: Failed to evict process queues
[ 954.460439] amdgpu 0000:c4:00.0: amdgpu: GPU reset begin!
[ 954.460464] amdgpu 0000:c4:00.0: amdgpu: remove_all_queues_mes: Failed to remove queue 0 for dev 5257
[ 954.460515] amdgpu 0000:c4:00.0: amdgpu: Dumping IP State
[ 954.462637] amdgpu 0000:c4:00.0: amdgpu: Dumping IP State Completed
[ 955.865591] amdgpu: process_termination_cpsch started
[ 955.866432] amdgpu: process_termination_cpsch started
[ 955.866445] amdgpu 0000:c4:00.0: amdgpu: Failed to remove queue 0
[ 956.503043] amdgpu 0000:c4:00.0: amdgpu: MES failed to respond to msg=REMOVE_QUEUE
[ 956.503059] [drm:amdgpu_mes_unmap_legacy_queue [amdgpu]] *ERROR* failed to unmap legacy queue
[ 958.507491] amdgpu 0000:c4:00.0: amdgpu: MES failed to respond to msg=REMOVE_QUEUE
[ 958.507507] [drm:amdgpu_mes_unmap_legacy_queue [amdgpu]] *ERROR* failed to unmap legacy queue
[ 960.512077] amdgpu 0000:c4:00.0: amdgpu: MES failed to respond to msg=REMOVE_QUEUE
[ 960.512093] [drm:amdgpu_mes_unmap_legacy_queue [amdgpu]] *ERROR* failed to unmap legacy queue
[ 960.785816] [drm:gfx_v11_0_hw_fini [amdgpu]] *ERROR* failed to halt cp gfx
Signed-off-by: Mika Laitio <lamikr@xxxxxxxxx>
---
.../drm/amd/amdkfd/kfd_device_queue_manager.c | 24 ++++++++++++-------
1 file changed, 16 insertions(+), 8 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index c79fe9069e22..96088d480e09 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -1187,9 +1187,12 @@ static int evict_process_queues_cpsch(struct device_queue_manager *dqm,
struct kfd_process_device *pdd;
int retval = 0;
+ // gfx1103 APU can fail to remove queue on evict/restore cycle
+ if (dqm->dev->adev->flags & AMD_IS_APU)
+ goto out;
dqm_lock(dqm);
if (qpd->evicted++ > 0) /* already evicted, do nothing */
- goto out;
+ goto out_unlock;
pdd = qpd_to_pdd(qpd);
@@ -1198,7 +1201,7 @@ static int evict_process_queues_cpsch(struct device_queue_manager *dqm,
* Skip queue eviction on process eviction.
*/
if (!pdd->drm_priv)
- goto out;
+ goto out_unlock;
pr_debug_ratelimited("Evicting PASID 0x%x queues\n",
pdd->process->pasid);
@@ -1219,7 +1222,7 @@ static int evict_process_queues_cpsch(struct device_queue_manager *dqm,
if (retval) {
dev_err(dev, "Failed to evict queue %d\n",
q->properties.queue_id);
- goto out;
+ goto out_unlock;
}
}
}
@@ -1231,8 +1234,9 @@ static int evict_process_queues_cpsch(struct device_queue_manager *dqm,
KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0,
USE_DEFAULT_GRACE_PERIOD);
-out:
+out_unlock:
dqm_unlock(dqm);
+out:
return retval;
}
@@ -1326,14 +1330,17 @@ static int restore_process_queues_cpsch(struct device_queue_manager *dqm,
uint64_t eviction_duration;
int retval = 0;
+ // gfx1103 APU can fail to remove queue on evict/restore cycle
+ if (dqm->dev->adev->flags & AMD_IS_APU)
+ goto out;
pdd = qpd_to_pdd(qpd);
dqm_lock(dqm);
if (WARN_ON_ONCE(!qpd->evicted)) /* already restored, do nothing */
- goto out;
+ goto out_unlock;
if (qpd->evicted > 1) { /* ref count still > 0, decrement & quit */
qpd->evicted--;
- goto out;
+ goto out_unlock;
}
/* The debugger creates processes that temporarily have not acquired
@@ -1364,7 +1371,7 @@ static int restore_process_queues_cpsch(struct device_queue_manager *dqm,
if (retval) {
dev_err(dev, "Failed to restore queue %d\n",
q->properties.queue_id);
- goto out;
+ goto out_unlock;
}
}
}
@@ -1375,8 +1382,9 @@ static int restore_process_queues_cpsch(struct device_queue_manager *dqm,
atomic64_add(eviction_duration, &pdd->evict_duration_counter);
vm_not_acquired:
qpd->evicted = 0;
-out:
+out_unlock:
dqm_unlock(dqm);
+out:
return retval;
}