On 2023-10-30 4:23, Christian König wrote:
Am 28.10.23 um 00:39 schrieb Felix Kuehling:
Make restore workers freezable so we don't have to explicitly flush them
in suspend and GPU reset code paths, and we don't accidentally try to
restore BOs while the GPU is suspended. Not having to flush restore_work
also helps avoid lock/fence dependencies in the GPU reset case where we're
not allowed to wait for fences.
This is an RFC and request for testing. I have not tested this myself yet.
My notes below:
Restore work won't be frozen during GPU reset. Does it matter? Queues will
stay evicted until resume in any case. But restore_work may be in trouble
if it gets run in the middle of a GPU reset. In that case, if anything
fails, it will just reschedule itself, so should be fine as long as it
doesn't interfere with the reset itself (respects any mechanisms in place
to prevent HW access during the reset).
What HW access does restore_work perform:
- migrating buffers: uses GPU scheduler, should be suspended during reset
- TLB flushes in userptr restore worker: not called directly, relies on
HWS to flush TLBs on VMID attachment
- TLB flushes in SVM restore worker: Does TLB flush in the mapping code
- Resuming user mode queues: should not happen while GPU reset keeps queue
eviction counter elevated
Ergo: Except for the SVM case, it's OK to not flush restore work before
GPU resets. I'll need to rethink the interaction of SVM restore_work with
GPU resets.
It also sounds like the restore work is some high level work and shouldn't interact with the lower level GPU reset.
That was my hope as well. Just trying to think through to make sure I'm not making any incorrect assumptions.
Do you know if there is anything preventing a TLB flush using MMIO from messing up a GPU reset in progress? That's the only thing in the SVM restore work that tries to touch HW directly.
The TLB invalidations should be protected from GPU reset influence after I reworked TLB flush.
So I think we are save here.
What about cancelling p->eviction_work? Eviction work would normally be
needed to signal eviction fences, but we're doing that explicitly in
suspend_all_processes. Does eviction_work wait for fences anywhere? Yes,
indirectly by flushing restore_work. So we should not try to cancel it
during a GPU reset.
Problem: accessing p->ef concurrently in evict_process_worker and
suspend_all_processes. Need a spinlock to use and update it safely.
Problem: What if evict_process_worker gets stuck in flushing restore_work?
We can skip all of that if p->ef is NULL (which it is during the reset).
Even if it gets stuck, it's no problem if the reset doesn't depend on it.
It should get unstuck after the reset.
Signed-off-by: Felix Kuehling <Felix.Kuehling@xxxxxxx>
---
.../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 9 ++--
drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 1 +
drivers/gpu/drm/amd/amdkfd/kfd_process.c | 49 +++++++++++++------
drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 4 +-
4 files changed, 44 insertions(+), 19 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 54f31a420229..89e632257663 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -1644,7 +1644,8 @@ int amdgpu_amdkfd_criu_resume(void *p)
goto out_unlock;
}
WRITE_ONCE(pinfo->block_mmu_notifications, false);
- schedule_delayed_work(&pinfo->restore_userptr_work, 0);
+ queue_delayed_work(system_freezable_wq,
+ &pinfo->restore_userptr_work, 0);
out_unlock:
mutex_unlock(&pinfo->lock);
@@ -2458,7 +2459,8 @@ int amdgpu_amdkfd_evict_userptr(struct mmu_interval_notifier *mni,
KFD_QUEUE_EVICTION_TRIGGER_USERPTR);
if (r)
pr_err("Failed to quiesce KFD\n");
- schedule_delayed_work(&process_info->restore_userptr_work,
+ queue_delayed_work(system_freezable_wq,
+ &process_info->restore_userptr_work,
msecs_to_jiffies(AMDGPU_USERPTR_RESTORE_DELAY_MS));
}
mutex_unlock(&process_info->notifier_lock);
@@ -2793,7 +2795,8 @@ static void amdgpu_amdkfd_restore_userptr_worker(struct work_struct *work)
/* If validation failed, reschedule another attempt */
if (evicted_bos) {
- schedule_delayed_work(&process_info->restore_userptr_work,
+ queue_delayed_work(system_freezable_wq,
+ &process_info->restore_userptr_work,
msecs_to_jiffies(AMDGPU_USERPTR_RESTORE_DELAY_MS));
kfd_smi_event_queue_restore_rescheduled(mm);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 9cc32f577e38..cf017d027fee 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -919,6 +919,7 @@ struct kfd_process {
* during restore
*/
struct dma_fence *ef;
+ spinlock_t ef_lock;
/* Work items for evicting and restoring BOs */
struct delayed_work eviction_work;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index fbf053001af9..a07cba58ec5e 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -664,7 +664,8 @@ int kfd_process_create_wq(void)
if (!kfd_process_wq)
kfd_process_wq = alloc_workqueue("kfd_process_wq", 0, 0);
if (!kfd_restore_wq)
- kfd_restore_wq = alloc_ordered_workqueue("kfd_restore_wq", 0);
+ kfd_restore_wq = alloc_ordered_workqueue("kfd_restore_wq",
+ WQ_FREEZABLE);
if (!kfd_process_wq || !kfd_restore_wq) {
kfd_process_destroy_wq();
@@ -1460,6 +1461,7 @@ static struct kfd_process *create_process(const struct task_struct *thread)
kref_init(&process->ref);
mutex_init(&process->mutex);
+ spin_lock_init(&process->ef_lock);
process->mm = thread->mm;
process->lead_thread = thread->group_leader;
process->n_pdds = 0;
@@ -1904,6 +1906,19 @@ kfd_process_gpuid_from_node(struct kfd_process *p, struct kfd_node *node,
return -EINVAL;
}
+static void signal_eviction_fence(struct kfd_process *p)
+{
+ spin_lock(&p->ef_lock);
+ if (!p->ef)
+ goto unlock_out;
+ dma_fence_signal(p->ef);
This needs to grab the internal lock of the eviction fence, I'm not sure that has correct ordering with the newly added ef_lock.
Hmm, the only thing we could get a circular lock dependency would be, if we took the p->ef_lock in a fence callback. I don't think that ever happens, because even the eviction work runs on a worker thread (exactly to avoid such lock dependency issues).
Anyway, I could try to move the fence_signal out of the critical section. The lock is only there to ensure that exactly one process signals and frees the fence.
So basically either the eviction worker or the GPU reset could signal this fence.
In that case I think it would be simpler to grab the reset lock in the eviction worker to protect against a concurrent reset.
Regards,
Christian.
spin_lock(&p->ef_lock); ef = p->ef; WRITE_ONCE(p->ef, NULL); spin_unlock(&p->ef_lock); if (ef) { dma_fence_signal(ef); dma_fence_put(ef); }
+ dma_fence_put(p->ef);
+ WRITE_ONCE(p->ef, NULL);
+
+unlock_out:
+ spin_unlock(&p->ef_lock);
+}
+
static void evict_process_worker(struct work_struct *work)
{
int ret;
@@ -1916,8 +1931,11 @@ static void evict_process_worker(struct work_struct *work)
* lifetime of this thread, kfd_process p will be valid
*/
p = container_of(dwork, struct kfd_process, eviction_work);
- WARN_ONCE(p->last_eviction_seqno != p->ef->seqno,
- "Eviction fence mismatch\n");
+ /* If the eviction fence is not valid, we're probably in a suspend
+ * or GPU reset cycle. There is nothing to do in this case.
+ */
+ if (!READ_ONCE(p->ef))
+ return;
This evict process work is high level I don't think it should have any dependency on the GPU reset.
Right. This is not here to avoid issues, just a short-cut to avoid unnecessary work.
Regards,
Felix
Regards,
Christian.
/* Narrow window of overlap between restore and evict work
* item is possible. Once amdgpu_amdkfd_gpuvm_restore_process_bos
@@ -1930,9 +1948,7 @@ static void evict_process_worker(struct work_struct *work)
pr_debug("Started evicting pasid 0x%x\n", p->pasid);
ret = kfd_process_evict_queues(p, KFD_QUEUE_EVICTION_TRIGGER_TTM);
if (!ret) {
- dma_fence_signal(p->ef);
- dma_fence_put(p->ef);
- p->ef = NULL;
+ signal_eviction_fence(p);
queue_delayed_work(kfd_restore_wq, &p->restore_work,
msecs_to_jiffies(PROCESS_RESTORE_TIME_MS));
@@ -1967,9 +1983,19 @@ static void restore_process_worker(struct work_struct *work)
p->last_restore_timestamp = get_jiffies_64();
/* VMs may not have been acquired yet during debugging. */
- if (p->kgd_process_info)
+ if (p->kgd_process_info) {
+ struct dma_fence *ef = NULL;
+
ret = amdgpu_amdkfd_gpuvm_restore_process_bos(p->kgd_process_info,
- &p->ef);
+ &ef);
+ if (!ret) {
+ spin_lock(&p->ef_lock);
+ WARN_ONCE(p->ef, "Eviction fence is not NULL");
+ WRITE_ONCE(p->ef, ef);
+ spin_unlock(&p->ef_lock);
+ }
+ }
+
if (ret) {
pr_debug("Failed to restore BOs of pasid 0x%x, retry after %d ms\n",
p->pasid, PROCESS_BACK_OFF_TIME_MS);
@@ -1994,14 +2020,9 @@ void kfd_suspend_all_processes(void)
WARN(debug_evictions, "Evicting all processes");
hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
- cancel_delayed_work_sync(&p->eviction_work);
- flush_delayed_work(&p->restore_work);
-
if (kfd_process_evict_queues(p, KFD_QUEUE_EVICTION_TRIGGER_SUSPEND))
pr_err("Failed to suspend process 0x%x\n", p->pasid);
- dma_fence_signal(p->ef);
- dma_fence_put(p->ef);
- p->ef = NULL;
+ signal_eviction_fence(p);
}
srcu_read_unlock(&kfd_processes_srcu, idx);
}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index fe937670c927..aa6c34127be9 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -1869,7 +1869,7 @@ static void svm_range_restore_work(struct work_struct *work)
/* If validation failed, reschedule another attempt */
if (evicted_ranges) {
pr_debug("reschedule to restore svm range\n");
- schedule_delayed_work(&svms->restore_work,
+ queue_delayed_work(system_freezable_wq, &svms->restore_work,
msecs_to_jiffies(AMDGPU_SVM_RANGE_RESTORE_DELAY_MS));
kfd_smi_event_queue_restore_rescheduled(mm);
@@ -1945,7 +1945,7 @@ svm_range_evict(struct svm_range *prange, struct mm_struct *mm,
pr_debug("failed to quiesce KFD\n");
pr_debug("schedule to restore svm %p ranges\n", svms);
- schedule_delayed_work(&svms->restore_work,
+ queue_delayed_work(system_freezable_wq, &svms->restore_work,
msecs_to_jiffies(AMDGPU_SVM_RANGE_RESTORE_DELAY_MS));
} else {
unsigned long s, l;