This is an RFC and request for testing.
v2:
- Reworked eviction fence signaling
- Introduced restore_process_helper
v3:
- Handle unsignaled eviction fences in restore_process_bos
Signed-off-by: Felix Kuehling <Felix.Kuehling@xxxxxxx>
Acked-by: Christian König <christian.koenig@xxxxxxx>
---
.../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 68 +++++++++++----
drivers/gpu/drm/amd/amdkfd/kfd_process.c | 87 +++++++++++--------
drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 4 +-
3 files changed, 104 insertions(+), 55 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 2e302956a279..bdec88713a09 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -1431,7 +1431,6 @@ static int init_kfd_vm(struct amdgpu_vm *vm,
void **process_info,
amdgpu_amdkfd_restore_userptr_worker);
*process_info = info;
- *ef = dma_fence_get(&info->eviction_fence->base);
}
vm->process_info = *process_info;
@@ -1462,6 +1461,8 @@ static int init_kfd_vm(struct amdgpu_vm *vm,
void **process_info,
list_add_tail(&vm->vm_list_node,
&(vm->process_info->vm_list_head));
vm->process_info->n_vms++;
+
+ *ef = dma_fence_get(&vm->process_info->eviction_fence->base);
mutex_unlock(&vm->process_info->lock);
return 0;
@@ -1473,10 +1474,7 @@ static int init_kfd_vm(struct amdgpu_vm *vm,
void **process_info,
reserve_pd_fail:
vm->process_info = NULL;
if (info) {
- /* Two fence references: one in info and one in *ef */
dma_fence_put(&info->eviction_fence->base);
- dma_fence_put(*ef);
- *ef = NULL;
*process_info = NULL;
put_pid(info->pid);
create_evict_fence_fail:
@@ -1670,7 +1668,8 @@ int amdgpu_amdkfd_criu_resume(void *p)
goto out_unlock;
}
WRITE_ONCE(pinfo->block_mmu_notifications, false);
- schedule_delayed_work(&pinfo->restore_userptr_work, 0);
+ queue_delayed_work(system_freezable_wq,
+ &pinfo->restore_userptr_work, 0);
out_unlock:
mutex_unlock(&pinfo->lock);
@@ -2475,7 +2474,8 @@ int amdgpu_amdkfd_evict_userptr(struct
mmu_interval_notifier *mni,
KFD_QUEUE_EVICTION_TRIGGER_USERPTR);
if (r)
pr_err("Failed to quiesce KFD\n");
- schedule_delayed_work(&process_info->restore_userptr_work,
+ queue_delayed_work(system_freezable_wq,
+ &process_info->restore_userptr_work,
msecs_to_jiffies(AMDGPU_USERPTR_RESTORE_DELAY_MS));
}
mutex_unlock(&process_info->notifier_lock);
@@ -2810,7 +2810,8 @@ static void
amdgpu_amdkfd_restore_userptr_worker(struct work_struct *work)
/* If validation failed, reschedule another attempt */
if (evicted_bos) {
- schedule_delayed_work(&process_info->restore_userptr_work,
+ queue_delayed_work(system_freezable_wq,
+ &process_info->restore_userptr_work,
msecs_to_jiffies(AMDGPU_USERPTR_RESTORE_DELAY_MS));
kfd_smi_event_queue_restore_rescheduled(mm);
@@ -2819,6 +2820,23 @@ static void
amdgpu_amdkfd_restore_userptr_worker(struct work_struct *work)
put_task_struct(usertask);
}
+static void replace_eviction_fence(struct dma_fence **ef,
+ struct dma_fence *new_ef)
+{
+ struct dma_fence *old_ef = rcu_replace_pointer(*ef, new_ef, true
+ /* protected by process_info->lock */);
+
+ /* If we're replacing an unsignaled eviction fence, that fence will
+ * never be signaled, and if anyone is still waiting on that fence,
+ * they will hang forever. This should never happen. We should only
+ * replace the fence in restore_work that only gets scheduled after
+ * eviction work signaled the fence.
+ */
+ WARN_ONCE(!dma_fence_is_signaled(old_ef),
+ "Replacing unsignaled eviction fence");
+ dma_fence_put(old_ef);
+}
+
/** amdgpu_amdkfd_gpuvm_restore_process_bos - Restore all BOs for
the given
* KFD process identified by process_info
*
@@ -2844,7 +2862,6 @@ int
amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence
**ef)
struct amdgpu_vm *peer_vm;
struct kgd_mem *mem;
struct bo_vm_reservation_context ctx;
- struct amdgpu_amdkfd_fence *new_fence;
int ret = 0, i;
struct list_head duplicate_save;
struct amdgpu_sync sync_obj;
@@ -2974,22 +2991,35 @@ int
amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence
**ef)
/* Wait for validate and PT updates to finish */
amdgpu_sync_wait(&sync_obj, false);
- /* Release old eviction fence and create new one, because
fence only
- * goes from unsignaled to signaled, fence cannot be reused.
- * Use context and mm from the old fence.
+ /* The old eviction fence may be unsignaled if restore happens
+ * after a GPU reset or suspend/resume. Keep the old fence in that
+ * case. Otherwise release the old eviction fence and create new
+ * one, because fence only goes from unsignaled to signaled once
+ * and cannot be reused. Use context and mm from the old fence.
+ *
+ * If an old eviction fence signals after this check, that's OK.
+ * Anyone signaling an eviction fence must stop the queues first
+ * and schedule another restore worker.
*/
- new_fence = amdgpu_amdkfd_fence_create(
+ if (dma_fence_is_signaled(&process_info->eviction_fence->base)) {
+ struct amdgpu_amdkfd_fence *new_fence =
+ amdgpu_amdkfd_fence_create(
process_info->eviction_fence->base.context,
process_info->eviction_fence->mm,
NULL);
- if (!new_fence) {
- pr_err("Failed to create eviction fence\n");
- ret = -ENOMEM;
- goto validate_map_fail;
+
+ if (!new_fence) {
+ pr_err("Failed to create eviction fence\n");
+ ret = -ENOMEM;
+ goto validate_map_fail;
+ }
+ dma_fence_put(&process_info->eviction_fence->base);
+ process_info->eviction_fence = new_fence;
+ replace_eviction_fence(ef, dma_fence_get(&new_fence->base));
+ } else {
+ WARN_ONCE(*ef != &process_info->eviction_fence->base,
+ "KFD eviction fence doesn't match KGD process_info");
}
- dma_fence_put(&process_info->eviction_fence->base);
- process_info->eviction_fence = new_fence;
- *ef = dma_fence_get(&new_fence->base);
/* Attach new eviction fence to all BOs except pinned ones */
list_for_each_entry(mem, &process_info->kfd_bo_list,
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index c10d050e1a61..71df51fcc1b0 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -664,7 +664,8 @@ int kfd_process_create_wq(void)
if (!kfd_process_wq)
kfd_process_wq = alloc_workqueue("kfd_process_wq", 0, 0);
if (!kfd_restore_wq)
- kfd_restore_wq = alloc_ordered_workqueue("kfd_restore_wq", 0);
+ kfd_restore_wq = alloc_ordered_workqueue("kfd_restore_wq",
+ WQ_FREEZABLE);
if (!kfd_process_wq || !kfd_restore_wq) {
kfd_process_destroy_wq();
@@ -1642,6 +1643,7 @@ int kfd_process_device_init_vm(struct
kfd_process_device *pdd,
struct amdgpu_fpriv *drv_priv;
struct amdgpu_vm *avm;
struct kfd_process *p;
+ struct dma_fence *ef;
struct kfd_node *dev;
int ret;
@@ -1661,11 +1663,12 @@ int kfd_process_device_init_vm(struct
kfd_process_device *pdd,
ret = amdgpu_amdkfd_gpuvm_acquire_process_vm(dev->adev, avm,
&p->kgd_process_info,
- &p->ef);
+ &ef);
if (ret) {
pr_err("Failed to create process VM object\n");
return ret;
}
+ RCU_INIT_POINTER(p->ef, ef);
pdd->drm_priv = drm_file->private_data;
ret = kfd_process_device_reserve_ib_mem(pdd);
@@ -1908,6 +1911,21 @@ kfd_process_gpuid_from_node(struct kfd_process
*p, struct kfd_node *node,
return -EINVAL;
}
+static int signal_eviction_fence(struct kfd_process *p)
+{
+ struct dma_fence *ef;
+ int ret;
+
+ rcu_read_lock();
+ ef = dma_fence_get_rcu_safe(&p->ef);
+ rcu_read_unlock();
+
+ ret = dma_fence_signal(ef);
+ dma_fence_put(ef);
+
+ return ret;
+}
+
static void evict_process_worker(struct work_struct *work)
{
int ret;
@@ -1920,31 +1938,46 @@ static void evict_process_worker(struct
work_struct *work)
* lifetime of this thread, kfd_process p will be valid
*/
p = container_of(dwork, struct kfd_process, eviction_work);
- WARN_ONCE(p->last_eviction_seqno != p->ef->seqno,
- "Eviction fence mismatch\n");
-
- /* Narrow window of overlap between restore and evict work
- * item is possible. Once amdgpu_amdkfd_gpuvm_restore_process_bos
- * unreserves KFD BOs, it is possible to evicted again. But
- * restore has few more steps of finish. So lets wait for any
- * previous restore work to complete
- */
- flush_delayed_work(&p->restore_work);
pr_debug("Started evicting pasid 0x%x\n", p->pasid);
ret = kfd_process_evict_queues(p, KFD_QUEUE_EVICTION_TRIGGER_TTM);
if (!ret) {
- dma_fence_signal(p->ef);
- dma_fence_put(p->ef);
- p->ef = NULL;
- queue_delayed_work(kfd_restore_wq, &p->restore_work,
+ /* If another thread already signaled the eviction fence,
+ * they are responsible stopping the queues and scheduling
+ * the restore work.
+ */
+ if (!signal_eviction_fence(p))
+ queue_delayed_work(kfd_restore_wq, &p->restore_work,
msecs_to_jiffies(PROCESS_RESTORE_TIME_MS));
+ else
+ kfd_process_restore_queues(p);
pr_debug("Finished evicting pasid 0x%x\n", p->pasid);
} else
pr_err("Failed to evict queues of pasid 0x%x\n", p->pasid);
}
+static int restore_process_helper(struct kfd_process *p)
+{
+ int ret = 0;
+
+ /* VMs may not have been acquired yet during debugging. */
+ if (p->kgd_process_info) {
+ ret = amdgpu_amdkfd_gpuvm_restore_process_bos(
+ p->kgd_process_info, &p->ef);
+ if (ret)
+ return ret;
+ }
+
+ ret = kfd_process_restore_queues(p);
+ if (!ret)
+ pr_debug("Finished restoring pasid 0x%x\n", p->pasid);
+ else
+ pr_err("Failed to restore queues of pasid 0x%x\n", p->pasid);
+
+ return ret;
+}
+
static void restore_process_worker(struct work_struct *work)
{
struct delayed_work *dwork;
@@ -1970,24 +2003,15 @@ static void restore_process_worker(struct
work_struct *work)
*/
p->last_restore_timestamp = get_jiffies_64();
- /* VMs may not have been acquired yet during debugging. */
- if (p->kgd_process_info)
- ret =
amdgpu_amdkfd_gpuvm_restore_process_bos(p->kgd_process_info,
- &p->ef);
+
+ ret = restore_process_helper(p);
if (ret) {
pr_debug("Failed to restore BOs of pasid 0x%x, retry after
%d ms\n",
p->pasid, PROCESS_BACK_OFF_TIME_MS);
ret = queue_delayed_work(kfd_restore_wq, &p->restore_work,
msecs_to_jiffies(PROCESS_BACK_OFF_TIME_MS));
WARN(!ret, "reschedule restore work failed\n");
- return;
}
-
- ret = kfd_process_restore_queues(p);
- if (!ret)
- pr_debug("Finished restoring pasid 0x%x\n", p->pasid);
- else
- pr_err("Failed to restore queues of pasid 0x%x\n", p->pasid);
}
void kfd_suspend_all_processes(void)
@@ -1998,14 +2022,9 @@ void kfd_suspend_all_processes(void)
WARN(debug_evictions, "Evicting all processes");
hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
- cancel_delayed_work_sync(&p->eviction_work);
- flush_delayed_work(&p->restore_work);
-
if (kfd_process_evict_queues(p,
KFD_QUEUE_EVICTION_TRIGGER_SUSPEND))
pr_err("Failed to suspend process 0x%x\n", p->pasid);
- dma_fence_signal(p->ef);
- dma_fence_put(p->ef);
- p->ef = NULL;
+ signal_eviction_fence(p);
}
srcu_read_unlock(&kfd_processes_srcu, idx);
}
@@ -2017,7 +2036,7 @@ int kfd_resume_all_processes(void)
int ret = 0, idx = srcu_read_lock(&kfd_processes_srcu);
hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
- if (!queue_delayed_work(kfd_restore_wq, &p->restore_work, 0)) {
+ if (restore_process_helper(p)) {
pr_err("Restore process %d failed during resume\n",
p->pasid);
ret = -EFAULT;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index b23ba92a794c..42b5279c7010 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -1871,7 +1871,7 @@ static void svm_range_restore_work(struct
work_struct *work)
/* If validation failed, reschedule another attempt */
if (evicted_ranges) {
pr_debug("reschedule to restore svm range\n");
- schedule_delayed_work(&svms->restore_work,
+ queue_delayed_work(system_freezable_wq, &svms->restore_work,
msecs_to_jiffies(AMDGPU_SVM_RANGE_RESTORE_DELAY_MS));
kfd_smi_event_queue_restore_rescheduled(mm);
@@ -1947,7 +1947,7 @@ svm_range_evict(struct svm_range *prange,
struct mm_struct *mm,
pr_debug("failed to quiesce KFD\n");
pr_debug("schedule to restore svm %p ranges\n", svms);
- schedule_delayed_work(&svms->restore_work,
+ queue_delayed_work(system_freezable_wq, &svms->restore_work,
msecs_to_jiffies(AMDGPU_SVM_RANGE_RESTORE_DELAY_MS));
} else {
unsigned long s, l;