Re: [PATCH v3] drm/amdkfd: Run restore_workers on freezable WQs

Felix Kuehling <felix.kuehling@xxxxxxx> · Tue, 28 Nov 2023 15:24:51 -0500

On 2023-11-24 8:40, Lazar, Lijo wrote:


On 11/24/2023 4:25 AM, Felix Kuehling wrote:
Make restore workers freezable so we don't have to explicitly flush them
in suspend and GPU reset code paths, and we don't accidentally try to
restore BOs while the GPU is suspended. Not having to flush restore_work
also helps avoid lock/fence dependencies in the GPU reset case where 
we're
not allowed to wait for fences.

A side effect of this is, that we can now have multiple concurrent 
threads
trying to signal the same eviction fence. Rework eviction fence 
signaling
and replacement to account for that.

The GPU reset path can no longer rely on restore_process_worker to 
resume
queues because evict/restore workers can run independently of it. 
Instead
call a new restore_process_helper directly.

Not familiar with this code. For clarity, does this mean 
eviction/restore may happen while a reset is in progress?

I'm not sure if anything would be able to cause an eviction while a GPU 
reset is in progress. I don't think it's likely. Any actual migration of 
BOs would need to wait until after the reset is done anyway. In case of 
suspend/resume, evictions happen because all the VRAM gets saved to 
system memory.

Suspend/resume and evictions depend on the same helper or worker to 
restore BOs before reenabling user mode queues. GPU reset needs the same 
helper to stop user mode queues to ensure that applications don't 
continue running with potentially corrupted data. This patch removes 
some synchronization between different users of these common code paths 
to fix deadlocks in the suspend/resume and GPU reset scenarios.

Regards,
  Felix



Thanks,
Lijo


This is an RFC and request for testing.

v2:
- Reworked eviction fence signaling
- Introduced restore_process_helper

v3:
- Handle unsignaled eviction fences in restore_process_bos

Signed-off-by: Felix Kuehling <Felix.Kuehling@xxxxxxx>
Acked-by: Christian König <christian.koenig@xxxxxxx>
---
  .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  | 68 +++++++++++----
  drivers/gpu/drm/amd/amdkfd/kfd_process.c      | 87 +++++++++++--------
  drivers/gpu/drm/amd/amdkfd/kfd_svm.c          |  4 +-
  3 files changed, 104 insertions(+), 55 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 2e302956a279..bdec88713a09 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -1431,7 +1431,6 @@ static int init_kfd_vm(struct amdgpu_vm *vm, 
void **process_info,
                    amdgpu_amdkfd_restore_userptr_worker);
            *process_info = info;
-        *ef = dma_fence_get(&info->eviction_fence->base);
      }
        vm->process_info = *process_info;
@@ -1462,6 +1461,8 @@ static int init_kfd_vm(struct amdgpu_vm *vm, 
void **process_info,
      list_add_tail(&vm->vm_list_node,
              &(vm->process_info->vm_list_head));
      vm->process_info->n_vms++;
+
+    *ef = dma_fence_get(&vm->process_info->eviction_fence->base);
      mutex_unlock(&vm->process_info->lock);
        return 0;
@@ -1473,10 +1474,7 @@ static int init_kfd_vm(struct amdgpu_vm *vm, 
void **process_info,
  reserve_pd_fail:
      vm->process_info = NULL;
      if (info) {
-        /* Two fence references: one in info and one in *ef */
          dma_fence_put(&info->eviction_fence->base);
-        dma_fence_put(*ef);
-        *ef = NULL;
          *process_info = NULL;
          put_pid(info->pid);
  create_evict_fence_fail:
@@ -1670,7 +1668,8 @@ int amdgpu_amdkfd_criu_resume(void *p)
          goto out_unlock;
      }
      WRITE_ONCE(pinfo->block_mmu_notifications, false);
-    schedule_delayed_work(&pinfo->restore_userptr_work, 0);
+    queue_delayed_work(system_freezable_wq,
+               &pinfo->restore_userptr_work, 0);
    out_unlock:
      mutex_unlock(&pinfo->lock);
@@ -2475,7 +2474,8 @@ int amdgpu_amdkfd_evict_userptr(struct 
mmu_interval_notifier *mni,
                         KFD_QUEUE_EVICTION_TRIGGER_USERPTR);
          if (r)
              pr_err("Failed to quiesce KFD\n");
- schedule_delayed_work(&process_info->restore_userptr_work,
+        queue_delayed_work(system_freezable_wq,
+            &process_info->restore_userptr_work,
msecs_to_jiffies(AMDGPU_USERPTR_RESTORE_DELAY_MS));
      }
      mutex_unlock(&process_info->notifier_lock);
@@ -2810,7 +2810,8 @@ static void 
amdgpu_amdkfd_restore_userptr_worker(struct work_struct *work)
        /* If validation failed, reschedule another attempt */
      if (evicted_bos) {
- schedule_delayed_work(&process_info->restore_userptr_work,
+        queue_delayed_work(system_freezable_wq,
+            &process_info->restore_userptr_work,
msecs_to_jiffies(AMDGPU_USERPTR_RESTORE_DELAY_MS));
            kfd_smi_event_queue_restore_rescheduled(mm);
@@ -2819,6 +2820,23 @@ static void 
amdgpu_amdkfd_restore_userptr_worker(struct work_struct *work)
      put_task_struct(usertask);
  }
  +static void replace_eviction_fence(struct dma_fence **ef,
+                   struct dma_fence *new_ef)
+{
+    struct dma_fence *old_ef = rcu_replace_pointer(*ef, new_ef, true
+        /* protected by process_info->lock */);
+
+    /* If we're replacing an unsignaled eviction fence, that fence will
+     * never be signaled, and if anyone is still waiting on that fence,
+     * they will hang forever. This should never happen. We should only
+     * replace the fence in restore_work that only gets scheduled after
+     * eviction work signaled the fence.
+     */
+    WARN_ONCE(!dma_fence_is_signaled(old_ef),
+          "Replacing unsignaled eviction fence");
+    dma_fence_put(old_ef);
+}
+
  /** amdgpu_amdkfd_gpuvm_restore_process_bos - Restore all BOs for 
the given
   *   KFD process identified by process_info
   *
@@ -2844,7 +2862,6 @@ int 
amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence 
**ef)
      struct amdgpu_vm *peer_vm;
      struct kgd_mem *mem;
      struct bo_vm_reservation_context ctx;
-    struct amdgpu_amdkfd_fence *new_fence;
      int ret = 0, i;
      struct list_head duplicate_save;
      struct amdgpu_sync sync_obj;
@@ -2974,22 +2991,35 @@ int 
amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence 
**ef)
      /* Wait for validate and PT updates to finish */
      amdgpu_sync_wait(&sync_obj, false);
  -    /* Release old eviction fence and create new one, because 
fence only
-     * goes from unsignaled to signaled, fence cannot be reused.
-     * Use context and mm from the old fence.
+    /* The old eviction fence may be unsignaled if restore happens
+     * after a GPU reset or suspend/resume. Keep the old fence in that
+     * case. Otherwise release the old eviction fence and create new
+     * one, because fence only goes from unsignaled to signaled once
+     * and cannot be reused. Use context and mm from the old fence.
+     *
+     * If an old eviction fence signals after this check, that's OK.
+     * Anyone signaling an eviction fence must stop the queues first
+     * and schedule another restore worker.
       */
-    new_fence = amdgpu_amdkfd_fence_create(
+    if (dma_fence_is_signaled(&process_info->eviction_fence->base)) {
+        struct amdgpu_amdkfd_fence *new_fence =
+            amdgpu_amdkfd_fence_create(
process_info->eviction_fence->base.context,
                  process_info->eviction_fence->mm,
                  NULL);
-    if (!new_fence) {
-        pr_err("Failed to create eviction fence\n");
-        ret = -ENOMEM;
-        goto validate_map_fail;
+
+        if (!new_fence) {
+            pr_err("Failed to create eviction fence\n");
+            ret = -ENOMEM;
+            goto validate_map_fail;
+        }
+ dma_fence_put(&process_info->eviction_fence->base);
+        process_info->eviction_fence = new_fence;
+        replace_eviction_fence(ef, dma_fence_get(&new_fence->base));
+    } else {
+        WARN_ONCE(*ef != &process_info->eviction_fence->base,
+              "KFD eviction fence doesn't match KGD process_info");
      }
- dma_fence_put(&process_info->eviction_fence->base);
-    process_info->eviction_fence = new_fence;
-    *ef = dma_fence_get(&new_fence->base);
        /* Attach new eviction fence to all BOs except pinned ones */
      list_for_each_entry(mem, &process_info->kfd_bo_list,
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index c10d050e1a61..71df51fcc1b0 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -664,7 +664,8 @@ int kfd_process_create_wq(void)
      if (!kfd_process_wq)
          kfd_process_wq = alloc_workqueue("kfd_process_wq", 0, 0);
      if (!kfd_restore_wq)
-        kfd_restore_wq = alloc_ordered_workqueue("kfd_restore_wq", 0);
+        kfd_restore_wq = alloc_ordered_workqueue("kfd_restore_wq",
+                             WQ_FREEZABLE);
        if (!kfd_process_wq || !kfd_restore_wq) {
          kfd_process_destroy_wq();
@@ -1642,6 +1643,7 @@ int kfd_process_device_init_vm(struct 
kfd_process_device *pdd,
      struct amdgpu_fpriv *drv_priv;
      struct amdgpu_vm *avm;
      struct kfd_process *p;
+    struct dma_fence *ef;
      struct kfd_node *dev;
      int ret;
  @@ -1661,11 +1663,12 @@ int kfd_process_device_init_vm(struct 
kfd_process_device *pdd,
        ret = amdgpu_amdkfd_gpuvm_acquire_process_vm(dev->adev, avm,
                               &p->kgd_process_info,
-                             &p->ef);
+                             &ef);
      if (ret) {
          pr_err("Failed to create process VM object\n");
          return ret;
      }
+    RCU_INIT_POINTER(p->ef, ef);
      pdd->drm_priv = drm_file->private_data;
        ret = kfd_process_device_reserve_ib_mem(pdd);
@@ -1908,6 +1911,21 @@ kfd_process_gpuid_from_node(struct kfd_process 
*p, struct kfd_node *node,
      return -EINVAL;
  }
  +static int signal_eviction_fence(struct kfd_process *p)
+{
+    struct dma_fence *ef;
+    int ret;
+
+    rcu_read_lock();
+    ef = dma_fence_get_rcu_safe(&p->ef);
+    rcu_read_unlock();
+
+    ret = dma_fence_signal(ef);
+    dma_fence_put(ef);
+
+    return ret;
+}
+
  static void evict_process_worker(struct work_struct *work)
  {
      int ret;
@@ -1920,31 +1938,46 @@ static void evict_process_worker(struct 
work_struct *work)
       * lifetime of this thread, kfd_process p will be valid
       */
      p = container_of(dwork, struct kfd_process, eviction_work);
-    WARN_ONCE(p->last_eviction_seqno != p->ef->seqno,
-          "Eviction fence mismatch\n");
-
-    /* Narrow window of overlap between restore and evict work
-     * item is possible. Once amdgpu_amdkfd_gpuvm_restore_process_bos
-     * unreserves KFD BOs, it is possible to evicted again. But
-     * restore has few more steps of finish. So lets wait for any
-     * previous restore work to complete
-     */
-    flush_delayed_work(&p->restore_work);
        pr_debug("Started evicting pasid 0x%x\n", p->pasid);
      ret = kfd_process_evict_queues(p, KFD_QUEUE_EVICTION_TRIGGER_TTM);
      if (!ret) {
-        dma_fence_signal(p->ef);
-        dma_fence_put(p->ef);
-        p->ef = NULL;
-        queue_delayed_work(kfd_restore_wq, &p->restore_work,
+        /* If another thread already signaled the eviction fence,
+         * they are responsible stopping the queues and scheduling
+         * the restore work.
+         */
+        if (!signal_eviction_fence(p))
+            queue_delayed_work(kfd_restore_wq, &p->restore_work,
                  msecs_to_jiffies(PROCESS_RESTORE_TIME_MS));
+        else
+            kfd_process_restore_queues(p);
            pr_debug("Finished evicting pasid 0x%x\n", p->pasid);
      } else
          pr_err("Failed to evict queues of pasid 0x%x\n", p->pasid);
  }
  +static int restore_process_helper(struct kfd_process *p)
+{
+    int ret = 0;
+
+    /* VMs may not have been acquired yet during debugging. */
+    if (p->kgd_process_info) {
+        ret = amdgpu_amdkfd_gpuvm_restore_process_bos(
+            p->kgd_process_info, &p->ef);
+        if (ret)
+            return ret;
+    }
+
+    ret = kfd_process_restore_queues(p);
+    if (!ret)
+        pr_debug("Finished restoring pasid 0x%x\n", p->pasid);
+    else
+        pr_err("Failed to restore queues of pasid 0x%x\n", p->pasid);
+
+    return ret;
+}
+
  static void restore_process_worker(struct work_struct *work)
  {
      struct delayed_work *dwork;
@@ -1970,24 +2003,15 @@ static void restore_process_worker(struct 
work_struct *work)
       */
        p->last_restore_timestamp = get_jiffies_64();
-    /* VMs may not have been acquired yet during debugging. */
-    if (p->kgd_process_info)
-        ret = 
amdgpu_amdkfd_gpuvm_restore_process_bos(p->kgd_process_info,
-                                 &p->ef);
+
+    ret = restore_process_helper(p);
      if (ret) {
          pr_debug("Failed to restore BOs of pasid 0x%x, retry after 
%d ms\n",
               p->pasid, PROCESS_BACK_OFF_TIME_MS);
          ret = queue_delayed_work(kfd_restore_wq, &p->restore_work,
                  msecs_to_jiffies(PROCESS_BACK_OFF_TIME_MS));
          WARN(!ret, "reschedule restore work failed\n");
-        return;
      }
-
-    ret = kfd_process_restore_queues(p);
-    if (!ret)
-        pr_debug("Finished restoring pasid 0x%x\n", p->pasid);
-    else
-        pr_err("Failed to restore queues of pasid 0x%x\n", p->pasid);
  }
    void kfd_suspend_all_processes(void)
@@ -1998,14 +2022,9 @@ void kfd_suspend_all_processes(void)
        WARN(debug_evictions, "Evicting all processes");
      hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
-        cancel_delayed_work_sync(&p->eviction_work);
-        flush_delayed_work(&p->restore_work);
-
          if (kfd_process_evict_queues(p, 
KFD_QUEUE_EVICTION_TRIGGER_SUSPEND))
              pr_err("Failed to suspend process 0x%x\n", p->pasid);
-        dma_fence_signal(p->ef);
-        dma_fence_put(p->ef);
-        p->ef = NULL;
+        signal_eviction_fence(p);
      }
      srcu_read_unlock(&kfd_processes_srcu, idx);
  }
@@ -2017,7 +2036,7 @@ int kfd_resume_all_processes(void)
      int ret = 0, idx = srcu_read_lock(&kfd_processes_srcu);
        hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
-        if (!queue_delayed_work(kfd_restore_wq, &p->restore_work, 0)) {
+        if (restore_process_helper(p)) {
              pr_err("Restore process %d failed during resume\n",
                     p->pasid);
              ret = -EFAULT;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index b23ba92a794c..42b5279c7010 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -1871,7 +1871,7 @@ static void svm_range_restore_work(struct 
work_struct *work)
      /* If validation failed, reschedule another attempt */
      if (evicted_ranges) {
          pr_debug("reschedule to restore svm range\n");
-        schedule_delayed_work(&svms->restore_work,
+        queue_delayed_work(system_freezable_wq, &svms->restore_work,
msecs_to_jiffies(AMDGPU_SVM_RANGE_RESTORE_DELAY_MS));
            kfd_smi_event_queue_restore_rescheduled(mm);
@@ -1947,7 +1947,7 @@ svm_range_evict(struct svm_range *prange, 
struct mm_struct *mm,
              pr_debug("failed to quiesce KFD\n");
            pr_debug("schedule to restore svm %p ranges\n", svms);
-        schedule_delayed_work(&svms->restore_work,
+        queue_delayed_work(system_freezable_wq, &svms->restore_work,
msecs_to_jiffies(AMDGPU_SVM_RANGE_RESTORE_DELAY_MS));
      } else {
          unsigned long s, l;