Re: [PATCH v11 24/28] drm/amdgpu: resume gfx userqueues

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 




On 17/09/2024 14:30, Christian König wrote:
Am 09.09.24 um 22:06 schrieb Shashank Sharma:
This patch adds support for userqueue resume. What it typically does is
this:
- adds a new delayed work for resuming all the queues.
- schedules this delayed work from the suspend work.
- validates the BOs and replaces the eviction fence before resuming all
   the queues running under this instance of userq manager.

V2: Addressed Christian's review comments:
     - declare local variables like ret at the bottom.
     - lock all the object first, then start attaching the new fence.
     - dont replace old eviction fence, just attach new eviction fence.
     - no error logs for drm_exec_lock failures
     - no need to reserve bos after drm_exec_locked
     - schedule the resume worker immediately (not after 100 ms)
     - check for NULL BO (Arvind)

Cc: Alex Deucher <alexander.deucher@xxxxxxx>
Cc: Christian Koenig <christian.koenig@xxxxxxx>
Signed-off-by: Shashank Sharma <shashank.sharma@xxxxxxx>
Signed-off-by: Arvind Yadav <arvind.yadav@xxxxxxx>
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.c | 120 ++++++++++++++++++
  .../gpu/drm/amd/include/amdgpu_userqueue.h    |   1 +
  2 files changed, 121 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.c
index 979174f80993..e7f7354e0c0e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.c
@@ -405,6 +405,122 @@ int amdgpu_userq_ioctl(struct drm_device *dev, void *data,
      return r;
  }
  +static int
+amdgpu_userqueue_resume_all(struct amdgpu_userq_mgr *uq_mgr)
+{
+    struct amdgpu_device *adev = uq_mgr->adev;
+    const struct amdgpu_userq_funcs *userq_funcs;
+    struct amdgpu_usermode_queue *queue;
+    int queue_id, ret;
+
+    userq_funcs = adev->userq_funcs[AMDGPU_HW_IP_GFX];
+
+    /* Resume all the queues for this process */
+    idr_for_each_entry(&uq_mgr->userq_idr, queue, queue_id) {
+        ret = userq_funcs->resume(uq_mgr, queue);
+        if (ret)
+            DRM_ERROR("Failed to resume queue %d\n", queue_id);
+    }
+
+    return ret;
+}
+
+static int
+amdgpu_userqueue_validate_bos(struct amdgpu_userq_mgr *uq_mgr)
+{
+    struct amdgpu_fpriv *fpriv = uq_mgr_to_fpriv(uq_mgr);
+    struct amdgpu_vm *vm = &fpriv->vm;
+    struct amdgpu_bo_va *bo_va, *tmp;
+    struct drm_exec exec;
+    struct amdgpu_bo *bo;
+    int ret;
+
+    drm_exec_init(&exec, DRM_EXEC_IGNORE_DUPLICATES, 0);
+    drm_exec_until_all_locked(&exec) {
+        ret = amdgpu_vm_lock_pd(vm, &exec, 2);
+        drm_exec_retry_on_contention(&exec);
+        if (unlikely(ret)) {
+            DRM_ERROR("Failed to lock PD\n");

I would drop those error messages in the low level function.

The most likely cause (except for contention) why locking a BO fails is because we were interrupted, and for that we actually don't want to print anything.

Apart from that I really need to wrap my head around the VM code once more, but that here should probably work for now.

Noted, I will remove the error message.

- Shashank


Regards,
Christian.

+            goto unlock_all;
+        }
+
+        /* Lock the done list */
+        list_for_each_entry_safe(bo_va, tmp, &vm->done, base.vm_status) {
+            bo = bo_va->base.bo;
+            if (!bo)
+                continue;
+
+            ret = drm_exec_lock_obj(&exec, &bo->tbo.base);
+            drm_exec_retry_on_contention(&exec);
+            if (unlikely(ret))
+                goto unlock_all;
+        }
+
+        /* Lock the invalidated list */
+        list_for_each_entry_safe(bo_va, tmp, &vm->invalidated, base.vm_status) {
+            bo = bo_va->base.bo;
+            if (!bo)
+                continue;
+
+            ret = drm_exec_lock_obj(&exec, &bo->tbo.base);
+            drm_exec_retry_on_contention(&exec);
+            if (unlikely(ret))
+                goto unlock_all;
+        }
+    }
+
+    /* Now validate BOs */
+    list_for_each_entry_safe(bo_va, tmp, &vm->invalidated, base.vm_status) {
+        bo = bo_va->base.bo;
+        if (!bo)
+            continue;
+
+        ret = amdgpu_userqueue_validate_vm_bo(NULL, bo);
+        if (ret) {
+            DRM_ERROR("Failed to validate BO\n");
+            goto unlock_all;
+        }
+    }
+
+    /* Handle the moved BOs */
+    ret = amdgpu_vm_handle_moved(uq_mgr->adev, vm, &exec.ticket);
+    if (ret) {
+        DRM_ERROR("Failed to handle moved BOs\n");
+        goto unlock_all;
+    }
+
+    ret = amdgpu_eviction_fence_replace_fence(fpriv);
+    if (ret)
+        DRM_ERROR("Failed to replace eviction fence\n");
+
+unlock_all:
+    drm_exec_fini(&exec);
+    return ret;
+}
+
+static void amdgpu_userqueue_resume_worker(struct work_struct *work)
+{
+    struct amdgpu_userq_mgr *uq_mgr = work_to_uq_mgr(work, resume_work.work);
+    int ret;
+
+    mutex_lock(&uq_mgr->userq_mutex);
+
+    ret = amdgpu_userqueue_validate_bos(uq_mgr);
+    if (ret) {
+        DRM_ERROR("Failed to validate BOs to restore\n");
+        goto unlock;
+    }
+
+    ret = amdgpu_userqueue_resume_all(uq_mgr);
+    if (ret) {
+        DRM_ERROR("Failed to resume all queues\n");
+        goto unlock;
+    }
+
+unlock:
+    mutex_unlock(&uq_mgr->userq_mutex);
+}
+
  static int
  amdgpu_userqueue_suspend_all(struct amdgpu_userq_mgr *uq_mgr)
  {
@@ -486,6 +602,9 @@ amdgpu_userqueue_suspend_worker(struct work_struct *work)
      /* Cleanup old eviction fence entry */
      amdgpu_eviction_fence_destroy(evf_mgr);
  +    /* Schedule a work to restore userqueue */
+    schedule_delayed_work(&uq_mgr->resume_work, 0);
+
  unlock:
      mutex_unlock(&uq_mgr->userq_mutex);
  }
@@ -508,6 +627,7 @@ int amdgpu_userq_mgr_init(struct amdgpu_userq_mgr *userq_mgr, struct amdgpu_devi
      /* This reference is required for suspend work */
      fpriv->evf_mgr.ev_fence->uq_mgr = userq_mgr;
      INIT_DELAYED_WORK(&userq_mgr->suspend_work, amdgpu_userqueue_suspend_worker); +    INIT_DELAYED_WORK(&userq_mgr->resume_work, amdgpu_userqueue_resume_worker);
      return 0;
  }
  diff --git a/drivers/gpu/drm/amd/include/amdgpu_userqueue.h b/drivers/gpu/drm/amd/include/amdgpu_userqueue.h
index 8b3b50fa8b5b..d035b5c2b14b 100644
--- a/drivers/gpu/drm/amd/include/amdgpu_userqueue.h
+++ b/drivers/gpu/drm/amd/include/amdgpu_userqueue.h
@@ -76,6 +76,7 @@ struct amdgpu_userq_mgr {
      struct amdgpu_device        *adev;
        struct delayed_work        suspend_work;
+    struct delayed_work        resume_work;
      int num_userqs;
  };




[Index of Archives]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux