On 17/09/2024 14:30, Christian König wrote:
Am 09.09.24 um 22:06 schrieb Shashank Sharma:This patch adds support for userqueue resume. What it typically does is this: - adds a new delayed work for resuming all the queues. - schedules this delayed work from the suspend work. - validates the BOs and replaces the eviction fence before resuming all the queues running under this instance of userq manager. V2: Addressed Christian's review comments: - declare local variables like ret at the bottom. - lock all the object first, then start attaching the new fence. - dont replace old eviction fence, just attach new eviction fence. - no error logs for drm_exec_lock failures - no need to reserve bos after drm_exec_locked - schedule the resume worker immediately (not after 100 ms) - check for NULL BO (Arvind) Cc: Alex Deucher <alexander.deucher@xxxxxxx> Cc: Christian Koenig <christian.koenig@xxxxxxx> Signed-off-by: Shashank Sharma <shashank.sharma@xxxxxxx> Signed-off-by: Arvind Yadav <arvind.yadav@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.c | 120 ++++++++++++++++++ .../gpu/drm/amd/include/amdgpu_userqueue.h | 1 + 2 files changed, 121 insertions(+)diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.cindex 979174f80993..e7f7354e0c0e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userqueue.c@@ -405,6 +405,122 @@ int amdgpu_userq_ioctl(struct drm_device *dev, void *data,return r; } +static int +amdgpu_userqueue_resume_all(struct amdgpu_userq_mgr *uq_mgr) +{ + struct amdgpu_device *adev = uq_mgr->adev; + const struct amdgpu_userq_funcs *userq_funcs; + struct amdgpu_usermode_queue *queue; + int queue_id, ret; + + userq_funcs = adev->userq_funcs[AMDGPU_HW_IP_GFX]; + + /* Resume all the queues for this process */ + idr_for_each_entry(&uq_mgr->userq_idr, queue, queue_id) { + ret = userq_funcs->resume(uq_mgr, queue); + if (ret) + DRM_ERROR("Failed to resume queue %d\n", queue_id); + } + + return ret; +} + +static int +amdgpu_userqueue_validate_bos(struct amdgpu_userq_mgr *uq_mgr) +{ + struct amdgpu_fpriv *fpriv = uq_mgr_to_fpriv(uq_mgr); + struct amdgpu_vm *vm = &fpriv->vm; + struct amdgpu_bo_va *bo_va, *tmp; + struct drm_exec exec; + struct amdgpu_bo *bo; + int ret; + + drm_exec_init(&exec, DRM_EXEC_IGNORE_DUPLICATES, 0); + drm_exec_until_all_locked(&exec) { + ret = amdgpu_vm_lock_pd(vm, &exec, 2); + drm_exec_retry_on_contention(&exec); + if (unlikely(ret)) { + DRM_ERROR("Failed to lock PD\n");I would drop those error messages in the low level function.The most likely cause (except for contention) why locking a BO fails is because we were interrupted, and for that we actually don't want to print anything.Apart from that I really need to wrap my head around the VM code once more, but that here should probably work for now.
Noted, I will remove the error message. - Shashank
Regards, Christian.+ goto unlock_all; + } + + /* Lock the done list */+ list_for_each_entry_safe(bo_va, tmp, &vm->done, base.vm_status) {+ bo = bo_va->base.bo; + if (!bo) + continue; + + ret = drm_exec_lock_obj(&exec, &bo->tbo.base); + drm_exec_retry_on_contention(&exec); + if (unlikely(ret)) + goto unlock_all; + } + + /* Lock the invalidated list */+ list_for_each_entry_safe(bo_va, tmp, &vm->invalidated, base.vm_status) {+ bo = bo_va->base.bo; + if (!bo) + continue; + + ret = drm_exec_lock_obj(&exec, &bo->tbo.base); + drm_exec_retry_on_contention(&exec); + if (unlikely(ret)) + goto unlock_all; + } + } + + /* Now validate BOs */+ list_for_each_entry_safe(bo_va, tmp, &vm->invalidated, base.vm_status) {+ bo = bo_va->base.bo; + if (!bo) + continue; + + ret = amdgpu_userqueue_validate_vm_bo(NULL, bo); + if (ret) { + DRM_ERROR("Failed to validate BO\n"); + goto unlock_all; + } + } + + /* Handle the moved BOs */ + ret = amdgpu_vm_handle_moved(uq_mgr->adev, vm, &exec.ticket); + if (ret) { + DRM_ERROR("Failed to handle moved BOs\n"); + goto unlock_all; + } + + ret = amdgpu_eviction_fence_replace_fence(fpriv); + if (ret) + DRM_ERROR("Failed to replace eviction fence\n"); + +unlock_all: + drm_exec_fini(&exec); + return ret; +} + +static void amdgpu_userqueue_resume_worker(struct work_struct *work) +{+ struct amdgpu_userq_mgr *uq_mgr = work_to_uq_mgr(work, resume_work.work);+ int ret; + + mutex_lock(&uq_mgr->userq_mutex); + + ret = amdgpu_userqueue_validate_bos(uq_mgr); + if (ret) { + DRM_ERROR("Failed to validate BOs to restore\n"); + goto unlock; + } + + ret = amdgpu_userqueue_resume_all(uq_mgr); + if (ret) { + DRM_ERROR("Failed to resume all queues\n"); + goto unlock; + } + +unlock: + mutex_unlock(&uq_mgr->userq_mutex); +} + static int amdgpu_userqueue_suspend_all(struct amdgpu_userq_mgr *uq_mgr) {@@ -486,6 +602,9 @@ amdgpu_userqueue_suspend_worker(struct work_struct *work)/* Cleanup old eviction fence entry */ amdgpu_eviction_fence_destroy(evf_mgr); + /* Schedule a work to restore userqueue */ + schedule_delayed_work(&uq_mgr->resume_work, 0); + unlock: mutex_unlock(&uq_mgr->userq_mutex); }@@ -508,6 +627,7 @@ int amdgpu_userq_mgr_init(struct amdgpu_userq_mgr *userq_mgr, struct amdgpu_devi/* This reference is required for suspend work */ fpriv->evf_mgr.ev_fence->uq_mgr = userq_mgr;INIT_DELAYED_WORK(&userq_mgr->suspend_work, amdgpu_userqueue_suspend_worker); + INIT_DELAYED_WORK(&userq_mgr->resume_work, amdgpu_userqueue_resume_worker);return 0; }diff --git a/drivers/gpu/drm/amd/include/amdgpu_userqueue.h b/drivers/gpu/drm/amd/include/amdgpu_userqueue.hindex 8b3b50fa8b5b..d035b5c2b14b 100644 --- a/drivers/gpu/drm/amd/include/amdgpu_userqueue.h +++ b/drivers/gpu/drm/amd/include/amdgpu_userqueue.h @@ -76,6 +76,7 @@ struct amdgpu_userq_mgr { struct amdgpu_device *adev; struct delayed_work suspend_work; + struct delayed_work resume_work; int num_userqs; };