It has been observed that task_info struct makes it difficult to handle amdgpu_vm during a GPU reset, due to it's parameters like task_name and process name. This patch: - removes task_info struct from amdgpu_vm and moves it into vm_mgr as an Xarray. - creates a PID vs task_info mapping to index task_info from this Xarray (similar to how it is being done for PASID to vm indexing). - adds additional changes to support these changes. Cc: Christian Koenig <christian.koenig@xxxxxxx> Cc: Alex Deucher <alexander.deucher@xxxxxxx> Signed-off-by: Shashank Sharma <shashank.sharma@xxxxxxx> --- .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 5 +- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 5 +- drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 4 +- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 64 ++++++++++++++++--- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 12 ++-- drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c | 4 +- 9 files changed, 77 insertions(+), 23 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index d34c3ef8f3ed..8568ced570c8 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -1539,7 +1539,7 @@ int amdgpu_amdkfd_gpuvm_acquire_process_vm(struct amdgpu_device *adev, if (ret) return ret; - amdgpu_vm_set_task_info(avm); + amdgpu_vm_set_task_info(adev, avm); return 0; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c index fb78a8f47587..24b9a841db54 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c @@ -316,7 +316,7 @@ static int amdgpu_cs_pass1(struct amdgpu_cs_parser *p, kvfree(chunk_array); /* Use this opportunity to fill in task info for the vm */ - amdgpu_vm_set_task_info(vm); + amdgpu_vm_set_task_info(p->adev, vm); return 0; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c index 56e89e76ff17..cd8aef1135e8 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c @@ -1762,9 +1762,10 @@ static int amdgpu_debugfs_vm_info_show(struct seq_file *m, void *unused) list_for_each_entry(file, &dev->filelist, lhead) { struct amdgpu_fpriv *fpriv = file->driver_priv; struct amdgpu_vm *vm = &fpriv->vm; + struct amdgpu_task_info ti = {0, }; - seq_printf(m, "pid:%d\tProcess:%s ----------\n", - vm->task_info.pid, vm->task_info.process_name); + amdgpu_vm_get_task_info(adev, vm->pasid, &ti); + seq_printf(m, "pid:%d\tProcess:%s ----------\n", ti.pid, ti.process_name); r = amdgpu_bo_reserve(vm->root.bo, true); if (r) break; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 6238701cde23..44fb16eba749 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -5192,8 +5192,9 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle, memset(&tmp_adev->reset_task_info, 0, sizeof(tmp_adev->reset_task_info)); if (reset_context->job && reset_context->job->vm) - tmp_adev->reset_task_info = - reset_context->job->vm->task_info; + amdgpu_vm_get_task_info(tmp_adev, + reset_context->job->vm->pasid, + &tmp_adev->reset_task_info); amdgpu_reset_capture_coredumpm(tmp_adev); #endif if (vram_lost) { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c index 78476bc75b4e..3985b9b10b46 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c @@ -58,7 +58,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job) goto exit; } - amdgpu_vm_get_task_info(ring->adev, job->pasid, &ti); + amdgpu_vm_get_task_info(ring->adev, job->vm->pasid, &ti); DRM_ERROR("ring %s timeout, signaled seq=%u, emitted seq=%u\n", job->base.sched->name, atomic_read(&ring->fence_drv.last_seq), ring->fence_drv.sync_seq); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c index 12414a713256..cb3bde5ce682 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c @@ -1329,8 +1329,10 @@ void amdgpu_driver_postclose_kms(struct drm_device *dev, amdgpu_ctx_mgr_fini(&fpriv->ctx_mgr); amdgpu_vm_fini(adev, &fpriv->vm); - if (pasid) + if (pasid) { + amdgpu_vm_free_task_info(adev, fpriv->vm.pid); amdgpu_pasid_free_delayed(pd->tbo.base.resv, pasid); + } amdgpu_bo_unref(&pd); idr_for_each_entry(&fpriv->bo_list_handles, list, handle) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index ec1ec08d4058..95bea7690bb5 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -161,7 +161,6 @@ int amdgpu_vm_set_pasid(struct amdgpu_device *adev, struct amdgpu_vm *vm, vm->pasid = pasid; } - return 0; } @@ -2439,6 +2438,7 @@ void amdgpu_vm_manager_init(struct amdgpu_device *adev) #endif xa_init_flags(&adev->vm_manager.pasids, XA_FLAGS_LOCK_IRQ); + xa_init_flags(&adev->vm_manager.tasks, XA_FLAGS_LOCK_IRQ); } /** @@ -2453,6 +2453,9 @@ void amdgpu_vm_manager_fini(struct amdgpu_device *adev) WARN_ON(!xa_empty(&adev->vm_manager.pasids)); xa_destroy(&adev->vm_manager.pasids); + WARN_ON(!xa_empty(&adev->vm_manager.tasks)); + xa_destroy(&adev->vm_manager.tasks); + amdgpu_vmid_mgr_fini(adev); } @@ -2498,6 +2501,20 @@ int amdgpu_vm_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) return 0; } +/** + * amdgpu_vm_free_task_info - Finds and frees task_info ptr + * + * @adev: drm device pointer + * @pid: pid of the process in vm + */ +void amdgpu_vm_free_task_info(struct amdgpu_device *adev, u32 pid) +{ + struct amdgpu_task_info *task_info; + + task_info = xa_load(&adev->vm_manager.tasks, pid); + kfree(task_info); +} + /** * amdgpu_vm_get_task_info - Extracts task info for a PASID. * @@ -2508,14 +2525,18 @@ int amdgpu_vm_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) void amdgpu_vm_get_task_info(struct amdgpu_device *adev, u32 pasid, struct amdgpu_task_info *task_info) { + struct amdgpu_task_info *ti; struct amdgpu_vm *vm; unsigned long flags; xa_lock_irqsave(&adev->vm_manager.pasids, flags); vm = xa_load(&adev->vm_manager.pasids, pasid); - if (vm) - *task_info = vm->task_info; + if (vm) { + ti = xa_load(&adev->vm_manager.tasks, vm->pid); + if (ti) + *task_info = *ti; + } xa_unlock_irqrestore(&adev->vm_manager.pasids, flags); } @@ -2523,21 +2544,44 @@ void amdgpu_vm_get_task_info(struct amdgpu_device *adev, u32 pasid, /** * amdgpu_vm_set_task_info - Sets VMs task info. * + * @adev: drm device pointer * @vm: vm for which to set the info */ -void amdgpu_vm_set_task_info(struct amdgpu_vm *vm) +void amdgpu_vm_set_task_info(struct amdgpu_device *adev, struct amdgpu_vm *vm) { - if (vm->task_info.pid) + struct amdgpu_task_info *task_info, *old; + + if (!vm->pasid) return; - vm->task_info.pid = current->pid; - get_task_comm(vm->task_info.task_name, current); + if (vm->pid == current->pid) + return; - if (current->group_leader->mm != current->mm) + task_info = kzalloc(sizeof(struct amdgpu_task_info), GFP_KERNEL); + if (!task_info) { + DRM_ERROR("OOM while setting task info\n"); return; + } + + task_info->pid = current->pid; + get_task_comm(task_info->task_name, current); + + if (current->group_leader->mm == current->mm) { + task_info->tgid = current->group_leader->pid; + get_task_comm(task_info->process_name, current->group_leader); + } + + /* Replace and free if there is an existing task_info entry */ + old = xa_store_irq(&adev->vm_manager.tasks, task_info->pid, task_info, GFP_KERNEL); + if (xa_err(old) < 0) { + DRM_WARN("Failed to update task info, pid=0x%x pasid=0x%x\n", + task_info->pid, vm->pasid); + kfree(task_info); + return; + } - vm->task_info.tgid = current->group_leader->pid; - get_task_comm(vm->task_info.process_name, current->group_leader); + kfree(old); + vm->pid = task_info->pid; } /** diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h index ffac7413c657..33f333d864b1 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h @@ -302,6 +302,7 @@ struct amdgpu_vm { struct dma_fence *last_unlocked; unsigned int pasid; + unsigned int pid; bool reserved_vmid[AMDGPU_MAX_VMHUBS]; /* Flag to indicate if VM tables are updated by CPU or GPU (SDMA) */ @@ -325,9 +326,6 @@ struct amdgpu_vm { /* Valid while the PD is reserved or fenced */ uint64_t pd_phys_addr; - /* Some basic info about the task */ - struct amdgpu_task_info task_info; - /* Store positions of group of BOs */ struct ttm_lru_bulk_move lru_bulk_move; /* Flag to indicate if VM is used for compute */ @@ -374,6 +372,11 @@ struct amdgpu_vm_manager { * look up VM of a page fault */ struct xarray pasids; + + /* PID to task_info mapping, will be used in reset context to + * look up PID/TID of a reset + */ + struct xarray tasks; }; struct amdgpu_bo_va_mapping; @@ -465,7 +468,8 @@ bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid, u32 vmid, u32 node_id, uint64_t addr, bool write_fault); -void amdgpu_vm_set_task_info(struct amdgpu_vm *vm); +void amdgpu_vm_set_task_info(struct amdgpu_device *adev, struct amdgpu_vm *vm); +void amdgpu_vm_free_task_info(struct amdgpu_device *adev, u32 pid); void amdgpu_vm_move_to_lru_tail(struct amdgpu_device *adev, struct amdgpu_vm *vm); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c index 5431332bbdb8..3704556210d8 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c @@ -992,16 +992,18 @@ int amdgpu_vm_ptes_update(struct amdgpu_vm_update_params *params, uint64_t upd_end = min(entry_end, frag_end); unsigned int nptes = (upd_end - frag_start) >> shift; uint64_t upd_flags = flags | AMDGPU_PTE_FRAG(frag); + struct amdgpu_task_info ti = {0, }; /* This can happen when we set higher level PDs to * silent to stop fault floods. */ nptes = max(nptes, 1u); + amdgpu_vm_get_task_info(adev, vm->pasid, &ti); trace_amdgpu_vm_update_ptes(params, frag_start, upd_end, min(nptes, 32u), dst, incr, upd_flags, - vm->task_info.tgid, + ti.tgid, vm->immediate.fence_context); amdgpu_vm_pte_update_flags(params, to_amdgpu_bo_vm(pt), cursor.level, pe_start, dst, -- 2.42.0