Currently, all kfd BOs use same destruction routine. But pinned BOs are not unpinned properly. Separate them from general routine. Signed-off-by: Lang Yu <lang.yu@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 2 + .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 10 ++ drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 3 + drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 3 + drivers/gpu/drm/amd/amdkfd/kfd_process.c | 125 ++++++++++++++---- 5 files changed, 114 insertions(+), 29 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h index 69de31754907..751557af09bb 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h @@ -279,6 +279,8 @@ int amdgpu_amdkfd_gpuvm_sync_memory( struct kgd_dev *kgd, struct kgd_mem *mem, bool intr); int amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(struct kgd_dev *kgd, struct kgd_mem *mem, void **kptr, uint64_t *size); +void amdgpu_amdkfd_gpuvm_unmap_gtt_bo_from_kernel(struct kgd_dev *kgd, struct kgd_mem *mem); + int amdgpu_amdkfd_gpuvm_restore_process_bos(void *process_info, struct dma_fence **ef); int amdgpu_amdkfd_gpuvm_get_vm_fault_info(struct kgd_dev *kgd, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index 054c1a224def..6acc78b02bdc 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -1871,6 +1871,16 @@ int amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(struct kgd_dev *kgd, return ret; } +void amdgpu_amdkfd_gpuvm_unmap_gtt_bo_from_kernel(struct kgd_dev *kgd, struct kgd_mem *mem) +{ + struct amdgpu_bo *bo = mem->bo; + + amdgpu_bo_reserve(bo, true); + amdgpu_bo_kunmap(bo); + amdgpu_bo_unpin(bo); + amdgpu_bo_unreserve(bo); +} + int amdgpu_amdkfd_gpuvm_get_vm_fault_info(struct kgd_dev *kgd, struct kfd_vm_fault_info *mem) { diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index f1e7edeb4e6b..0db48ac10fde 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -1051,6 +1051,9 @@ static int kfd_ioctl_create_event(struct file *filp, struct kfd_process *p, pr_err("Failed to set event page\n"); return err; } + + p->signal_handle = args->event_page_offset; + } err = kfd_event_create(filp, p, args->event_type, diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index 6d8f9bb2d905..30f08f1606bb 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -608,12 +608,14 @@ struct qcm_process_device { uint32_t sh_hidden_private_base; /* CWSR memory */ + struct kgd_mem *cwsr_mem; void *cwsr_kaddr; uint64_t cwsr_base; uint64_t tba_addr; uint64_t tma_addr; /* IB memory */ + struct kgd_mem *ib_mem; uint64_t ib_base; void *ib_kaddr; @@ -808,6 +810,7 @@ struct kfd_process { /* Event ID allocator and lookup */ struct idr event_idr; /* Event page */ + u64 signal_handle; struct kfd_signal_page *signal_page; size_t signal_mapped_size; size_t signal_event_count; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c index 21ec8a18cad2..c024f2e2efaa 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c @@ -72,6 +72,8 @@ static int kfd_process_init_cwsr_apu(struct kfd_process *p, struct file *filep); static void evict_process_worker(struct work_struct *work); static void restore_process_worker(struct work_struct *work); +static void kfd_process_device_destroy_cwsr_dgpu(struct kfd_process_device *pdd); + struct kfd_procfs_tree { struct kobject *kobj; }; @@ -685,10 +687,15 @@ void kfd_process_destroy_wq(void) } static void kfd_process_free_gpuvm(struct kgd_mem *mem, - struct kfd_process_device *pdd) + struct kfd_process_device *pdd, void *kptr) { struct kfd_dev *dev = pdd->dev; + if (kptr) { + amdgpu_amdkfd_gpuvm_unmap_gtt_bo_from_kernel(dev->kgd, mem); + kptr = NULL; + } + amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu(dev->kgd, mem, pdd->drm_priv); amdgpu_amdkfd_gpuvm_free_memory_of_gpu(dev->kgd, mem, pdd->drm_priv, NULL); @@ -702,63 +709,46 @@ static void kfd_process_free_gpuvm(struct kgd_mem *mem, */ static int kfd_process_alloc_gpuvm(struct kfd_process_device *pdd, uint64_t gpu_va, uint32_t size, - uint32_t flags, void **kptr) + uint32_t flags, struct kgd_mem **mem, void **kptr) { struct kfd_dev *kdev = pdd->dev; - struct kgd_mem *mem = NULL; - int handle; int err; err = amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(kdev->kgd, gpu_va, size, - pdd->drm_priv, &mem, NULL, flags); + pdd->drm_priv, mem, NULL, flags); if (err) goto err_alloc_mem; - err = amdgpu_amdkfd_gpuvm_map_memory_to_gpu(kdev->kgd, mem, + err = amdgpu_amdkfd_gpuvm_map_memory_to_gpu(kdev->kgd, *mem, pdd->drm_priv, NULL); if (err) goto err_map_mem; - err = amdgpu_amdkfd_gpuvm_sync_memory(kdev->kgd, mem, true); + err = amdgpu_amdkfd_gpuvm_sync_memory(kdev->kgd, *mem, true); if (err) { pr_debug("Sync memory failed, wait interrupted by user signal\n"); goto sync_memory_failed; } - /* Create an obj handle so kfd_process_device_remove_obj_handle - * will take care of the bo removal when the process finishes. - * We do not need to take p->mutex, because the process is just - * created and the ioctls have not had the chance to run. - */ - handle = kfd_process_device_create_obj_handle(pdd, mem); - - if (handle < 0) { - err = handle; - goto free_gpuvm; - } - if (kptr) { err = amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(kdev->kgd, - (struct kgd_mem *)mem, kptr, NULL); + (struct kgd_mem *)*mem, kptr, NULL); if (err) { pr_debug("Map GTT BO to kernel failed\n"); - goto free_obj_handle; + goto sync_memory_failed; } } return err; -free_obj_handle: - kfd_process_device_remove_obj_handle(pdd, handle); -free_gpuvm: sync_memory_failed: - kfd_process_free_gpuvm(mem, pdd); - return err; + amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu(kdev->kgd, *mem, pdd->drm_priv); err_map_mem: - amdgpu_amdkfd_gpuvm_free_memory_of_gpu(kdev->kgd, mem, pdd->drm_priv, + amdgpu_amdkfd_gpuvm_free_memory_of_gpu(kdev->kgd, *mem, pdd->drm_priv, NULL); err_alloc_mem: + *mem = NULL; *kptr = NULL; return err; } @@ -776,6 +766,7 @@ static int kfd_process_device_reserve_ib_mem(struct kfd_process_device *pdd) KFD_IOC_ALLOC_MEM_FLAGS_NO_SUBSTITUTE | KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE | KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE; + struct kgd_mem *mem; void *kaddr; int ret; @@ -784,15 +775,26 @@ static int kfd_process_device_reserve_ib_mem(struct kfd_process_device *pdd) /* ib_base is only set for dGPU */ ret = kfd_process_alloc_gpuvm(pdd, qpd->ib_base, PAGE_SIZE, flags, - &kaddr); + &mem, &kaddr); if (ret) return ret; + qpd->ib_mem = mem; qpd->ib_kaddr = kaddr; return 0; } +static void kfd_process_device_destroy_ib_mem(struct kfd_process_device *pdd) +{ + struct qcm_process_device *qpd = &pdd->qpd; + + if (!qpd->ib_kaddr || !qpd->ib_base) + return; + + kfd_process_free_gpuvm(qpd->ib_mem, pdd, qpd->ib_kaddr); +} + struct kfd_process *kfd_create_process(struct file *filep) { struct kfd_process *process; @@ -947,6 +949,52 @@ static void kfd_process_device_free_bos(struct kfd_process_device *pdd) } } +static void kfd_process_free_signal_bo(struct kfd_process *p) +{ + struct kfd_process_device *pdd; + struct kfd_dev *kdev; + void *mem; + int i; + + kdev = kfd_device_by_id(GET_GPU_ID(p->signal_handle)); + if (!kdev) + return; + + mutex_lock(&p->mutex); + + pdd = kfd_get_process_device_data(kdev, p); + if (!pdd) { + mutex_unlock(&p->mutex); + return; + } + + mem = kfd_process_device_translate_handle( + pdd, GET_IDR_HANDLE(p->signal_handle)); + if (!mem) { + mutex_unlock(&p->mutex); + return; + } + + mutex_unlock(&p->mutex); + + for (i = 0; i < p->n_pdds; i++) { + struct kfd_process_device *peer_pdd = p->pdds[i]; + + if (!peer_pdd->drm_priv) + continue; + amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu( + peer_pdd->dev->kgd, mem, peer_pdd->drm_priv); + } + + amdgpu_amdkfd_gpuvm_unmap_gtt_bo_from_kernel(kdev->kgd, mem); + + amdgpu_amdkfd_gpuvm_free_memory_of_gpu(kdev->kgd, mem, + pdd->drm_priv, NULL); + + kfd_process_device_remove_obj_handle(pdd, + GET_IDR_HANDLE(p->signal_handle)); +} + static void kfd_process_free_outstanding_kfd_bos(struct kfd_process *p) { int i; @@ -965,6 +1013,9 @@ static void kfd_process_destroy_pdds(struct kfd_process *p) pr_debug("Releasing pdd (topology id %d) for process (pasid 0x%x)\n", pdd->dev->id, p->pasid); + kfd_process_device_destroy_cwsr_dgpu(pdd); + kfd_process_device_destroy_ib_mem(pdd); + if (pdd->drm_file) { amdgpu_amdkfd_gpuvm_release_process_vm( pdd->dev->kgd, pdd->drm_priv); @@ -1049,9 +1100,11 @@ static void kfd_process_wq_release(struct work_struct *work) { struct kfd_process *p = container_of(work, struct kfd_process, release_work); + kfd_process_remove_sysfs(p); kfd_iommu_unbind_process(p); + kfd_process_free_signal_bo(p); kfd_process_free_outstanding_kfd_bos(p); svm_range_list_fini(p); @@ -1066,6 +1119,7 @@ static void kfd_process_wq_release(struct work_struct *work) put_task_struct(p->lead_thread); kfree(p); + } static void kfd_process_ref_release(struct kref *ref) @@ -1198,6 +1252,7 @@ static int kfd_process_device_init_cwsr_dgpu(struct kfd_process_device *pdd) uint32_t flags = KFD_IOC_ALLOC_MEM_FLAGS_GTT | KFD_IOC_ALLOC_MEM_FLAGS_NO_SUBSTITUTE | KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE; + struct kgd_mem *mem; void *kaddr; int ret; @@ -1206,10 +1261,11 @@ static int kfd_process_device_init_cwsr_dgpu(struct kfd_process_device *pdd) /* cwsr_base is only set for dGPU */ ret = kfd_process_alloc_gpuvm(pdd, qpd->cwsr_base, - KFD_CWSR_TBA_TMA_SIZE, flags, &kaddr); + KFD_CWSR_TBA_TMA_SIZE, flags, &mem, &kaddr); if (ret) return ret; + qpd->cwsr_mem = mem; qpd->cwsr_kaddr = kaddr; qpd->tba_addr = qpd->cwsr_base; @@ -1222,6 +1278,17 @@ static int kfd_process_device_init_cwsr_dgpu(struct kfd_process_device *pdd) return 0; } +static void kfd_process_device_destroy_cwsr_dgpu(struct kfd_process_device *pdd) +{ + struct kfd_dev *dev = pdd->dev; + struct qcm_process_device *qpd = &pdd->qpd; + + if (!dev->cwsr_enabled || !qpd->cwsr_kaddr || !qpd->cwsr_base) + return; + + kfd_process_free_gpuvm(qpd->cwsr_mem, pdd, qpd->cwsr_kaddr); +} + void kfd_process_set_trap_handler(struct qcm_process_device *qpd, uint64_t tba_addr, uint64_t tma_addr) -- 2.25.1