When doing GPU reset, try to block all kfd functions including kfd ioctls and file close function, which maybe access hardware. v2: fix a potential recursive locking issue kfd_ioctl_dbg_register has chance called into pqm_create_queue, which will cause recursive locking. So remove locking read_lock from process queue manager, and add read_lock into related ioctls instead. v3: put pqm_query_dev_by_qid under the protection of p->mutex Signed-off-by: Dennis Li <Dennis.Li@xxxxxxx> Acked-by: Christian König <christian.koenig@xxxxxxx> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index 6802c616e10e..283ba9435233 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -40,6 +40,7 @@ #include "kfd_dbgmgr.h" #include "amdgpu_amdkfd.h" #include "kfd_smi_events.h" +#include "amdgpu.h" static long kfd_ioctl(struct file *, unsigned int, unsigned long); static int kfd_open(struct inode *, struct file *); @@ -298,6 +299,9 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p, } mutex_lock(&p->mutex); + err = amdgpu_read_lock(dev->ddev, true); + if (err) + goto err_read_lock; pdd = kfd_bind_process_to_device(dev, p); if (IS_ERR(pdd)) { @@ -326,6 +330,7 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p, */ args->doorbell_offset |= doorbell_offset_in_process; + amdgpu_read_unlock(dev->ddev); mutex_unlock(&p->mutex); pr_debug("Queue id %d was created successfully\n", args->queue_id); @@ -343,6 +348,8 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p, err_create_queue: err_bind_process: + amdgpu_read_unlock(dev->ddev); +err_read_lock: mutex_unlock(&p->mutex); return err; } @@ -352,6 +359,7 @@ static int kfd_ioctl_destroy_queue(struct file *filp, struct kfd_process *p, { int retval; struct kfd_ioctl_destroy_queue_args *args = data; + struct kfd_dev *dev; pr_debug("Destroying queue id %d for pasid 0x%x\n", args->queue_id, @@ -359,8 +367,20 @@ static int kfd_ioctl_destroy_queue(struct file *filp, struct kfd_process *p, mutex_lock(&p->mutex); + dev = pqm_query_dev_by_qid(&p->pqm, args->queue_id); + if (!dev) { + retval = -EINVAL; + goto err_query_dev; + } + + retval = amdgpu_read_lock(dev->ddev, true); + if (retval) + goto err_read_lock; retval = pqm_destroy_queue(&p->pqm, args->queue_id); + amdgpu_read_unlock(dev->ddev); +err_read_lock: +err_query_dev: mutex_unlock(&p->mutex); return retval; } @@ -371,6 +391,7 @@ static int kfd_ioctl_update_queue(struct file *filp, struct kfd_process *p, int retval; struct kfd_ioctl_update_queue_args *args = data; struct queue_properties properties; + struct kfd_dev *dev; if (args->queue_percentage > KFD_MAX_QUEUE_PERCENTAGE) { pr_err("Queue percentage must be between 0 to KFD_MAX_QUEUE_PERCENTAGE\n"); @@ -404,10 +425,21 @@ static int kfd_ioctl_update_queue(struct file *filp, struct kfd_process *p, mutex_lock(&p->mutex); + dev = pqm_query_dev_by_qid(&p->pqm, args->queue_id); + if (!dev) { + retval = -EINVAL; + goto err_query_dev; + } + + retval = amdgpu_read_lock(dev->ddev, true); + if (retval) + goto err_read_lock; retval = pqm_update_queue(&p->pqm, args->queue_id, &properties); + amdgpu_read_unlock(dev->ddev); +err_read_lock: +err_query_dev: mutex_unlock(&p->mutex); - return retval; } @@ -420,6 +452,7 @@ static int kfd_ioctl_set_cu_mask(struct file *filp, struct kfd_process *p, struct queue_properties properties; uint32_t __user *cu_mask_ptr = (uint32_t __user *)args->cu_mask_ptr; size_t cu_mask_size = sizeof(uint32_t) * (args->num_cu_mask / 32); + struct kfd_dev *dev; if ((args->num_cu_mask % 32) != 0) { pr_debug("num_cu_mask 0x%x must be a multiple of 32", @@ -456,8 +489,20 @@ static int kfd_ioctl_set_cu_mask(struct file *filp, struct kfd_process *p, mutex_lock(&p->mutex); + dev = pqm_query_dev_by_qid(&p->pqm, args->queue_id); + if (!dev) { + retval = -EINVAL; + goto err_query_dev; + } + + retval = amdgpu_read_lock(dev->ddev, true); + if (retval) + goto err_read_lock; retval = pqm_set_cu_mask(&p->pqm, args->queue_id, &properties); + amdgpu_read_unlock(dev->ddev); +err_read_lock: +err_query_dev: mutex_unlock(&p->mutex); if (retval) @@ -471,14 +516,27 @@ static int kfd_ioctl_get_queue_wave_state(struct file *filep, { struct kfd_ioctl_get_queue_wave_state_args *args = data; int r; + struct kfd_dev *dev; mutex_lock(&p->mutex); + dev = pqm_query_dev_by_qid(&p->pqm, args->queue_id); + if (!dev) { + r = -EINVAL; + goto err_query_dev; + } + + r = amdgpu_read_lock(dev->ddev, true); + if (r) + goto err_read_lock; r = pqm_get_wave_state(&p->pqm, args->queue_id, (void __user *)args->ctl_stack_address, &args->ctl_stack_used_size, &args->save_area_used_size); + amdgpu_read_unlock(dev->ddev); +err_read_lock: +err_query_dev: mutex_unlock(&p->mutex); return r; @@ -509,6 +567,10 @@ static int kfd_ioctl_set_memory_policy(struct file *filep, mutex_lock(&p->mutex); + err = amdgpu_read_lock(dev->ddev, true); + if (err) + goto err_read_lock; + pdd = kfd_bind_process_to_device(dev, p); if (IS_ERR(pdd)) { err = -ESRCH; @@ -531,6 +593,9 @@ static int kfd_ioctl_set_memory_policy(struct file *filep, err = -EINVAL; out: + amdgpu_read_unlock(dev->ddev); + +err_read_lock: mutex_unlock(&p->mutex); return err; @@ -550,6 +615,10 @@ static int kfd_ioctl_set_trap_handler(struct file *filep, mutex_lock(&p->mutex); + err = amdgpu_read_lock(dev->ddev, true); + if (err) + goto err_read_lock; + pdd = kfd_bind_process_to_device(dev, p); if (IS_ERR(pdd)) { err = -ESRCH; @@ -559,6 +628,9 @@ static int kfd_ioctl_set_trap_handler(struct file *filep, kfd_process_set_trap_handler(&pdd->qpd, args->tba_addr, args->tma_addr); out: + amdgpu_read_unlock(dev->ddev); + +err_read_lock: mutex_unlock(&p->mutex); return err; @@ -584,6 +656,11 @@ static int kfd_ioctl_dbg_register(struct file *filep, } mutex_lock(&p->mutex); + + status = amdgpu_read_lock(dev->ddev, true); + if (status) + goto err_read_lock; + mutex_lock(kfd_get_dbgmgr_mutex()); /* @@ -613,6 +690,9 @@ static int kfd_ioctl_dbg_register(struct file *filep, out: mutex_unlock(kfd_get_dbgmgr_mutex()); + amdgpu_read_unlock(dev->ddev); + +err_read_lock: mutex_unlock(&p->mutex); return status; @@ -634,6 +714,10 @@ static int kfd_ioctl_dbg_unregister(struct file *filep, return -EINVAL; } + status = amdgpu_read_lock(dev->ddev, true); + if (status) + return status; + mutex_lock(kfd_get_dbgmgr_mutex()); status = kfd_dbgmgr_unregister(dev->dbgmgr, p); @@ -644,6 +728,8 @@ static int kfd_ioctl_dbg_unregister(struct file *filep, mutex_unlock(kfd_get_dbgmgr_mutex()); + amdgpu_read_unlock(dev->ddev); + return status; } @@ -743,15 +829,19 @@ static int kfd_ioctl_dbg_address_watch(struct file *filep, /* Currently HSA Event is not supported for DBG */ aw_info.watch_event = NULL; + status = amdgpu_read_lock(dev->ddev, true); + if (status) + goto out; + mutex_lock(kfd_get_dbgmgr_mutex()); status = kfd_dbgmgr_address_watch(dev->dbgmgr, &aw_info); mutex_unlock(kfd_get_dbgmgr_mutex()); + amdgpu_read_unlock(dev->ddev); out: kfree(args_buff); - return status; } @@ -822,6 +912,10 @@ static int kfd_ioctl_dbg_wave_control(struct file *filep, *((uint32_t *)(&args_buff[args_idx])); wac_info.dbgWave_msg.MemoryVA = NULL; + status = amdgpu_read_lock(dev->ddev, true); + if (status) + goto pro_end; + mutex_lock(kfd_get_dbgmgr_mutex()); pr_debug("Calling dbg manager process %p, operand %u, mode %u, trapId %u, message %u\n", @@ -835,6 +929,9 @@ static int kfd_ioctl_dbg_wave_control(struct file *filep, mutex_unlock(kfd_get_dbgmgr_mutex()); + amdgpu_read_unlock(dev->ddev); + +pro_end: kfree(args_buff); return status; @@ -847,10 +944,11 @@ static int kfd_ioctl_get_clock_counters(struct file *filep, struct kfd_dev *dev; dev = kfd_device_by_id(args->gpu_id); - if (dev) + if (dev && !amdgpu_read_lock(dev->ddev, true)) { /* Reading GPU clock counter from KGD */ args->gpu_clock_counter = amdgpu_amdkfd_get_gpu_clock_counter(dev->kgd); - else + amdgpu_read_unlock(dev->ddev); + } else /* Node without GPU resource */ args->gpu_clock_counter = 0; @@ -1056,13 +1154,20 @@ static int kfd_ioctl_create_event(struct file *filp, struct kfd_process *p, } mutex_unlock(&p->mutex); + err = amdgpu_read_lock(kfd->ddev, true); + if (err) + return err; + err = amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(kfd->kgd, mem, &kern_addr, &size); if (err) { pr_err("Failed to map event page to kernel\n"); + amdgpu_read_unlock(kfd->ddev); return err; } + amdgpu_read_unlock(kfd->ddev); + err = kfd_event_page_set(p, kern_addr, size); if (err) { pr_err("Failed to set event page\n"); @@ -1144,11 +1249,17 @@ static int kfd_ioctl_set_scratch_backing_va(struct file *filep, mutex_unlock(&p->mutex); + err = amdgpu_read_lock(dev->ddev, true); + if (err) + return err; + if (dev->dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS && pdd->qpd.vmid != 0 && dev->kfd2kgd->set_scratch_backing_va) dev->kfd2kgd->set_scratch_backing_va( dev->kgd, args->va_addr, pdd->qpd.vmid); + amdgpu_read_unlock(dev->ddev); + return 0; bind_process_to_device_fail: @@ -1217,6 +1328,10 @@ static int kfd_ioctl_acquire_vm(struct file *filep, struct kfd_process *p, mutex_lock(&p->mutex); + ret = amdgpu_read_lock(dev->ddev, true); + if (ret) + goto err_read_lock; + pdd = kfd_get_process_device_data(dev, p); if (!pdd) { ret = -EINVAL; @@ -1231,12 +1346,16 @@ static int kfd_ioctl_acquire_vm(struct file *filep, struct kfd_process *p, ret = kfd_process_device_init_vm(pdd, drm_file); if (ret) goto err_unlock; + + amdgpu_read_unlock(dev->ddev); /* On success, the PDD keeps the drm_file reference */ mutex_unlock(&p->mutex); return 0; err_unlock: + amdgpu_read_unlock(dev->ddev); +err_read_lock: mutex_unlock(&p->mutex); fput(drm_file); return ret; @@ -1289,6 +1408,10 @@ static int kfd_ioctl_alloc_memory_of_gpu(struct file *filep, mutex_lock(&p->mutex); + err = amdgpu_read_lock(dev->ddev, true); + if (err) + goto err_read_lock; + pdd = kfd_bind_process_to_device(dev, p); if (IS_ERR(pdd)) { err = PTR_ERR(pdd); @@ -1331,6 +1454,7 @@ static int kfd_ioctl_alloc_memory_of_gpu(struct file *filep, if (flags & KFD_IOC_ALLOC_MEM_FLAGS_VRAM) WRITE_ONCE(pdd->vram_usage, pdd->vram_usage + args->size); + amdgpu_read_unlock(dev->ddev); mutex_unlock(&p->mutex); args->handle = MAKE_HANDLE(args->gpu_id, idr_handle); @@ -1348,6 +1472,8 @@ static int kfd_ioctl_alloc_memory_of_gpu(struct file *filep, err_free: amdgpu_amdkfd_gpuvm_free_memory_of_gpu(dev->kgd, (struct kgd_mem *)mem, NULL); err_unlock: + amdgpu_read_unlock(dev->ddev); +err_read_lock: mutex_unlock(&p->mutex); return err; } @@ -1368,6 +1494,10 @@ static int kfd_ioctl_free_memory_of_gpu(struct file *filep, mutex_lock(&p->mutex); + ret = amdgpu_read_lock(dev->ddev, true); + if (ret) + goto err_read_lock; + pdd = kfd_get_process_device_data(dev, p); if (!pdd) { pr_err("Process device data doesn't exist\n"); @@ -1395,6 +1525,8 @@ static int kfd_ioctl_free_memory_of_gpu(struct file *filep, WRITE_ONCE(pdd->vram_usage, pdd->vram_usage - size); err_unlock: + amdgpu_read_unlock(dev->ddev); +err_read_lock: mutex_unlock(&p->mutex); return ret; } @@ -1465,13 +1597,21 @@ static int kfd_ioctl_map_memory_to_gpu(struct file *filep, err = PTR_ERR(peer_pdd); goto get_mem_obj_from_handle_failed; } + + err = amdgpu_read_lock(peer->ddev, true); + if (err) + goto map_memory_to_gpu_failed; + err = amdgpu_amdkfd_gpuvm_map_memory_to_gpu( peer->kgd, (struct kgd_mem *)mem, peer_pdd->vm); if (err) { pr_err("Failed to map to gpu %d/%d\n", i, args->n_devices); + amdgpu_read_unlock(peer->ddev); goto map_memory_to_gpu_failed; } + + amdgpu_read_unlock(peer->ddev); args->n_success = i+1; } @@ -1491,7 +1631,10 @@ static int kfd_ioctl_map_memory_to_gpu(struct file *filep, peer_pdd = kfd_get_process_device_data(peer, p); if (WARN_ON_ONCE(!peer_pdd)) continue; - kfd_flush_tlb(peer_pdd); + if (!amdgpu_read_lock(peer->ddev, true)) { + kfd_flush_tlb(peer_pdd); + amdgpu_read_unlock(peer->ddev); + } } kfree(devices_arr); @@ -1572,13 +1715,20 @@ static int kfd_ioctl_unmap_memory_from_gpu(struct file *filep, err = -ENODEV; goto get_mem_obj_from_handle_failed; } + + err = amdgpu_read_lock(peer->ddev, true); + if (err) + goto unmap_memory_from_gpu_failed; + err = amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu( peer->kgd, (struct kgd_mem *)mem, peer_pdd->vm); if (err) { pr_err("Failed to unmap from gpu %d/%d\n", i, args->n_devices); + amdgpu_read_unlock(peer->ddev); goto unmap_memory_from_gpu_failed; } + amdgpu_read_unlock(peer->ddev); args->n_success = i+1; } kfree(devices_arr); @@ -1624,7 +1774,13 @@ static int kfd_ioctl_alloc_queue_gws(struct file *filep, goto out_unlock; } + retval = amdgpu_read_lock(dev->ddev, true); + if (retval) + goto out_unlock; + retval = pqm_set_gws(&p->pqm, args->queue_id, args->num_gws ? dev->gws : NULL); + + amdgpu_read_unlock(dev->ddev); mutex_unlock(&p->mutex); args->first_gws = 0; @@ -1711,6 +1867,9 @@ static int kfd_ioctl_import_dmabuf(struct file *filep, return PTR_ERR(dmabuf); mutex_lock(&p->mutex); + r = amdgpu_read_lock(dev->ddev, true); + if (r) + goto err_read_lock; pdd = kfd_bind_process_to_device(dev, p); if (IS_ERR(pdd)) { @@ -1731,6 +1890,7 @@ static int kfd_ioctl_import_dmabuf(struct file *filep, goto err_free; } + amdgpu_read_unlock(dev->ddev); mutex_unlock(&p->mutex); dma_buf_put(dmabuf); @@ -1741,6 +1901,8 @@ static int kfd_ioctl_import_dmabuf(struct file *filep, err_free: amdgpu_amdkfd_gpuvm_free_memory_of_gpu(dev->kgd, (struct kgd_mem *)mem, NULL); err_unlock: + amdgpu_read_unlock(dev->ddev); +err_read_lock: mutex_unlock(&p->mutex); dma_buf_put(dmabuf); return r; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index d8c8b5ff449a..5ea25c7dff0d 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -1011,7 +1011,8 @@ int pqm_get_wave_state(struct process_queue_manager *pqm, void __user *ctl_stack, u32 *ctl_stack_used_size, u32 *save_area_used_size); - +struct kfd_dev *pqm_query_dev_by_qid(struct process_queue_manager *pqm, + unsigned int qid); int amdkfd_fence_wait_timeout(unsigned int *fence_addr, unsigned int fence_value, unsigned int timeout_ms); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c index f5237997fa18..d02ca231ad83 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c @@ -898,11 +898,15 @@ static void kfd_process_device_free_bos(struct kfd_process_device *pdd) per_device_list) { if (!peer_pdd->vm) continue; + amdgpu_read_lock(peer_pdd->dev->ddev, false); amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu( peer_pdd->dev->kgd, mem, peer_pdd->vm); + amdgpu_read_unlock(peer_pdd->dev->ddev); } + amdgpu_read_lock(pdd->dev->ddev, false); amdgpu_amdkfd_gpuvm_free_memory_of_gpu(pdd->dev->kgd, mem, NULL); + amdgpu_read_unlock(pdd->dev->ddev); kfd_process_device_remove_obj_handle(pdd, id); } } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c index eb1635ac8988..2b2308c0b006 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c @@ -64,6 +64,23 @@ static int find_available_queue_slot(struct process_queue_manager *pqm, return 0; } +struct kfd_dev *pqm_query_dev_by_qid(struct process_queue_manager *pqm, + unsigned int qid) +{ + struct process_queue_node *pqn; + + pqn = get_queue_by_qid(pqm, qid); + if (!pqn) { + pr_err("Queue id does not match any known queue\n"); + return NULL; + } + + if (pqn->q) + return pqn->q->device; + + return NULL; +} + void kfd_process_dequeue_from_device(struct kfd_process_device *pdd) { struct kfd_dev *dev = pdd->dev; -- 2.17.1 _______________________________________________ amd-gfx mailing list amd-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/amd-gfx