On 2024-10-18 10:09, Chen, Xiaogang wrote:
On 10/17/2024 4:04 PM, Felix Kuehling wrote:On 2024-10-15 17:21, Xiaogang.Chen wrote:From: Xiaogang Chen <xiaogang.chen@xxxxxxx>The purpose of this patch is having kfd driver function as expected during AMDgpu device plug/unplug.When an AMD gpu device got unplug kfd driver stops all queues from this device. If there are user processes still ref the render node this device is marked as invalid. kfd driver will return error to following requests to the device from all existing user processes. Existing user processes can still use remaininggpu devices during/after unplug event.After all refs to the device have been closed from user space kfd drivertopology got updated by removing correspodent kfd nodes.User space can use remaining gpu devices that are valid at same time. When all AMD gpu devices got removed kfd driver will not allow open /dev/kfd request.Unplugged AMD gpu devices can be re-plugged. kfd driver will use added devicesand function as usual. Signed-off-by: Xiaogang Chen<Xiaogang.Chen@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 5 ++ drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 7 ++ drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 3 +-drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 78 +++++++++++++++++++drivers/gpu/drm/amd/amdkfd/kfd_device.c | 43 ++++++++++ drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c | 6 ++ drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 7 ++ drivers/gpu/drm/amd/amdkfd/kfd_process.c | 13 +++- .../amd/amdkfd/kfd_process_queue_manager.c | 24 ++++++ 9 files changed, 183 insertions(+), 3 deletions(-)diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.cindex b545940e512b..651ae0775f80 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c@@ -248,6 +248,11 @@ void amdgpu_amdkfd_interrupt(struct amdgpu_device *adev,kgd2kfd_interrupt(adev->kfd.dev, ih_ring_entry); } +void amdgpu_amdkfd_teardown_kfd_device(struct kfd_dev *kfd) +{ + kgd2kfd_teardown_kfd_device(kfd); +} + void amdgpu_amdkfd_suspend(struct amdgpu_device *adev, bool run_pm) { if (adev->kfd.dev)diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.hindex 7e0a22072536..bd241f569b79 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h @@ -152,6 +152,7 @@ struct amdkfd_process_info { int amdgpu_amdkfd_init(void); void amdgpu_amdkfd_fini(void); +void amdgpu_amdkfd_teardown_kfd_device(struct kfd_dev *kfd);void amdgpu_amdkfd_suspend(struct amdgpu_device *adev, bool run_pm);int amdgpu_amdkfd_resume(struct amdgpu_device *adev, bool run_pm); @@ -431,6 +432,7 @@ int kgd2kfd_check_and_lock_kfd(void); void kgd2kfd_unlock_kfd(void); int kgd2kfd_start_sched(struct kfd_dev *kfd, uint32_t node_id); int kgd2kfd_stop_sched(struct kfd_dev *kfd, uint32_t node_id); +void kgd2kfd_teardown_kfd_device(struct kfd_dev *kfd); #else static inline int kgd2kfd_init(void) {@@ -511,5 +513,10 @@ static inline int kgd2kfd_stop_sched(struct kfd_dev *kfd, uint32_t node_id){ return 0; } + +void kgd2kfd_teardown_processes(void) +{ +} + #endif #endif /* AMDGPU_AMDKFD_H_INCLUDED */diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.cindex 1e47655e02c6..4529d7a88b98 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c@@ -3315,7 +3315,8 @@ static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev)amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE); amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); - amdgpu_amdkfd_suspend(adev, false); + if (adev->kfd.dev) + amdgpu_amdkfd_teardown_kfd_device(adev->kfd.dev); /* Workaroud for ASICs need to disable SMC first */ amdgpu_device_smu_fini_early(adev);diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.cindex a1f191a5984b..d246f72ae0e9 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c@@ -327,6 +327,13 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p,err = -EINVAL; goto err_pdd; } + + if (!is_kfd_process_device_valid(pdd)) { + pr_debug("gpu 0x%x is not available\n", args->gpu_id); + err = -EINVAL; + goto err_pdd; + } +Instead of duplicating this in all the ioctl functions, could this check be done in kfd_process_device_data_by_id?Yes, that makes code simpler. Also, need add same check on kfd_get_process_device_data.
Maybe not. kfd_get_process_device_data gets a kfd_node as parameter, that callers typically get from get from a call to kfd_device_by_id. Maybe the check should be in kfd_get_device_by_id so it doesn't return invalid devices.
dev = pdd->dev; pdd = kfd_bind_process_to_device(dev, p);@@ -578,6 +585,12 @@ static int kfd_ioctl_set_memory_policy(struct file *filep,goto err_pdd; } + if (!is_kfd_process_device_valid(pdd)) { + pr_debug("gpu 0x%x is not available\n", args->gpu_id); + err = -EINVAL; + goto err_pdd; + } + pdd = kfd_bind_process_to_device(pdd->dev, p); if (IS_ERR(pdd)) { err = -ESRCH;@@ -621,6 +634,11 @@ static int kfd_ioctl_set_trap_handler(struct file *filep,goto err_pdd; } + if (!is_kfd_process_device_valid(pdd)) { + err = -EINVAL; + goto err_pdd; + } + pdd = kfd_bind_process_to_device(pdd->dev, p); if (IS_ERR(pdd)) { err = -ESRCH;@@ -704,6 +722,9 @@ static int kfd_ioctl_get_process_apertures(struct file *filp,for (i = 0; i < p->n_pdds; i++) { struct kfd_process_device *pdd = p->pdds[i]; + if (!is_kfd_process_device_valid(pdd)) + continue; + pAperture = &args->process_apertures[args->num_of_nodes]; pAperture->gpu_id = pdd->dev->id;@@ -779,6 +800,9 @@ static int kfd_ioctl_get_process_apertures_new(struct file *filp,for (i = 0; i < min(p->n_pdds, args->num_of_nodes); i++) { struct kfd_process_device *pdd = p->pdds[i]; + if (!is_kfd_process_device_valid(pdd)) + continue; + pa[i].gpu_id = pdd->dev->id; pa[i].lds_base = pdd->lds_base; pa[i].lds_limit = pdd->lds_limit;@@ -901,6 +925,11 @@ static int kfd_ioctl_set_scratch_backing_va(struct file *filep,goto bind_process_to_device_fail; } + if (!is_kfd_process_device_valid(pdd)) { + err = PTR_ERR(pdd); + goto bind_process_to_device_fail; + } + pdd->qpd.sh_hidden_private_base = args->va_addr; mutex_unlock(&p->mutex);@@ -981,6 +1010,11 @@ static int kfd_ioctl_acquire_vm(struct file *filep, struct kfd_process *p,goto err_pdd; } + if (!is_kfd_process_device_valid(pdd)) { + ret = -EINVAL; + goto err_pdd; + } + if (pdd->drm_file) { ret = pdd->drm_file == drm_file ? 0 : -EBUSY; goto err_drm_file;@@ -1031,6 +1065,10 @@ static int kfd_ioctl_get_available_memory(struct file *filep,if (!pdd) return -EINVAL; + + if (!is_kfd_process_device_valid(pdd)) + return -EINVAL; +args->available = amdgpu_amdkfd_get_available_memory(pdd->dev->adev,pdd->dev->node_id); kfd_unlock_pdd(pdd);@@ -1090,6 +1128,11 @@ static int kfd_ioctl_alloc_memory_of_gpu(struct file *filep,goto err_pdd; } + if (!is_kfd_process_device_valid(pdd)) { + err = -EINVAL; + goto err_pdd; + } + dev = pdd->dev; if ((flags & KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC) &&@@ -1202,6 +1245,12 @@ static int kfd_ioctl_free_memory_of_gpu(struct file *filep,goto err_pdd; } + if (!is_kfd_process_device_valid(pdd)) { + pr_err("Process device is not valid\n"); + ret = -EINVAL; + goto err_pdd; + } + mem = kfd_process_device_translate_handle( pdd, GET_IDR_HANDLE(args->handle)); if (!mem) {@@ -1266,6 +1315,12 @@ static int kfd_ioctl_map_memory_to_gpu(struct file *filep,err = -EINVAL; goto get_process_device_data_failed; } + + if (!is_kfd_process_device_valid(pdd)) { + err = -EINVAL; + goto get_process_device_data_failed; + } + dev = pdd->dev; pdd = kfd_bind_process_to_device(dev, p);@@ -1384,6 +1439,11 @@ static int kfd_ioctl_unmap_memory_from_gpu(struct file *filep,goto bind_process_to_device_failed; } + if (!is_kfd_process_device_valid(pdd)) { + err = -EINVAL; + goto bind_process_to_device_failed; + } + mem = kfd_process_device_translate_handle(pdd, GET_IDR_HANDLE(args->handle)); if (!mem) {@@ -1567,6 +1627,11 @@ static int kfd_ioctl_import_dmabuf(struct file *filep,goto err_unlock; } + if (!is_kfd_process_device_valid(pdd)) { + r = PTR_ERR(pdd); + goto err_unlock; + } +r = amdgpu_amdkfd_gpuvm_import_dmabuf_fd(pdd->dev->adev, args->dmabuf_fd,args->va_addr, pdd->drm_priv, (struct kgd_mem **)&mem, &size,@@ -1616,6 +1681,11 @@ static int kfd_ioctl_export_dmabuf(struct file *filep,goto err_unlock; } + if (!is_kfd_process_device_valid(pdd)) { + ret = -EINVAL; + goto err_unlock; + } + mem = kfd_process_device_translate_handle(pdd, GET_IDR_HANDLE(args->handle)); if (!mem) {@@ -1660,6 +1730,9 @@ static int kfd_ioctl_smi_events(struct file *filep,if (!pdd) return -EINVAL; + if (!is_kfd_process_device_valid(pdd)) + return -EINVAL; + return kfd_smi_event_open(pdd->dev, &args->anon_fd); }@@ -2990,6 +3063,11 @@ static int kfd_ioctl_set_debug_trap(struct file *filep, struct kfd_process *p, vr = -ENODEV; goto unlock_out; } + + if (!is_kfd_process_device_valid(pdd)) { + r = -ENODEV; + goto unlock_out; + } } switch (args->op) {diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.cindex fad1c8f2bc83..019567249110 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c @@ -893,6 +893,7 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd, svm_range_set_max_pages(kfd->adev); kfd->init_complete = true; + kfd->valid = true;dev_info(kfd_device, "added device %x:%x\n", kfd->adev->pdev->vendor,kfd->adev->pdev->device); @@ -919,6 +920,10 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd, void kgd2kfd_device_exit(struct kfd_dev *kfd) { + struct kfd_process *p; + unsigned int i, j; + unsigned int temp; + if (kfd->init_complete) { /* Cleanup KFD nodes */ kfd_cleanup_nodes(kfd, kfd->num_nodes); @@ -929,6 +934,20 @@ void kgd2kfd_device_exit(struct kfd_dev *kfd) amdgpu_amdkfd_free_gtt_mem(kfd->adev, &kfd->gtt_mem); } + /* now this kfd_dev has been completely removed from kfd driver+ * before kfree kfd iterate all existing kfd processes, if kfd process+ * uses any kfd node from this kfd set its ref to NULL + */ + hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) { + for (i = 0; i < kfd->num_nodes; i++) + for (j = 0; j < p->n_pdds; j++) { + if (kfd->nodes[i] == p->pdds[j]->dev) { + p->pdds[j]->dev = NULL;Could this be done in teardown_kfd_device? Then you may not need a separate "valid" for is_kfd_process_device_valid. And any accidental access to a device associated with an invalid pdd would automatically trigger a kernel error message with a backtrace.At teardown_kfd_device the adev has not been removed, ex: there are user apps still refer the render node. kfd dev(kfd nodes) is released at kfd_cleanup_nodes of kgd2kfd_device_exit when all ref to adev got released(user apps close render node). During that time kfd nodes(pdd->dev) are valid. We still can access kfd nodes data structure though their queues got stopped and kfd_node->kfd has been markded as invalid.
I'm not sure why that matters. The fact that the there are still other pointers to the dev doesn't mean the pointer in the pdd must remain valid.
On the other hand, there should be code in kgd2kfd_teardown_kfd_device or kgd2kfd_device_exit to clean up _all_ the other pointers to the invalid kfd_dev and kfd_nodes. AFAICT the kfd_dev and kfd_nodes are not reference counted, and there is no guarantee that these structures still exist by the time the processes terminate and run their cleanup code. You can't rely on kfd_dev->valid after the kfd_dev itself has been freed with kfree in kgd2kfd_device_exit. By that time all pointers to the kfd_dev and its nodes must have been cleaned up.
A quick survey of the header files shows * kfd_dev pointers in kfd_node, kfd_device_queue_manager * kfd_node pointers in kfd_dev, kfd_bo, queue, kernel_queue, kfd_process_device, kfd_*_properties referenced in lists in kfd_topology_device, svm_range_bo, mqd_manager
pci base driver will find that the device has been unplugged, will not call amdgpu driver's callback for pci device that has been removed. So that would not happen.+ break; + } + } + } + kfree(kfd); }@@ -1485,6 +1504,30 @@ int kgd2kfd_stop_sched(struct kfd_dev *kfd, uint32_t node_id)return node->dqm->ops.halt(node->dqm); } +/* tear down this kfd deve */ +void kgd2kfd_teardown_kfd_device(struct kfd_dev *kfd) +{ + struct kfd_process *p; + struct kfd_node *dev; + unsigned int i; + unsigned int temp; + + kfd->valid = false; + /* stop queues from kfd nodes in this kfd dev */ + for (i = 0; i < kfd->num_nodes; i++) { + dev = kfd->nodes[i]; + dev->dqm->ops.stop(dev->dqm); + }If the GPU was unplugged already, what's the point of this? Won't this trigger a timeout?
This has nothing to do with PCIe callbacks. dev->dqm->ops.stop tries to talk to the HWS firmware to remove queues. That will hand or time out if the GPU has been unplugged.
Regards, Felix
ok, will iterate all existing kfd processes. If any kfd node from this kfd dev got used by a kfd process send the event to correspondent user process.++ /* signal a gpu device is being teared down to user spalce processes by+ * KFD_EVENT_TYPE_HW_EXCEPTION event + */ + hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) + kfd_signal_hw_exception_event(p->pasid);This sends exceptions to all processes. It should only do this for processes that use the unplugged device (i.e. have a pdd that uses the device). This excludes processes that don't have the device in their cgroup.ok, this change is not related to the patch. I thought it is better to update kfd topology at last step after all kfd process resources got released. I will remove this change.+ + return; +} + #if defined(CONFIG_DEBUG_FS) /* This function will send a package to HIQ to hang the HWSdiff --git a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.cindex dbcb60eb54b2..b8dd80ee17be 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c@@ -378,6 +378,12 @@ int kfd_init_apertures(struct kfd_process *process)continue; } + /* kfd device that this kfd node belogns is not valid */ + if (!dev->kfd->valid) { + id++; + continue; + } + pdd = kfd_create_process_device_data(dev, process); if (!pdd) { dev_err(dev->adev->dev,diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.hindex 6a5bf88cc232..97e7692ce569 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -371,6 +371,9 @@ struct kfd_dev {/* bitmap for dynamic doorbell allocation from doorbell object */unsigned long *doorbell_bitmap; + + /* this kfd_dev valid or not */ + bool valid; }; enum kfd_mempool {@@ -1055,6 +1058,10 @@ int kfd_process_restore_queues(struct kfd_process *p);void kfd_suspend_all_processes(void); int kfd_resume_all_processes(void);+static inline bool is_kfd_process_device_valid(struct kfd_process_device *pdd) {+ return (pdd && pdd->dev && pdd->dev->kfd && pdd->dev->kfd->valid); +} +struct kfd_process_device *kfd_process_device_data_by_id(struct kfd_process *process,uint32_t gpu_id);diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.cindex d07acf1b2f93..c06eb9d8008e 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c@@ -1157,8 +1157,6 @@ static void kfd_process_wq_release(struct work_struct *work)ef = rcu_access_pointer(p->ef); dma_fence_signal(ef); - kfd_process_remove_sysfs(p); - kfd_process_kunmap_signal_bo(p); kfd_process_free_outstanding_kfd_bos(p); svm_range_list_fini(p);@@ -1173,6 +1171,11 @@ static void kfd_process_wq_release(struct work_struct *work)put_task_struct(p->lead_thread); + /* the last step is removing process entries under /sys + * to indicate the process has been terminated. + */This comment doesn't provide any useful information. What would be useful is, why this needs to be the last step? Without that, I see no good reason for this change.I will change kfd_get_process_device_data that will include this check, then this message will be merged at !pdd case.+ kfd_process_remove_sysfs(p); + kfree(p); }@@ -1536,6 +1539,12 @@ static struct kfd_process *create_process(const struct task_struct *thread)if (err != 0) goto err_init_apertures; + /* no any kfd_process_device can be created */ + if (!process->n_pdds) { + err = -ENODEV; + goto err_init_apertures; + } +/* Check XNACK support after PDDs are created in kfd_init_apertures */process->xnack_enabled = kfd_process_xnack_mode(process, false);diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.cindex c76db22a1000..eaf4ba65466c 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c@@ -124,6 +124,11 @@ int pqm_set_gws(struct process_queue_manager *pqm, unsigned int qid,return -EINVAL; } + if (!is_kfd_process_device_valid(pdd)) { + pr_debug("device 0x%x is not available\n",dev->node_id); + return -EINVAL; + } + /* Only allow one queue per process can have GWS assigned */ if (gws && pdd->qpd.num_gws) return -EBUSY;@@ -498,6 +503,11 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid)if (WARN_ON(!dev)) return -ENODEV; + if (!dev->kfd || !dev->kfd->valid) { + pr_err("Process device is not valid\n");Would you expect to see this message during process termination after a hot-unplug? Should this really be an error message, or would an info or debug message be more appropriate?+ return -1;This should be a proper error code. -1 is -EPERM.Same as above.+ } + pdd = kfd_get_process_device_data(dev, pqm->process); if (!pdd) { pr_err("Process device data doesn't exist\n");@@ -567,6 +577,10 @@ int pqm_update_queue_properties(struct process_queue_manager *pqm,pdd = kfd_get_process_device_data(q->device, q->process); if (!pdd) return -ENODEV; + + if (!is_kfd_process_device_valid(pdd)) + return -ENODEV; + vm = drm_priv_to_vm(pdd->drm_priv); err = amdgpu_bo_reserve(vm->root.bo, false); if (err)@@ -612,6 +626,11 @@ int pqm_update_mqd(struct process_queue_manager *pqm,return -EFAULT; } + if (!pqn->q->device->kfd->valid) { + pr_debug("device where queue %d exists is not valid\n", qid); + return -EFAULT; + } + /* CUs are masked for debugger requirements so deny user mask */ if (pqn->q->properties.is_dbg_wa && minfo && minfo->cu_mask.ptr) return -EBUSY;@@ -679,6 +698,11 @@ int pqm_get_wave_state(struct process_queue_manager *pqm,return -EFAULT; } + if (!pqn->q->device->kfd->valid) { + pr_debug("device where queue %d exists is not valid\n", qid); + return -EFAULT;EFAULT means "bad address". Probably not the right error code here.Will use -EINVAL. Thanks XiaogangRegards, Felix+ } +return pqn->q->device->dqm->ops.get_wave_state(pqn->q->device->dqm,pqn->q, ctl_stack,