Re: [PATCH v2] drm/amdkfd: Add kfd driver function to support hot plug/unplug amdgpu devices

"Chen, Xiaogang" <xiaogang.chen@xxxxxxx> · Fri, 18 Oct 2024 09:09:23 -0500

On 10/17/2024 4:04 PM, Felix Kuehling wrote:

On 2024-10-15 17:21, Xiaogang.Chen wrote:
From: Xiaogang Chen <xiaogang.chen@xxxxxxx>

The purpose of this patch is having kfd driver function as expected 
during AMD
gpu device plug/unplug.

When an AMD gpu device got unplug kfd driver stops all queues from 
this device.
If there are user processes still ref the render node this device is 
marked as
invalid. kfd driver will return error to following requests to the 
device from
all existing user processes. Existing user processes can still use 
remaining
gpu devices during/after unplug event.

After all refs to the device have been closed from user space kfd driver
topology got updated by removing correspodent kfd nodes.

User space can use remaining gpu devices that are valid at same time. 
When all
AMD gpu devices got removed kfd driver will not allow open /dev/kfd 
request.

Unplugged AMD gpu devices can be re-plugged. kfd driver will use 
added devices
and function as usual.

Signed-off-by: Xiaogang Chen<Xiaogang.Chen@xxxxxxx>
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c    |  5 ++
  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h    |  7 ++
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c    |  3 +-
  drivers/gpu/drm/amd/amdkfd/kfd_chardev.c      | 78 +++++++++++++++++++
  drivers/gpu/drm/amd/amdkfd/kfd_device.c       | 43 ++++++++++
  drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c  |  6 ++
  drivers/gpu/drm/amd/amdkfd/kfd_priv.h         |  7 ++
  drivers/gpu/drm/amd/amdkfd/kfd_process.c      | 13 +++-
  .../amd/amdkfd/kfd_process_queue_manager.c    | 24 ++++++
  9 files changed, 183 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index b545940e512b..651ae0775f80 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -248,6 +248,11 @@ void amdgpu_amdkfd_interrupt(struct 
amdgpu_device *adev,
          kgd2kfd_interrupt(adev->kfd.dev, ih_ring_entry);
  }
  +void amdgpu_amdkfd_teardown_kfd_device(struct kfd_dev *kfd)
+{
+       kgd2kfd_teardown_kfd_device(kfd);
+}
+
  void amdgpu_amdkfd_suspend(struct amdgpu_device *adev, bool run_pm)
  {
      if (adev->kfd.dev)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index 7e0a22072536..bd241f569b79 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -152,6 +152,7 @@ struct amdkfd_process_info {
    int amdgpu_amdkfd_init(void);
  void amdgpu_amdkfd_fini(void);
+void amdgpu_amdkfd_teardown_kfd_device(struct kfd_dev *kfd);
    void amdgpu_amdkfd_suspend(struct amdgpu_device *adev, bool run_pm);
  int amdgpu_amdkfd_resume(struct amdgpu_device *adev, bool run_pm);
@@ -431,6 +432,7 @@ int kgd2kfd_check_and_lock_kfd(void);
  void kgd2kfd_unlock_kfd(void);
  int kgd2kfd_start_sched(struct kfd_dev *kfd, uint32_t node_id);
  int kgd2kfd_stop_sched(struct kfd_dev *kfd, uint32_t node_id);
+void kgd2kfd_teardown_kfd_device(struct kfd_dev *kfd);
  #else
  static inline int kgd2kfd_init(void)
  {
@@ -511,5 +513,10 @@ static inline int kgd2kfd_stop_sched(struct 
kfd_dev *kfd, uint32_t node_id)
  {
      return 0;
  }
+
+void kgd2kfd_teardown_processes(void)
+{
+}
+
  #endif
  #endif /* AMDGPU_AMDKFD_H_INCLUDED */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 1e47655e02c6..4529d7a88b98 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3315,7 +3315,8 @@ static int amdgpu_device_ip_fini_early(struct 
amdgpu_device *adev)
      amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
      amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
  -    amdgpu_amdkfd_suspend(adev, false);
+    if (adev->kfd.dev)
+        amdgpu_amdkfd_teardown_kfd_device(adev->kfd.dev);
        /* Workaroud for ASICs need to disable SMC first */
      amdgpu_device_smu_fini_early(adev);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index a1f191a5984b..d246f72ae0e9 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -327,6 +327,13 @@ static int kfd_ioctl_create_queue(struct file 
*filep, struct kfd_process *p,
          err = -EINVAL;
          goto err_pdd;
      }
+
+    if (!is_kfd_process_device_valid(pdd)) {
+        pr_debug("gpu 0x%x is not available\n", args->gpu_id);
+        err = -EINVAL;
+        goto err_pdd;
+    }
+

Instead of duplicating this in all the ioctl functions, could this 
check be done in kfd_process_device_data_by_id?

Yes, that makes code simpler. Also, need add same check on 
kfd_get_process_device_data.



      dev = pdd->dev;
        pdd = kfd_bind_process_to_device(dev, p);
@@ -578,6 +585,12 @@ static int kfd_ioctl_set_memory_policy(struct 
file *filep,
          goto err_pdd;
      }
  +    if (!is_kfd_process_device_valid(pdd)) {
+        pr_debug("gpu 0x%x is not available\n", args->gpu_id);
+        err = -EINVAL;
+        goto err_pdd;
+    }
+
      pdd = kfd_bind_process_to_device(pdd->dev, p);
      if (IS_ERR(pdd)) {
          err = -ESRCH;
@@ -621,6 +634,11 @@ static int kfd_ioctl_set_trap_handler(struct 
file *filep,
          goto err_pdd;
      }
  +    if (!is_kfd_process_device_valid(pdd)) {
+        err = -EINVAL;
+        goto err_pdd;
+    }
+
      pdd = kfd_bind_process_to_device(pdd->dev, p);
      if (IS_ERR(pdd)) {
          err = -ESRCH;
@@ -704,6 +722,9 @@ static int kfd_ioctl_get_process_apertures(struct 
file *filp,
      for (i = 0; i < p->n_pdds; i++) {
          struct kfd_process_device *pdd = p->pdds[i];
  +        if (!is_kfd_process_device_valid(pdd))
+            continue;
+
          pAperture =
&args->process_apertures[args->num_of_nodes];
          pAperture->gpu_id = pdd->dev->id;
@@ -779,6 +800,9 @@ static int 
kfd_ioctl_get_process_apertures_new(struct file *filp,
      for (i = 0; i < min(p->n_pdds, args->num_of_nodes); i++) {
          struct kfd_process_device *pdd = p->pdds[i];
  +        if (!is_kfd_process_device_valid(pdd))
+            continue;
+
          pa[i].gpu_id = pdd->dev->id;
          pa[i].lds_base = pdd->lds_base;
          pa[i].lds_limit = pdd->lds_limit;
@@ -901,6 +925,11 @@ static int 
kfd_ioctl_set_scratch_backing_va(struct file *filep,
          goto bind_process_to_device_fail;
      }
  +    if (!is_kfd_process_device_valid(pdd)) {
+        err = PTR_ERR(pdd);
+        goto bind_process_to_device_fail;
+    }
+
      pdd->qpd.sh_hidden_private_base = args->va_addr;
        mutex_unlock(&p->mutex);
@@ -981,6 +1010,11 @@ static int kfd_ioctl_acquire_vm(struct file 
*filep, struct kfd_process *p,
          goto err_pdd;
      }
  +    if (!is_kfd_process_device_valid(pdd)) {
+        ret = -EINVAL;
+        goto err_pdd;
+    }
+
      if (pdd->drm_file) {
          ret = pdd->drm_file == drm_file ? 0 : -EBUSY;
          goto err_drm_file;
@@ -1031,6 +1065,10 @@ static int 
kfd_ioctl_get_available_memory(struct file *filep,
        if (!pdd)
          return -EINVAL;
+
+    if (!is_kfd_process_device_valid(pdd))
+        return -EINVAL;
+
      args->available = 
amdgpu_amdkfd_get_available_memory(pdd->dev->adev,
                              pdd->dev->node_id);
      kfd_unlock_pdd(pdd);
@@ -1090,6 +1128,11 @@ static int 
kfd_ioctl_alloc_memory_of_gpu(struct file *filep,
          goto err_pdd;
      }
  +    if (!is_kfd_process_device_valid(pdd)) {
+        err = -EINVAL;
+        goto err_pdd;
+    }
+
      dev = pdd->dev;
        if ((flags & KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC) &&
@@ -1202,6 +1245,12 @@ static int kfd_ioctl_free_memory_of_gpu(struct 
file *filep,
          goto err_pdd;
      }
  +    if (!is_kfd_process_device_valid(pdd)) {
+        pr_err("Process device is not valid\n");
+        ret = -EINVAL;
+        goto err_pdd;
+    }
+
      mem = kfd_process_device_translate_handle(
          pdd, GET_IDR_HANDLE(args->handle));
      if (!mem) {
@@ -1266,6 +1315,12 @@ static int kfd_ioctl_map_memory_to_gpu(struct 
file *filep,
          err = -EINVAL;
          goto get_process_device_data_failed;
      }
+
+    if (!is_kfd_process_device_valid(pdd)) {
+        err = -EINVAL;
+        goto get_process_device_data_failed;
+    }
+
      dev = pdd->dev;
        pdd = kfd_bind_process_to_device(dev, p);
@@ -1384,6 +1439,11 @@ static int 
kfd_ioctl_unmap_memory_from_gpu(struct file *filep,
          goto bind_process_to_device_failed;
      }
  +    if (!is_kfd_process_device_valid(pdd)) {
+        err = -EINVAL;
+        goto bind_process_to_device_failed;
+    }
+
      mem = kfd_process_device_translate_handle(pdd,
                          GET_IDR_HANDLE(args->handle));
      if (!mem) {
@@ -1567,6 +1627,11 @@ static int kfd_ioctl_import_dmabuf(struct file 
*filep,
          goto err_unlock;
      }
  +    if (!is_kfd_process_device_valid(pdd)) {
+        r = PTR_ERR(pdd);
+        goto err_unlock;
+    }
+
      r = amdgpu_amdkfd_gpuvm_import_dmabuf_fd(pdd->dev->adev, 
args->dmabuf_fd,
                           args->va_addr, pdd->drm_priv,
                           (struct kgd_mem **)&mem, &size,
@@ -1616,6 +1681,11 @@ static int kfd_ioctl_export_dmabuf(struct file 
*filep,
          goto err_unlock;
      }
  +    if (!is_kfd_process_device_valid(pdd)) {
+        ret = -EINVAL;
+        goto err_unlock;
+    }
+
      mem = kfd_process_device_translate_handle(pdd,
                          GET_IDR_HANDLE(args->handle));
      if (!mem) {
@@ -1660,6 +1730,9 @@ static int kfd_ioctl_smi_events(struct file 
*filep,
      if (!pdd)
          return -EINVAL;
  +    if (!is_kfd_process_device_valid(pdd))
+        return -EINVAL;
+
      return kfd_smi_event_open(pdd->dev, &args->anon_fd);
  }
  @@ -2990,6 +3063,11 @@ static int kfd_ioctl_set_debug_trap(struct 
file *filep, struct kfd_process *p, v
              r = -ENODEV;
              goto unlock_out;
          }
+
+        if (!is_kfd_process_device_valid(pdd)) {
+            r = -ENODEV;
+            goto unlock_out;
+        }
      }
        switch (args->op) {
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index fad1c8f2bc83..019567249110 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -893,6 +893,7 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
      svm_range_set_max_pages(kfd->adev);
        kfd->init_complete = true;
+    kfd->valid = true;
      dev_info(kfd_device, "added device %x:%x\n", 
kfd->adev->pdev->vendor,
           kfd->adev->pdev->device);
  @@ -919,6 +920,10 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
    void kgd2kfd_device_exit(struct kfd_dev *kfd)
  {
+    struct kfd_process *p;
+    unsigned int i, j;
+    unsigned int temp;
+
      if (kfd->init_complete) {
          /* Cleanup KFD nodes */
          kfd_cleanup_nodes(kfd, kfd->num_nodes);
@@ -929,6 +934,20 @@ void kgd2kfd_device_exit(struct kfd_dev *kfd)
          amdgpu_amdkfd_free_gtt_mem(kfd->adev, &kfd->gtt_mem);
      }
  +    /* now this kfd_dev has been completely removed from kfd driver
+     * before kfree kfd iterate all existing kfd processes, if kfd 
process
+     * uses any kfd node from this kfd set its ref to NULL
+     */
+    hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
+        for (i = 0; i < kfd->num_nodes; i++)
+            for (j = 0; j < p->n_pdds; j++) {
+                if (kfd->nodes[i] == p->pdds[j]->dev) {
+                    p->pdds[j]->dev = NULL;

Could this be done in teardown_kfd_device? Then you may not need a 
separate "valid" for is_kfd_process_device_valid. And any accidental 
access to a device associated with an invalid pdd would automatically 
trigger a kernel error message with a backtrace.

At teardown_kfd_device the adev has not been removed, ex: there are user 
apps still refer the render node. kfd dev(kfd nodes) is released at 
kfd_cleanup_nodes of kgd2kfd_device_exit when all ref to adev got 
released(user apps close render node). During that time kfd 
nodes(pdd->dev) are valid. We still can access kfd nodes data structure 
though their queues got stopped and kfd_node->kfd has been markded as 
invalid.



+                    break;
+                }
+            }
+    }
+
      kfree(kfd);
  }
  @@ -1485,6 +1504,30 @@ int kgd2kfd_stop_sched(struct kfd_dev *kfd, 
uint32_t node_id)
      return node->dqm->ops.halt(node->dqm);
  }
  +/* tear down this kfd deve */
+void kgd2kfd_teardown_kfd_device(struct kfd_dev *kfd)
+{
+    struct kfd_process *p;
+    struct kfd_node *dev;
+    unsigned int i;
+    unsigned int temp;
+
+    kfd->valid = false;
+    /* stop queues from kfd nodes in this kfd dev */
+    for (i = 0; i < kfd->num_nodes; i++) {
+        dev = kfd->nodes[i];
+        dev->dqm->ops.stop(dev->dqm);
+    }

If the GPU was unplugged already, what's the point of this? Won't this 
trigger a timeout?

pci base driver will find that the device has been unplugged, will not 
call amdgpu driver's callback for pci device that has been removed. So 
that would not happen.

+
+    /* signal a gpu device is being teared down to user spalce 
processes by
+     * KFD_EVENT_TYPE_HW_EXCEPTION event
+     */
+    hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes)
+        kfd_signal_hw_exception_event(p->pasid);

This sends exceptions to all processes. It should only do this for 
processes that use the unplugged device (i.e. have a pdd that uses the 
device). This excludes processes that don't have the device in their 
cgroup.
ok, will iterate all existing kfd processes. If any kfd node from this 
kfd dev got used by a kfd process send the event to correspondent user 
process.


+
+    return;
+}
+
  #if defined(CONFIG_DEBUG_FS)
    /* This function will send a package to HIQ to hang the HWS
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c
index dbcb60eb54b2..b8dd80ee17be 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c
@@ -378,6 +378,12 @@ int kfd_init_apertures(struct kfd_process *process)
              continue;
          }
  +        /* kfd device that this kfd node belogns is not valid */
+        if (!dev->kfd->valid) {
+            id++;
+            continue;
+        }
+
          pdd = kfd_create_process_device_data(dev, process);
          if (!pdd) {
              dev_err(dev->adev->dev,
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 6a5bf88cc232..97e7692ce569 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -371,6 +371,9 @@ struct kfd_dev {
        /* bitmap for dynamic doorbell allocation from doorbell 
object */
      unsigned long *doorbell_bitmap;
+
+    /* this kfd_dev valid or not */
+    bool valid;
  };
    enum kfd_mempool {
@@ -1055,6 +1058,10 @@ int kfd_process_restore_queues(struct 
kfd_process *p);
  void kfd_suspend_all_processes(void);
  int kfd_resume_all_processes(void);
  +static inline bool is_kfd_process_device_valid(struct 
kfd_process_device *pdd) {
+    return (pdd && pdd->dev && pdd->dev->kfd && pdd->dev->kfd->valid);
+}
+
  struct kfd_process_device *kfd_process_device_data_by_id(struct 
kfd_process *process,
                               uint32_t gpu_id);
  diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index d07acf1b2f93..c06eb9d8008e 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -1157,8 +1157,6 @@ static void kfd_process_wq_release(struct 
work_struct *work)
      ef = rcu_access_pointer(p->ef);
      dma_fence_signal(ef);
  -    kfd_process_remove_sysfs(p);
-
      kfd_process_kunmap_signal_bo(p);
      kfd_process_free_outstanding_kfd_bos(p);
      svm_range_list_fini(p);
@@ -1173,6 +1171,11 @@ static void kfd_process_wq_release(struct 
work_struct *work)
        put_task_struct(p->lead_thread);
  +    /* the last step is removing process entries under /sys
+     * to indicate the process has been terminated.
+     */

This comment doesn't provide any useful information. What would be 
useful is, why this needs to be the last step? Without that, I see no 
good reason for this change.
ok, this change is not related to the patch. I thought it is better to 
update kfd topology at last step after all kfd process resources got 
released. I will remove this change.


+    kfd_process_remove_sysfs(p);
+
      kfree(p);
  }
  @@ -1536,6 +1539,12 @@ static struct kfd_process 
*create_process(const struct task_struct *thread)
      if (err != 0)
          goto err_init_apertures;
  +    /* no any kfd_process_device can be created */
+    if (!process->n_pdds) {
+        err = -ENODEV;
+        goto err_init_apertures;
+    }
+
      /* Check XNACK support after PDDs are created in 
kfd_init_apertures */
      process->xnack_enabled = kfd_process_xnack_mode(process, false);
  diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
index c76db22a1000..eaf4ba65466c 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
@@ -124,6 +124,11 @@ int pqm_set_gws(struct process_queue_manager 
*pqm, unsigned int qid,
          return -EINVAL;
      }
  +    if (!is_kfd_process_device_valid(pdd)) {
+        pr_debug("device 0x%x is not available\n",dev->node_id);
+        return -EINVAL;
+    }
+
      /* Only allow one queue per process can have GWS assigned */
      if (gws && pdd->qpd.num_gws)
          return -EBUSY;
@@ -498,6 +503,11 @@ int pqm_destroy_queue(struct 
process_queue_manager *pqm, unsigned int qid)
      if (WARN_ON(!dev))
          return -ENODEV;
  +    if (!dev->kfd || !dev->kfd->valid) {
+        pr_err("Process device is not valid\n");

Would you expect to see this message during process termination after 
a hot-unplug? Should this really be an error message, or would an info 
or debug message be more appropriate?
I will change kfd_get_process_device_data that will include this check, 
then this message will be merged at !pdd case.


+        return -1;

This should be a proper error code. -1 is -EPERM.

Same as above.

+    }
+
      pdd = kfd_get_process_device_data(dev, pqm->process);
      if (!pdd) {
          pr_err("Process device data doesn't exist\n");
@@ -567,6 +577,10 @@ int pqm_update_queue_properties(struct 
process_queue_manager *pqm,
          pdd = kfd_get_process_device_data(q->device, q->process);
          if (!pdd)
              return -ENODEV;
+
+        if (!is_kfd_process_device_valid(pdd))
+            return -ENODEV;
+         vm = drm_priv_to_vm(pdd->drm_priv);
          err = amdgpu_bo_reserve(vm->root.bo, false);
          if (err)
@@ -612,6 +626,11 @@ int pqm_update_mqd(struct process_queue_manager 
*pqm,
          return -EFAULT;
      }
  +    if (!pqn->q->device->kfd->valid) {
+        pr_debug("device where queue %d exists is not valid\n", qid);
+        return -EFAULT;
+    }
+
      /* CUs are masked for debugger requirements so deny user mask  */
      if (pqn->q->properties.is_dbg_wa && minfo && minfo->cu_mask.ptr)
          return -EBUSY;
@@ -679,6 +698,11 @@ int pqm_get_wave_state(struct 
process_queue_manager *pqm,
          return -EFAULT;
      }
  +    if (!pqn->q->device->kfd->valid) {
+        pr_debug("device where queue %d exists is not valid\n", qid);
+        return -EFAULT;

EFAULT means "bad address". Probably not the right error code here.

Will use -EINVAL.

Thanks

Xiaogang


Regards,
  Felix


+    }
+
      return pqn->q->device->dqm->ops.get_wave_state(pqn->q->device->dqm,
                                 pqn->q,
                                 ctl_stack,