Re: [PATCH 20/24] drm/amdkfd: enable pc sampling work to trigger trap

James Zhu <jamesz@xxxxxxx> · Thu, 23 Nov 2023 14:52:35 -0500

On 2023-11-23 14:08, Felix Kuehling wrote:
On 2023-11-23 13:27, James Zhu wrote:

On 2023-11-22 17:31, Felix Kuehling wrote:

On 2023-11-03 09:11, James Zhu wrote:
Enable a delay work to trigger pc sampling trap.

Signed-off-by: James Zhu <James.Zhu@xxxxxxx>
---
  drivers/gpu/drm/amd/amdkfd/kfd_device.c      |  3 ++
  drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c | 39 
++++++++++++++++++++
  drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.h |  1 +
  drivers/gpu/drm/amd/amdkfd/kfd_priv.h        |  1 +
  4 files changed, 44 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index bcaeedac8fe0..fb21902e433a 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -35,6 +35,7 @@
  #include "kfd_migrate.h"
  #include "amdgpu.h"
  #include "amdgpu_xcp.h"
+#include "kfd_pc_sampling.h"
    #define MQD_SIZE_ALIGNED 768
  @@ -537,6 +538,8 @@ static void kfd_pc_sampling_init(struct 
kfd_node *dev)
  {
      mutex_init(&dev->pcs_data.mutex);
idr_init_base(&dev->pcs_data.hosttrap_entry.base.pc_sampling_idr, 1);
+ INIT_WORK(&dev->pcs_data.hosttrap_entry.base.pc_sampling_work,
+        kfd_pc_sample_handler);
  }
    static void kfd_pc_sampling_exit(struct kfd_node *dev)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
index 2c4ac5b4cc4b..e8f0559b618e 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
@@ -38,6 +38,43 @@ struct supported_pc_sample_info 
supported_formats[] = {
      { IP_VERSION(9, 4, 2), &sample_info_hosttrap_9_0_0 },
  };
  +void kfd_pc_sample_handler(struct work_struct *work)
+{
+    struct amdgpu_device *adev;
+    struct kfd_node *node;
+    uint32_t timeout = 0;
+
+    node = container_of(work, struct kfd_node,
+ pcs_data.hosttrap_entry.base.pc_sampling_work);
+
+    mutex_lock(&node->pcs_data.mutex);
+    if (node->pcs_data.hosttrap_entry.base.active_count &&
+ node->pcs_data.hosttrap_entry.base.pc_sample_info.value &&
+        node->kfd2kgd->trigger_pc_sample_trap) {
+        switch 
(node->pcs_data.hosttrap_entry.base.pc_sample_info.type) {
+        case KFD_IOCTL_PCS_TYPE_TIME_US:
+            timeout = 
(uint32_t)node->pcs_data.hosttrap_entry.base.pc_sample_info.value;
+            break;
+        default:
+            pr_debug("PC Sampling type %d not supported.",
+ node->pcs_data.hosttrap_entry.base.pc_sample_info.type);
+        }
+    }
+    mutex_unlock(&node->pcs_data.mutex);
+    if (!timeout)
+        return;
+
+    adev = node->adev;
+    while 
(!READ_ONCE(node->pcs_data.hosttrap_entry.base.stop_enable)) {

This worker basically runs indefinitely (controlled by user mode).

+ node->kfd2kgd->trigger_pc_sample_trap(adev, 
node->vm_info.last_vmid_kfd,
+ &node->pcs_data.hosttrap_entry.base.target_simd,
+ &node->pcs_data.hosttrap_entry.base.target_wave_slot,
+ node->pcs_data.hosttrap_entry.base.pc_sample_info.method);
+        pr_debug_ratelimited("triggered a host trap.");
+        usleep_range(timeout, timeout + 10);

This will cause drift of the interval. Instead what you should do, 
is calculate the wait time at the end of every iteration based on 
the current time and the interval.
[JZ] I am wondering what degree of accuracy is requested  on 
interval, there is HW time stamp with each pc sampling data packet,


+    }
+}
+
  static int kfd_pc_sample_query_cap(struct kfd_process_device *pdd,
                      struct kfd_ioctl_pc_sample_args __user 
*user_args)
  {
@@ -101,6 +138,7 @@ static int kfd_pc_sample_start(struct 
kfd_process_device *pdd,
          } else {
kfd_process_set_trap_pc_sampling_flag(&pdd->qpd,
pdd->dev->pcs_data.hosttrap_entry.base.pc_sample_info.method, true);
+ 
schedule_work(&pdd->dev->pcs_data.hosttrap_entry.base.pc_sampling_work);

Scheduling a worker that runs indefinitely on the system workqueue 
is probably a bad idea. It could block other work items 
indefinitely. I think you are misusing the work queue API here. What 
you really want is probably, to crease a kernel thread.
[JZ] Yes, you are right. How about use  alloc_workqueue to create 
queue instead of system queue, is alloc_workqueue more efficient than 
kernel thread creation?

A work queue can create many kernel threads to handle the execution of 
work items. You really only need a single kernel thread per GPU for 
time-based PC sampling. IMO the work queue just adds a bunch of 
overhead. Using a work queue for something that runs indefinitely 
feels like an abuse of the API. I don't have much experience with 
creating kernel threads directly. See include/linux/kthread.h. If you 
want to look for an example, it seems drivers/gpu/drm/scheduler uses 
the kthread API.
[JZ] then let me switch to kthread

Regards,
  Felix



Regards,
  Felix


              break;
          }
      }
@@ -123,6 +161,7 @@ static int kfd_pc_sample_stop(struct 
kfd_process_device *pdd,
      mutex_unlock(&pdd->dev->pcs_data.mutex);
        if (pc_sampling_stop) {
+ 
cancel_work_sync(&pdd->dev->pcs_data.hosttrap_entry.base.pc_sampling_work);
kfd_process_set_trap_pc_sampling_flag(&pdd->qpd,
pdd->dev->pcs_data.hosttrap_entry.base.pc_sample_info.method, false);
  diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.h
index 4eeded4ea5b6..cb93909e6bd3 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.h
@@ -30,5 +30,6 @@
    int kfd_pc_sample(struct kfd_process_device *pdd,
                      struct kfd_ioctl_pc_sample_args __user *args);
+void kfd_pc_sample_handler(struct work_struct *work);
    #endif /* KFD_PC_SAMPLING_H_ */
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index badaa4d68cc4..b7062033fda4 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -263,6 +263,7 @@ struct kfd_dev_pc_sampling_data {
      uint32_t target_wave_slot;  /* target wave slot for trap */
      bool stop_enable;           /* pc sampling stop in process */
      struct idr pc_sampling_idr;
+    struct work_struct pc_sampling_work;
      struct kfd_pc_sample_info pc_sample_info;
  };