Re: [PATCH 20/24] drm/amdkfd: enable pc sampling work to trigger trap

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 




On 2023-11-23 14:08, Felix Kuehling wrote:
On 2023-11-23 13:27, James Zhu wrote:

On 2023-11-22 17:31, Felix Kuehling wrote:

On 2023-11-03 09:11, James Zhu wrote:
Enable a delay work to trigger pc sampling trap.

Signed-off-by: James Zhu <James.Zhu@xxxxxxx>
---
  drivers/gpu/drm/amd/amdkfd/kfd_device.c      |  3 ++
  drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c | 39 ++++++++++++++++++++
  drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.h |  1 +
  drivers/gpu/drm/amd/amdkfd/kfd_priv.h        |  1 +
  4 files changed, 44 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index bcaeedac8fe0..fb21902e433a 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -35,6 +35,7 @@
  #include "kfd_migrate.h"
  #include "amdgpu.h"
  #include "amdgpu_xcp.h"
+#include "kfd_pc_sampling.h"
    #define MQD_SIZE_ALIGNED 768
  @@ -537,6 +538,8 @@ static void kfd_pc_sampling_init(struct kfd_node *dev)
  {
      mutex_init(&dev->pcs_data.mutex);
idr_init_base(&dev->pcs_data.hosttrap_entry.base.pc_sampling_idr, 1);
+ INIT_WORK(&dev->pcs_data.hosttrap_entry.base.pc_sampling_work,
+        kfd_pc_sample_handler);
  }
    static void kfd_pc_sampling_exit(struct kfd_node *dev)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
index 2c4ac5b4cc4b..e8f0559b618e 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
@@ -38,6 +38,43 @@ struct supported_pc_sample_info supported_formats[] = {
      { IP_VERSION(9, 4, 2), &sample_info_hosttrap_9_0_0 },
  };
  +void kfd_pc_sample_handler(struct work_struct *work)
+{
+    struct amdgpu_device *adev;
+    struct kfd_node *node;
+    uint32_t timeout = 0;
+
+    node = container_of(work, struct kfd_node,
+ pcs_data.hosttrap_entry.base.pc_sampling_work);
+
+    mutex_lock(&node->pcs_data.mutex);
+    if (node->pcs_data.hosttrap_entry.base.active_count &&
+ node->pcs_data.hosttrap_entry.base.pc_sample_info.value &&
+        node->kfd2kgd->trigger_pc_sample_trap) {
+        switch (node->pcs_data.hosttrap_entry.base.pc_sample_info.type) {
+        case KFD_IOCTL_PCS_TYPE_TIME_US:
+            timeout = (uint32_t)node->pcs_data.hosttrap_entry.base.pc_sample_info.value;
+            break;
+        default:
+            pr_debug("PC Sampling type %d not supported.",
+ node->pcs_data.hosttrap_entry.base.pc_sample_info.type);
+        }
+    }
+    mutex_unlock(&node->pcs_data.mutex);
+    if (!timeout)
+        return;
+
+    adev = node->adev;
+    while (!READ_ONCE(node->pcs_data.hosttrap_entry.base.stop_enable)) {

This worker basically runs indefinitely (controlled by user mode).

+ node->kfd2kgd->trigger_pc_sample_trap(adev, node->vm_info.last_vmid_kfd,
+ &node->pcs_data.hosttrap_entry.base.target_simd,
+ &node->pcs_data.hosttrap_entry.base.target_wave_slot,
+ node->pcs_data.hosttrap_entry.base.pc_sample_info.method);
+        pr_debug_ratelimited("triggered a host trap.");
+        usleep_range(timeout, timeout + 10);

This will cause drift of the interval. Instead what you should do, is calculate the wait time at the end of every iteration based on the current time and the interval.
[JZ] I am wondering what degree of accuracy is requested  on interval, there is HW time stamp with each pc sampling data packet,


+    }
+}
+
  static int kfd_pc_sample_query_cap(struct kfd_process_device *pdd,
                      struct kfd_ioctl_pc_sample_args __user *user_args)
  {
@@ -101,6 +138,7 @@ static int kfd_pc_sample_start(struct kfd_process_device *pdd,
          } else {
kfd_process_set_trap_pc_sampling_flag(&pdd->qpd,
pdd->dev->pcs_data.hosttrap_entry.base.pc_sample_info.method, true);
+ schedule_work(&pdd->dev->pcs_data.hosttrap_entry.base.pc_sampling_work);

Scheduling a worker that runs indefinitely on the system workqueue is probably a bad idea. It could block other work items indefinitely. I think you are misusing the work queue API here. What you really want is probably, to crease a kernel thread.
[JZ] Yes, you are right. How about use  alloc_workqueue to create queue instead of system queue, is alloc_workqueue more efficient than kernel thread creation?

A work queue can create many kernel threads to handle the execution of work items. You really only need a single kernel thread per GPU for time-based PC sampling. IMO the work queue just adds a bunch of overhead. Using a work queue for something that runs indefinitely feels like an abuse of the API. I don't have much experience with creating kernel threads directly. See include/linux/kthread.h. If you want to look for an example, it seems drivers/gpu/drm/scheduler uses the kthread API.
[JZ] then let me switch to kthread

Regards,
  Felix



Regards,
  Felix


              break;
          }
      }
@@ -123,6 +161,7 @@ static int kfd_pc_sample_stop(struct kfd_process_device *pdd,
      mutex_unlock(&pdd->dev->pcs_data.mutex);
        if (pc_sampling_stop) {
+ cancel_work_sync(&pdd->dev->pcs_data.hosttrap_entry.base.pc_sampling_work);
kfd_process_set_trap_pc_sampling_flag(&pdd->qpd,
pdd->dev->pcs_data.hosttrap_entry.base.pc_sample_info.method, false);
  diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.h b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.h
index 4eeded4ea5b6..cb93909e6bd3 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.h
@@ -30,5 +30,6 @@
    int kfd_pc_sample(struct kfd_process_device *pdd,
                      struct kfd_ioctl_pc_sample_args __user *args);
+void kfd_pc_sample_handler(struct work_struct *work);
    #endif /* KFD_PC_SAMPLING_H_ */
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index badaa4d68cc4..b7062033fda4 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -263,6 +263,7 @@ struct kfd_dev_pc_sampling_data {
      uint32_t target_wave_slot;  /* target wave slot for trap */
      bool stop_enable;           /* pc sampling stop in process */
      struct idr pc_sampling_idr;
+    struct work_struct pc_sampling_work;
      struct kfd_pc_sample_info pc_sample_info;
  };



[Index of Archives]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux