Re: [PATCH 1/3] amd/amdkfd: add a function to wait no process running in kfd

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 




On 3/25/2024 10:18 AM, Zhigang Luo wrote:
Caution: This message originated from an External Source. Use proper caution when opening attachments, clicking links, or responding.


Signed-off-by: Zhigang Luo <Zhigang.Luo@xxxxxxx>
Change-Id: I2a98d513c26107ac76ecf20e951c188afbc7ede6
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 20 ++++++++++++++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 10 +++++++++-
 drivers/gpu/drm/amd/amdkfd/kfd_device.c    | 11 +++++++++++
 3 files changed, 40 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index d5fde8adf19b..e02bfcec608b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -261,6 +261,26 @@ int amdgpu_amdkfd_resume(struct amdgpu_device *adev, bool run_pm)
        return r;
 }

+int amdgpu_amdkfd_wait_no_process_running(struct amdgpu_device *adev)
+{
+       unsigned long end_jiffies;
+
+       if (!adev->kfd.dev)
+               return 0;
+
+       end_jiffies = msecs_to_jiffies(AMDKFD_WAIT_NO_PROCESS_RUNNING_TIMEOUT_MS) + jiffies;
+       while (!kgd2kfd_is_processes_table_empty(adev->kfd.dev)) {
+               if (time_after(jiffies, end_jiffies)) {
+                       dev_err(adev->dev, "wait no process running timeout\n");
+
+                       return -ETIME;
+               }
+               schedule();
+       }
+
+       return 0;
+}
+
 int amdgpu_amdkfd_pre_reset(struct amdgpu_device *adev)
 {
        int r = 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index caee36e52a09..d46dccc5bbf7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -38,6 +38,8 @@
 #include "amdgpu_vm.h"
 #include "amdgpu_xcp.h"

+#define AMDKFD_WAIT_NO_PROCESS_RUNNING_TIMEOUT_MS 10000
+
 extern uint64_t amdgpu_amdkfd_total_mem_size;

 enum TLB_FLUSH_TYPE {
@@ -169,7 +171,7 @@ void amdgpu_amdkfd_set_compute_idle(struct amdgpu_device *adev, bool idle);
 bool amdgpu_amdkfd_have_atomics_support(struct amdgpu_device *adev);

 bool amdgpu_amdkfd_is_kfd_vmid(struct amdgpu_device *adev, u32 vmid);
-
+int amdgpu_amdkfd_wait_no_process_running(struct amdgpu_device *adev);
 int amdgpu_amdkfd_pre_reset(struct amdgpu_device *adev);

 int amdgpu_amdkfd_post_reset(struct amdgpu_device *adev);
@@ -411,6 +413,7 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
 void kgd2kfd_device_exit(struct kfd_dev *kfd);
 void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm);
 int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm);
+bool kgd2kfd_is_processes_table_empty(struct kfd_dev *kfd);
 int kgd2kfd_pre_reset(struct kfd_dev *kfd);
 int kgd2kfd_post_reset(struct kfd_dev *kfd);
 void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry);
@@ -454,6 +457,11 @@ static inline int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm)
        return 0;
 }

+static inline bool kgd2kfd_is_processes_table_empty(struct kfd_dev *kfd)
+{
+       return true;
+}
+
 static inline int kgd2kfd_pre_reset(struct kfd_dev *kfd)
 {
        return 0;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index 041ec3de55e7..2bec79e0c721 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -894,6 +894,17 @@ void kgd2kfd_device_exit(struct kfd_dev *kfd)
        kfree(kfd);
 }

+bool kgd2kfd_is_processes_table_empty(struct kfd_dev *kfd)
+{
+       bool is_empty;
+
+       mutex_lock(&kfd_processes_mutex);
+       is_empty = hash_empty(kfd_processes_table);
+       mutex_unlock(&kfd_processes_mutex);
+
+       return is_empty;
+}
+

hash table kfd_processes_table being empty does not mean all kfd processes have been terminated. kfd process got terminated through a specific workqueue: kfd_process_wq that is async from kfd_processes_table getting empty. The resources and data structure that kfd processes use may still not released though kfd_processes_table is empty.

I think a solid method to know all kfd process have been terminated is by checking the kobject under /sys: /sys/class/kfd/kfd/proc. When this directory is empty we know there is no any kfd process or all kfd processes had been terminated.

Regard

Xiaogang

 int kgd2kfd_pre_reset(struct kfd_dev *kfd)
 {
        struct kfd_node *node;
--
2.25.1


[Index of Archives]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux