On 3/25/2024 10:18 AM, Zhigang Luo
wrote:
Caution: This message originated from an External Source. Use proper caution when opening attachments, clicking links, or responding. Signed-off-by: Zhigang Luo <Zhigang.Luo@xxxxxxx> Change-Id: I2a98d513c26107ac76ecf20e951c188afbc7ede6 --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 20 ++++++++++++++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 10 +++++++++- drivers/gpu/drm/amd/amdkfd/kfd_device.c | 11 +++++++++++ 3 files changed, 40 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c index d5fde8adf19b..e02bfcec608b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c @@ -261,6 +261,26 @@ int amdgpu_amdkfd_resume(struct amdgpu_device *adev, bool run_pm) return r; } +int amdgpu_amdkfd_wait_no_process_running(struct amdgpu_device *adev) +{ + unsigned long end_jiffies; + + if (!adev->kfd.dev) + return 0; + + end_jiffies = msecs_to_jiffies(AMDKFD_WAIT_NO_PROCESS_RUNNING_TIMEOUT_MS) + jiffies; + while (!kgd2kfd_is_processes_table_empty(adev->kfd.dev)) { + if (time_after(jiffies, end_jiffies)) { + dev_err(adev->dev, "wait no process running timeout\n"); + + return -ETIME; + } + schedule(); + } + + return 0; +} + int amdgpu_amdkfd_pre_reset(struct amdgpu_device *adev) { int r = 0; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h index caee36e52a09..d46dccc5bbf7 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h @@ -38,6 +38,8 @@ #include "amdgpu_vm.h" #include "amdgpu_xcp.h" +#define AMDKFD_WAIT_NO_PROCESS_RUNNING_TIMEOUT_MS 10000 + extern uint64_t amdgpu_amdkfd_total_mem_size; enum TLB_FLUSH_TYPE { @@ -169,7 +171,7 @@ void amdgpu_amdkfd_set_compute_idle(struct amdgpu_device *adev, bool idle); bool amdgpu_amdkfd_have_atomics_support(struct amdgpu_device *adev); bool amdgpu_amdkfd_is_kfd_vmid(struct amdgpu_device *adev, u32 vmid); - +int amdgpu_amdkfd_wait_no_process_running(struct amdgpu_device *adev); int amdgpu_amdkfd_pre_reset(struct amdgpu_device *adev); int amdgpu_amdkfd_post_reset(struct amdgpu_device *adev); @@ -411,6 +413,7 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd, void kgd2kfd_device_exit(struct kfd_dev *kfd); void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm); int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm); +bool kgd2kfd_is_processes_table_empty(struct kfd_dev *kfd); int kgd2kfd_pre_reset(struct kfd_dev *kfd); int kgd2kfd_post_reset(struct kfd_dev *kfd); void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry); @@ -454,6 +457,11 @@ static inline int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm) return 0; } +static inline bool kgd2kfd_is_processes_table_empty(struct kfd_dev *kfd) +{ + return true; +} + static inline int kgd2kfd_pre_reset(struct kfd_dev *kfd) { return 0; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c index 041ec3de55e7..2bec79e0c721 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c @@ -894,6 +894,17 @@ void kgd2kfd_device_exit(struct kfd_dev *kfd) kfree(kfd); } +bool kgd2kfd_is_processes_table_empty(struct kfd_dev *kfd) +{ + bool is_empty; + + mutex_lock(&kfd_processes_mutex); + is_empty = hash_empty(kfd_processes_table); + mutex_unlock(&kfd_processes_mutex); + + return is_empty; +} +
hash table kfd_processes_table
being empty does not mean all kfd processes have been terminated.
kfd process got terminated through a specific workqueue:
kfd_process_wq that is async from kfd_processes_table getting
empty. The resources and data structure that kfd processes use may
still not released though kfd_processes_table
is empty.
I think a solid method to know all kfd process have been
terminated is by checking the kobject under
/sys: /sys/class/kfd/kfd/proc. When this directory is empty we
know there is no any kfd process or all kfd processes had been
terminated.
Regard
Xiaogang
int kgd2kfd_pre_reset(struct kfd_dev *kfd) { struct kfd_node *node; -- 2.25.1