Yes, you are right. It was not called by it. OK I have made a version which does that with some atomic counters. Please read later in the diff.
I actually tried that earlier, and it did not work. Application still keeps running, and you have to send a kill to the user process.
I have made the following version. It waits for processes to terminate synchronously after sending SIGBUS. After that it does the real work of amdgpu_pci_remove.
However, it hangs at amdgpu_device_ip_fini_early when it is trying to deinit ip_block 6 <sdma_v4_0> (https://gitlab.freedesktop.org/agd5f/linux/-/blob/amd-staging-drm-next/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c#L2818).
I assume that there are still some inflight dma, therefore fini of this ip block thus hangs?
The following is an excerpt of the dmesg: please excuse for putting my own pr_info, but I hope you get my point of where it hangs.
[ 392.344735] amdgpu: all processes has been fully released
[ 392.346557] amdgpu: amdgpu_acpi_fini done
[ 392.346568] amdgpu 0000:b3:00.0: amdgpu: amdgpu: finishing device.
[ 392.349238] amdgpu: amdgpu_device_ip_fini_early enter ip_blocks = 9
[ 392.349248] amdgpu: Free mem_obj = 000000007bf54275, range_start = 14, range_end = 14
[ 392.350299] amdgpu: Free mem_obj = 00000000a85bc878, range_start = 12, range_end = 12
[ 392.350304] amdgpu: Free mem_obj = 00000000b8019e32, range_start = 13, range_end = 13
[ 392.350308] amdgpu: Free mem_obj = 000000002d296168, range_start = 4, range_end = 11
[ 392.350313] amdgpu: Free mem_obj = 000000001fc4f934, range_start = 0, range_end = 3
[ 392.350322] amdgpu: amdgpu_amdkfd_suspend(adev, false) done
[ 392.350672] amdgpu: hw_fini of IP block[8] <jpeg_v2_5> done 0
[ 392.350679] amdgpu: hw_fini of IP block[7] <vcn_v2_5> done 0
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 8fa9b86ac9d2..c0b27f722281 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -188,6 +188,12 @@ void amdgpu_amdkfd_interrupt(struct amdgpu_device *adev,
kgd2kfd_interrupt(adev->kfd.dev, ih_ring_entry);
}
+void amdgpu_amdkfd_kill_all_processes(struct amdgpu_device *adev)
+{
+ if (adev->kfd.dev)
+ kgd2kfd_kill_all_user_processes(adev->kfd.dev);
+}
+
void amdgpu_amdkfd_suspend(struct amdgpu_device *adev, bool run_pm)
{
if (adev->kfd.dev)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index 27c74fcec455..f4e485d60442 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -141,6 +141,7 @@ struct amdkfd_process_info {
int amdgpu_amdkfd_init(void);
void amdgpu_amdkfd_fini(void);
+void amdgpu_amdkfd_kill_all_processes(struct amdgpu_device *adev);
void amdgpu_amdkfd_suspend(struct amdgpu_device *adev, bool run_pm);
int amdgpu_amdkfd_resume_iommu(struct amdgpu_device *adev);
int amdgpu_amdkfd_resume(struct amdgpu_device *adev, bool run_pm, bool sync);
@@ -405,6 +406,7 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd,
const struct kgd2kfd_shared_resources *gpu_resources);
void kgd2kfd_device_exit(struct kfd_dev *kfd);
void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm, bool force);
+void kgd2kfd_kill_all_user_processes(struct kfd_dev *kfd);
int kgd2kfd_resume_iommu(struct kfd_dev *kfd);
int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm, bool sync);
int kgd2kfd_pre_reset(struct kfd_dev *kfd);
@@ -443,6 +445,9 @@ static inline void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm, bool force)
{
}
+void kgd2kfd_kill_all_user_processes(struct kfd_dev *kfd){
+}
+
static int __maybe_unused kgd2kfd_resume_iommu(struct kfd_dev *kfd)
{
return 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index 3d5fc0751829..af6fe5080cfa 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -2101,6 +2101,9 @@ amdgpu_pci_remove(struct pci_dev *pdev)
{
struct drm_device *dev = pci_get_drvdata(pdev);
+ /* kill all kfd processes before drm_dev_unplug */
+ amdgpu_amdkfd_kill_all_processes(drm_to_adev(dev));
+
#ifdef HAVE_DRM_DEV_UNPLUG
drm_dev_unplug(dev);
#else
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index 5504a18b5a45..480c23bef5e2 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -691,6 +691,12 @@ bool kfd_is_locked(void)
return (atomic_read(&kfd_locked) > 0);
}
+inline void kgd2kfd_kill_all_user_processes(struct kfd_dev* dev)
+{
+ kfd_kill_all_user_processes();
+}
+
+
void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm, bool force)
{
if (!kfd->init_complete)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 55c9e1922714..a35a2cb5bb9f 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -1064,6 +1064,7 @@ static inline struct kfd_process_device *kfd_process_device_from_gpuidx(
void kfd_unref_process(struct kfd_process *p);
int kfd_process_evict_queues(struct kfd_process *p, bool force);
int kfd_process_restore_queues(struct kfd_process *p);
+void kfd_kill_all_user_processes(void);
void kfd_suspend_all_processes(bool force);
/*
* kfd_resume_all_processes:
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index 6cdc855abb6d..17e769e6951d 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -46,6 +46,9 @@ struct mm_struct;
#include "kfd_trace.h"
#include "kfd_debug.h"
+static atomic_t kfd_process_locked = ATOMIC_INIT(0);
+static atomic_t kfd_inflight_kills = ATOMIC_INIT(0);
+
/*
* List of struct kfd_process (field kfd_process).
* Unique/indexed by mm_struct*
@@ -802,6 +805,9 @@ struct kfd_process *kfd_create_process(struct task_struct *thread)
struct kfd_process *process;
int ret;
+ if ( atomic_read(&kfd_process_locked) > 0 )
+ return ERR_PTR(-EINVAL);
+
if (!(thread->mm && mmget_not_zero(thread->mm)))
return ERR_PTR(-EINVAL);
@@ -1126,6 +1132,10 @@ static void kfd_process_wq_release(struct work_struct *work)
put_task_struct(p->lead_thread);
kfree(p);
+
+ if ( atomic_read(&kfd_process_locked) > 0 ){
+ atomic_dec(&kfd_inflight_kills);
+ }
}
static void kfd_process_ref_release(struct kref *ref)
@@ -2186,6 +2196,35 @@ static void restore_process_worker(struct work_struct *work)
pr_err("Failed to restore queues of pasid 0x%x\n", p->pasid);
}
+void kfd_kill_all_user_processes(void)
+{
+ struct kfd_process *p;
+ /* struct amdkfd_process_info *p_info; */
+ unsigned int temp;
+ int idx;
+ atomic_inc(&kfd_process_locked);
+
+ idx = srcu_read_lock(&kfd_processes_srcu);
+ pr_info("Killing all processes\n");
+ hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
+ dev_warn(kfd_device,
+ "Sending SIGBUS to process %d (pasid 0x%x)",
+ p->lead_thread->pid, p->pasid);
+ send_sig(SIGBUS, p->lead_thread, 0);
+ atomic_inc(&kfd_inflight_kills);
+ }
+ srcu_read_unlock(&kfd_processes_srcu, idx);
+
+ while ( atomic_read(&kfd_inflight_kills) > 0 ){
+ dev_warn(kfd_device,
+ "kfd_processes_table is not empty, going to sleep for 10ms\n");
+ msleep(10);
+ }
+
+ atomic_dec(&kfd_process_locked);
+ pr_info("all processes has been fully released");
+}
+
void kfd_suspend_all_processes(bool force)
{
struct kfd_process *p;
Regards,
Shuotao
|