Am 06.03.23 um 22:58 schrieb David Belanger:
Handle case when module is unloaded (kfd_exit) before a process space
(mm_struct) is released.
Well that should never ever happen in the first place. It sounds like we
are missing grabbing module references.
Regards,
Christian.
Signed-off-by: David Belanger <david.belanger@xxxxxxx>
---
drivers/gpu/drm/amd/amdkfd/kfd_module.c | 4 ++
drivers/gpu/drm/amd/amdkfd/kfd_process.c | 57 ++++++++++++++++++++++++
2 files changed, 61 insertions(+)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_module.c b/drivers/gpu/drm/amd/amdkfd/kfd_module.c
index 09b966dc3768..8ef4bd9e4f7d 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_module.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_module.c
@@ -26,6 +26,9 @@
#include "kfd_priv.h"
#include "amdgpu_amdkfd.h"
+void kfd_cleanup_processes(void);
+
+
static int kfd_init(void)
{
int err;
@@ -77,6 +80,7 @@ static int kfd_init(void)
static void kfd_exit(void)
{
+ kfd_cleanup_processes();
kfd_debugfs_fini();
kfd_process_destroy_wq();
kfd_procfs_shutdown();
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index ebabe92f7edb..b5b28a32639d 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -1181,6 +1181,17 @@ static void kfd_process_notifier_release(struct mmu_notifier *mn,
return;
mutex_lock(&kfd_processes_mutex);
+ /*
+ * Do early return if p is not in the table.
+ *
+ * This could potentially happen if this function is called concurrently
+ * by mmu_notifier and by kfd_cleanup_pocesses.
+ *
+ */
+ if (!hash_hashed(&p->kfd_processes)) {
+ mutex_unlock(&kfd_processes_mutex);
+ return;
+ }
hash_del_rcu(&p->kfd_processes);
mutex_unlock(&kfd_processes_mutex);
synchronize_srcu(&kfd_processes_srcu);
@@ -1200,6 +1211,52 @@ static const struct mmu_notifier_ops kfd_process_mmu_notifier_ops = {
.free_notifier = kfd_process_free_notifier,
};
+
+void kfd_cleanup_processes(void)
+{
+ struct kfd_process *p;
+ unsigned int temp;
+
+ /*
+ * Iterate over remaining processes in table, calling notifier release
+ * to free up notifier and process resources.
+ *
+ * This code handles the case when driver is unloaded before all mm_struct
+ * are released.
+ */
+ int idx = srcu_read_lock(&kfd_processes_srcu);
+
+ hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
+ if (p) {
+ /*
+ * Obtain a reference on p to avoid a late mmu_notifier release
+ * call triggering freeing the process.
+ */
+
+ kref_get(&p->ref);
+
+ srcu_read_unlock(&kfd_processes_srcu, idx);
+
+ kfd_process_notifier_release(&p->mmu_notifier, p->mm);
+
+ kfd_unref_process(p);
+
+ idx = srcu_read_lock(&kfd_processes_srcu);
+ }
+ }
+ srcu_read_unlock(&kfd_processes_srcu, idx);
+
+ /*
+ * Must be called after all mmu_notifier_put are done and before
+ * kfd_process_wq is released.
+ *
+ * Ensures that all outstanding free_notifier gets called, triggering the release
+ * of the process.
+ */
+ mmu_notifier_synchronize();
+}
+
+
static int kfd_process_init_cwsr_apu(struct kfd_process *p, struct file *filep)
{
unsigned long offset;