Yes, I only had one Radeon VII in my system, so this 4th test should have been skipped. I am ignoring this issue.
I finally got some time to continue on kfd hotplug patch attempt.
The following patch seems to work for kfd hotplug on Radeon VII. After hot plugout, the tf process exists because of vm fault.A new tf process run without issues after plugback.
It has the following fixes.
Please take a look and let me know if it acceptable.
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 1f8161cd507f..2f7858692067 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -33,6 +33,7 @@
#include <uapi/linux/kfd_ioctl.h>
#include "amdgpu_ras.h"
#include "amdgpu_umc.h"
+#include <drm/drm_drv.h>
/* Total memory size in system memory and all GPU VRAM. Used to
* estimate worst case amount of memory to reserve for page tables
@@ -681,9 +682,10 @@ int amdgpu_amdkfd_submit_ib(struct amdgpu_device *adev,
void amdgpu_amdkfd_set_compute_idle(struct amdgpu_device *adev, bool idle)
{
- amdgpu_dpm_switch_power_profile(adev,
- PP_SMC_POWER_PROFILE_COMPUTE,
- !idle);
+ if (!drm_dev_is_unplugged(adev_to_drm(adev)))
+ amdgpu_dpm_switch_power_profile(adev,
+ PP_SMC_POWER_PROFILE_COMPUTE,
+ !idle);
}
bool amdgpu_amdkfd_is_kfd_vmid(struct amdgpu_device *adev, u32 vmid)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c
index 4b153daf283d..fb4c9e55eace 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c
@@ -46,6 +46,7 @@
#include <linux/firmware.h>
#include <linux/module.h>
#include <drm/drm.h>
+#include <drm/drm_drv.h>
#include "amdgpu.h"
#include "amdgpu_amdkfd.h"
@@ -104,6 +105,9 @@ static bool amdgpu_mn_invalidate_hsa(struct mmu_interval_notifier *mni,
struct amdgpu_bo *bo = container_of(mni, struct amdgpu_bo, notifier);
struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev);
+ if (drm_dev_is_unplugged(adev_to_drm(adev)))
+ return true;
+
if (!mmu_notifier_range_blockable(range))
return false;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index cac56f830aed..fbbaaabf3a67 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1509,7 +1509,6 @@ static int amdgpu_ras_fs_fini(struct amdgpu_device *adev)
}
}
- amdgpu_ras_sysfs_remove_all(adev);
return 0;
}
/* ras fs end */
@@ -2557,8 +2556,6 @@ void amdgpu_ras_block_late_fini(struct amdgpu_device *adev,
if (!ras_block)
return;
- amdgpu_ras_sysfs_remove(adev, ras_block);
-
ras_obj = container_of(ras_block, struct amdgpu_ras_block_object, ras_comm);
if (ras_obj->ras_cb)
amdgpu_ras_interrupt_remove_handler(adev, ras_block);
@@ -2659,6 +2656,7 @@ int amdgpu_ras_pre_fini(struct amdgpu_device *adev)
/* Need disable ras on all IPs here before ip [hw/sw]fini */
amdgpu_ras_disable_all_features(adev, 0);
amdgpu_ras_recovery_fini(adev);
+ amdgpu_ras_sysfs_remove_all(adev);
return 0;
}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index f1a225a20719..4b789bec9670 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -714,16 +714,37 @@ bool kfd_is_locked(void)
void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm)
{
+ struct kfd_process *p;
+ struct amdkfd_process_info *p_info;
+ unsigned int temp;
+
if (!kfd->init_complete)
return;
/* for runtime suspend, skip locking kfd */
- if (!run_pm) {
+ if (!run_pm && !drm_dev_is_unplugged(kfd->ddev)) {
/* For first KFD device suspend all the KFD processes */
if (atomic_inc_return(&kfd_locked) == 1)
kfd_suspend_all_processes();
}
+ if (drm_dev_is_unplugged(kfd->ddev)){
+ int idx = srcu_read_lock(&kfd_processes_srcu);
+ pr_debug("cancel restore_userptr_work\n");
+ hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
+ if (kfd_process_gpuidx_from_gpuid(p, kfd->id) >= 0) {
+ p_info = p->kgd_process_info;
+ pr_debug("cancel processes, pid = %d for gpu_id = %d", pid_nr(p_info->pid), kfd->id);
+ cancel_delayed_work_sync(&p_info->restore_userptr_work);
+
+ /* send exception signals to the kfd events waiting in user space */
+ kfd_signal_hw_exception_event(p->pasid);
+ kfd_signal_vm_fault_event(kfd, p->pasid, NULL);
+ }
+ }
+ srcu_read_unlock(&kfd_processes_srcu, idx);
+ }
+
kfd->dqm->ops.stop(kfd->dqm);
kfd_iommu_suspend(kfd);
}
Regards,
Shuotao
|