[ 711.769977] WARNING: CPU: 23 PID: 344 at .../amdgpu-rocm5.0.2/src/amd/amdgpu/amdgpu_object.c:1336 amdgpu_bo_release_notify+0x150/0x160 [amdgpu]
[ 711.770528] Modules linked in: amdgpu(OE) amdttm(OE) amd_sched(OE) amdkcl(OE) iommu_v2 nf_conntrack_netlink nfnetlink xfrm_user xfrm_algo xt_addrtype br_netfilter xt_CHECKSUM iptable_mangle xt_MASQUERADE iptable_nat nf_nat xt_conntrack nf_conntrack
nf_defrag_ipv6 nf_defrag_ipv4 ipt_REJECT nf_reject_ipv4 xt_tcpudp bridge stp llc ebtable_filter ebtables ip6table_filter ip6_tables iptable_filter overlay binfmt_misc intel_rapl_msr i40iw intel_rapl_common skx_edac nfit x86_pkg_temp_thermal intel_powerclamp
coretemp kvm_intel rpcrdma kvm sunrpc ipmi_ssif ib_umad ib_ipoib rdma_ucm irqbypass rapl joydev acpi_ipmi input_leds intel_cstate ipmi_si ipmi_devintf mei_me mei intel_pch_thermal ipmi_msghandler ioatdma mac_hid lpc_ich dca acpi_power_meter acpi_pad sch_fq_codel
ib_iser rdma_cm iw_cm ib_cm iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi pci_stub ip_tables x_tables autofs4 btrfs blake2b_generic zstd_compress raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx xor
[ 711.779359] raid6_pq libcrc32c raid1 raid0 multipath linear mlx5_ib ib_uverbs ib_core ast drm_vram_helper i2c_algo_bit drm_ttm_helper ttm drm_kms_helper syscopyarea crct10dif_pclmul crc32_pclmul ghash_clmulni_intel sysfillrect uas hid_generic sysimgblt
aesni_intel mlx5_core fb_sys_fops crypto_simd usbhid cryptd drm i40e pci_hyperv_intf usb_storage glue_helper mlxfw hid ahci libahci wmi
[ 711.779752] CPU: 23 PID: 344 Comm: kworker/23:1 Tainted: G W OE 5.11.0+ #1
[ 711.779755] Hardware name: Supermicro SYS-4029GP-TRT2/X11DPG-OT-CPU, BIOS 2.1 08/14/2018
[ 711.779756] Workqueue: kfd_process_wq kfd_process_wq_release [amdgpu]
[ 711.779955] RIP: 0010:amdgpu_bo_release_notify+0x150/0x160 [amdgpu]
[ 711.780141] Code: e8 b5 af 34 f4 e9 1f ff ff ff 48 39 c2 74 07 0f 0b e9 69 ff ff ff 4c 89 e7 e8 3c b4 16 00 e9 5c ff ff ff e8 a2 ce fd f3 eb cf <0f> 0b eb cb e8 d7 02 34 f4 0f 1f 80 00 00 00 00 0f 1f 44 00 00 55
[ 711.780143] RSP: 0018:ffffa8100dd67c30 EFLAGS: 00010282
[ 711.780145] RAX: 00000000ffffffea RBX: ffff89980e792058 RCX: 0000000000000000
[ 711.780147] RDX: 0000000000000000 RSI: ffff89a8f9ad8870 RDI: ffff89a8f9ad8870
[ 711.780148] RBP: ffffa8100dd67c50 R08: 0000000000000000 R09: fffffffffff99b18
[ 711.780149] R10: ffffa8100dd67bd0 R11: ffffa8100dd67908 R12: ffff89980e792000
[ 711.780151] R13: ffff89980e792058 R14: ffff89980e7921bc R15: dead000000000100
[ 711.780152] FS: 0000000000000000(0000) GS:ffff89a8f9ac0000(0000) knlGS:0000000000000000
[ 711.780154] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 711.780156] CR2: 00007ffddac6f71f CR3: 00000030bb80a003 CR4: 00000000007706e0
[ 711.780157] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 711.780159] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[ 711.780160] PKRU: 55555554
[ 711.780161] Call Trace:
[ 711.780163] ttm_bo_release+0x2ae/0x320 [amdttm]
[ 711.780169] amdttm_bo_put+0x30/0x40 [amdttm]
[ 711.780357] amdgpu_bo_unref+0x1e/0x30 [amdgpu]
[ 711.780543] amdgpu_gem_object_free+0x8b/0x160 [amdgpu]
[ 711.781119] drm_gem_object_free+0x1d/0x30 [drm]
[ 711.781489] amdgpu_amdkfd_gpuvm_free_memory_of_gpu+0x34a/0x380 [amdgpu]
[ 711.782044] kfd_process_device_free_bos+0xe0/0x130 [amdgpu]
[ 711.782611] kfd_process_wq_release+0x286/0x380 [amdgpu]
[ 711.783172] process_one_work+0x236/0x420
[ 711.783543] worker_thread+0x34/0x400
[ 711.783911] ? process_one_work+0x420/0x420
[ 711.784279] kthread+0x126/0x140
[ 711.784653] ? kthread_park+0x90/0x90
[ 711.785018] ret_from_fork+0x22/0x30
[ 711.785387] ---[ end trace d8f50f6594817c84 ]---
[ 711.798716] [drm] amdgpu: ttm finalized
[186885.764079] amdgpu 0000:43:00.0: amdgpu: amdgpu: finishing device.
[186885.766916] [drm] free PSP TMR buffer
[186893.868173] amdgpu: amdgpu_vm_bo_update failed
[186893.868235] amdgpu: validate_invalid_user_pages: update PTE failed
[186893.876154] amdgpu: amdgpu_vm_bo_update failed
[186893.876190] amdgpu: validate_invalid_user_pages: update PTE failed
[186893.884150] amdgpu: amdgpu_vm_bo_update failed
[186893.884185] amdgpu: validate_invalid_user_pages: update PTE failed
I tried something like *kfd_process_ref_release* which I think did what you described, but it did not work. Instead I tried to kill the process from the kernel, but the amdgpu could **only** be hot-plugged in back successfully only if there was no rocm kernel running when it was plugged out. If not, amdgpu_probe will just hang later. (Maybe because amdgpu was
plugged out while running state, it leaves a bad HW state which causes probe to hang).
I don’t know if this is a viable solution worth pursuing, but I attached the diff anyway.
Another solution could be let compute stack user mode detect a topology change via generation_count change, and abort gracefully there.
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index 4e7d9cb09a69..79b4c9b84cd0 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -697,12 +697,15 @@ void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm, bool force)
return;
/* for runtime suspend, skip locking kfd */
- if (!run_pm) {
+ if (!run_pm && !drm_dev_is_unplugged(kfd->ddev)) {
/* For first KFD device suspend all the KFD processes */
if (atomic_inc_return(&kfd_locked) == 1)
kfd_suspend_all_processes(force);
}
+ if (drm_dev_is_unplugged(kfd->ddev))
+ kfd_kill_all_user_processes();
+
kfd->dqm->ops.stop(kfd->dqm);
kfd_iommu_suspend(kfd);
}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 55c9e1922714..84cbcd857856 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -1065,6 +1065,7 @@ void kfd_unref_process(struct kfd_process *p);
int kfd_process_evict_queues(struct kfd_process *p, bool force);
int kfd_process_restore_queues(struct kfd_process *p);
void kfd_suspend_all_processes(bool force);
+void kfd_kill_all_user_processes(void);
/*
* kfd_resume_all_processes:
* bool sync: If kfd_resume_all_processes() should wait for the
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index 6cdc855abb6d..fb0c753b682c 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -2206,6 +2206,24 @@ void kfd_suspend_all_processes(bool force)
srcu_read_unlock(&kfd_processes_srcu, idx);
}
+
+void kfd_kill_all_user_processes(void)
+{
+ struct kfd_process *p;
+ struct amdkfd_process_info *p_info;
+ unsigned int temp;
+ int idx = srcu_read_lock(&kfd_processes_srcu);
+
+ pr_info("Killing all processes\n");
+ hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
+ p_info = p->kgd_process_info;
+ pr_info("Killing processes, pid = %d", pid_nr(p_info->pid));
+ kill_pid(p_info->pid, SIGBUS, 1);
+ }
+ srcu_read_unlock(&kfd_processes_srcu, idx);
+}
+
+
int kfd_resume_all_processes(bool sync)
{
struct kfd_process *p;
|