Am 19.12.24 um 11:38 schrieb Arunpravin Paneer Selvam:
Fix the wait IOCTL lockup issue.
v2:(Christian)
- The problem is that you need to lock all BOs under a single
drm_exec_until_all_locked() loop. Otherwise not all BOs
would be locked again on contention.
v3: Locking vm pd when accessing the GEM BO's solves the lockup
issues.
[Dec 6 17:53] watchdog: BUG: soft lockup - CPU#4 stuck for 26s! [Xwayland:cs0:2634]
[ +0.000002] RIP: 0010:amdgpu_userq_wait_ioctl+0x3ce/0xfe0 [amdgpu]
[ +0.000003] RSP: 0018:ffffada901f4fc38 EFLAGS: 00000202
[ +0.000003] RAX: 0000000000000001 RBX: 0000000000000000 RCX: 0000000000000001
[ +0.000001] RDX: ffff9fc28b974048 RSI: 0000000000000010 RDI: ffffada901f4fce8
[ +0.000002] RBP: ffffada901f4fd58 R08: ffff9fc28b974148 R09: 0000000000000000
[ +0.000002] R10: ffffada901f4fbf0 R11: 0000000000000001 R12: ffff9fc28ed1ac00
[ +0.000002] R13: 0000000000000000 R14: 0000000000000010 R15: ffffada901f4fe00
[ +0.000002] FS: 00007f3a00a00640(0000) GS:ffff9fc99e800000(0000) knlGS:0000000000000000
[ +0.000002] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ +0.000002] CR2: 00007f8e92137020 CR3: 000000013380e000 CR4: 0000000000350ef0
[ +0.000002] DR0: ffffffff90921610 DR1: ffffffff90921611 DR2: ffffffff90921612
[ +0.000001] DR3: ffffffff90921613 DR6: 00000000ffff0ff0 DR7: 0000000000000600
[ +0.000002] Call Trace:
[ +0.000002] <IRQ>
[ +0.000004] ? show_regs+0x69/0x80
[ +0.000005] ? watchdog_timer_fn+0x271/0x300
[ +0.000005] ? __pfx_watchdog_timer_fn+0x10/0x10
[ +0.000003] ? __hrtimer_run_queues+0x114/0x2a0
[ +0.000006] ? hrtimer_interrupt+0x110/0x240
[ +0.000005] ? __sysvec_apic_timer_interrupt+0x73/0x180
[ +0.000004] ? sysvec_apic_timer_interrupt+0xaa/0xd0
[ +0.000004] </IRQ>
[ +0.000002] <TASK>
[ +0.000002] ? asm_sysvec_apic_timer_interrupt+0x1b/0x20
[ +0.000006] ? amdgpu_userq_wait_ioctl+0x3ce/0xfe0 [amdgpu]
[ +0.000162] ? amdgpu_userq_wait_ioctl+0x3cc/0xfe0 [amdgpu]
[ +0.000156] ? finish_task_switch.isra.0+0x94/0x290
[ +0.000010] ? __pfx_amdgpu_userq_wait_ioctl+0x10/0x10 [amdgpu]
[ +0.000161] drm_ioctl_kernel+0xaa/0x100 [drm]
[ +0.000025] drm_ioctl+0x29d/0x500 [drm]
[ +0.000017] ? __pfx_amdgpu_userq_wait_ioctl+0x10/0x10 [amdgpu]
[ +0.000160] ? srso_return_thunk+0x5/0x5f
[ +0.000004] ? srso_return_thunk+0x5/0x5f
[ +0.000003] ? _raw_spin_unlock_irqrestore+0x27/0x50
[ +0.000004] amdgpu_drm_ioctl+0x4b/0x80 [amdgpu]
[ +0.000141] __x64_sys_ioctl+0x95/0xd0
[ +0.000005] x64_sys_call+0x1205/0x20d0
[ +0.000003] do_syscall_64+0x4d/0x120
[ +0.000004] entry_SYSCALL_64_after_hwframe+0x76/0x7e
[ +0.000002] RIP: 0033:0x7f3a0bb1a94f
[ +0.000002] RSP: 002b:00007f3a009ff870 EFLAGS: 00000246 ORIG_RAX: 0000000000000010
[ +0.000003] RAX: ffffffffffffffda RBX: 00007f3a009ff9f0 RCX: 00007f3a0bb1a94f
[ +0.000001] RDX: 00007f3a009ff9f0 RSI: 00000000c0406458 RDI: 000000000000000c
[ +0.000002] RBP: 00007f3a009ff8f0 R08: 0000000000000001 R09: 0000000000000000
[ +0.000002] R10: 0000000000000002 R11: 0000000000000246 R12: 0000561824bf39e0
[ +0.000002] R13: 00000000c0406458 R14: 000000000000000c R15: 0000561824a25b60
[ +0.000005] </TASK>
[ +27.998164] watchdog: BUG: soft lockup - CPU#4 stuck for 52s! [Xwayland:cs0:2634]
[ +0.000002] RIP: 0010:drm_exec_unlock_all+0x76/0xc0 [drm_exec]
[ +0.000002] RSP: 0018:ffffada901f4fbf8 EFLAGS: 00000246
[ +0.000003] RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000001
[ +0.000002] RDX: ffff9fc28b974048 RSI: 0000000000000010 RDI: 0000000000000000
[ +0.000001] RBP: ffffada901f4fc10 R08: ffff9fc28b974148 R09: 0000000000000000
[ +0.000002] R10: ffffada901f4fbf0 R11: 0000000000000001 R12: ffff9fc28ed1ac00
[ +0.000002] R13: 00000000ffffffff R14: ffffada901f4fce8 R15: ffffada901f4fe00
[ +0.000002] FS: 00007f3a00a00640(0000) GS:ffff9fc99e800000(0000) knlGS:0000000000000000
[ +0.000002] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ +0.000002] CR2: 00007f8e92137020 CR3: 000000013380e000 CR4: 0000000000350ef0
[ +0.000002] DR0: ffffffff90921610 DR1: ffffffff90921611 DR2: ffffffff90921612
[ +0.000002] DR3: ffffffff90921613 DR6: 00000000ffff0ff0 DR7: 0000000000000600
[ +0.000002] Call Trace:
[ +0.000002] <IRQ>
[ +0.000003] ? show_regs+0x69/0x80
[ +0.000006] ? watchdog_timer_fn+0x271/0x300
[ +0.000004] ? __pfx_watchdog_timer_fn+0x10/0x10
[ +0.000003] ? __hrtimer_run_queues+0x114/0x2a0
[ +0.000006] ? hrtimer_interrupt+0x110/0x240
[ +0.000005] ? __sysvec_apic_timer_interrupt+0x73/0x180
[ +0.000004] ? sysvec_apic_timer_interrupt+0xaa/0xd0
[ +0.000004] </IRQ>
[ +0.000002] <TASK>
[ +0.000002] ? asm_sysvec_apic_timer_interrupt+0x1b/0x20
[ +0.000006] ? drm_exec_unlock_all+0x76/0xc0 [drm_exec]
[ +0.000004] drm_exec_cleanup+0x77/0x90 [drm_exec]
[ +0.000004] amdgpu_userq_wait_ioctl+0x3cc/0xfe0 [amdgpu]
[ +0.000206] ? finish_task_switch.isra.0+0x94/0x290
[ +0.000010] ? __pfx_amdgpu_userq_wait_ioctl+0x10/0x10 [amdgpu]
[ +0.000159] drm_ioctl_kernel+0xaa/0x100 [drm]
[ +0.000026] drm_ioctl+0x29d/0x500 [drm]
[ +0.000017] ? __pfx_amdgpu_userq_wait_ioctl+0x10/0x10 [amdgpu]
[ +0.000163] ? srso_return_thunk+0x5/0x5f
[ +0.000005] ? srso_return_thunk+0x5/0x5f
[ +0.000002] ? _raw_spin_unlock_irqrestore+0x27/0x50
[ +0.000005] amdgpu_drm_ioctl+0x4b/0x80 [amdgpu]
[ +0.000143] __x64_sys_ioctl+0x95/0xd0
[ +0.000005] x64_sys_call+0x1205/0x20d0
[ +0.000004] do_syscall_64+0x4d/0x120
[ +0.000003] entry_SYSCALL_64_after_hwframe+0x76/0x7e
[ +0.000002] RIP: 0033:0x7f3a0bb1a94f
[ +0.000002] RSP: 002b:00007f3a009ff870 EFLAGS: 00000246 ORIG_RAX: 0000000000000010
[ +0.000003] RAX: ffffffffffffffda RBX: 00007f3a009ff9f0 RCX: 00007f3a0bb1a94f
[ +0.000002] RDX: 00007f3a009ff9f0 RSI: 00000000c0406458 RDI: 000000000000000c
[ +0.000002] RBP: 00007f3a009ff8f0 R08: 0000000000000001 R09: 0000000000000000
[ +0.000001] R10: 0000000000000002 R11: 0000000000000246 R12: 0000561824bf39e0
[ +0.000002] R13: 00000000c0406458 R14: 000000000000000c R15: 0000561824a25b60
[ +0.000006] </TASK>
Signed-off-by: Arunpravin Paneer Selvam <Arunpravin.PaneerSelvam@xxxxxxx>
---
.../gpu/drm/amd/amdgpu/amdgpu_userq_fence.c | 45 ++++++++++++-------
1 file changed, 29 insertions(+), 16 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
index 6876f24e5eda..35d306fc069d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
@@ -463,22 +463,6 @@ int amdgpu_userq_signal_ioctl(struct drm_device *dev, void *data,
goto put_gobj_write;
}
- drm_exec_init(&exec, DRM_EXEC_INTERRUPTIBLE_WAIT,
- (num_read_bo_handles + num_write_bo_handles));
-
- /* Lock all BOs with retry handling */
- drm_exec_until_all_locked(&exec) {
- r = drm_exec_prepare_array(&exec, gobj_read, num_read_bo_handles, 1);
- drm_exec_retry_on_contention(&exec);
- if (r)
- goto exec_fini;
-
- r = drm_exec_prepare_array(&exec, gobj_write, num_write_bo_handles, 1);
- drm_exec_retry_on_contention(&exec);
- if (r)
- goto exec_fini;
- }
-
r = amdgpu_userq_fence_read_wptr(queue, &wptr);
if (r)
goto exec_fini;
@@ -495,6 +479,27 @@ int amdgpu_userq_signal_ioctl(struct drm_device *dev, void *data,
queue->last_fence = dma_fence_get(fence);
mutex_unlock(&uq_mgr->userq_mutex);
+ drm_exec_init(&exec, DRM_EXEC_INTERRUPTIBLE_WAIT,
+ (num_read_bo_handles + num_write_bo_handles));
+
+ /* Lock all BOs with retry handling */
+ drm_exec_until_all_locked(&exec) {
+ r = amdgpu_vm_lock_pd(queue->vm, &exec, 0);
Well that absolutely makes no sense at all.
First of all we need to lock the BOs *before* we create the fence. Then
there is no need to lock the VM PD here.
I have no idea what you are trying to do here.
Regards,
Christian.
+ drm_exec_retry_on_contention(&exec);
+ if (r)
+ goto exec_fini;
+
+ r = drm_exec_prepare_array(&exec, gobj_read, num_read_bo_handles, 1);
+ drm_exec_retry_on_contention(&exec);
+ if (r)
+ goto exec_fini;
+
+ r = drm_exec_prepare_array(&exec, gobj_write, num_write_bo_handles, 1);
+ drm_exec_retry_on_contention(&exec);
+ if (r)
+ goto exec_fini;
+ }
+
for (i = 0; i < num_read_bo_handles; i++) {
if (!gobj_read || !gobj_read[i]->resv)
continue;
@@ -558,6 +563,7 @@ int amdgpu_userq_wait_ioctl(struct drm_device *dev, void *data,
u32 num_syncobj, num_read_bo_handles, num_write_bo_handles;
struct drm_amdgpu_userq_fence_info *fence_info = NULL;
struct drm_amdgpu_userq_wait *wait_info = data;
+ struct amdgpu_fpriv *fpriv = filp->driver_priv;
struct drm_gem_object **gobj_write;
struct drm_gem_object **gobj_read;
struct dma_fence **fences = NULL;
@@ -635,6 +641,13 @@ int amdgpu_userq_wait_ioctl(struct drm_device *dev, void *data,
/* Lock all BOs with retry handling */
drm_exec_until_all_locked(&exec) {
+ r = amdgpu_vm_lock_pd(&fpriv->vm, &exec, 0);
+ drm_exec_retry_on_contention(&exec);
+ if (r) {
+ drm_exec_fini(&exec);
+ goto put_gobj_write;
+ }
+
r = drm_exec_prepare_array(&exec, gobj_read, num_read_bo_handles, 1);
drm_exec_retry_on_contention(&exec);
if (r) {