Am 12.04.22 um 09:16 schrieb xinhui pan:
VM might already be freed when amdgpu_vm_tlb_seq_cb() is called. We see
the calltrace below.
Fix it by adding vm.delayed_tlb_flush and check this value in vm_fini().
BUG kmalloc-4k (Not tainted): Poison overwritten
Shit, I feared that this could happen.
Please don't use a counter+schedule, but rather keep the last flush
fence around and wait for it to signal during vm destruction.
Since fences signal in order you can then wait on that one and make sure
that the last cb has run.
Thanks,
Christian.
0xffff9c88630414e8-0xffff9c88630414e8 @offset=5352. First byte 0x6c
instead of 0x6b Allocated in amdgpu_driver_open_kms+0x9d/0x360 [amdgpu]
age=44 cpu=0 pid=2343
__slab_alloc.isra.0+0x4f/0x90
kmem_cache_alloc_trace+0x6b8/0x7a0
amdgpu_driver_open_kms+0x9d/0x360 [amdgpu]
drm_file_alloc+0x222/0x3e0 [drm]
drm_open+0x11d/0x410 [drm]
drm_stub_open+0xdc/0x230 [drm]
chrdev_open+0xa5/0x1e0
do_dentry_open+0x16c/0x3c0
vfs_open+0x2d/0x30
path_openat+0x70a/0xa90
do_filp_open+0xb2/0x120
do_sys_openat2+0x245/0x330
do_sys_open+0x46/0x80
__x64_sys_openat+0x20/0x30
do_syscall_64+0x38/0xc0
entry_SYSCALL_64_after_hwframe+0x44/0xae
Freed in amdgpu_driver_postclose_kms+0x3e9/0x550 [amdgpu] age=22 cpu=1
pid=2485
kfree+0x4a2/0x580
amdgpu_driver_postclose_kms+0x3e9/0x550 [amdgpu]
drm_file_free+0x24e/0x3c0 [drm]
drm_close_helper.isra.0+0x90/0xb0 [drm]
drm_release+0x97/0x1a0 [drm]
__fput+0xb6/0x280
____fput+0xe/0x10
task_work_run+0x64/0xb0
do_exit+0x406/0xcf0
do_group_exit+0x50/0xc0
__x64_sys_exit_group+0x18/0x20
do_syscall_64+0x38/0xc0
entry_SYSCALL_64_after_hwframe+0x44/0xae
Signed-off-by: xinhui pan <xinhui.pan@xxxxxxx>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 4 ++++
drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 1 +
2 files changed, 5 insertions(+)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index 645ce28277c2..0e89f42283c6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -789,6 +789,7 @@ static void amdgpu_vm_tlb_seq_cb(struct dma_fence *fence,
tlb_cb = container_of(cb, typeof(*tlb_cb), cb);
atomic64_inc(&tlb_cb->vm->tlb_seq);
+ atomic64_dec(&tlb_cb->vm->delayed_tlb_flush);
kfree(tlb_cb);
}
@@ -932,6 +933,7 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, struct amdgpu_vm *vm,
if (flush_tlb || params.table_freed) {
tlb_cb->vm = vm;
+ atomic64_inc(&vm->delayed_tlb_flush);
if (!fence || !*fence ||
dma_fence_add_callback(*fence, &tlb_cb->cb,
amdgpu_vm_tlb_seq_cb))
@@ -2257,6 +2259,8 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm)
amdgpu_vm_set_pasid(adev, vm, 0);
dma_fence_wait(vm->last_unlocked, false);
dma_fence_put(vm->last_unlocked);
+ while (atomic64_read(&vm->delayed_tlb_flush))
+ schedule();
list_for_each_entry_safe(mapping, tmp, &vm->freed, list) {
if (mapping->flags & AMDGPU_PTE_PRT && prt_fini_needed) {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
index 1a814fbffff8..4dc9f493d355 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
@@ -286,6 +286,7 @@ struct amdgpu_vm {
/* Last finished delayed update */
atomic64_t tlb_seq;
+ atomic64_t delayed_tlb_flush;
/* Last unlocked submission to the scheduler entities */
struct dma_fence *last_unlocked;