Hi, I found another KASAN sanitizer find, when try calculated inotify-consumers by this script https://github.com/fatso83/dotfiles/blob/master/utils/scripts/inotify-consumers When I ran this script under root I got this backtrace for the first time. Somehow it is related to amdgpu_bo_get_memory. general protection fault, probably for non-canonical address 0xdffffc0000000002: 0000 [#1] PREEMPT SMP KASAN NOPTI KASAN: null-ptr-deref in range [0x0000000000000010-0x0000000000000017] CPU: 30 PID: 100871 Comm: grep Tainted: G W L ------- --- 6.4.0-0.rc0.20230503git348551ddaf31.12.fc39.x86_64+debug #1 Hardware name: Micro-Star International Co., Ltd. MS-7D73/MPG B650I EDGE WIFI (MS-7D73), BIOS 1.32 04/28/2023 RIP: 0010:amdgpu_bo_get_memory+0x80/0x360 [amdgpu] Code: 48 c1 ea 03 80 3c 02 00 0f 85 90 02 00 00 4c 8b ad 90 02 00 00 48 b8 00 00 00 00 00 fc ff df 49 8d 7d 10 48 89 fa 48 c1 ea 03 <0f> b6 04 02 84 c0 74 08 3c 03 0f 8e 34 02 00 00 41 8b 45 10 83 f8 RSP: 0018:ffffc90039e178c0 EFLAGS: 00010212 RAX: dffffc0000000000 RBX: ffffc90039e17988 RCX: 0000000000000000 RDX: 0000000000000002 RSI: ffffc90039e17988 RDI: 0000000000000010 RBP: ffff88812fcda000 R08: 0000000000000000 R09: ffffffff8a0ca6af R10: fffffbfff14194d5 R11: 0000000000000000 R12: 0000000000840000 R13: 0000000000000000 R14: ffff88810f686160 R15: ffff88811fb6d218 FS: 00007f1f8e87a740(0000) GS:ffff888f99000000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000055fac423f000 CR3: 00000004ba39c000 CR4: 0000000000750ee0 PKRU: 55555554 Call Trace: <TASK> amdgpu_vm_get_memory+0x430/0x5c0 [amdgpu] amdgpu_show_fdinfo+0x30a/0x900 [amdgpu] ? __pfx_amdgpu_show_fdinfo+0x10/0x10 [amdgpu] ? __pfx_seq_printf+0x10/0x10 seq_show+0x48a/0x710 seq_read_iter+0x40d/0x11c0 seq_read+0x190/0x230 ? __pfx_seq_read+0x10/0x10 ? selinux_file_permission+0x356/0x440 ? fsnotify_perm.part.0+0x146/0x4e0 vfs_read+0x1cd/0x860 ? __pfx_vfs_read+0x10/0x10 ? __do_sys_newfstatat+0x94/0xe0 ? __pfx___do_sys_newfstatat+0x10/0x10 ? __fget_light+0x51/0x230 ksys_read+0x10a/0x1e0 ? __pfx_ksys_read+0x10/0x10 ? syscall_enter_from_user_mode+0x26/0x90 do_syscall_64+0x5d/0x90 ? do_syscall_64+0x6c/0x90 ? lockdep_hardirqs_on+0x81/0x110 ? do_syscall_64+0x6c/0x90 ? lockdep_hardirqs_on+0x81/0x110 ? do_syscall_64+0x6c/0x90 ? do_syscall_64+0x6c/0x90 ? do_syscall_64+0x6c/0x90 ? do_syscall_64+0x6c/0x90 ? lockdep_hardirqs_on+0x81/0x110 entry_SYSCALL_64_after_hwframe+0x72/0xdc RIP: 0033:0x7f1f8e702b91 Code: d5 fe ff ff 55 48 8d 3d d5 4c 0a 00 48 89 e5 e8 35 fe 01 00 0f 1f 44 00 00 f3 0f 1e fa 80 3d ed c7 13 00 00 74 13 31 c0 0f 05 <48> 3d 00 f0 ff ff 77 4f c3 66 0f 1f 44 00 00 55 48 89 e5 48 83 ec RSP: 002b:00007ffc2783de88 EFLAGS: 00000246 ORIG_RAX: 0000000000000000 RAX: ffffffffffffffda RBX: 0000000000018000 RCX: 00007f1f8e702b91 RDX: 0000000000018000 RSI: 000055fac4228000 RDI: 0000000000000003 RBP: 00007ffc2783deb0 R08: 0000000000000001 R09: 0000000000000000 R10: 0000000000001000 R11: 0000000000000246 R12: 0000000000018000 R13: 000055fac4228000 R14: 0000000000000003 R15: 000055fac4227110 </TASK> Modules linked in: overlay tun uinput rfcomm snd_seq_dummy snd_hrtimer nf_conntrack_netbios_ns nf_conntrack_broadcast nft_fib_inet nft_fib_ipv4 nft_fib_ipv6 nft_fib nft_reject_inet nf_reject_ipv4 nf_reject_ipv6 nft_reject nft_ct nft_chain_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 ip_set nf_tables nfnetlink qrtr bnep sunrpc binfmt_misc mt76x2u mt76x2_common mt7921e mt76x02_usb snd_hda_codec_hdmi mt7921_common mt76_usb intel_rapl_msr btusb intel_rapl_common mt76x02_lib mt76_connac_lib btrtl uvcvideo snd_hda_intel btbcm edac_mce_amd btintel snd_intel_dspcfg btmtk uvc mt76 snd_intel_sdw_acpi videobuf2_vmalloc snd_hda_codec videobuf2_memops bluetooth snd_usb_audio mac80211 videobuf2_v4l2 snd_hda_core snd_usbmidi_lib videobuf2_common kvm_amd snd_rawmidi snd_hwdep videodev snd_seq vfat kvm libarc4 snd_seq_device irqbypass mc fat snd_pcm xpad cfg80211 rapl ff_memless wmi_bmof snd_timer pcspkr snd k10temp i2c_piix4 acpi_cpufreq joydev soundcore rfkill apple_mfi_fastcharge gpio_amdpt gpio_generic loop zram amdgpu i2c_algo_bit drm_ttm_helper ttm crct10dif_pclmul crc32_pclmul crc32c_intel drm_suballoc_helper polyval_clmulni iommu_v2 polyval_generic hid_apple drm_buddy ccp nvme gpu_sched drm_display_helper ghash_clmulni_intel nvme_core ucsi_ccg sha512_ssse3 typec_ucsi typec r8169 cec sp5100_tco nvme_common video wmi ip6_tables ip_tables fuse ---[ end trace 0000000000000000 ]--- RIP: 0010:amdgpu_bo_get_memory+0x80/0x360 [amdgpu] Code: 48 c1 ea 03 80 3c 02 00 0f 85 90 02 00 00 4c 8b ad 90 02 00 00 48 b8 00 00 00 00 00 fc ff df 49 8d 7d 10 48 89 fa 48 c1 ea 03 <0f> b6 04 02 84 c0 74 08 3c 03 0f 8e 34 02 00 00 41 8b 45 10 83 f8 RSP: 0018:ffffc90039e178c0 EFLAGS: 00010212 RAX: dffffc0000000000 RBX: ffffc90039e17988 RCX: 0000000000000000 RDX: 0000000000000002 RSI: ffffc90039e17988 RDI: 0000000000000010 RBP: ffff88812fcda000 R08: 0000000000000000 R09: ffffffff8a0ca6af R10: fffffbfff14194d5 R11: 0000000000000000 R12: 0000000000840000 R13: 0000000000000000 R14: ffff88810f686160 R15: ffff88811fb6d218 FS: 00007f1f8e87a740(0000) GS:ffff888f99000000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000055fac423f000 CR3: 00000004ba39c000 CR4: 0000000000750ee0 PKRU: 55555554 note: grep[100871] exited with preempt_count 1 ================================================================== BUG: KASAN: slab-use-after-free in mutex_can_spin_on_owner+0x191/0x1c0 Read of size 4 at addr ffff88843e1ebe74 by task grep/101238 CPU: 23 PID: 101238 Comm: grep Tainted: G D W L ------- --- 6.4.0-0.rc0.20230503git348551ddaf31.12.fc39.x86_64+debug #1 Hardware name: Micro-Star International Co., Ltd. MS-7D73/MPG B650I EDGE WIFI (MS-7D73), BIOS 1.32 04/28/2023 Call Trace: <TASK> dump_stack_lvl+0x76/0xd0 print_report+0xcf/0x670 ? mutex_can_spin_on_owner+0x191/0x1c0 ? mutex_can_spin_on_owner+0x191/0x1c0 kasan_report+0xa8/0xe0 ? mutex_can_spin_on_owner+0x191/0x1c0 mutex_can_spin_on_owner+0x191/0x1c0 __ww_mutex_lock.constprop.0+0x605/0x3730 ? number+0x628/0x910 ? amdgpu_show_fdinfo+0x2ea/0x900 [amdgpu] ? __pfx___ww_mutex_lock.constprop.0+0x10/0x10 ? __pfx_number+0x10/0x10 ? __kmalloc_node+0x65/0x160 ? seq_read_iter+0x6b4/0x11c0 ? __pfx___might_resched+0x10/0x10 ? do_syscall_64+0x5d/0x90 ? ww_mutex_lock_interruptible+0x3c/0x150 ww_mutex_lock_interruptible+0x3c/0x150 amdgpu_show_fdinfo+0x2ea/0x900 [amdgpu] ? __pfx_amdgpu_show_fdinfo+0x10/0x10 [amdgpu] ? __pfx_seq_printf+0x10/0x10 seq_show+0x48a/0x710 seq_read_iter+0x40d/0x11c0 seq_read+0x190/0x230 ? cp_new_stat+0x468/0x590 ? __pfx_seq_read+0x10/0x10 ? selinux_file_permission+0x356/0x440 ? fsnotify_perm.part.0+0x146/0x4e0 vfs_read+0x1cd/0x860 ? __do_sys_newfstatat+0x94/0xe0 ? __pfx_vfs_read+0x10/0x10 ? __fget_light+0x51/0x230 ksys_read+0x10a/0x1e0 ? __pfx_ksys_read+0x10/0x10 ? do_syscall_64+0x6c/0x90 ? syscall_enter_from_user_mode+0x26/0x90 ? rcu_is_watching+0x15/0xb0 ? syscall_enter_from_user_mode+0x26/0x90 do_syscall_64+0x5d/0x90 ? do_syscall_64+0x6c/0x90 ? do_syscall_64+0x6c/0x90 ? do_syscall_64+0x6c/0x90 entry_SYSCALL_64_after_hwframe+0x72/0xdc RIP: 0033:0x7fc962d02b91 Code: d5 fe ff ff 55 48 8d 3d d5 4c 0a 00 48 89 e5 e8 35 fe 01 00 0f 1f 44 00 00 f3 0f 1e fa 80 3d ed c7 13 00 00 74 13 31 c0 0f 05 <48> 3d 00 f0 ff ff 77 4f c3 66 0f 1f 44 00 00 55 48 89 e5 48 83 ec RSP: 002b:00007fff56419e08 EFLAGS: 00000246 ORIG_RAX: 0000000000000000 RAX: ffffffffffffffda RBX: 0000000000018000 RCX: 00007fc962d02b91 RDX: 0000000000018000 RSI: 000055bd4ef4b000 RDI: 0000000000000003 RBP: 00007fff56419e30 R08: 0000000000000001 R09: 0000000000000000 R10: 0000000000001000 R11: 0000000000000246 R12: 0000000000018000 R13: 000055bd4ef4b000 R14: 0000000000000003 R15: 000055bd4ef4a110 </TASK> Allocated by task 100870: kasan_save_stack+0x33/0x60 kasan_set_track+0x25/0x30 __kasan_slab_alloc+0x6e/0x70 kmem_cache_alloc_node+0x18c/0x420 copy_process+0x3be/0x6940 kernel_clone+0xc8/0x6d0 __do_sys_clone+0xa1/0xe0 do_syscall_64+0x5d/0x90 entry_SYSCALL_64_after_hwframe+0x72/0xdc Last potentially related work creation: kasan_save_stack+0x33/0x60 __kasan_record_aux_stack+0x97/0xb0 __call_rcu_common.constprop.0+0xf8/0x1af0 release_task+0xd46/0x17a0 wait_consider_task+0x10e6/0x3680 do_wait+0x6a7/0xa30 kernel_wait4+0x105/0x1e0 __do_sys_wait4+0x10d/0x120 do_syscall_64+0x5d/0x90 entry_SYSCALL_64_after_hwframe+0x72/0xdc Second to last potentially related work creation: kasan_save_stack+0x33/0x60 __kasan_record_aux_stack+0x97/0xb0 task_work_add+0x88/0x220 scheduler_tick+0x2a7/0xaa0 update_process_times+0x157/0x1d0 tick_sched_handle+0x67/0x130 tick_sched_timer+0xb1/0xe0 __hrtimer_run_queues+0x4d5/0x910 hrtimer_interrupt+0x2f5/0x810 __sysvec_apic_timer_interrupt+0x147/0x3f0 sysvec_apic_timer_interrupt+0x8e/0xc0 asm_sysvec_apic_timer_interrupt+0x1a/0x20 The buggy address belongs to the object at ffff88843e1ebe40 which belongs to the cache task_struct of size 15624 The buggy address is located 52 bytes inside of freed 15624-byte region [ffff88843e1ebe40, ffff88843e1efb48) The buggy address belongs to the physical page: page:000000002b34b3bc refcount:1 mapcount:0 mapping:0000000000000000 index:0xffff88843e1ebe40 pfn:0x43e1e8 head:000000002b34b3bc order:3 entire_mapcount:0 nr_pages_mapped:0 pincount:0 memcg:ffff888726d08961 flags: 0x17ffffc0010200(slab|head|node=0|zone=2|lastcpupid=0x1fffff) page_type: 0xffffffff() raw: 0017ffffc0010200 ffff888100052080 ffffea0006493a10 ffffea0014033c10 raw: ffff88843e1ebe40 0000000000020001 00000001ffffffff ffff888726d08961 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff88843e1ebd00: 00 fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ffff88843e1ebd80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc >ffff88843e1ebe00: fc fc fc fc fc fc fc fc fb fb fb fb fb fb fb fb ^ ffff88843e1ebe80: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff88843e1ebf00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ================================================================== $ /usr/src/kernels/6.4.0-0.rc0.20230503git348551ddaf31.12.fc39.x86_64/scripts/faddr2line /lib/debug/lib/modules/6.4.0-0.rc0.20230503git348551ddaf31.12.fc39.x86_64/kernel/drivers/gpu/drm/amd/amdgpu/amdgpu.ko.debug amdgpu_bo_get_memory+0x80 amdgpu_bo_get_memory+0x80/0xc0: amdgpu_bo_get_memory at /usr/src/debug/kernel-6.3-12728-g348551ddaf31/linux-6.4.0-0.rc0.20230503git348551ddaf31.12.fc39.x86_64/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c:1293 $ cat -s -n /usr/src/debug/kernel-6.3-12728-g348551ddaf31/linux-6.4.0-0.rc0.20230503git348551ddaf31.12.fc39.x86_64/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | head -1300 | tail -20 1281 case AMDGPU_GEM_DOMAIN_GTT: 1282 stats->gtt += size; 1283 break; 1284 case AMDGPU_GEM_DOMAIN_CPU: 1285 default: 1286 stats->cpu += size; 1287 break; 1288 } 1289 1290 if (bo->preferred_domains & AMDGPU_GEM_DOMAIN_VRAM) { 1291 stats->requested_vram += size; 1292 if (bo->flags & AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED) 1293 stats->requested_visible_vram += size; 1294 1295 if (domain != AMDGPU_GEM_DOMAIN_VRAM) { 1296 stats->evicted_vram += size; 1297 if (bo->flags & AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED) 1298 stats->evicted_visible_vram += size; 1299 } 1300 } else if (bo->preferred_domains & AMDGPU_GEM_DOMAIN_GTT) { $ git blame drivers/gpu/drm/amd/amdgpu/amdgpu_object.c -L 1281,1300 874442541133f (Roy Sun 2021-04-26 14:27:01 +0800 1281) case AMDGPU_GEM_DOMAIN_GTT: d6530c33a978c (Marek Olšák 2023-01-30 01:52:40 -0500 1282) stats->gtt += size; 874442541133f (Roy Sun 2021-04-26 14:27:01 +0800 1283) break; 874442541133f (Roy Sun 2021-04-26 14:27:01 +0800 1284) case AMDGPU_GEM_DOMAIN_CPU: 874442541133f (Roy Sun 2021-04-26 14:27:01 +0800 1285) default: d6530c33a978c (Marek Olšák 2023-01-30 01:52:40 -0500 1286) stats->cpu += size; 874442541133f (Roy Sun 2021-04-26 14:27:01 +0800 1287) break; 874442541133f (Roy Sun 2021-04-26 14:27:01 +0800 1288) } d6530c33a978c (Marek Olšák 2023-01-30 01:52:40 -0500 1289) d6530c33a978c (Marek Olšák 2023-01-30 01:52:40 -0500 1290) if (bo->preferred_domains & AMDGPU_GEM_DOMAIN_VRAM) { d6530c33a978c (Marek Olšák 2023-01-30 01:52:40 -0500 1291) stats->requested_vram += size; d6530c33a978c (Marek Olšák 2023-01-30 01:52:40 -0500 1292) if (bo->flags & AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED) d6530c33a978c (Marek Olšák 2023-01-30 01:52:40 -0500 1293) stats->requested_visible_vram += size; d6530c33a978c (Marek Olšák 2023-01-30 01:52:40 -0500 1294) d6530c33a978c (Marek Olšák 2023-01-30 01:52:40 -0500 1295) if (domain != AMDGPU_GEM_DOMAIN_VRAM) { d6530c33a978c (Marek Olšák 2023-01-30 01:52:40 -0500 1296) stats->evicted_vram += size; d6530c33a978c (Marek Olšák 2023-01-30 01:52:40 -0500 1297) if (bo->flags & AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED) d6530c33a978c (Marek Olšák 2023-01-30 01:52:40 -0500 1298) stats->evicted_visible_vram += size; d6530c33a978c (Marek Olšák 2023-01-30 01:52:40 -0500 1299) } d6530c33a978c (Marek Olšák 2023-01-30 01:52:40 -0500 1300) } else if (bo->preferred_domains & AMDGPU_GEM_DOMAIN_GTT) { git blame says that this code related to this commit: commit d6530c33a978c6d170125b3a2ca1d218b1863e52 Author: Marek Olšák <marek.olsak@xxxxxxx> Date: Mon Jan 30 01:52:40 2023 -0500 drm/amdgpu: expose more memory stats in fdinfo This will be used for performance investigations. Signed-off-by: Marek Olšák <marek.olsak@xxxxxxx> Reviewed-by: Christian König <christian.koenig@xxxxxxx> Signed-off-by: Alex Deucher <alexander.deucher@xxxxxxx> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fdinfo.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fdinfo.c index 99a7855ab1bc..c57252f004e8 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fdinfo.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fdinfo.c @@ -60,12 +60,13 @@ void amdgpu_show_fdinfo(struct seq_file *m, struct file *f) struct amdgpu_fpriv *fpriv = file->driver_priv; struct amdgpu_vm *vm = &fpriv->vm; - uint64_t vram_mem = 0, gtt_mem = 0, cpu_mem = 0; + struct amdgpu_mem_stats stats; ktime_t usage[AMDGPU_HW_IP_NUM]; uint32_t bus, dev, fn, domain; unsigned int hw_ip; int ret; I need to say that it may not be easy to reproduce this bug. For helping reproduce: 1. I looped script above: $ for i in {1..99999}; do sudo curl -s https://raw.githubusercontent.com/fatso83/dotfiles/master/utils/scripts/inotify-consumers | bash; done 2. Launched google chrome with 26 opened windows 3. And played in the game Division 2. A little time and luck and I get the desired backtrace again and again. I am ready to answer any question and open for testing any patches. Thanks. -- Best Regards, Mike Gavrilov.
Attachment:
BUG-KASAN-null-ptr-deref-in-range-[0x0000000000000010-0x0000000000000017]-7900XTX-01.tar.xz
Description: application/xz
Attachment:
BUG-KASAN-null-ptr-deref-in-range-[0x0000000000000010-0x0000000000000017]-7900XTX-02.tar.xz
Description: application/xz
Attachment:
BUG-KASAN-null-ptr-deref-in-range-[0x0000000000000010-0x0000000000000017]-7900XTX-03.tar.xz
Description: application/xz