On Tue, Apr 11, 2023 at 10:40 PM Mikhail Gavrilov <mikhail.v.gavrilov@xxxxxxxxx> wrote: > > Hi, > KASAN continues to find problems in the drm_sched_job_cleanup code at 6.3rc6. > I not got any feedback in the thread > https://lore.kernel.org/lkml/CABXGCsMVUB2RA4D+k5CnA0_2521TOX++D4NmOukKi4X2-Q_RfQ@xxxxxxxxxxxxxx/ > Therefore, I decided to start a separate thread. Since the problems > are different, the symptoms are also different. > > Reproduction scenario. > After launching one of the listed games: > - Cyberpunk 2077 > - Forza Horizon 4 > - Forza Horizon 5 > - Sackboy: A Big Adventure > > Firstly after some time (may be after several attempts) appears bug > message from KASAN: > ================================================================== > BUG: KASAN: null-ptr-deref in drm_sched_job_cleanup+0x96/0x290 [gpu_sched] > Read of size 4 at addr 0000000000000078 by task ForzaHorizon4.e/31587 > > CPU: 15 PID: 31587 Comm: ForzaHorizon4.e Tainted: G W L > ------- --- 6.3.0-0.rc6.49.fc39.x86_64+debug #1 > Hardware name: System manufacturer System Product Name/ROG STRIX > X570-I GAMING, BIOS 4601 02/02/2023 > Call Trace: > <TASK> > dump_stack_lvl+0x72/0xc0 > kasan_report+0xa4/0xe0 > ? drm_sched_job_cleanup+0x96/0x290 [gpu_sched] > kasan_check_range+0x104/0x1b0 > drm_sched_job_cleanup+0x96/0x290 [gpu_sched] > ? __pfx_drm_sched_job_cleanup+0x10/0x10 [gpu_sched] > ? slab_free_freelist_hook+0x11e/0x1d0 > ? amdgpu_cs_parser_fini+0x363/0x5a0 [amdgpu] > amdgpu_job_free+0x40/0x1b0 [amdgpu] > amdgpu_cs_parser_fini+0x3c9/0x5a0 [amdgpu] > ? __pfx_amdgpu_cs_parser_fini+0x10/0x10 [amdgpu] > amdgpu_cs_ioctl+0x3d9/0x5630 [amdgpu] > ? __pfx_amdgpu_cs_ioctl+0x10/0x10 [amdgpu] > ? __kmem_cache_free+0xbc/0x2e0 > ? mark_lock+0x101/0x16e0 > ? __lock_acquire+0xe54/0x59f0 > ? kasan_save_stack+0x3f/0x50 > ? __pfx_lock_release+0x10/0x10 > ? __pfx_amdgpu_cs_ioctl+0x10/0x10 [amdgpu] > drm_ioctl_kernel+0x1f8/0x3d0 > ? __pfx_drm_ioctl_kernel+0x10/0x10 > drm_ioctl+0x4c1/0xaa0 > ? __pfx_amdgpu_cs_ioctl+0x10/0x10 [amdgpu] > ? __pfx_drm_ioctl+0x10/0x10 > ? _raw_spin_unlock_irqrestore+0x62/0x80 > ? lockdep_hardirqs_on+0x7d/0x100 > ? _raw_spin_unlock_irqrestore+0x4b/0x80 > amdgpu_drm_ioctl+0xce/0x1b0 [amdgpu] > __x64_sys_ioctl+0x12d/0x1a0 > do_syscall_64+0x5c/0x90 > ? do_syscall_64+0x68/0x90 > ? lockdep_hardirqs_on+0x7d/0x100 > ? do_syscall_64+0x68/0x90 > ? do_syscall_64+0x68/0x90 > ? lockdep_hardirqs_on+0x7d/0x100 > ? do_syscall_64+0x68/0x90 > ? asm_exc_page_fault+0x22/0x30 > ? lockdep_hardirqs_on+0x7d/0x100 > entry_SYSCALL_64_after_hwframe+0x72/0xdc > RIP: 0033:0x7fb8a270881d > Code: 04 25 28 00 00 00 48 89 45 c8 31 c0 48 8d 45 10 c7 45 b0 10 00 > 00 00 48 89 45 b8 48 8d 45 d0 48 89 45 c0 b8 10 00 00 00 0f 05 <89> c2 > 3d 00 f0 ff ff 77 1a 48 8b 45 c8 64 48 2b 04 25 28 00 00 00 > RSP: 002b:00000000467ad060 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 > RAX: ffffffffffffffda RBX: 00000000467ad358 RCX: 00007fb8a270881d > RDX: 00000000467ad140 RSI: 00000000c0186444 RDI: 000000000000005a > RBP: 00000000467ad0b0 R08: 00007fb7f00d3eb0 R09: 00000000467ad100 > R10: 00007fb88c68fb20 R11: 0000000000000246 R12: 00000000467ad140 > R13: 00000000c0186444 R14: 000000000000005a R15: 00007fb7f00d3e50 > </TASK> > ================================================================== > > Finally it ends up with the games listed above stopping working they > stuck after a kernel warning: > general protection fault, probably for non-canonical address > 0xdffffc000000000f: 0000 [#1] PREEMPT SMP KASAN NOPTI > KASAN: null-ptr-deref in range [0x0000000000000078-0x000000000000007f] > CPU: 15 PID: 31587 Comm: ForzaHorizon4.e Tainted: G B W L > ------- --- 6.3.0-0.rc6.49.fc39.x86_64+debug #1 > Hardware name: System manufacturer System Product Name/ROG STRIX > X570-I GAMING, BIOS 4601 02/02/2023 > RIP: 0010:drm_sched_job_cleanup+0xa7/0x290 [gpu_sched] > Code: d6 01 00 00 4c 8b 75 20 be 04 00 00 00 4d 8d 66 78 4c 89 e7 e8 > ba 4d 4e c9 4c 89 e2 48 b8 00 00 00 00 00 fc ff df 48 c1 ea 03 <0f> b6 > 14 02 4c 89 e0 83 e0 07 83 c0 03 38 d0 7c 08 84 d2 0f 85 8a > RSP: 0018:ffffc9003676f5a8 EFLAGS: 00010216 > RAX: dffffc0000000000 RBX: ffff88816f81f020 RCX: 0000000000000001 > RDX: 000000000000000f RSI: 0000000000000008 RDI: ffffffff9053e5e0 > RBP: ffff88816f81f000 R08: 0000000000000001 R09: ffffffff9053e5e7 > R10: fffffbfff20a7cbc R11: 6e696c6261736944 R12: 0000000000000078 > R13: 1ffff92006cedeb5 R14: 0000000000000000 R15: ffffc9003676f870 > FS: 000000004680f6c0(0000) GS:ffff888fa5c00000(0000) knlGS:0000000029910000 > CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 > CR2: 00007fb854d6f010 CR3: 000000017b2d6000 CR4: 0000000000350ee0 > Call Trace: > <TASK> > ? __pfx_drm_sched_job_cleanup+0x10/0x10 [gpu_sched] > ? slab_free_freelist_hook+0x11e/0x1d0 > ? amdgpu_cs_parser_fini+0x363/0x5a0 [amdgpu] > amdgpu_job_free+0x40/0x1b0 [amdgpu] > amdgpu_cs_parser_fini+0x3c9/0x5a0 [amdgpu] > ? __pfx_amdgpu_cs_parser_fini+0x10/0x10 [amdgpu] > amdgpu_cs_ioctl+0x3d9/0x5630 [amdgpu] > ? __pfx_amdgpu_cs_ioctl+0x10/0x10 [amdgpu] > ? __kmem_cache_free+0xbc/0x2e0 > ? mark_lock+0x101/0x16e0 > ? __lock_acquire+0xe54/0x59f0 > ? kasan_save_stack+0x3f/0x50 > ? __pfx_lock_release+0x10/0x10 > ? __pfx_amdgpu_cs_ioctl+0x10/0x10 [amdgpu] > drm_ioctl_kernel+0x1f8/0x3d0 > ? __pfx_drm_ioctl_kernel+0x10/0x10 > drm_ioctl+0x4c1/0xaa0 > ? __pfx_amdgpu_cs_ioctl+0x10/0x10 [amdgpu] > ? __pfx_drm_ioctl+0x10/0x10 > ? _raw_spin_unlock_irqrestore+0x62/0x80 > ? lockdep_hardirqs_on+0x7d/0x100 > ? _raw_spin_unlock_irqrestore+0x4b/0x80 > amdgpu_drm_ioctl+0xce/0x1b0 [amdgpu] > __x64_sys_ioctl+0x12d/0x1a0 > do_syscall_64+0x5c/0x90 > ? do_syscall_64+0x68/0x90 > ? lockdep_hardirqs_on+0x7d/0x100 > ? do_syscall_64+0x68/0x90 > ? do_syscall_64+0x68/0x90 > ? lockdep_hardirqs_on+0x7d/0x100 > ? do_syscall_64+0x68/0x90 > ? asm_exc_page_fault+0x22/0x30 > ? lockdep_hardirqs_on+0x7d/0x100 > entry_SYSCALL_64_after_hwframe+0x72/0xdc > RIP: 0033:0x7fb8a270881d > Code: 04 25 28 00 00 00 48 89 45 c8 31 c0 48 8d 45 10 c7 45 b0 10 00 > 00 00 48 89 45 b8 48 8d 45 d0 48 89 45 c0 b8 10 00 00 00 0f 05 <89> c2 > 3d 00 f0 ff ff 77 1a 48 8b 45 c8 64 48 2b 04 25 28 00 00 00 > RSP: 002b:00000000467ad060 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 > RAX: ffffffffffffffda RBX: 00000000467ad358 RCX: 00007fb8a270881d > RDX: 00000000467ad140 RSI: 00000000c0186444 RDI: 000000000000005a > RBP: 00000000467ad0b0 R08: 00007fb7f00d3eb0 R09: 00000000467ad100 > R10: 00007fb88c68fb20 R11: 0000000000000246 R12: 00000000467ad140 > R13: 00000000c0186444 R14: 000000000000005a R15: 00007fb7f00d3e50 > </TASK> > Modules linked in: uinput rfcomm snd_seq_dummy snd_hrtimer > nf_conntrack_netbios_ns nf_conntrack_broadcast nft_fib_inet > nft_fib_ipv4 nft_fib_ipv6 nft_fib nft_reject_inet nf_reject_ipv4 > nf_reject_ipv6 nft_reject nft_ct nft_chain_nat nf_nat nf_conntrack > nf_defrag_ipv6 ip_set nf_tables nfnetlink qrtr bnep sunrpc binfmt_misc > mt76x2u mt76x2_common mt76x02_usb iwlmvm snd_hda_codec_realtek > mt76_usb intel_rapl_msr snd_hda_codec_generic snd_hda_codec_hdmi > intel_rapl_common mt76x02_lib mt76 snd_hda_intel edac_mce_amd > snd_intel_dspcfg cpi snd_usb_audio snd_hda_codec mac80211 kvm_amd > snd_usbmidi_lib btusb snd_hda_core snd_rawmidi snd_hwdep mc btrtl kvm > btbcm btintel snd_seq libarc4 iwlwifi btmtk snd_seq_device vfat > eeepc_wmi fat bluetooth asus_ec_sensors snd_pcm asus_wmi irqbypass > ledtrig_audio _keymap snd_timer xpad platform_profile wmi_bmof > ff_memless rapl joydev pcspkr snd k10temp i2c_piix4 soundcore rfkill > acpi_cpufreq loop zram amdgpu drm_ttm_helper ttm video iommu_v2 > drm_buddy gpu_sched drm_display_helper crct10dif_pclmul ucsi_ccg > crc32_pclmul crc32c_intel typec_ucsi polyval_clmulni polyval_generic > typec ghash_clmulni_intel cec ccp sha512_ssse3 sp5100_tco igb nvme > nvme_core dca i2c_algo_bit nvme_common wmi ip6_tables ip_tables > ---[ end trace 0000000000000000 ]--- > RIP: 0010:drm_sched_job_cleanup+0xa7/0x290 [gpu_sched] > Code: d6 01 00 00 4c 8b 75 20 be 04 00 00 00 4d 8d 66 78 4c 89 e7 e8 > ba 4d 4e c9 4c 89 e2 48 b8 00 00 00 00 00 fc ff df 48 c1 ea 03 <0f> b6 > 14 02 4c 89 e0 83 e0 07 83 c0 03 38 d0 7c 08 84 d2 0f 85 8a > RSP: 0018:ffffc9003676f5a8 EFLAGS: 00010216 > RAX: dffffc0000000000 RBX: ffff88816f81f020 RCX: 0000000000000001 > RDX: 000000000000000f RSI: 0000000000000008 RDI: ffffffff9053e5e0 > RBP: ffff88816f81f000 R08: 0000000000000001 R09: ffffffff9053e5e7 > R10: fffffbfff20a7cbc R11: 6e696c6261736944 R12: 0000000000000078 > R13: 1ffff92006cedeb5 R14: 0000000000000000 R15: ffffc9003676f870 > FS: 000000004680f6c0(0000) GS:ffff888fa5c00000(0000) knlGS:0000000029910000 > CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 > CR2: 00007fb854d6f010 CR3: 000000017b2d6000 CR4: 0000000000350ee0 > > Demonstration: > https://youtu.be/ysRc4TXuBQI > > I would be happy to join in testing patches that would fix this. > > I attached a full kernel log here. > I think that the result of the problem that KASAN found out looks like this if the kernel is built without KASAN: BUG: kernel NULL pointer dereference, address: 0000000000000078 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD f975b1067 P4D f975b1067 PUD e3bdba067 PMD f94134067 PTE 0 Oops: 0000 [#1] PREEMPT SMP NOPTI CPU: 31 PID: 40791 Comm: ForzaHorizon4.e Tainted: G L ------- --- 6.3.0-0.rc6.20230413gitde4664485abb.52.fc39.x86_64 #1 Hardware name: System manufacturer System Product Name/ROG STRIX X570-I GAMING, BIOS 4601 02/02/2023 RIP: 0010:drm_sched_job_cleanup+0x2a/0x130 [gpu_sched] Code: 0f 1f 44 00 00 55 53 48 89 fb 48 83 ec 10 48 8b 7f 20 65 48 8b 04 25 28 00 00 00 48 89 44 24 08 31 c0 48 c7 04 24 00 00 00 00 <8b> 47 78 85 c0 0f 84 c2 00 00 00 48 83 ff c0 74 1f 48 8d 57 78 b8 RSP: 0018:ffffa69d5d33fa10 EFLAGS: 00010246 RAX: 0000000000000000 RBX: ffff8a617d87c000 RCX: 00000000b93d601f RDX: 00000000b93d401f RSI: ad0811cd15498925 RDI: 0000000000000000 RBP: 0000000000000000 R08: ffff8a55c4986018 R09: 0000000080080000 R10: 0000000000000001 R11: 0000000000000000 R12: 00000000ffffffff R13: 0000000000000018 R14: 0000000000000000 R15: ffffa69d5d33faf8 FS: 0000000048b6f6c0(0000) GS:ffff8a64aa9c0000(0000) knlGS:000000003bc40000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000078 CR3: 000000015a164000 CR4: 0000000000350ee0 Call Trace: <TASK> amdgpu_job_free+0x15/0xc0 [amdgpu] amdgpu_cs_parser_fini+0x137/0x1a0 [amdgpu] amdgpu_cs_ioctl+0x176/0x2140 [amdgpu] ? kmem_cache_alloc+0xf1/0x310 ? __pfx_amdgpu_cs_ioctl+0x10/0x10 [amdgpu] drm_ioctl_kernel+0xc9/0x170 drm_ioctl+0x269/0x4a0 ? __pfx_amdgpu_cs_ioctl+0x10/0x10 [amdgpu] amdgpu_drm_ioctl+0x4a/0x80 [amdgpu] __x64_sys_ioctl+0x90/0xd0 do_syscall_64+0x5c/0x90 ? __x64_sys_ioctl+0xa8/0xd0 ? syscall_exit_to_user_mode+0x17/0x40 ? do_syscall_64+0x68/0x90 ? exc_page_fault+0x78/0x180 entry_SYSCALL_64_after_hwframe+0x72/0xdc RIP: 0033:0x7fe76290881d Code: 04 25 28 00 00 00 48 89 45 c8 31 c0 48 8d 45 10 c7 45 b0 10 00 00 00 48 89 45 b8 48 8d 45 d0 48 89 45 c0 b8 10 00 00 00 0f 05 <89> c2 3d 00 f0 ff ff 77 1a 48 8b 45 c8 64 48 2b 04 25 28 00 00 00 RSP: 002b:0000000048b6c220 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 RAX: ffffffffffffffda RBX: 0000000048b6c418 RCX: 00007fe76290881d RDX: 0000000048b6c300 RSI: 00000000c0186444 RDI: 0000000000000059 RBP: 0000000048b6c270 R08: 00007fe6a80bedc0 R09: 0000000048b6c2c0 R10: 00007fe74c678770 R11: 0000000000000246 R12: 0000000048b6c300 R13: 00000000c0186444 R14: 0000000000000059 R15: 0000000000000001 </TASK> Modules linked in: overlay tun uinput rfcomm snd_seq_dummy snd_hrtimer nf_conntrack_netbios_ns nf_conntrack_broadcast nft_fib_inet nft_fib_ipv4 nft_fib_ipv6 nft_fib nft_reject_inet nf_reject_ipv4 nft_reject nf_reject_ipv6 nft_ct nft_chain_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 ip_set nf_tables nfnetlink qrtr bnep sunrpc binfmt_misc snd_hda_codec_realtek snd_hda_codec_generic intel_rapl_msr snd_hda_codec_hdmi intel_rapl_common mt76x2u rapl mt76x2_common snd_hda_intel mt76x02_usb iwlmvm snd_hda_codec mt76x02_lib snd_usb_audio mt76_usb snd_hda_core kvm_amd mt76 snd_intel_dspcfg snd_intel_sdw_acpi vfat fat snd_hwdep mac80211 eeepc_wmi snd_usbmidi_lib asus_wmi kvm snd_rawmidi btusb snd_seq btrtl snd_seq_device snd_pcm btbcm btintel ledtrig_audio iwlwifi irqbypass snd_timer libarc4 btmtk sparse_keymap asus_ec_sensors bluetooth snd edac_mce_amd platform_profile cfg80211 wmi_bmof pcspkr soundcore mc i2c_piix4 k10temp rfkill joydev acpi_cpufreq loop zram amdgpu drm_ttm_helper ttm iommu_v2 drm_buddy gpu_sched crc32_pclmul drm_display_helper nvme ghash_clmulni_intel ucsi_ccg polyval_clmulni igb typec_ucsi polyval_generic cec ccp nvme_core sha512_ssse3 typec crct10dif_pclmul video crc32c_intel sp5100_tco i2c_algo_bit dca nvme_common wmi ip6_tables ip_tables fuse CR2: 0000000000000078 ---[ end trace 0000000000000000 ]--- RIP: 0010:drm_sched_job_cleanup+0x2a/0x130 [gpu_sched] Code: 0f 1f 44 00 00 55 53 48 89 fb 48 83 ec 10 48 8b 7f 20 65 48 8b 04 25 28 00 00 00 48 89 44 24 08 31 c0 48 c7 04 24 00 00 00 00 <8b> 47 78 85 c0 0f 84 c2 00 00 00 48 83 ff c0 74 1f 48 8d 57 78 b8 RSP: 0018:ffffa69d5d33fa10 EFLAGS: 00010246 RAX: 0000000000000000 RBX: ffff8a617d87c000 RCX: 00000000b93d601f RDX: 00000000b93d401f RSI: ad0811cd15498925 RDI: 0000000000000000 RBP: 0000000000000000 R08: ffff8a55c4986018 R09: 0000000080080000 R10: 0000000000000001 R11: 0000000000000000 R12: 00000000ffffffff R13: 0000000000000018 R14: 0000000000000000 R15: ffffa69d5d33faf8 FS: 0000000048b6f6c0(0000) GS:ffff8a64aa9c0000(0000) knlGS:000000003bc40000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000078 CR3: 000000015a164000 CR4: 0000000000350ee0 note: ForzaHorizon4.e[40791] exited with irqs disabled To reproduce it, you need to spend more time running Cyberpunk 2077, Forza Horizon 4, Forza Horizon 5 in turn. -- Best Regards, Mike Gavrilov.
Attachment:
BUG-kernel-NULL-pointer-dereference-address-0000000000000078.tar.xz
Description: application/xz