Hi, KASAN continues to find problems in the drm_sched_job_cleanup code at 6.3rc6. I not got any feedback in the thread https://lore.kernel.org/lkml/CABXGCsMVUB2RA4D+k5CnA0_2521TOX++D4NmOukKi4X2-Q_RfQ@xxxxxxxxxxxxxx/ Therefore, I decided to start a separate thread. Since the problems are different, the symptoms are also different. Reproduction scenario. After launching one of the listed games: - Cyberpunk 2077 - Forza Horizon 4 - Forza Horizon 5 - Sackboy: A Big Adventure Firstly after some time (may be after several attempts) appears bug message from KASAN: ================================================================== BUG: KASAN: null-ptr-deref in drm_sched_job_cleanup+0x96/0x290 [gpu_sched] Read of size 4 at addr 0000000000000078 by task ForzaHorizon4.e/31587 CPU: 15 PID: 31587 Comm: ForzaHorizon4.e Tainted: G W L ------- --- 6.3.0-0.rc6.49.fc39.x86_64+debug #1 Hardware name: System manufacturer System Product Name/ROG STRIX X570-I GAMING, BIOS 4601 02/02/2023 Call Trace: <TASK> dump_stack_lvl+0x72/0xc0 kasan_report+0xa4/0xe0 ? drm_sched_job_cleanup+0x96/0x290 [gpu_sched] kasan_check_range+0x104/0x1b0 drm_sched_job_cleanup+0x96/0x290 [gpu_sched] ? __pfx_drm_sched_job_cleanup+0x10/0x10 [gpu_sched] ? slab_free_freelist_hook+0x11e/0x1d0 ? amdgpu_cs_parser_fini+0x363/0x5a0 [amdgpu] amdgpu_job_free+0x40/0x1b0 [amdgpu] amdgpu_cs_parser_fini+0x3c9/0x5a0 [amdgpu] ? __pfx_amdgpu_cs_parser_fini+0x10/0x10 [amdgpu] amdgpu_cs_ioctl+0x3d9/0x5630 [amdgpu] ? __pfx_amdgpu_cs_ioctl+0x10/0x10 [amdgpu] ? __kmem_cache_free+0xbc/0x2e0 ? mark_lock+0x101/0x16e0 ? __lock_acquire+0xe54/0x59f0 ? kasan_save_stack+0x3f/0x50 ? __pfx_lock_release+0x10/0x10 ? __pfx_amdgpu_cs_ioctl+0x10/0x10 [amdgpu] drm_ioctl_kernel+0x1f8/0x3d0 ? __pfx_drm_ioctl_kernel+0x10/0x10 drm_ioctl+0x4c1/0xaa0 ? __pfx_amdgpu_cs_ioctl+0x10/0x10 [amdgpu] ? __pfx_drm_ioctl+0x10/0x10 ? _raw_spin_unlock_irqrestore+0x62/0x80 ? lockdep_hardirqs_on+0x7d/0x100 ? _raw_spin_unlock_irqrestore+0x4b/0x80 amdgpu_drm_ioctl+0xce/0x1b0 [amdgpu] __x64_sys_ioctl+0x12d/0x1a0 do_syscall_64+0x5c/0x90 ? do_syscall_64+0x68/0x90 ? lockdep_hardirqs_on+0x7d/0x100 ? do_syscall_64+0x68/0x90 ? do_syscall_64+0x68/0x90 ? lockdep_hardirqs_on+0x7d/0x100 ? do_syscall_64+0x68/0x90 ? asm_exc_page_fault+0x22/0x30 ? lockdep_hardirqs_on+0x7d/0x100 entry_SYSCALL_64_after_hwframe+0x72/0xdc RIP: 0033:0x7fb8a270881d Code: 04 25 28 00 00 00 48 89 45 c8 31 c0 48 8d 45 10 c7 45 b0 10 00 00 00 48 89 45 b8 48 8d 45 d0 48 89 45 c0 b8 10 00 00 00 0f 05 <89> c2 3d 00 f0 ff ff 77 1a 48 8b 45 c8 64 48 2b 04 25 28 00 00 00 RSP: 002b:00000000467ad060 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 RAX: ffffffffffffffda RBX: 00000000467ad358 RCX: 00007fb8a270881d RDX: 00000000467ad140 RSI: 00000000c0186444 RDI: 000000000000005a RBP: 00000000467ad0b0 R08: 00007fb7f00d3eb0 R09: 00000000467ad100 R10: 00007fb88c68fb20 R11: 0000000000000246 R12: 00000000467ad140 R13: 00000000c0186444 R14: 000000000000005a R15: 00007fb7f00d3e50 </TASK> ================================================================== Finally it ends up with the games listed above stopping working they stuck after a kernel warning: general protection fault, probably for non-canonical address 0xdffffc000000000f: 0000 [#1] PREEMPT SMP KASAN NOPTI KASAN: null-ptr-deref in range [0x0000000000000078-0x000000000000007f] CPU: 15 PID: 31587 Comm: ForzaHorizon4.e Tainted: G B W L ------- --- 6.3.0-0.rc6.49.fc39.x86_64+debug #1 Hardware name: System manufacturer System Product Name/ROG STRIX X570-I GAMING, BIOS 4601 02/02/2023 RIP: 0010:drm_sched_job_cleanup+0xa7/0x290 [gpu_sched] Code: d6 01 00 00 4c 8b 75 20 be 04 00 00 00 4d 8d 66 78 4c 89 e7 e8 ba 4d 4e c9 4c 89 e2 48 b8 00 00 00 00 00 fc ff df 48 c1 ea 03 <0f> b6 14 02 4c 89 e0 83 e0 07 83 c0 03 38 d0 7c 08 84 d2 0f 85 8a RSP: 0018:ffffc9003676f5a8 EFLAGS: 00010216 RAX: dffffc0000000000 RBX: ffff88816f81f020 RCX: 0000000000000001 RDX: 000000000000000f RSI: 0000000000000008 RDI: ffffffff9053e5e0 RBP: ffff88816f81f000 R08: 0000000000000001 R09: ffffffff9053e5e7 R10: fffffbfff20a7cbc R11: 6e696c6261736944 R12: 0000000000000078 R13: 1ffff92006cedeb5 R14: 0000000000000000 R15: ffffc9003676f870 FS: 000000004680f6c0(0000) GS:ffff888fa5c00000(0000) knlGS:0000000029910000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fb854d6f010 CR3: 000000017b2d6000 CR4: 0000000000350ee0 Call Trace: <TASK> ? __pfx_drm_sched_job_cleanup+0x10/0x10 [gpu_sched] ? slab_free_freelist_hook+0x11e/0x1d0 ? amdgpu_cs_parser_fini+0x363/0x5a0 [amdgpu] amdgpu_job_free+0x40/0x1b0 [amdgpu] amdgpu_cs_parser_fini+0x3c9/0x5a0 [amdgpu] ? __pfx_amdgpu_cs_parser_fini+0x10/0x10 [amdgpu] amdgpu_cs_ioctl+0x3d9/0x5630 [amdgpu] ? __pfx_amdgpu_cs_ioctl+0x10/0x10 [amdgpu] ? __kmem_cache_free+0xbc/0x2e0 ? mark_lock+0x101/0x16e0 ? __lock_acquire+0xe54/0x59f0 ? kasan_save_stack+0x3f/0x50 ? __pfx_lock_release+0x10/0x10 ? __pfx_amdgpu_cs_ioctl+0x10/0x10 [amdgpu] drm_ioctl_kernel+0x1f8/0x3d0 ? __pfx_drm_ioctl_kernel+0x10/0x10 drm_ioctl+0x4c1/0xaa0 ? __pfx_amdgpu_cs_ioctl+0x10/0x10 [amdgpu] ? __pfx_drm_ioctl+0x10/0x10 ? _raw_spin_unlock_irqrestore+0x62/0x80 ? lockdep_hardirqs_on+0x7d/0x100 ? _raw_spin_unlock_irqrestore+0x4b/0x80 amdgpu_drm_ioctl+0xce/0x1b0 [amdgpu] __x64_sys_ioctl+0x12d/0x1a0 do_syscall_64+0x5c/0x90 ? do_syscall_64+0x68/0x90 ? lockdep_hardirqs_on+0x7d/0x100 ? do_syscall_64+0x68/0x90 ? do_syscall_64+0x68/0x90 ? lockdep_hardirqs_on+0x7d/0x100 ? do_syscall_64+0x68/0x90 ? asm_exc_page_fault+0x22/0x30 ? lockdep_hardirqs_on+0x7d/0x100 entry_SYSCALL_64_after_hwframe+0x72/0xdc RIP: 0033:0x7fb8a270881d Code: 04 25 28 00 00 00 48 89 45 c8 31 c0 48 8d 45 10 c7 45 b0 10 00 00 00 48 89 45 b8 48 8d 45 d0 48 89 45 c0 b8 10 00 00 00 0f 05 <89> c2 3d 00 f0 ff ff 77 1a 48 8b 45 c8 64 48 2b 04 25 28 00 00 00 RSP: 002b:00000000467ad060 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 RAX: ffffffffffffffda RBX: 00000000467ad358 RCX: 00007fb8a270881d RDX: 00000000467ad140 RSI: 00000000c0186444 RDI: 000000000000005a RBP: 00000000467ad0b0 R08: 00007fb7f00d3eb0 R09: 00000000467ad100 R10: 00007fb88c68fb20 R11: 0000000000000246 R12: 00000000467ad140 R13: 00000000c0186444 R14: 000000000000005a R15: 00007fb7f00d3e50 </TASK> Modules linked in: uinput rfcomm snd_seq_dummy snd_hrtimer nf_conntrack_netbios_ns nf_conntrack_broadcast nft_fib_inet nft_fib_ipv4 nft_fib_ipv6 nft_fib nft_reject_inet nf_reject_ipv4 nf_reject_ipv6 nft_reject nft_ct nft_chain_nat nf_nat nf_conntrack nf_defrag_ipv6 ip_set nf_tables nfnetlink qrtr bnep sunrpc binfmt_misc mt76x2u mt76x2_common mt76x02_usb iwlmvm snd_hda_codec_realtek mt76_usb intel_rapl_msr snd_hda_codec_generic snd_hda_codec_hdmi intel_rapl_common mt76x02_lib mt76 snd_hda_intel edac_mce_amd snd_intel_dspcfg cpi snd_usb_audio snd_hda_codec mac80211 kvm_amd snd_usbmidi_lib btusb snd_hda_core snd_rawmidi snd_hwdep mc btrtl kvm btbcm btintel snd_seq libarc4 iwlwifi btmtk snd_seq_device vfat eeepc_wmi fat bluetooth asus_ec_sensors snd_pcm asus_wmi irqbypass ledtrig_audio _keymap snd_timer xpad platform_profile wmi_bmof ff_memless rapl joydev pcspkr snd k10temp i2c_piix4 soundcore rfkill acpi_cpufreq loop zram amdgpu drm_ttm_helper ttm video iommu_v2 drm_buddy gpu_sched drm_display_helper crct10dif_pclmul ucsi_ccg crc32_pclmul crc32c_intel typec_ucsi polyval_clmulni polyval_generic typec ghash_clmulni_intel cec ccp sha512_ssse3 sp5100_tco igb nvme nvme_core dca i2c_algo_bit nvme_common wmi ip6_tables ip_tables ---[ end trace 0000000000000000 ]--- RIP: 0010:drm_sched_job_cleanup+0xa7/0x290 [gpu_sched] Code: d6 01 00 00 4c 8b 75 20 be 04 00 00 00 4d 8d 66 78 4c 89 e7 e8 ba 4d 4e c9 4c 89 e2 48 b8 00 00 00 00 00 fc ff df 48 c1 ea 03 <0f> b6 14 02 4c 89 e0 83 e0 07 83 c0 03 38 d0 7c 08 84 d2 0f 85 8a RSP: 0018:ffffc9003676f5a8 EFLAGS: 00010216 RAX: dffffc0000000000 RBX: ffff88816f81f020 RCX: 0000000000000001 RDX: 000000000000000f RSI: 0000000000000008 RDI: ffffffff9053e5e0 RBP: ffff88816f81f000 R08: 0000000000000001 R09: ffffffff9053e5e7 R10: fffffbfff20a7cbc R11: 6e696c6261736944 R12: 0000000000000078 R13: 1ffff92006cedeb5 R14: 0000000000000000 R15: ffffc9003676f870 FS: 000000004680f6c0(0000) GS:ffff888fa5c00000(0000) knlGS:0000000029910000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fb854d6f010 CR3: 000000017b2d6000 CR4: 0000000000350ee0 Demonstration: https://youtu.be/ysRc4TXuBQI I would be happy to join in testing patches that would fix this. I attached a full kernel log here. -- Best Regards, Mike Gavrilov.
Attachment:
BUG-KASAN-null-ptr-deref-in-drm_sched_job_cleanup+0x96.tar.xz
Description: Binary data
Attachment:
BUG-KASAN-null-ptr-deref-in-drm_sched_job_cleanup+0x96-2.tar.xz
Description: application/xz
Attachment:
BUG-KASAN-null-ptr-deref-in-drm_sched_job_cleanup+0x96-3.tar.xz
Description: application/xz