Le dim. 15 janv. 2023 à 22:45, David Airlie <airlied@xxxxxxxxxx> a écrit : > > On Thu, Dec 29, 2022 at 12:58 AM Diogo Ivo <diogo.ivo@xxxxxxxxxxxxxxxxxx> wrote: > > > > Hello, > > > > Commit 2541626cfb79 breaks GM20B probe with > > the following kernel log: > > > > [ 2.153892] ------------[ cut here ]------------ > > [ 2.153897] WARNING: CPU: 1 PID: 36 at drivers/gpu/drm/nouveau/nvkm/subdev/mmu/vmmgf100.c:273 gf100_vmm_valid+0x2c4/0x390 > > [ 2.153916] Modules linked in: > > [ 2.153922] CPU: 1 PID: 36 Comm: kworker/u8:1 Not tainted 6.1.0+ #1 > > [ 2.153929] Hardware name: Google Pixel C (DT) > > [ 2.153933] Workqueue: events_unbound deferred_probe_work_func > > [ 2.153943] pstate: 80000005 (Nzcv daif -PAN -UAO -TCO -DIT -SSBS BTYPE=--) > > [ 2.153950] pc : gf100_vmm_valid+0x2c4/0x390 > > [ 2.153959] lr : gf100_vmm_valid+0xb4/0x390 > > [ 2.153966] sp : ffffffc009e134b0 > > [ 2.153969] x29: ffffffc009e134b0 x28: 0000000000000000 x27: ffffffc008fd44c8 > > [ 2.153979] x26: 00000000ffffffea x25: ffffffc0087b98d0 x24: ffffff8080f89038 > > [ 2.153987] x23: ffffff8081fadc08 x22: 0000000000000000 x21: 0000000000000000 > > [ 2.153995] x20: ffffff8080f8a000 x19: ffffffc009e13678 x18: 0000000000000000 > > [ 2.154003] x17: f37a8b93418958e6 x16: ffffffc009f0d000 x15: 0000000000000000 > > [ 2.154011] x14: 0000000000000002 x13: 000000000003a020 x12: ffffffc008000000 > > [ 2.154019] x11: 0000000102913000 x10: 0000000000000000 x9 : 0000000000000000 > > [ 2.154026] x8 : ffffffc009e136d8 x7 : ffffffc008fd44c8 x6 : ffffff80803d0f00 > > [ 2.154034] x5 : 0000000000000000 x4 : ffffff8080f88c00 x3 : 0000000000000010 > > [ 2.154041] x2 : 000000000000000c x1 : 00000000ffffffea x0 : 00000000ffffffea > > [ 2.154050] Call trace: > > [ 2.154053] gf100_vmm_valid+0x2c4/0x390 > > [ 2.154061] nvkm_vmm_map_valid+0xd4/0x204 > > [ 2.154069] nvkm_vmm_map_locked+0xa4/0x344 > > [ 2.154076] nvkm_vmm_map+0x50/0x84 > > [ 2.154083] nvkm_firmware_mem_map+0x84/0xc4 > > [ 2.154094] nvkm_falcon_fw_oneinit+0xc8/0x320 > > [ 2.154101] nvkm_acr_oneinit+0x428/0x5b0 > > [ 2.154109] nvkm_subdev_oneinit_+0x50/0x104 > > [ 2.154114] nvkm_subdev_init_+0x3c/0x12c > > [ 2.154119] nvkm_subdev_init+0x60/0xa0 > > [ 2.154125] nvkm_device_init+0x14c/0x2a0 > > [ 2.154133] nvkm_udevice_init+0x60/0x9c > > [ 2.154140] nvkm_object_init+0x48/0x1b0 > > [ 2.154144] nvkm_ioctl_new+0x168/0x254 > > [ 2.154149] nvkm_ioctl+0xd0/0x220 > > [ 2.154153] nvkm_client_ioctl+0x10/0x1c > > [ 2.154162] nvif_object_ctor+0xf4/0x22c > > [ 2.154168] nvif_device_ctor+0x28/0x70 > > [ 2.154174] nouveau_cli_init+0x150/0x590 > > [ 2.154180] nouveau_drm_device_init+0x60/0x2a0 > > [ 2.154187] nouveau_platform_device_create+0x90/0xd0 > > [ 2.154193] nouveau_platform_probe+0x3c/0x9c > > [ 2.154200] platform_probe+0x68/0xc0 > > [ 2.154207] really_probe+0xbc/0x2dc > > [ 2.154211] __driver_probe_device+0x78/0xe0 > > [ 2.154216] driver_probe_device+0xd8/0x160 > > [ 2.154221] __device_attach_driver+0xb8/0x134 > > [ 2.154226] bus_for_each_drv+0x78/0xd0 > > [ 2.154230] __device_attach+0x9c/0x1a0 > > [ 2.154234] device_initial_probe+0x14/0x20 > > [ 2.154239] bus_probe_device+0x98/0xa0 > > [ 2.154243] deferred_probe_work_func+0x88/0xc0 > > [ 2.154247] process_one_work+0x204/0x40c > > [ 2.154256] worker_thread+0x230/0x450 > > [ 2.154261] kthread+0xc8/0xcc > > [ 2.154266] ret_from_fork+0x10/0x20 > > [ 2.154273] ---[ end trace 0000000000000000 ]--- > > [ 2.154278] nouveau 57000000.gpu: pmu: map -22 > > [ 2.154285] nouveau 57000000.gpu: acr: one-time init failed, -22 > > [ 2.154559] nouveau 57000000.gpu: init failed with -22 > > [ 2.154564] nouveau: DRM-master:00000000:00000080: init failed with -22 > > [ 2.154574] nouveau 57000000.gpu: DRM-master: Device allocation failed: -22 > > [ 2.162905] nouveau: probe of 57000000.gpu failed with error -22 > > > > #regzbot introduced: 2541626cfb79 > > As a quick check can you try changing > > drivers/gpu/drm/nouveau/nvkm/core/firmware.c:nvkm_firmware_mem_target > from NVKM_MEM_TARGET_HOST to NVKM_MEM_TARGET_NCOH ? > > Dave. I'm also reproducing the error on jetson-tx1 running 6.2-rc4 (rawhide nodebug kernel) on Fedora 37 userspace. With this change, the error is different: nouveau: loading out-of-tree module taints kernel. nouveau: module verification failed: signature and/or required key missing - tainting kernel Failed to set up IOMMU for device 57000000.gpu; retaining platform DMA ops nouveau 57000000.gpu: NVIDIA GM20B (12b000a1) nouveau 57000000.gpu: imem: using IOMMU Unable to handle kernel execution of user memory at virtual address 0000000000000000 Mem abort info: ESR = 0x0000000086000004 EC = 0x21: IABT (current EL), IL = 32 bits SET = 0, FnV = 0 EA = 0, S1PTW = 0 FSC = 0x04: level 0 translation fault user pgtable: 4k pages, 48-bit VAs, pgdp=000000013e136000 [0000000000000000] pgd=0000000000000000, p4d=0000000000000000 Internal error: Oops: 0000000086000004 [#1] SMP Modules linked in: nouveau(OE+) drm_ttm_helper ttm snd_seq_dummy snd_hrtimer nf_conntrack_netbios_ns nf_conntrack_broadcast nft_fib_inet nft_fib_ipv4 nft_fib_ipv6 nft_fib nft_reject_inet nf_reject_ipv4 nf_reject_ipv6 nft_reject nft_ct nft_chain_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 rfkill ip_set nf_tables nfnetlink qrtr sunrpc snd_soc_tegra_audio_graph_card snd_soc_audio_graph_card snd_soc_simple_card_utils snd_soc_core snd_compress snd_hda_codec_hdmi ac97_bus snd_hda_tegra snd_pcm_dmaengine snd_hda_codec max77620_thermal vfat snd_hda_core fat snd_hwdep snd_seq cpufreq_dt tegra_xudc snd_seq_device usb_conn_gpio tegra_soctherm snd_pcm udc_core snd_timer snd at24 soundcore zram r8152 mii panel_simple mmc_block tegra_drm drm_dp_aux_bus rtc_max77686 drm_display_helper lp855x_bl cec crct10dif_ce polyval_ce polyval_generic xhci_tegra sdhci_tegra ghash_ce sdhci_pltfm phy_tegra_xusb sdhci host1x ahci_tegra gpio_keys tegra210_emc cqhci rtc_tegra i2c_tegra ip6_tables abrt-dump-journal-oops: Found oopses: 1 abrt-dump-journal-oops: Creating problem directories Can't find a meaningful backtrace for hashing in '.' Preserving oops '.' because DropNotReportableOopses is 'no' Reported 1 kernel oopses to Abrt System encountered a non-fatal error in ??() ip_tables fuse CPU: 3 PID: 15789 Comm: insmod Tainted: G OE ------- --- 6.2.0-0.rc4.31.fc38.aarch64 #1 Hardware name: nvidia,p2371-2180 NVIDIA P2371-2180/NVIDIA P2371-2180, BIOS 2022.10 10/01/2022 pstate: 60400005 (nZCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) pc : 0x0 lr : nvkm_falcon_load_dmem+0x60/0x80 [nouveau] sp : ffff8000141034c0 x29: ffff8000141034c0 x28: ffff0000865d2008 x27: ffff0000c87b8638 x26: 00000000000000ff x25: ffff0000865d20a0 x24: 0000000000000000 x23: ffff0000f4ef34f8 x22: 000000000000002c x21: 0000000000005fd4 x20: ffff800014103534 x19: ffff0000f4ef34b8 x18: ffffffffffffffff x17: 000000040044ffff x16: 00500074b5503510 x15: ffff8000141035d8 x14: ffff0000c87b8506 x13: 0000000000000001 x12: 0000000000000018 x11: 0000000000000001 x10: fffffffffffffec0 x9 : ffff8000091efa90 x8 : ffff800014103560 x7 : 0000000000000000 x6 : 0000000434da8f65 x5 : 0000000000000000 x4 : 0000000000000000 x3 : 000000000000002c x2 : 0000000000005fd4 x1 : ffff800014103534 x0 : ffff0000f4ef34b8 Call trace: 0x0 gm20b_pmu_init+0x70/0x90 [nouveau] nvkm_pmu_init+0x28/0x40 [nouveau] nvkm_subdev_init_+0x68/0x134 [nouveau] nvkm_subdev_init+0x68/0xb0 [nouveau] nvkm_device_init+0x154/0x2ac [nouveau] nvkm_udevice_init+0x68/0xa0 [nouveau] nvkm_object_init+0x50/0x1b4 [nouveau] nvkm_ioctl_new+0x154/0x280 [nouveau] nvkm_ioctl+0xd8/0x230 [nouveau] nvkm_client_ioctl+0x18/0x24 [nouveau] nvif_object_ctor+0xec/0x1a0 [nouveau] nvif_device_ctor+0x30/0x7c [nouveau] nouveau_cli_init+0x144/0x574 [nouveau] nouveau_drm_device_init+0x68/0x2b0 [nouveau] nouveau_platform_device_create+0x98/0xd0 [nouveau] nouveau_platform_probe+0x34/0x90 [nouveau] platform_probe+0x70/0xd0 really_probe+0xc8/0x3e4 __driver_probe_device+0x84/0x190 driver_probe_device+0x44/0x11c __driver_attach+0xf8/0x200 bus_for_each_dev+0x6c/0xac driver_attach+0x2c/0x40 bus_add_driver+0x188/0x250 driver_register+0x80/0x13c __platform_driver_register+0x30/0x3c nouveau_drm_init+0x9c/0x1000 [nouveau] do_one_initcall+0x4c/0x2a0 do_init_module+0x50/0x200 load_module+0x9b0/0xb10 __do_sys_finit_module+0x98/0x100 __arm64_sys_finit_module+0x28/0x34 invoke_syscall+0x78/0x100 el0_svc_common.constprop.0+0x4c/0xf4 do_el0_svc+0x34/0x4c el0_svc+0x34/0x10c el0t_64_sync_handler+0x114/0x120 el0t_64_sync+0x194/0x198 Code: bad PC value ---[ end trace 0000000000000000 ]---