On 1/8/2025 2:26 PM, Jiang Liu wrote:If some GPU device failed to probe, `rmmod amdgpu` will trigger a use after free bug related to amdgpu_driver_release_kms() as: 2024-12-26 16:17:45 [16002.085540] BUG: kernel NULL pointer dereference, address: 0000000000000000 2024-12-26 16:17:45 [16002.093792] #PF: supervisor read access in kernel mode 2024-12-26 16:17:45 [16002.099993] #PF: error_code(0x0000) - not-present page 2024-12-26 16:17:45 [16002.106188] PGD 0 P4D 0 2024-12-26 16:17:45 [16002.109464] Oops: Oops: 0000 [#1] PREEMPT SMP NOPTI 2024-12-26 16:17:45 [16002.115372] CPU: 2 PID: 14375 Comm: rmmod Kdump: loaded Tainted: G W E 6.10.0+ #2 2024-12-26 16:17:45 [16002.125577] Hardware name: Alibaba Alibaba Cloud ECS/Alibaba Cloud ECS, BIOS 3.0.ES.AL.P.087.05 04/07/2024 2024-12-26 16:17:45 [16002.136858] RIP: 0010:drm_sched_fini+0x3f/0xe0 [gpu_sched] 2024-12-26 16:17:45 [16002.143463] Code: 60 c6 87 be 00 00 00 01 e8 ce e0 90 d8 48 8d bb 80 00 00 00 e8 c2 e0 90 d8 8b 43 20 85 c0 74 51 45 31 e4 48 8b 43 28 4d 63 ec <4a> 8b 2c e8 48 89 ef e8 f5 0e 59 d9 48 8b 45 10 48 8d 55 10 48 39 2024-12-26 16:17:45 [16002.164992] RSP: 0018:ffffb570dbbb7da8 EFLAGS: 00010246 2024-12-26 16:17:45 [16002.171316] RAX: 0000000000000000 RBX: ffff96b0fdadc878 RCX: 0000000000000000 2024-12-26 16:17:46 [16002.179784] RDX: 000fffffffe00000 RSI: 0000000000000000 RDI: ffff96b0fdadc8f8 2024-12-26 16:17:46 [16002.188252] RBP: ffff96b0fdadc800 R08: ffff97abbd035040 R09: ffffffff9ac52540 2024-12-26 16:17:46 [16002.196722] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000 2024-12-26 16:17:46 [16002.205179] R13: 0000000000000000 R14: ffff96b0fdadfc00 R15: 0000000000000000 2024-12-26 16:17:46 [16002.213648] FS: 00007f2737000740(0000) GS:ffff97abbd100000(0000) knlGS:0000000000000000 2024-12-26 16:17:46 [16002.223189] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 2024-12-26 16:17:46 [16002.230103] CR2: 0000000000000000 CR3: 000000011be3a005 CR4: 0000000000f70ef0 2024-12-26 16:17:46 [16002.238581] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 2024-12-26 16:17:46 [16002.247053] DR3: 0000000000000000 DR6: 00000000fffe07f0 DR7: 0000000000000400 e024se+0x3c/0x90 [amdxcp] 2024-12-26 16:17:46 [16002.337645] __do_sys_delete_module.constprop.0+0x176/0x310 2024-12-26 16:17:46 [16002.344324] do_syscall_64+0x5d/0x170 2024-12-26 16:17:46 [16002.348858] entry_SYSCALL_64_after_hwframe+0x76/0x7e 2024-12-26 16:17:46 [16002.354956] RIP: 0033:0x7f2736a620cb-12-26
Fix it by removing xcp drm devices when failed to probe GPU devices.
Signed-off-by: Jiang Liu <gerry@xxxxxxxxxxxxxxxxx> Tested-by: Shuo Liu <shuox.liu@xxxxxxxxxxxxxxxxx> Reviewed-by: Lijo Lazar <lijo.lazar@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c | 47 +++++++++++++++++++--- drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.h | 4 +- 5 files changed, 47 insertions(+), 9 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 5ff53a3b9851..510074a9074e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -6682,7 +6682,7 @@ void amdgpu_device_halt(struct amdgpu_device *adev) struct pci_dev *pdev = adev->pdev; struct drm_device *ddev = adev_to_drm(adev);
- amdgpu_xcp_dev_unplug(adev); + amdgpu_xcp_dev_deregister(adev); drm_dev_unplug(ddev);
amdgpu_irq_disable_all(adev); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 62de668e9ff8..41d1b06be600 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -2435,7 +2435,7 @@ amdgpu_pci_remove(struct pci_dev *pdev) struct drm_device *dev = pci_get_drvdata(pdev); struct amdgpu_device *adev = drm_to_adev(dev);
- amdgpu_xcp_dev_unplug(adev); + amdgpu_xcp_dev_deregister(adev); amdgpu_gmc_prepare_nps_mode_change(adev); drm_dev_unplug(dev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c index d2a046736edd..be9147eb8308 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c @@ -1508,6 +1508,7 @@ void amdgpu_driver_release_kms(struct drm_device *dev) struct amdgpu_device *adev = drm_to_adev(dev);
amdgpu_device_fini_sw(adev); + amdgpu_xcp_mgr_fini(adev);
Suggest to move this inside fini_sw() pci_set_drvdata(adev->pdev, NULL); }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c index e209b5e101df..62dd5287808b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c @@ -283,6 +283,33 @@ static int amdgpu_xcp_dev_alloc(struct amdgpu_device *adev) return 0; }
+static void amdgpu_xcp_dev_free(struct amdgpu_device *adev) +{ + struct drm_device *p_ddev; + int i; + + if (!adev->xcp_mgr) + return; + + for (i = 1; i < MAX_XCP; i++) { + if (!adev->xcp_mgr->xcp[i].ddev) + break; + + // Restore and free the original drm_device. + p_ddev = adev->xcp_mgr->xcp[i].ddev; + p_ddev->render->dev = adev->xcp_mgr->xcp[i].rdev; + p_ddev->primary->dev = adev->xcp_mgr->xcp[i].pdev; + p_ddev->driver = adev->xcp_mgr->xcp[i].driver; + p_ddev->vma_offset_manager = adev->xcp_mgr->xcp[i].vma_offset_manager;
Now that there are more calls, this doesn't make sense here. What aboutmoving the redirection along with register() (I guess it matters fromthat point onwards) and undoing it (restore back saved values) alongwith deregister()? With that, there won't be a need to have registeredflag. You may only need to check if xcp rdev/pdev is not NULL.
Good point, this makes code more clear.
+ amdgpu_xcp_drm_dev_free(p_ddev); + + adev->xcp_mgr->xcp[i].ddev = NULL; + } + + adev->xcp_mgr->xcp->ddev = NULL; +} + + int amdgpu_xcp_mgr_init(struct amdgpu_device *adev, int init_mode, int init_num_xcps, struct amdgpu_xcp_mgr_funcs *xcp_funcs) @@ -310,6 +337,13 @@ int amdgpu_xcp_mgr_init(struct amdgpu_device *adev, int init_mode, return amdgpu_xcp_dev_alloc(adev); }
+void amdgpu_xcp_mgr_fini(struct amdgpu_device *adev) +{ + amdgpu_xcp_dev_free(adev); + kfree(adev->xcp_mgr); + adev->xcp_mgr = NULL;
Thanks for adding this.Thanks,Lijo+} + int amdgpu_xcp_get_partition(struct amdgpu_xcp_mgr *xcp_mgr, enum AMDGPU_XCP_IP_BLOCK ip, int instance) { @@ -359,12 +393,14 @@ int amdgpu_xcp_dev_register(struct amdgpu_device *adev, ret = drm_dev_register(adev->xcp_mgr->xcp[i].ddev, ent->driver_data); if (ret) return ret; + + adev->xcp_mgr->xcp[i].registered = true; }
return 0; }
-void amdgpu_xcp_dev_unplug(struct amdgpu_device *adev) +void amdgpu_xcp_dev_deregister(struct amdgpu_device *adev) { struct drm_device *p_ddev; int i; @@ -377,11 +413,10 @@ void amdgpu_xcp_dev_unplug(struct amdgpu_device *adev) break;
p_ddev = adev->xcp_mgr->xcp[i].ddev; - drm_dev_unplug(p_ddev); - p_ddev->render->dev = adev->xcp_mgr->xcp[i].rdev; - p_ddev->primary->dev = adev->xcp_mgr->xcp[i].pdev; - p_ddev->driver = adev->xcp_mgr->xcp[i].driver; - p_ddev->vma_offset_manager = adev->xcp_mgr->xcp[i].vma_offset_manager; + if (adev->xcp_mgr->xcp[i].registered) { + drm_dev_unplug(p_ddev); + adev->xcp_mgr->xcp[i].registered = false; + } } }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.h index b63f53242c57..be22d4398463 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.h @@ -101,6 +101,7 @@ struct amdgpu_xcp { uint8_t id; uint8_t mem_id; bool valid; + bool registered; atomic_t ref_cnt; struct drm_device *ddev; struct drm_device *rdev; @@ -155,6 +156,7 @@ int amdgpu_xcp_resume(struct amdgpu_xcp_mgr *xcp_mgr, int xcp_id);
int amdgpu_xcp_mgr_init(struct amdgpu_device *adev, int init_mode, int init_xcps, struct amdgpu_xcp_mgr_funcs *xcp_funcs); +void amdgpu_xcp_mgr_fini(struct amdgpu_device *adev); int amdgpu_xcp_init(struct amdgpu_xcp_mgr *xcp_mgr, int num_xcps, int mode); int amdgpu_xcp_query_partition_mode(struct amdgpu_xcp_mgr *xcp_mgr, u32 flags); int amdgpu_xcp_switch_partition_mode(struct amdgpu_xcp_mgr *xcp_mgr, int mode); @@ -168,7 +170,7 @@ int amdgpu_xcp_get_inst_details(struct amdgpu_xcp *xcp,
int amdgpu_xcp_dev_register(struct amdgpu_device *adev, const struct pci_device_id *ent); -void amdgpu_xcp_dev_unplug(struct amdgpu_device *adev); +void amdgpu_xcp_dev_deregister(struct amdgpu_device *adev); int amdgpu_xcp_open_device(struct amdgpu_device *adev, struct amdgpu_fpriv *fpriv, struct drm_file *file_priv);
|