[ 264.483189] ====================================================== [ 264.483487] WARNING: possible circular locking dependency detected [ 264.483781] 5.6.0-deli-v5.6-2848-g3f3109b0e75f #1 Tainted: G OE [ 264.484076] ------------------------------------------------------ [ 264.484370] kworker/39:1/567 is trying to acquire lock: [ 264.484663] ffffffffc15df4b0 (mgpu_info.mutex){+.+.}, at: amdgpu_unregister_gpu_instance+0x1d/0xc0 [amdgpu] [ 264.485081] but task is already holding lock: [ 264.485670] ffff965fd31647a0 (&adev->reset_sem){++++}, at: amdgpu_device_gpu_recover+0x264/0x1030 [amdgpu] [ 264.486074] which lock already depends on the new lock. [ 264.487043] the existing dependency chain (in reverse order) is: [ 264.487710] -> #3 (&adev->reset_sem){++++}: [ 264.488400] down_write+0x49/0x120 [ 264.488783] amdgpu_device_gpu_recover+0x264/0x1030 [amdgpu] [ 264.489179] amdgpu_ras_do_recovery+0x159/0x190 [amdgpu] [ 264.489544] process_one_work+0x29e/0x630 [ 264.489910] worker_thread+0x3c/0x3f0 [ 264.490279] kthread+0x12f/0x150 [ 264.490649] ret_from_fork+0x3a/0x50 [ 264.491020] -> #2 (&tmp->hive_lock){+.+.}: [ 264.491764] __mutex_lock+0x95/0xa20 [ 264.492137] mutex_lock_nested+0x1b/0x20 [ 264.492553] amdgpu_get_xgmi_hive+0x352/0x400 [amdgpu] [ 264.492972] amdgpu_xgmi_add_device+0xb8/0x460 [amdgpu] [ 264.493387] amdgpu_device_init+0x12fb/0x1e10 [amdgpu] [ 264.493807] amdgpu_driver_load_kms+0x5c/0x2c0 [amdgpu] [ 264.494226] amdgpu_pci_probe+0x11d/0x200 [amdgpu] [ 264.494617] local_pci_probe+0x47/0xa0 [ 264.494998] work_for_cpu_fn+0x1a/0x30 [ 264.495369] process_one_work+0x29e/0x630 [ 264.495746] worker_thread+0x22b/0x3f0 [ 264.496124] kthread+0x12f/0x150 [ 264.496504] ret_from_fork+0x3a/0x50 [ 264.496876] -> #1 (xgmi_mutex){+.+.}: [ 264.497596] __mutex_lock+0x95/0xa20 [ 264.497954] mutex_lock_nested+0x1b/0x20 [ 264.498346] amdgpu_get_xgmi_hive+0x38/0x400 [amdgpu] [ 264.498741] amdgpu_xgmi_set_pstate+0x10/0x20 [amdgpu] [ 264.499126] amdgpu_device_ip_late_init+0x219/0x230 [amdgpu] [ 264.499506] amdgpu_device_init+0x1401/0x1e10 [amdgpu] [ 264.499886] amdgpu_driver_load_kms+0x5c/0x2c0 [amdgpu] [ 264.500264] amdgpu_pci_probe+0x11d/0x200 [amdgpu] [ 264.500608] local_pci_probe+0x47/0xa0 [ 264.500945] work_for_cpu_fn+0x1a/0x30 [ 264.501276] process_one_work+0x29e/0x630 [ 264.501603] worker_thread+0x22b/0x3f0 [ 264.501927] kthread+0x12f/0x150 [ 264.502239] ret_from_fork+0x3a/0x50 [ 264.502541] -> #0 (mgpu_info.mutex){+.+.}: [ 264.503126] __lock_acquire+0x13ec/0x16e0 [ 264.503411] lock_acquire+0xb8/0x1c0 [ 264.503693] __mutex_lock+0x95/0xa20 [ 264.504019] mutex_lock_nested+0x1b/0x20 [ 264.504354] amdgpu_unregister_gpu_instance+0x1d/0xc0 [amdgpu] [ 264.504691] amdgpu_device_gpu_recover+0x360/0x1030 [amdgpu] [ 264.505029] amdgpu_ras_do_recovery+0x159/0x190 [amdgpu] [ 264.505334] process_one_work+0x29e/0x630 [ 264.505617] worker_thread+0x3c/0x3f0 [ 264.505897] kthread+0x12f/0x150 [ 264.506176] ret_from_fork+0x3a/0x50 [ 264.506453] other info that might help us debug this: [ 264.507267] Chain exists of: mgpu_info.mutex --> &tmp->hive_lock --> &adev->reset_sem [ 264.508102] Possible unsafe locking scenario: [ 264.508664] CPU0 CPU1 [ 264.508945] ---- ---- [ 264.509221] lock(&adev->reset_sem); [ 264.509524] lock(&tmp->hive_lock); [ 264.509818] lock(&adev->reset_sem); [ 264.510111] lock(mgpu_info.mutex); [ 264.510401] *** DEADLOCK *** [ 264.511224] 4 locks held by kworker/39:1/567: [ 264.511499] #0: ffff961ff5c1d348 ((wq_completion)events){+.+.}, at: process_one_work+0x21f/0x630 [ 264.511793] #1: ffffafa90e233e58 ((work_completion)(&con->recovery_work)){+.+.}, at: process_one_work+0x21f/0x630 [ 264.512100] #2: ffffffffc16245d0 (&tmp->hive_lock){+.+.}, at: amdgpu_device_gpu_recover+0xb0/0x1030 [amdgpu] [ 264.512450] #3: ffff965fd31647a0 (&adev->reset_sem){++++}, at: amdgpu_device_gpu_recover+0x264/0x1030 [amdgpu] Remove the lock(&hive->hive_lock) out of amdgpu_get_xgmi_hive, to disable its locking dependency on xgmi_mutex. Signed-off-by: Dennis Li <Dennis.Li@xxxxxxx> Change-Id: I2d9d80ee23f9f9ac6ce9e1b9e5e1b2b3530f5bdd diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 62ecac97fbd2..6c572db42d92 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2840,7 +2840,7 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work) { struct amdgpu_device *adev = container_of(__work, struct amdgpu_device, xgmi_reset_work); - struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, 0); + struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); /* It's a bug to not have a hive within this function */ if (WARN_ON(!hive)) @@ -4283,7 +4283,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, * We always reset all schedulers for device and all devices for XGMI * hive so that should take care of them too. */ - hive = amdgpu_get_xgmi_hive(adev, false); + hive = amdgpu_get_xgmi_hive(adev); if (hive) { if (atomic_cmpxchg(&hive->in_reset, 0, 1) != 0) { DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress", diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 5680f7eafcb1..5cd42740461c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1514,7 +1514,7 @@ static void amdgpu_ras_do_recovery(struct work_struct *work) struct amdgpu_device *remote_adev = NULL; struct amdgpu_device *adev = ras->adev; struct list_head device_list, *device_list_handle = NULL; - struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, false); + struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); /* Build list of devices to query RAS related errors */ if (hive && adev->gmc.xgmi.num_physical_nodes > 1) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c index 67a756f4337b..19fd5ce3e857 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c @@ -336,7 +336,7 @@ static void amdgpu_xgmi_sysfs_rem_dev_info(struct amdgpu_device *adev, -struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev, int lock) +struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev) { int i; struct amdgpu_hive_info *tmp; @@ -349,8 +349,6 @@ struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev, int lo for (i = 0 ; i < hive_count; ++i) { tmp = &xgmi_hives[i]; if (tmp->hive_id == adev->gmc.xgmi.hive_id) { - if (lock) - mutex_lock(&tmp->hive_lock); mutex_unlock(&xgmi_mutex); return tmp; } @@ -374,9 +372,6 @@ struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev, int lo mutex_init(&tmp->hive_lock); atomic_set(&tmp->in_reset, 0); task_barrier_init(&tmp->tb); - - if (lock) - mutex_lock(&tmp->hive_lock); tmp->pstate = AMDGPU_XGMI_PSTATE_UNKNOWN; tmp->hi_req_gpu = NULL; /* @@ -392,7 +387,7 @@ struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev, int lo int amdgpu_xgmi_set_pstate(struct amdgpu_device *adev, int pstate) { int ret = 0; - struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, 0); + struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); struct amdgpu_device *request_adev = hive->hi_req_gpu ? hive->hi_req_gpu : adev; bool is_hi_req = pstate == AMDGPU_XGMI_PSTATE_MAX_VEGA20; @@ -515,7 +510,7 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev) adev->gmc.xgmi.node_id = adev->gmc.xgmi.physical_node_id + 16; } - hive = amdgpu_get_xgmi_hive(adev, 1); + hive = amdgpu_get_xgmi_hive(adev); if (!hive) { ret = -EINVAL; dev_err(adev->dev, @@ -524,6 +519,8 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev) goto exit; } + mutex_lock(&hive->hive_lock); + top_info = &adev->psp.xgmi_context.top_info; list_add_tail(&adev->gmc.xgmi.head, &hive->device_list); @@ -587,10 +584,11 @@ int amdgpu_xgmi_remove_device(struct amdgpu_device *adev) if (!adev->gmc.xgmi.supported) return -EINVAL; - hive = amdgpu_get_xgmi_hive(adev, 1); + hive = amdgpu_get_xgmi_hive(adev); if (!hive) return -EINVAL; + mutex_lock(&hive->hive_lock); task_barrier_rem_task(&hive->tb); amdgpu_xgmi_sysfs_rem_dev_info(adev, hive); mutex_unlock(&hive->hive_lock); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h index 61720cd4a1ee..ae3249c1aafe 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h @@ -51,7 +51,7 @@ struct amdgpu_pcs_ras_field { uint32_t pcs_err_shift; }; -struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev, int lock); +struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev); int amdgpu_xgmi_update_topology(struct amdgpu_hive_info *hive, struct amdgpu_device *adev); int amdgpu_xgmi_add_device(struct amdgpu_device *adev); int amdgpu_xgmi_remove_device(struct amdgpu_device *adev); -- 2.17.1 _______________________________________________ amd-gfx mailing list amd-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/amd-gfx