[PATCH] drm/amdgpu: annotate a false positive locking dependency

Dennis Li <Dennis.Li@xxxxxxx> · Thu, 6 Aug 2020 09:24:04 +0800

[  264.483189] ======================================================
[  264.483487] WARNING: possible circular locking dependency detected
[  264.483781] 5.6.0-deli-v5.6-2848-g3f3109b0e75f #1 Tainted: G           OE
[  264.484076] ------------------------------------------------------
[  264.484370] kworker/39:1/567 is trying to acquire lock:
[  264.484663] ffffffffc15df4b0 (mgpu_info.mutex){+.+.}, at: amdgpu_unregister_gpu_instance+0x1d/0xc0 [amdgpu]
[  264.485081]
               but task is already holding lock:
[  264.485670] ffff965fd31647a0 (&adev->reset_sem){++++}, at: amdgpu_device_gpu_recover+0x264/0x1030 [amdgpu]
[  264.486074]
               which lock already depends on the new lock.

[  264.487043]
               the existing dependency chain (in reverse order) is:
[  264.487710]
               -> #3 (&adev->reset_sem){++++}:
[  264.488400]        down_write+0x49/0x120
[  264.488783]        amdgpu_device_gpu_recover+0x264/0x1030 [amdgpu]
[  264.489179]        amdgpu_ras_do_recovery+0x159/0x190 [amdgpu]
[  264.489544]        process_one_work+0x29e/0x630
[  264.489910]        worker_thread+0x3c/0x3f0
[  264.490279]        kthread+0x12f/0x150
[  264.490649]        ret_from_fork+0x3a/0x50
[  264.491020]
               -> #2 (&tmp->hive_lock){+.+.}:
[  264.491764]        __mutex_lock+0x95/0xa20
[  264.492137]        mutex_lock_nested+0x1b/0x20
[  264.492553]        amdgpu_get_xgmi_hive+0x352/0x400 [amdgpu]
[  264.492972]        amdgpu_xgmi_add_device+0xb8/0x460 [amdgpu]
[  264.493387]        amdgpu_device_init+0x12fb/0x1e10 [amdgpu]
[  264.493807]        amdgpu_driver_load_kms+0x5c/0x2c0 [amdgpu]
[  264.494226]        amdgpu_pci_probe+0x11d/0x200 [amdgpu]
[  264.494617]        local_pci_probe+0x47/0xa0
[  264.494998]        work_for_cpu_fn+0x1a/0x30
[  264.495369]        process_one_work+0x29e/0x630
[  264.495746]        worker_thread+0x22b/0x3f0
[  264.496124]        kthread+0x12f/0x150
[  264.496504]        ret_from_fork+0x3a/0x50
[  264.496876]
               -> #1 (xgmi_mutex){+.+.}:
[  264.497596]        __mutex_lock+0x95/0xa20
[  264.497954]        mutex_lock_nested+0x1b/0x20
[  264.498346]        amdgpu_get_xgmi_hive+0x38/0x400 [amdgpu]
[  264.498741]        amdgpu_xgmi_set_pstate+0x10/0x20 [amdgpu]
[  264.499126]        amdgpu_device_ip_late_init+0x219/0x230 [amdgpu]
[  264.499506]        amdgpu_device_init+0x1401/0x1e10 [amdgpu]
[  264.499886]        amdgpu_driver_load_kms+0x5c/0x2c0 [amdgpu]
[  264.500264]        amdgpu_pci_probe+0x11d/0x200 [amdgpu]
[  264.500608]        local_pci_probe+0x47/0xa0
[  264.500945]        work_for_cpu_fn+0x1a/0x30
[  264.501276]        process_one_work+0x29e/0x630
[  264.501603]        worker_thread+0x22b/0x3f0
[  264.501927]        kthread+0x12f/0x150
[  264.502239]        ret_from_fork+0x3a/0x50
[  264.502541]
               -> #0 (mgpu_info.mutex){+.+.}:
[  264.503126]        __lock_acquire+0x13ec/0x16e0
[  264.503411]        lock_acquire+0xb8/0x1c0
[  264.503693]        __mutex_lock+0x95/0xa20
[  264.504019]        mutex_lock_nested+0x1b/0x20
[  264.504354]        amdgpu_unregister_gpu_instance+0x1d/0xc0 [amdgpu]
[  264.504691]        amdgpu_device_gpu_recover+0x360/0x1030 [amdgpu]
[  264.505029]        amdgpu_ras_do_recovery+0x159/0x190 [amdgpu]
[  264.505334]        process_one_work+0x29e/0x630
[  264.505617]        worker_thread+0x3c/0x3f0
[  264.505897]        kthread+0x12f/0x150
[  264.506176]        ret_from_fork+0x3a/0x50
[  264.506453]
               other info that might help us debug this:

[  264.507267] Chain exists of:
                 mgpu_info.mutex --> &tmp->hive_lock --> &adev->reset_sem

[  264.508102]  Possible unsafe locking scenario:

[  264.508664]        CPU0                    CPU1
[  264.508945]        ----                    ----
[  264.509221]   lock(&adev->reset_sem);
[  264.509524]                                lock(&tmp->hive_lock);
[  264.509818]                                lock(&adev->reset_sem);
[  264.510111]   lock(mgpu_info.mutex);
[  264.510401]
                *** DEADLOCK ***

[  264.511224] 4 locks held by kworker/39:1/567:
[  264.511499]  #0: ffff961ff5c1d348 ((wq_completion)events){+.+.}, at: process_one_work+0x21f/0x630
[  264.511793]  #1: ffffafa90e233e58 ((work_completion)(&con->recovery_work)){+.+.}, at: process_one_work+0x21f/0x630
[  264.512100]  #2: ffffffffc16245d0 (&tmp->hive_lock){+.+.}, at: amdgpu_device_gpu_recover+0xb0/0x1030 [amdgpu]
[  264.512450]  #3: ffff965fd31647a0 (&adev->reset_sem){++++}, at: amdgpu_device_gpu_recover+0x264/0x1030 [amdgpu]

Remove the lock(&hive->hive_lock) out of amdgpu_get_xgmi_hive,
to disable its locking dependency on xgmi_mutex.

Signed-off-by: Dennis Li <Dennis.Li@xxxxxxx>
Change-Id: I2d9d80ee23f9f9ac6ce9e1b9e5e1b2b3530f5bdd

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 62ecac97fbd2..6c572db42d92 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2840,7 +2840,7 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
 {
 	struct amdgpu_device *adev =
 		container_of(__work, struct amdgpu_device, xgmi_reset_work);
-	struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, 0);
+	struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
 
 	/* It's a bug to not have a hive within this function */
 	if (WARN_ON(!hive))
@@ -4283,7 +4283,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 	 * We always reset all schedulers for device and all devices for XGMI
 	 * hive so that should take care of them too.
 	 */
-	hive = amdgpu_get_xgmi_hive(adev, false);
+	hive = amdgpu_get_xgmi_hive(adev);
 	if (hive) {
 		if (atomic_cmpxchg(&hive->in_reset, 0, 1) != 0) {
 			DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 5680f7eafcb1..5cd42740461c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1514,7 +1514,7 @@ static void amdgpu_ras_do_recovery(struct work_struct *work)
 	struct amdgpu_device *remote_adev = NULL;
 	struct amdgpu_device *adev = ras->adev;
 	struct list_head device_list, *device_list_handle =  NULL;
-	struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, false);
+	struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
 
 	/* Build list of devices to query RAS related errors */
 	if  (hive && adev->gmc.xgmi.num_physical_nodes > 1)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index 67a756f4337b..19fd5ce3e857 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -336,7 +336,7 @@ static void amdgpu_xgmi_sysfs_rem_dev_info(struct amdgpu_device *adev,
 
 
 
-struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev, int lock)
+struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev)
 {
 	int i;
 	struct amdgpu_hive_info *tmp;
@@ -349,8 +349,6 @@ struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev, int lo
 	for (i = 0 ; i < hive_count; ++i) {
 		tmp = &xgmi_hives[i];
 		if (tmp->hive_id == adev->gmc.xgmi.hive_id) {
-			if (lock)
-				mutex_lock(&tmp->hive_lock);
 			mutex_unlock(&xgmi_mutex);
 			return tmp;
 		}
@@ -374,9 +372,6 @@ struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev, int lo
 	mutex_init(&tmp->hive_lock);
 	atomic_set(&tmp->in_reset, 0);
 	task_barrier_init(&tmp->tb);
-
-	if (lock)
-		mutex_lock(&tmp->hive_lock);
 	tmp->pstate = AMDGPU_XGMI_PSTATE_UNKNOWN;
 	tmp->hi_req_gpu = NULL;
 	/*
@@ -392,7 +387,7 @@ struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev, int lo
 int amdgpu_xgmi_set_pstate(struct amdgpu_device *adev, int pstate)
 {
 	int ret = 0;
-	struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, 0);
+	struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
 	struct amdgpu_device *request_adev = hive->hi_req_gpu ?
 						hive->hi_req_gpu : adev;
 	bool is_hi_req = pstate == AMDGPU_XGMI_PSTATE_MAX_VEGA20;
@@ -515,7 +510,7 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
 		adev->gmc.xgmi.node_id = adev->gmc.xgmi.physical_node_id + 16;
 	}
 
-	hive = amdgpu_get_xgmi_hive(adev, 1);
+	hive = amdgpu_get_xgmi_hive(adev);
 	if (!hive) {
 		ret = -EINVAL;
 		dev_err(adev->dev,
@@ -524,6 +519,8 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
 		goto exit;
 	}
 
+	mutex_lock(&hive->hive_lock);
+
 	top_info = &adev->psp.xgmi_context.top_info;
 
 	list_add_tail(&adev->gmc.xgmi.head, &hive->device_list);
@@ -587,10 +584,11 @@ int amdgpu_xgmi_remove_device(struct amdgpu_device *adev)
 	if (!adev->gmc.xgmi.supported)
 		return -EINVAL;
 
-	hive = amdgpu_get_xgmi_hive(adev, 1);
+	hive = amdgpu_get_xgmi_hive(adev);
 	if (!hive)
 		return -EINVAL;
 
+	mutex_lock(&hive->hive_lock);
 	task_barrier_rem_task(&hive->tb);
 	amdgpu_xgmi_sysfs_rem_dev_info(adev, hive);
 	mutex_unlock(&hive->hive_lock);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
index 61720cd4a1ee..ae3249c1aafe 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
@@ -51,7 +51,7 @@ struct amdgpu_pcs_ras_field {
 	uint32_t pcs_err_shift;
 };
 
-struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev, int lock);
+struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev);
 int amdgpu_xgmi_update_topology(struct amdgpu_hive_info *hive, struct amdgpu_device *adev);
 int amdgpu_xgmi_add_device(struct amdgpu_device *adev);
 int amdgpu_xgmi_remove_device(struct amdgpu_device *adev);
-- 
2.17.1

_______________________________________________
amd-gfx mailing list
amd-gfx@xxxxxxxxxxxxxxxxxxxxx
https://lists.freedesktop.org/mailman/listinfo/amd-gfx