Re: [PATCH] drm/amdgpu: race issue when jobs on 2 ring timeout

Andrey Grodzovsky <Andrey.Grodzovsky@xxxxxxx> · Wed, 20 Jan 2021 11:39:54 -0500

On 1/20/21 9:12 AM, Horace Chen wrote:
Fix a racing issue when jobs on 2 rings timeout simultaneously.

If 2 rings timed out at the same time, the
amdgpu_device_gpu_recover will be reentered. Then the
adev->gmc.xgmi.head will be grabbed by 2 local linked list,
which may cause wild pointer issue in iterating.

lock the device earily to prevent the node be added to 2
different lists.

also increase karma for the skipped job since the job is also
timed out and should be guilty.

Signed-off-by: Horace Chen <horace.chen@xxxxxxx>
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 70 +++++++++++++++++++---
  1 file changed, 61 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 4d434803fb49..d59d3182ac2d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4460,6 +4460,46 @@ static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
  	up_write(&adev->reset_sem);
  }
  
+/*
+ * to lockup a list of amdgpu devices in a hive safely, if not a hive
+ * with multiple nodes, it will be same as amdgpu_device_lock_adev.
+ *
+ * unlock won't require roll back.
+ */
+static bool amdgpu_device_lock_hive_adev(struct amdgpu_device *adev, struct amdgpu_hive_info *hive)
+{
+	struct amdgpu_device *tmp_adev = NULL;
+
+	if (adev->gmc.xgmi.num_physical_nodes > 1) {
+		if (!hive) {
+			dev_err(adev->dev, "Hive is NULL while device has multiple xgmi nodes");
+			return false;
+		}
+		list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
+			if (!amdgpu_device_lock_adev(tmp_adev, hive))
+				goto roll_back;
+		}
+		return true;
+	} else {
+		return amdgpu_device_lock_adev(adev, hive);
+	}
+roll_back:
+	if (!list_is_first(&tmp_adev->gmc.xgmi.head, &hive->device_list)) {
+		/*
+		 * if the lockup iteration break in the middle of a hive,
+		 * it may means there may has a race issue,
+		 * or a hive device locked up independently.
+		 * we may be in trouble and may not,
+		 * so will try to roll back the lock and give out a warnning.
+		 */
+		dev_warn(tmp_adev->dev, "Hive lock iteration broke in the middle. Rolling back to unlock");
+		list_for_each_entry_continue_reverse(tmp_adev, &hive->device_list, gmc.xgmi.head) {
+			amdgpu_device_unlock_adev(tmp_adev);
+		}
+	}
+	return false;
+}
+
  static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
  {
  	struct pci_dev *p = NULL;
@@ -4573,11 +4613,32 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
  			DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",
  				job ? job->base.id : -1, hive->hive_id);
  			amdgpu_put_xgmi_hive(hive);
+			if (job)
+				drm_sched_increase_karma(&job->base);
  			return 0;
  		}
  		mutex_lock(&hive->hive_lock);
  	}
  
+	/*
+	 * lock the device before we try to operate the linked list
+	 * if didn't get the device lock, don't touch the linked list since
+	 * others may iterating it.
+	 */
+	if (!amdgpu_device_lock_hive_adev(adev, hive)) {
+		dev_info(adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress",
+					job ? job->base.id : -1);
+
+		if (adev->gmc.xgmi.num_physical_nodes > 1 && !hive)
+			r = -ENODEV;
+		else
+			r = 0;


You can just change amdgpu_device_lock_hive_adev return type to int instead
of code duplication, maybe returning EAGAIN for actual locking failure.

Andrey


+		/* even we skipped this reset, still need to set the job to guilty */
+		if (job)
+			drm_sched_increase_karma(&job->base);
+		goto skip_recovery;
+	}
+
  	/*
  	 * Build list of devices to reset.
  	 * In case we are in XGMI hive mode, resort the device list
@@ -4585,8 +4646,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
  	 */
  	INIT_LIST_HEAD(&device_list);
  	if (adev->gmc.xgmi.num_physical_nodes > 1) {
-		if (!hive)
-			return -ENODEV;
  		if (!list_is_first(&adev->gmc.xgmi.head, &hive->device_list))
  			list_rotate_to_front(&adev->gmc.xgmi.head, &hive->device_list);
  		device_list_handle = &hive->device_list;
@@ -4597,13 +4656,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
  
  	/* block all schedulers and reset given job's ring */
  	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
-		if (!amdgpu_device_lock_adev(tmp_adev, hive)) {
-			dev_info(tmp_adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress",
-				  job ? job->base.id : -1);
-			r = 0;
-			goto skip_recovery;
-		}
-
  		/*
  		 * Try to put the audio codec into suspend state
  		 * before gpu reset started.
_______________________________________________
amd-gfx mailing list
amd-gfx@xxxxxxxxxxxxxxxxxxxxx
https://lists.freedesktop.org/mailman/listinfo/amd-gfx