Re: [PATCH] drm/amdgpu: Do a basic health check before reset

Felix Kuehling <felix.kuehling@xxxxxxx> · Wed, 13 Mar 2024 15:49:15 -0400

On 2024-03-13 5:41, Lijo Lazar wrote:
Check if the device is present in the bus before trying to recover. It
could be that device itself is lost from the bus in some hang
situations.

Signed-off-by: Lijo Lazar <lijo.lazar@xxxxxxx>
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 24 ++++++++++++++++++++++
  1 file changed, 24 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 1e9454e6e4cb..b37113b79483 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -5536,6 +5536,23 @@ static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
  
  }
  
+static int amdgpu_device_health_check(struct list_head *device_list_handle)
+{
+	struct amdgpu_device *tmp_adev;
+	int ret = 0;
+	u32 status;
+
+	list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
+		pci_read_config_dword(tmp_adev->pdev, PCI_COMMAND, &status);
+		if (PCI_POSSIBLE_ERROR(status)) {
+			dev_err(tmp_adev->dev, "device lost from bus!");
+			ret = -ENODEV;

You could just return here. What's the point of looking for other 
devices if you're going to return an error anyway?

Regards,
  Felix


+		}
+	}
+
+	return ret;
+}
+
  /**
   * amdgpu_device_gpu_recover - reset the asic and recover scheduler
   *
@@ -5607,6 +5624,12 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
  		device_list_handle = &device_list;
  	}
  
+	if (!amdgpu_sriov_vf(adev)) {
+		r = amdgpu_device_health_check(device_list_handle);
+		if (r)
+			goto end_reset;
+	}
+
  	/* We need to lock reset domain only once both for XGMI and single device */
  	tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
  				    reset_list);
@@ -5772,6 +5795,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
  					    reset_list);
  	amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain);
  
+end_reset:
  	if (hive) {
  		mutex_unlock(&hive->hive_lock);
  		amdgpu_put_xgmi_hive(hive);