On 3/14/2024 1:19 AM, Felix Kuehling wrote: > > On 2024-03-13 5:41, Lijo Lazar wrote: >> Check if the device is present in the bus before trying to recover. It >> could be that device itself is lost from the bus in some hang >> situations. >> >> Signed-off-by: Lijo Lazar <lijo.lazar@xxxxxxx> >> --- >> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 24 ++++++++++++++++++++++ >> 1 file changed, 24 insertions(+) >> >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> index 1e9454e6e4cb..b37113b79483 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> @@ -5536,6 +5536,23 @@ static inline void >> amdgpu_device_stop_pending_resets(struct amdgpu_device *adev) >> } >> +static int amdgpu_device_health_check(struct list_head >> *device_list_handle) >> +{ >> + struct amdgpu_device *tmp_adev; >> + int ret = 0; >> + u32 status; >> + >> + list_for_each_entry(tmp_adev, device_list_handle, reset_list) { >> + pci_read_config_dword(tmp_adev->pdev, PCI_COMMAND, &status); >> + if (PCI_POSSIBLE_ERROR(status)) { >> + dev_err(tmp_adev->dev, "device lost from bus!"); >> + ret = -ENODEV; > > You could just return here. What's the point of looking for other > devices if you're going to return an error anyway? > This for XGMI case; the error is primarily for informational purpose to know which all devices in the hive got into a bad state. Thanks, Lijo > Regards, > Felix > > >> + } >> + } >> + >> + return ret; >> +} >> + >> /** >> * amdgpu_device_gpu_recover - reset the asic and recover scheduler >> * >> @@ -5607,6 +5624,12 @@ int amdgpu_device_gpu_recover(struct >> amdgpu_device *adev, >> device_list_handle = &device_list; >> } >> + if (!amdgpu_sriov_vf(adev)) { >> + r = amdgpu_device_health_check(device_list_handle); >> + if (r) >> + goto end_reset; >> + } >> + >> /* We need to lock reset domain only once both for XGMI and >> single device */ >> tmp_adev = list_first_entry(device_list_handle, struct >> amdgpu_device, >> reset_list); >> @@ -5772,6 +5795,7 @@ int amdgpu_device_gpu_recover(struct >> amdgpu_device *adev, >> reset_list); >> amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); >> +end_reset: >> if (hive) { >> mutex_unlock(&hive->hive_lock); >> amdgpu_put_xgmi_hive(hive);