Re: [PATCH 5/5] drm/amdgpu: Reset the devices in the XGMI hive duirng probe

Andrey Grodzovsky <andrey.grodzovsky@xxxxxxx> · Sat, 6 Mar 2021 01:09:08 -0500

Thanks for explaining this, one thing I still don't understand is why 
you schedule the reset work explicilty in the begining of 
amdgpu_drv_delayed_reset_work_handler and then also call 
amdgpu_do_asic_reset which will do the same thing too. It looks like the 
physical reset will execute twice for each device.
Another thing is, more like improvement suggestion  - currently you 
schedule delayed_reset_work using AMDGPU_RESUME_MS - so i guesss this 
should give enough time for SMU to start ? Is there maybe a way to 
instead poll for SMU start completion and then execute this - some SMU 
status registers maybe ? Just to avoid relying on this arbitrary value.

Andrey

On 2021-03-05 8:37 p.m., Liu, Shaoyun wrote:
[AMD Official Use Only - Internal Distribution Only]

Hi,  Andrey
The existing reset function (amdgpu_device_gpu_recover or amd do_asic _reset) assumed driver already have  the correct hive info . But in my case, it's  not true . The gpus are in a bad state and the XGMI TA  might not functional properly , so driver can  not  get the hive and node info when probe the device .  It means driver even don't know  the device belongs to which hive on a system with multiple hive configuration (ex, 8 gpus in  two hive). The only solution I can think of is let driver trigger the reset on all gpus at the same time after driver do the minimum initialization on the HW ( bring up the  SMU IP)  no matter they belongs to the same hive or not and call amdgpu_xgmi_add_device for each device after re-init .
The 100 ms delay added after the baco reset . I think they can be removed . let me verify it.

Regards
Shaoyun.liu



-----Original Message-----
From: Grodzovsky, Andrey <Andrey.Grodzovsky@xxxxxxx>
Sent: Friday, March 5, 2021 2:27 PM
To: Liu, Shaoyun <Shaoyun.Liu@xxxxxxx>; amd-gfx@xxxxxxxxxxxxxxxxxxxxx
Subject: Re: [PATCH 5/5] drm/amdgpu: Reset the devices in the XGMI hive duirng probe



On 2021-03-05 12:52 p.m., shaoyunl wrote:
In passthrough configuration, hypervisior will trigger the
SBR(Secondary bus reset) to the devices without sync to each other.
This could cause device hang since for XGMI configuration, all the
devices within the hive need to be reset at a limit time slot. This
serial of patches try to solve this issue by co-operate with new SMU
which will only do minimum house keeping to response the SBR request
but don't do the real reset job and leave it to driver. Driver need to
do the whole sw init and minimum HW init to bring up the SMU and
trigger the reset(possibly BACO) on all the ASICs at the same time

Signed-off-by: shaoyunl <shaoyun.liu@xxxxxxx>
Change-Id: I34e838e611b7623c7ad824704c7ce350808014fc
---
   drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  13 +++
   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 102 +++++++++++++++------
   drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c    |  71 ++++++++++++++
   drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h    |   1 +
   drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c   |   8 +-
   5 files changed, 165 insertions(+), 30 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index d46d3794699e..5602c6edee97 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -125,6 +125,10 @@ struct amdgpu_mgpu_info
   	uint32_t			num_gpu;
   	uint32_t			num_dgpu;
   	uint32_t			num_apu;
+
+	/* delayed reset_func for XGMI configuration if necessary */
+	struct delayed_work		delayed_reset_work;
+	bool				pending_reset;
   };
   
   #define AMDGPU_MAX_TIMEOUT_PARAM_LENGTH	256
@@ -1124,6 +1128,15 @@ void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,
   bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type);
   bool amdgpu_device_has_dc_support(struct amdgpu_device *adev);
   
+int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
+				  struct amdgpu_job *job,
+				  bool *need_full_reset_arg);
+
+int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
+			  struct list_head *device_list_handle,
+			  bool *need_full_reset_arg,
+			  bool skip_hw_reset);
+
   int emu_soc_asic_init(struct amdgpu_device *adev);
   
   /*
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 3c35b0c1e710..5b520f70e660 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -1220,6 +1220,10 @@ bool amdgpu_device_need_post(struct amdgpu_device *adev)
   		}
   	}
   
+	/* Don't post if we need to reset whole hive on init */
+	if (adev->gmc.xgmi.pending_reset)
+		return false;
+
   	if (adev->has_hw_reset) {
   		adev->has_hw_reset = false;
   		return true;
@@ -2149,6 +2153,9 @@ static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
   			if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
   				continue;
   
+			if (!adev->ip_blocks[i].status.sw)
+				continue;
+
   			/* no need to do the fw loading again if already done*/
   			if (adev->ip_blocks[i].status.hw == true)
   				break;
@@ -2289,7 +2296,10 @@ static int amdgpu_device_ip_init(struct
amdgpu_device *adev)
   
   	if (adev->gmc.xgmi.num_physical_nodes > 1)
   		amdgpu_xgmi_add_device(adev);
-	amdgpu_amdkfd_device_init(adev);
+
+	/* Don't init kfd if whole hive need to be reset during init */
+	if (!adev->gmc.xgmi.pending_reset)
+		amdgpu_amdkfd_device_init(adev);
   
   	amdgpu_fru_get_product_info(adev);
   
@@ -2734,6 +2744,16 @@ static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
   			adev->ip_blocks[i].status.hw = false;
   			continue;
   		}
+
+		/* skip unnecessary suspend if we do not initialize them yet */
+		if (adev->gmc.xgmi.pending_reset &&
+		    !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
+		      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC ||
+		      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
+		      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) {
+			adev->ip_blocks[i].status.hw = false;
+			continue;
+		}
   		/* XXX handle errors */
   		r = adev->ip_blocks[i].version->funcs->suspend(adev);
   		/* XXX handle errors */
@@ -3407,10 +3427,28 @@ int amdgpu_device_init(struct amdgpu_device *adev,
   	 *  E.g., driver was not cleanly unloaded previously, etc.
   	 */
   	if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
-		r = amdgpu_asic_reset(adev);
-		if (r) {
-			dev_err(adev->dev, "asic reset on init failed\n");
-			goto failed;
+		if (adev->gmc.xgmi.num_physical_nodes) {
+			dev_info(adev->dev, "Pending hive reset.\n");
+			adev->gmc.xgmi.pending_reset = true;
+			/* Only need to init necessary block for SMU to handle the reset */
+			for (i = 0; i < adev->num_ip_blocks; i++) {
+				if (!adev->ip_blocks[i].status.valid)
+					continue;
+				if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
+				      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
+				      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
+				      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) {
+					DRM_DEBUG("IP %s disabed for hw_init.\n",
+						adev->ip_blocks[i].version->funcs->name);
+					adev->ip_blocks[i].status.hw = true;
+				}
+			}
+		} else {
+			r = amdgpu_asic_reset(adev);
+			if (r) {
+				dev_err(adev->dev, "asic reset on init failed\n");
+				goto failed;
+			}
   		}
   	}
   
@@ -3541,19 +3579,19 @@ int amdgpu_device_init(struct amdgpu_device *adev,
   	/* enable clockgating, etc. after ib tests, etc. since some blocks require
   	 * explicit gating rather than handling it automatically.
   	 */
-	r = amdgpu_device_ip_late_init(adev);
-	if (r) {
-		dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
-		amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
-		goto failed;
+	if (!adev->gmc.xgmi.pending_reset) {
+		r = amdgpu_device_ip_late_init(adev);
+		if (r) {
+			dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
+			amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
+			goto failed;
+		}
+		/* must succeed. */
+		amdgpu_ras_resume(adev);
+		queue_delayed_work(system_wq, &adev->delayed_init_work,
+				   msecs_to_jiffies(AMDGPU_RESUME_MS));
   	}
   
-	/* must succeed. */
-	amdgpu_ras_resume(adev);
-
-	queue_delayed_work(system_wq, &adev->delayed_init_work,
-			   msecs_to_jiffies(AMDGPU_RESUME_MS));
-
   	if (amdgpu_sriov_vf(adev))
   		flush_delayed_work(&adev->delayed_init_work);
   
@@ -3570,6 +3608,10 @@ int amdgpu_device_init(struct amdgpu_device *adev,
   	if (amdgpu_device_cache_pci_state(adev->pdev))
   		pci_restore_state(pdev);
   
+	if (adev->gmc.xgmi.pending_reset)
+		queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work,
+				   msecs_to_jiffies(AMDGPU_RESUME_MS));
+
   	return 0;
   
   failed:
@@ -4240,14 +4282,16 @@ bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
   }
   
   
-static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
-					struct amdgpu_job *job,
-					bool *need_full_reset_arg)
+int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
+				  struct amdgpu_job *job,
+				  bool *need_full_reset_arg)
   {
   	int i, r = 0;
   	bool need_full_reset  = *need_full_reset_arg;
   
-	amdgpu_debugfs_wait_dump(adev);
+	/* no need to dump if device is not in good state during probe period */
+	if (!adev->gmc.xgmi.pending_reset)
+		amdgpu_debugfs_wait_dump(adev);
   
   	if (amdgpu_sriov_vf(adev)) {
   		/* stop the data exchange thread */ @@ -4293,10 +4337,10 @@ static
int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
   	return r;
   }
   
-static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
-			       struct list_head *device_list_handle,
-			       bool *need_full_reset_arg,
-			       bool skip_hw_reset)
+int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
+			  struct list_head *device_list_handle,
+			  bool *need_full_reset_arg,
+			  bool skip_hw_reset)
   {
   	struct amdgpu_device *tmp_adev = NULL;
   	bool need_full_reset = *need_full_reset_arg, vram_lost = false; @@
-4310,6 +4354,7 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
   		list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
   			/* For XGMI run all resets in parallel to speed up the process */
   			if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
+				tmp_adev->gmc.xgmi.pending_reset = false;
   				if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
   					r = -EALREADY;
   			} else
@@ -4348,10 +4393,10 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
   	list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
   		if (need_full_reset) {
   			/* post card */
-			if (amdgpu_device_asic_init(tmp_adev))
+			r = amdgpu_device_asic_init(tmp_adev);
+			if (r) {
   				dev_warn(tmp_adev->dev, "asic atom init failed!");
-
-			if (!r) {
+			} else {
   				dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
   				r = amdgpu_device_ip_resume_phase1(tmp_adev);
   				if (r)
@@ -4384,6 +4429,9 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
   				 */
   				amdgpu_register_gpu_instance(tmp_adev);
   
+				if (!hive && tmp_adev->gmc.xgmi.num_physical_nodes > 1)
+					amdgpu_xgmi_add_device(tmp_adev);
+
   				r = amdgpu_device_ip_late_init(tmp_adev);
   				if (r)
   					goto out;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index 253c59e0a100..aebe4bc561ee 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -44,6 +44,7 @@
   #include "amdgpu_amdkfd.h"
   
   #include "amdgpu_ras.h"
+#include "amdgpu_xgmi.h"
   
   /*
    * KMS wrapper.
@@ -167,8 +168,13 @@ uint amdgpu_freesync_vid_mode;
   int amdgpu_reset_method = -1; /* auto */
   int amdgpu_num_kcq = -1;
   
+static void amdgpu_drv_delayed_reset_work_handler(struct work_struct
+*work);
+
   struct amdgpu_mgpu_info mgpu_info = {
   	.mutex = __MUTEX_INITIALIZER(mgpu_info.mutex),
+	.delayed_reset_work = __DELAYED_WORK_INITIALIZER(
+			mgpu_info.delayed_reset_work,
+			amdgpu_drv_delayed_reset_work_handler, 0),
   };
   int amdgpu_ras_enable = -1;
   uint amdgpu_ras_mask = 0xffffffff;
@@ -1297,6 +1303,71 @@ amdgpu_pci_shutdown(struct pci_dev *pdev)
   	adev->mp1_state = PP_MP1_STATE_NONE;
   }
   
+/**
+ * amdgpu_drv_delayed_reset_work_handler - work handler for reset
+ *
+ * @work: work_struct.
+ */
+static void amdgpu_drv_delayed_reset_work_handler(struct work_struct
+*work) {
+	struct list_head device_list;
+	struct amdgpu_device *adev;
+	int i, r;
+	bool need_full_reset = true;
+
+	mutex_lock(&mgpu_info.mutex);
+	if (mgpu_info.pending_reset == true) {
+		mutex_unlock(&mgpu_info.mutex);
+		return;
+	}
+	mgpu_info.pending_reset = true;
+	mutex_unlock(&mgpu_info.mutex);
+
+	for (i = 0; i < mgpu_info.num_dgpu; i++) {
+		adev = mgpu_info.gpu_ins[i].adev;
+		r = amdgpu_device_pre_asic_reset(adev, NULL, &need_full_reset);

Why amdgpu_device_pre_asic_reset is needed ?

+		if (r) {
+			dev_err(adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
+				r, adev_to_drm(adev)->unique);
+		}
+		if (!queue_work(system_unbound_wq, &adev->xgmi_reset_work))
+			r = -EALREADY;

amdgpu_do_asic_reset bellow will already schedule xgmi_reset_work for this device, what you could do instead is call amdgpu_do_asic_reset for each member of the hive and because there is a task barrier in amdgpu_device_xgmi_reset_func, it will synchronize all the resets to same point in time already.

+	}
+	msleep(100);

What's the 100ms is wiating for ?

+	for (i = 0; i < mgpu_info.num_dgpu; i++) {
+		adev = mgpu_info.gpu_ins[i].adev;
+		adev->gmc.xgmi.pending_reset = false;
+		flush_work(&adev->xgmi_reset_work);
+	}
+
+	msleep(100);

Same as above

+	/* reset function will rebuild the xgmi hive info , clear it now */
+	for (i = 0; i < mgpu_info.num_dgpu; i++)
+		amdgpu_xgmi_remove_device(mgpu_info.gpu_ins[i].adev);
+
+	INIT_LIST_HEAD(&device_list);
+
+	for (i = 0; i < mgpu_info.num_dgpu; i++)
+		list_add_tail(&mgpu_info.gpu_ins[i].adev->reset_list,
+&device_list);
+
+	/* unregister the GPU first, reset function will add them back */
+	list_for_each_entry(adev, &device_list, reset_list)
+		amdgpu_unregister_gpu_instance(adev);
+
+	r = amdgpu_do_asic_reset(NULL, &device_list, &need_full_reset, true);
+	if (r) {
+		DRM_ERROR("reinit gpus failure");
+		return;
+	}
+	for (i = 0; i < mgpu_info.num_dgpu; i++) {
+		adev = mgpu_info.gpu_ins[i].adev;
+		if (!adev->kfd.init_complete)
+			amdgpu_amdkfd_device_init(adev);
+		amdgpu_ttm_set_buffer_funcs_status(adev, true);
+	}
+	return;
+}
+
   static int amdgpu_pmops_suspend(struct device *dev)
   {
   	struct drm_device *drm_dev = dev_get_drvdata(dev); diff --git
a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
index aa0c83776ce0..8c71d84a2fbe 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
@@ -149,6 +149,7 @@ struct amdgpu_xgmi {
   	struct list_head head;
   	bool supported;
   	struct ras_common_if *ras_if;
+	bool pending_reset;
   };
   
   struct amdgpu_gmc {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index 659b385b27b5..b459ef755ea9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -492,7 +492,8 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
   	if (!adev->gmc.xgmi.supported)
   		return 0;
   
-	if (amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
+	if (!adev->gmc.xgmi.pending_reset &&
+	    amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
   		ret = psp_xgmi_initialize(&adev->psp);
   		if (ret) {
   			dev_err(adev->dev,
@@ -538,7 +539,8 @@ int amdgpu_xgmi_add_device(struct amdgpu_device
*adev)
   
   	task_barrier_add_task(&hive->tb);
   
-	if (amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
+	if (!adev->gmc.xgmi.pending_reset &&
+	    amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_PSP)) {
   		list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
   			/* update node list for other device in the hive */
   			if (tmp_adev != adev) {
@@ -567,7 +569,7 @@ int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
   		}
   	}
   
-	if (!ret)
+	if (!ret && !adev->gmc.xgmi.pending_reset)
   		ret = amdgpu_xgmi_sysfs_add_dev_info(adev, hive);
   
   exit_unlock:

_______________________________________________
amd-gfx mailing list
amd-gfx@xxxxxxxxxxxxxxxxxxxxx
https://lists.freedesktop.org/mailman/listinfo/amd-gfx