Re: [PATCH 07/10] drm/amdgpu: add concurrent baco reset support for XGMI

Andrey Grodzovsky <Andrey.Grodzovsky@xxxxxxx> · Wed, 27 Nov 2019 10:46:20 -0500

On 11/27/19 4:15 AM, Le Ma wrote:
Currently each XGMI node reset wq does not run in parrallel because same work
item bound to same cpu runs in sequence. So change to bound the xgmi_reset_work
item to different cpus.

It's not the same work item, see more bellow



XGMI requires all nodes enter into baco within very close proximity before
any node exit baco. So schedule the xgmi_reset_work wq twice for enter/exit
baco respectively.

The default reset code path and methods do not change for vega20 production:
   - baco reset without xgmi/ras
   - psp reset with xgmi/ras

To enable baco for XGMI/RAS case, both 2 conditions below are needed:
   - amdgpu_ras_enable=2
   - baco-supported smu firmware

The case that PSP reset and baco reset coexist within an XGMI hive is not in
the consideration.

Change-Id: I9c08cf90134f940b42e20d2129ff87fba761c532
Signed-off-by: Le Ma <le.ma@xxxxxxx>
---
  drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  2 +
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 78 ++++++++++++++++++++++++++----
  2 files changed, 70 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index d120fe5..08929e6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -998,6 +998,8 @@ struct amdgpu_device {
  	int				pstate;
  	/* enable runtime pm on the device */
  	bool                            runpm;
+
+	bool				in_baco;
  };
  
  static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device *bdev)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index bd387bb..71abfe9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2654,7 +2654,13 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
  	struct amdgpu_device *adev =
  		container_of(__work, struct amdgpu_device, xgmi_reset_work);
  
-	adev->asic_reset_res =  amdgpu_asic_reset(adev);
+	if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO)
+		adev->asic_reset_res = (adev->in_baco == false) ?
+				amdgpu_device_baco_enter(adev->ddev) :
+				amdgpu_device_baco_exit(adev->ddev);
+	else
+		adev->asic_reset_res = amdgpu_asic_reset(adev);
+
  	if (adev->asic_reset_res)
  		DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
  			 adev->asic_reset_res, adev->ddev->unique);
@@ -3796,6 +3802,7 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
  	struct amdgpu_device *tmp_adev = NULL;
  	bool need_full_reset = *need_full_reset_arg, vram_lost = false;
  	int r = 0;
+	int cpu = smp_processor_id();
  
  	/*
  	 * ASIC reset has to be done on all HGMI hive nodes ASAP
@@ -3803,21 +3810,24 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
  	 */
  	if (need_full_reset) {
  		list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
-			/* For XGMI run all resets in parallel to speed up the process */
+			/*
+			 * For XGMI run all resets in parallel to speed up the
+			 * process by scheduling the highpri wq on different
+			 * cpus. For XGMI with baco reset, all nodes must enter
+			 * baco within close proximity before anyone exit.
+			 */
  			if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
-				if (!queue_work(system_highpri_wq, &tmp_adev->xgmi_reset_work))


Note that tmp_adev->xgmi_reset_work (the work item) is per device in 
XGMI hive and not the same work item. So I don't see why you need to 
explicitly queue them on different CPUs, they should run in parallel 
already.

Andrey



+				if (!queue_work_on(cpu, system_highpri_wq,
+						   &tmp_adev->xgmi_reset_work))
  					r = -EALREADY;
+				cpu = cpumask_next(cpu, cpu_online_mask);
  			} else
  				r = amdgpu_asic_reset(tmp_adev);
-
-			if (r) {
-				DRM_ERROR("ASIC reset failed with error, %d for drm dev, %s",
-					 r, tmp_adev->ddev->unique);
+			if (r)
  				break;
-			}
  		}
  
-		/* For XGMI wait for all PSP resets to complete before proceed */
+		/* For XGMI wait for all work to complete before proceed */
  		if (!r) {
  			list_for_each_entry(tmp_adev, device_list_handle,
  					    gmc.xgmi.head) {
@@ -3826,11 +3836,59 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
  					r = tmp_adev->asic_reset_res;
  					if (r)
  						break;
+					if(AMD_RESET_METHOD_BACO ==
+					   amdgpu_asic_reset_method(tmp_adev))
+						tmp_adev->in_baco = true;
  				}
  			}
  		}
-	}
  
+		/*
+		 * For XGMI with baco reset, need exit baco phase by scheduling
+		 * xgmi_reset_work one more time. PSP reset skips this phase.
+		 * Not assume the situation that PSP reset and baco reset
+		 * coexist within an XGMI hive.
+		 */
+
+		if (!r) {
+			cpu = smp_processor_id();
+			list_for_each_entry(tmp_adev, device_list_handle,
+					    gmc.xgmi.head) {
+				if (tmp_adev->gmc.xgmi.num_physical_nodes > 1
+				    && AMD_RESET_METHOD_BACO ==
+				    amdgpu_asic_reset_method(tmp_adev)) {
+					if (!queue_work_on(cpu,
+						system_highpri_wq,
+						&tmp_adev->xgmi_reset_work))
+						r = -EALREADY;
+					if (r)
+						break;
+					cpu = cpumask_next(cpu, cpu_online_mask);
+				}
+			}
+		}
+
+		if (!r) {
+			list_for_each_entry(tmp_adev, device_list_handle,
+					    gmc.xgmi.head) {
+				if (tmp_adev->gmc.xgmi.num_physical_nodes > 1
+				    && AMD_RESET_METHOD_BACO ==
+				    amdgpu_asic_reset_method(tmp_adev)) {
+					flush_work(&tmp_adev->xgmi_reset_work);
+					r = tmp_adev->asic_reset_res;
+					if (r)
+						break;
+					tmp_adev->in_baco = false;
+				}
+			}
+		}
+
+		if (r) {
+			DRM_ERROR("ASIC reset failed with error, %d for drm dev, %s",
+				 r, tmp_adev->ddev->unique);
+			goto end;
+		}
+	}
  
  	list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
  		if (need_full_reset) {
_______________________________________________
amd-gfx mailing list
amd-gfx@xxxxxxxxxxxxxxxxxxxxx
https://lists.freedesktop.org/mailman/listinfo/amd-gfx