Re: [PATCH Review 1/1] drm/amdgpu: Support setting recover method

Christian König <ckoenig.leichtzumerken@xxxxxxxxx> · Thu, 11 Apr 2024 13:17:09 +0200

Am 11.04.24 um 13:11 schrieb Stanley.Yang:
Don't modify amdgpu gpu recover get operation,
add amdgpu gpu recover set operation to select
reset method, only support mode1 and mode2 currently.

Well I don't think setting this from userspace is valid.

The reset method to use is determined by the hardware and environment 
(e.g. SRIOV, passthrough, whatever) and can't be chosen simply.

Regards,
Christian.


Signed-off-by: Stanley.Yang <Stanley.Yang@xxxxxxx>
---
  drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  3 ++
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  1 +
  drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c  | 37 +++++++++++++++++++---
  3 files changed, 37 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 9c62552bec34..c82976b2b977 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1151,6 +1151,9 @@ struct amdgpu_device {
  	bool                            debug_largebar;
  	bool                            debug_disable_soft_recovery;
  	bool                            debug_use_vram_fw_buf;
+
+	/* Used to set gpu reset method */
+	int                             recover_method;
  };
  
  static inline uint32_t amdgpu_ip_version(const struct amdgpu_device *adev,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 3204b8f6edeb..8411a793be18 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3908,6 +3908,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
  	else
  		adev->asic_type = flags & AMD_ASIC_MASK;
  
+	adev->recover_method = AMD_RESET_METHOD_NONE;
  	adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT;
  	if (amdgpu_emu_mode == 1)
  		adev->usec_timeout *= 10;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
index 10832b470448..e388a50d11d9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
@@ -965,9 +965,37 @@ static int gpu_recover_get(void *data, u64 *val)
  	return 0;
  }
  
+static int gpu_recover_set(void *data, u64 val)
+{
+	struct amdgpu_device *adev = (struct amdgpu_device *)data;
+	struct drm_device *dev = adev_to_drm(adev);
+	int r;
+
+	/* TODO: support mode1 and mode2 currently */
+	if (val == AMD_RESET_METHOD_MODE1 ||
+		val == AMD_RESET_METHOD_MODE2)
+		adev->recover_method = val;
+	else
+		adev->recover_method = AMD_RESET_METHOD_NONE;
+
+	r = pm_runtime_get_sync(dev->dev);
+	if (r < 0) {
+		pm_runtime_put_autosuspend(dev->dev);
+		return 0;
+	}
+
+	if (amdgpu_reset_domain_schedule(adev->reset_domain, &adev->reset_work))
+		flush_work(&adev->reset_work);
+
+	pm_runtime_mark_last_busy(dev->dev);
+	pm_runtime_put_autosuspend(dev->dev);
+
+	return 0;
+}
+
  DEFINE_SHOW_ATTRIBUTE(amdgpu_debugfs_fence_info);
-DEFINE_DEBUGFS_ATTRIBUTE(amdgpu_debugfs_gpu_recover_fops, gpu_recover_get, NULL,
-			 "%lld\n");
+DEFINE_DEBUGFS_ATTRIBUTE(amdgpu_debugfs_gpu_recover_fops, gpu_recover_get,
+			 gpu_recover_set, "%lld\n");
  
  static void amdgpu_debugfs_reset_work(struct work_struct *work)
  {
@@ -978,9 +1006,10 @@ static void amdgpu_debugfs_reset_work(struct work_struct *work)
  
  	memset(&reset_context, 0, sizeof(reset_context));
  
-	reset_context.method = AMD_RESET_METHOD_NONE;
+	reset_context.method = adev->recover_method;
  	reset_context.reset_req_dev = adev;
  	set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
+	adev->recover_method = AMD_RESET_METHOD_NONE;
  
  	amdgpu_device_gpu_recover(adev, NULL, &reset_context);
  }
@@ -999,7 +1028,7 @@ void amdgpu_debugfs_fence_init(struct amdgpu_device *adev)
  	if (!amdgpu_sriov_vf(adev)) {
  
  		INIT_WORK(&adev->reset_work, amdgpu_debugfs_reset_work);
-		debugfs_create_file("amdgpu_gpu_recover", 0444, root, adev,
+		debugfs_create_file("amdgpu_gpu_recover", 0666, root, adev,
  				    &amdgpu_debugfs_gpu_recover_fops);
  	}
  #endif