Re: [PATCH] drm/amdgpu: Support AMDGPU RAS debugfs poll interface

Christian König <christian.koenig@xxxxxxx> · Tue, 29 Mar 2022 15:20:55 +0200

Well that is a really bad idea.

Tools should only use sysfs functionality, not debugfs since that isn't 
stable.

And I totally agree to Hawking here. As I wrote on the other mail this 
patch is a no-go.

Regards,
Christian.

Am 29.03.22 um 15:18 schrieb Chai, Thomas:
[AMD Official Use Only]

Sorry for the confusing commit message。
This interface is only for amdgpu ras tool new function. There is no impact on currently existing tools and scripts。

-----Original Message-----
From: Zhang, Hawking <Hawking.Zhang@xxxxxxx>
Sent: Tuesday, March 29, 2022 4:33 PM
To: Chai, Thomas <YiPeng.Chai@xxxxxxx>; amd-gfx@xxxxxxxxxxxxxxxxxxxxx
Cc: Zhou1, Tao <Tao.Zhou1@xxxxxxx>; Clements, John <John.Clements@xxxxxxx>
Subject: RE: [PATCH] drm/amdgpu: Support AMDGPU RAS debugfs poll interface

[AMD Official Use Only]

I'm not sure I understand the fix correctly - It seems to me it is trying to stop user/test cases that initiate error injection request back-to-back? But anyway, we shouldn't make the change or leverage debugfs for that purpose, and there is no guarantee test scripts/applications will follow the rule as well.

I guess we need to identify the root cause case by case and stop the invalid request in kernel driver.

Regards,
Hawking

-----Original Message-----
From: Chai, Thomas <YiPeng.Chai@xxxxxxx>
Sent: Tuesday, March 29, 2022 15:39
To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx
Cc: Chai, Thomas <YiPeng.Chai@xxxxxxx>; Zhang, Hawking <Hawking.Zhang@xxxxxxx>; Zhou1, Tao <Tao.Zhou1@xxxxxxx>; Clements, John <John.Clements@xxxxxxx>; Chai, Thomas <YiPeng.Chai@xxxxxxx>
Subject: [PATCH] drm/amdgpu: Support AMDGPU RAS debugfs poll interface

Some AMDGPU RAS debugfs operations like UE injection can cause gpu reset. Before doing the next debugfs operation, the application should call poll to check if the gpu has finished recovering.

Signed-off-by: yipechai <YiPeng.Chai@xxxxxxx>
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 38 ++++++++++++++++++++++++-  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  6 ++++
  2 files changed, 43 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 4bbed76b79c8..337e3e247a45 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -452,6 +452,12 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f,
  
  		/* data.inject.address is offset instead of absolute gpu address */
  		ret = amdgpu_ras_error_inject(adev, &data.inject);
+
+		if (!ret && (data.head.type == AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE)) {
+			struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+
+			con->ras_ue_injected = 1;
+		}
  		break;
  	default:
  		ret = -EINVAL;
@@ -464,6 +470,30 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f,
  	return size;
  }
  
+/**
+ * DOC: Support AMDGPU RAS debugfs poll interface
+ *
+ * Some AMDGPU RAS debugfs operations like UE injection
+ * can cause gpu reset. Before doing the next debugfs
+ * operation, the application should call poll to check
+ * if gpu is in recovering status.
+ */
+static __poll_t amdgpu_ras_debugfs_ctrl_poll(struct file *f, struct
+poll_table_struct *wait) {
+	struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private;
+	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+	__poll_t mask = 0;
+
+	/* For UE injection, wait for gpu to finish recovery */
+	if (con->ras_ue_injected)
+		poll_wait(f, &con->gpu_ready_wait_wq, wait);
+
+	if (!atomic_read(&con->in_recovery))
+		mask = EPOLLIN | EPOLLRDNORM;
+
+	return mask;
+}
+
  /**
   * DOC: AMDGPU RAS debugfs EEPROM table reset interface
   *
@@ -503,6 +533,7 @@ static ssize_t amdgpu_ras_debugfs_eeprom_write(struct file *f,
  
  static const struct file_operations amdgpu_ras_debugfs_ctrl_ops = {
  	.owner = THIS_MODULE,
+	.poll = amdgpu_ras_debugfs_ctrl_poll,
  	.read = NULL,
  	.write = amdgpu_ras_debugfs_ctrl_write,
  	.llseek = default_llseek
@@ -1837,6 +1868,11 @@ static void amdgpu_ras_do_recovery(struct work_struct *work)
  	if (amdgpu_device_should_recover_gpu(ras->adev))
  		amdgpu_device_gpu_recover(ras->adev, NULL);
  	atomic_set(&ras->in_recovery, 0);
+
+	if (ras->ras_ue_injected) {
+		ras->ras_ue_injected = 0;
+		wake_up_all(&ras->gpu_ready_wait_wq);
+	}
  }
  
  /* alloc/realloc bps array */
@@ -2279,7 +2315,7 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
  	INIT_DELAYED_WORK(&con->ras_counte_delay_work, amdgpu_ras_counte_dw);
  	atomic_set(&con->ras_ce_count, 0);
  	atomic_set(&con->ras_ue_count, 0);
-
+	init_waitqueue_head(&con->gpu_ready_wait_wq);
  	con->objs = (struct ras_manager *)(con + 1);
  
  	amdgpu_ras_set_context(adev, con);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 606df8869b89..aea6bbb71501 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -379,6 +379,12 @@ struct amdgpu_ras {
  
  	/* Indicates smu whether need update bad channel info */
  	bool update_channel_flag;
+
+	/* UE injection flag */
+	uint32_t  ras_ue_injected;
+
+	/* Waiting for gpu ready work queue */
+	wait_queue_head_t gpu_ready_wait_wq;
  };
  
  struct ras_fs_data {
--
2.25.1