Re: [PATCH] drm/amdgpu: refine reboot debugfs operation in ras case

Christian König <ckoenig.leichtzumerken@xxxxxxxxx> · Mon, 21 Oct 2019 11:11:31 +0200

Am 21.10.19 um 11:08 schrieb Chen, Guchun:
Reboot operation for ras recovery is one common debugfs
entry, which should get rid of ras_ctrl node and remove
ip dependence when inputting by user. So add one new
auto_reboot node in ras debugfs dir to achieve this.

We need some justification why this can't be a module parameter instead.

For example write something like we want to control reboot behavior on a 
per device basis.

Apart from that looks like a nice cleanup to me.

Regards,
Christian.


Signed-off-by: Guchun Chen <guchun.chen@xxxxxxx>
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 55 ++++++++++++++++++++++---
  1 file changed, 49 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 6220394521e4..3adcd29feb5f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -153,8 +153,6 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
  		op = 1;
  	else if (sscanf(str, "inject %32s %8s", block_name, err) == 2)
  		op = 2;
-	else if (sscanf(str, "reboot %32s", block_name) == 1)
-		op = 3;
  	else if (str[0] && str[1] && str[2] && str[3])
  		/* ascii string, but commands are not matched. */
  		return -EINVAL;
@@ -223,7 +221,6 @@ static struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev,
   * - 0: disable RAS on the block. Take ::head as its data.
   * - 1: enable RAS on the block. Take ::head as its data.
   * - 2: inject errors on the block. Take ::inject as its data.
- * - 3: reboot on unrecoverable error
   *
   * How to use the interface?
   * programs:
@@ -305,9 +302,6 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user *
  		/* data.inject.address is offset instead of absolute gpu address */
  		ret = amdgpu_ras_error_inject(adev, &data.inject);
  		break;
-	case 3:
-		amdgpu_ras_get_context(adev)->reboot = true;
-		break;
  	default:
  		ret = -EINVAL;
  		break;
@@ -346,6 +340,46 @@ static ssize_t amdgpu_ras_debugfs_eeprom_write(struct file *f, const char __user
  	return ret == 1 ? size : -EIO;
  }
  
+/**
+ * DOC: AMDGPU RAS debugfs auto reboot interface
+ *
+ * After one uncorrectable error happens, GPU recovery will be scheduled.
+ * Due to the known problem in GPU recovery failing to bring GPU back, this
+ * interface provides one direct way to user to reboot system automatically
+ * in such case within ERREVENT_ATHUB_INTERRUPT generated. Normal GPU recovery
+ * routine will never be called.
+ *
+ * Enable auto_reboot:
+ *
+ *	echo 1 > /sys/kernel/debug/dri/x/ras/auto_reboot
+ *
+ * Revert auto_reboot:
+ *
+ * 	echo 0 > /sys/kernel/debug/dri/x/ras/auto_reboot
+ *
+ */
+static ssize_t amdgpu_ras_debugfs_reboot_write(struct file *f,
+	const char __user *buf, size_t size, loff_t *pos)
+{
+	struct amdgpu_device *adev =
+		(struct amdgpu_device *)file_inode(f)->i_private;
+	char tmp[8] = {0};
+	int value = -1;
+
+	if (size != simple_write_to_buffer(tmp, sizeof(tmp), pos, buf, size))
+		return -EINVAL;
+
+	if (kstrtoint(tmp, 10, &value))
+		return -EINVAL;
+
+	if (value == 1)
+		amdgpu_ras_get_context(adev)->reboot = true;
+	else if (value == 0)
+		amdgpu_ras_get_context(adev)->reboot = false;
+
+	return size;
+}
+
  static const struct file_operations amdgpu_ras_debugfs_ctrl_ops = {
  	.owner = THIS_MODULE,
  	.read = NULL,
@@ -360,6 +394,13 @@ static const struct file_operations amdgpu_ras_debugfs_eeprom_ops = {
  	.llseek = default_llseek
  };
  
+static const struct file_operations amdgpu_ras_debugfs_reboot_ops = {
+	.owner = THIS_MODULE,
+	.read = NULL,
+	.write = amdgpu_ras_debugfs_reboot_write,
+	.llseek = default_llseek
+};
+
  /**
   * DOC: AMDGPU RAS sysfs Error Count Interface
   *
@@ -1037,6 +1078,8 @@ static void amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device *adev)
  				adev, &amdgpu_ras_debugfs_ctrl_ops);
  	debugfs_create_file("ras_eeprom_reset", S_IWUGO | S_IRUGO, con->dir,
  				adev, &amdgpu_ras_debugfs_eeprom_ops);
+	debugfs_create_file("auto_reboot", S_IWUGO | S_IRUGO, con->dir,
+				adev, &amdgpu_ras_debugfs_reboot_ops);
  }
  
  void amdgpu_ras_debugfs_create(struct amdgpu_device *adev,

_______________________________________________
amd-gfx mailing list
amd-gfx@xxxxxxxxxxxxxxxxxxxxx
https://lists.freedesktop.org/mailman/listinfo/amd-gfx