Reboot operation for ras recovery is one common debugfs entry, which should get rid of ras_ctrl node and remove ip dependence when inputting by user. So add one new auto_reboot node in ras debugfs dir to achieve this. Signed-off-by: Guchun Chen <guchun.chen@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 55 ++++++++++++++++++++++--- 1 file changed, 49 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 6220394521e4..3adcd29feb5f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -153,8 +153,6 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f, op = 1; else if (sscanf(str, "inject %32s %8s", block_name, err) == 2) op = 2; - else if (sscanf(str, "reboot %32s", block_name) == 1) - op = 3; else if (str[0] && str[1] && str[2] && str[3]) /* ascii string, but commands are not matched. */ return -EINVAL; @@ -223,7 +221,6 @@ static struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev, * - 0: disable RAS on the block. Take ::head as its data. * - 1: enable RAS on the block. Take ::head as its data. * - 2: inject errors on the block. Take ::inject as its data. - * - 3: reboot on unrecoverable error * * How to use the interface? * programs: @@ -305,9 +302,6 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user * /* data.inject.address is offset instead of absolute gpu address */ ret = amdgpu_ras_error_inject(adev, &data.inject); break; - case 3: - amdgpu_ras_get_context(adev)->reboot = true; - break; default: ret = -EINVAL; break; @@ -346,6 +340,46 @@ static ssize_t amdgpu_ras_debugfs_eeprom_write(struct file *f, const char __user return ret == 1 ? size : -EIO; } +/** + * DOC: AMDGPU RAS debugfs auto reboot interface + * + * After one uncorrectable error happens, GPU recovery will be scheduled. + * Due to the known problem in GPU recovery failing to bring GPU back, this + * interface provides one direct way to user to reboot system automatically + * in such case within ERREVENT_ATHUB_INTERRUPT generated. Normal GPU recovery + * routine will never be called. + * + * Enable auto_reboot: + * + * echo 1 > /sys/kernel/debug/dri/x/ras/auto_reboot + * + * Revert auto_reboot: + * + * echo 0 > /sys/kernel/debug/dri/x/ras/auto_reboot + * + */ +static ssize_t amdgpu_ras_debugfs_reboot_write(struct file *f, + const char __user *buf, size_t size, loff_t *pos) +{ + struct amdgpu_device *adev = + (struct amdgpu_device *)file_inode(f)->i_private; + char tmp[8] = {0}; + int value = -1; + + if (size != simple_write_to_buffer(tmp, sizeof(tmp), pos, buf, size)) + return -EINVAL; + + if (kstrtoint(tmp, 10, &value)) + return -EINVAL; + + if (value == 1) + amdgpu_ras_get_context(adev)->reboot = true; + else if (value == 0) + amdgpu_ras_get_context(adev)->reboot = false; + + return size; +} + static const struct file_operations amdgpu_ras_debugfs_ctrl_ops = { .owner = THIS_MODULE, .read = NULL, @@ -360,6 +394,13 @@ static const struct file_operations amdgpu_ras_debugfs_eeprom_ops = { .llseek = default_llseek }; +static const struct file_operations amdgpu_ras_debugfs_reboot_ops = { + .owner = THIS_MODULE, + .read = NULL, + .write = amdgpu_ras_debugfs_reboot_write, + .llseek = default_llseek +}; + /** * DOC: AMDGPU RAS sysfs Error Count Interface * @@ -1037,6 +1078,8 @@ static void amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device *adev) adev, &amdgpu_ras_debugfs_ctrl_ops); debugfs_create_file("ras_eeprom_reset", S_IWUGO | S_IRUGO, con->dir, adev, &amdgpu_ras_debugfs_eeprom_ops); + debugfs_create_file("auto_reboot", S_IWUGO | S_IRUGO, con->dir, + adev, &amdgpu_ras_debugfs_reboot_ops); } void amdgpu_ras_debugfs_create(struct amdgpu_device *adev, -- 2.17.1 _______________________________________________ amd-gfx mailing list amd-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/amd-gfx