> -----Original Message----- > From: amd-gfx <amd-gfx-bounces@xxxxxxxxxxxxxxxxxxxxx> On Behalf Of > Chen, Guchun > Sent: Monday, October 21, 2019 5:33 AM > To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx; Koenig, Christian > <Christian.Koenig@xxxxxxx>; Zhang, Hawking > <Hawking.Zhang@xxxxxxx>; Li, Dennis <Dennis.Li@xxxxxxx>; > Grodzovsky, Andrey <Andrey.Grodzovsky@xxxxxxx>; Zhou1, Tao > <Tao.Zhou1@xxxxxxx> > Cc: Li, Candice <Candice.Li@xxxxxxx>; Chen, Guchun > <Guchun.Chen@xxxxxxx> > Subject: [PATCH] drm/amdgpu: refine reboot debugfs operation in ras case > > Ras reboot debugfs node allows user one easy control to avoid gpu recovery > hang problem and directly reboot system per card basis, after ras > uncorrectable error happens. However, it is one common entry, which > should get rid of ras_ctrl node and remove ip dependence when inputting by > user. So add one new auto_reboot node in ras debugfs dir to achieve this. > > v2: in commit mssage, add justification why ras reboot debugfs node is > needed. > > Signed-off-by: Guchun Chen <guchun.chen@xxxxxxx> Reviewed-by: Alex Deucher <alexander.deucher@xxxxxxx> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 57 > ++++++++++++++++++++++--- > 1 file changed, 50 insertions(+), 7 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > index 6220394521e4..6450065b88de 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > @@ -153,8 +153,6 @@ static int > amdgpu_ras_debugfs_ctrl_parse_data(struct file *f, > op = 1; > else if (sscanf(str, "inject %32s %8s", block_name, err) == 2) > op = 2; > - else if (sscanf(str, "reboot %32s", block_name) == 1) > - op = 3; > else if (str[0] && str[1] && str[2] && str[3]) > /* ascii string, but commands are not matched. */ > return -EINVAL; > @@ -218,12 +216,11 @@ static struct ras_manager > *amdgpu_ras_find_obj(struct amdgpu_device *adev, > * value to the address. > * > * Second member: struct ras_debug_if::op. > - * It has four kinds of operations. > + * It has three kinds of operations. > * > * - 0: disable RAS on the block. Take ::head as its data. > * - 1: enable RAS on the block. Take ::head as its data. > * - 2: inject errors on the block. Take ::inject as its data. > - * - 3: reboot on unrecoverable error > * > * How to use the interface? > * programs: > @@ -305,9 +302,6 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct > file *f, const char __user * > /* data.inject.address is offset instead of absolute gpu > address */ > ret = amdgpu_ras_error_inject(adev, &data.inject); > break; > - case 3: > - amdgpu_ras_get_context(adev)->reboot = true; > - break; > default: > ret = -EINVAL; > break; > @@ -346,6 +340,46 @@ static ssize_t > amdgpu_ras_debugfs_eeprom_write(struct file *f, const char __user > return ret == 1 ? size : -EIO; > } > > +/** > + * DOC: AMDGPU RAS debugfs auto reboot interface > + * > + * After one uncorrectable error happens, GPU recovery will be scheduled. > + * Due to the known problem in GPU recovery failing to bring GPU back, > +this > + * interface provides one direct way to user to reboot system > +automatically > + * in such case within ERREVENT_ATHUB_INTERRUPT generated. Normal > GPU > +recovery > + * routine will never be called. > + * > + * Enable auto_reboot: > + * > + * echo 1 > /sys/kernel/debug/dri/x/ras/auto_reboot > + * > + * Revert auto_reboot: > + * > + * echo 0 > /sys/kernel/debug/dri/x/ras/auto_reboot > + * > + */ > +static ssize_t amdgpu_ras_debugfs_reboot_write(struct file *f, > + const char __user *buf, size_t size, loff_t *pos) { > + struct amdgpu_device *adev = > + (struct amdgpu_device *)file_inode(f)->i_private; > + char tmp[8] = {0}; > + int value = -1; > + > + if (size != simple_write_to_buffer(tmp, sizeof(tmp), pos, buf, size)) > + return -EINVAL; > + > + if (kstrtoint(tmp, 10, &value)) > + return -EINVAL; > + > + if (value == 1) > + amdgpu_ras_get_context(adev)->reboot = true; > + else if (value == 0) > + amdgpu_ras_get_context(adev)->reboot = false; > + > + return size; > +} > + > static const struct file_operations amdgpu_ras_debugfs_ctrl_ops = { > .owner = THIS_MODULE, > .read = NULL, > @@ -360,6 +394,13 @@ static const struct file_operations > amdgpu_ras_debugfs_eeprom_ops = { > .llseek = default_llseek > }; > > +static const struct file_operations amdgpu_ras_debugfs_reboot_ops = { > + .owner = THIS_MODULE, > + .read = NULL, > + .write = amdgpu_ras_debugfs_reboot_write, > + .llseek = default_llseek > +}; > + > /** > * DOC: AMDGPU RAS sysfs Error Count Interface > * > @@ -1037,6 +1078,8 @@ static void > amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device *adev) > adev, &amdgpu_ras_debugfs_ctrl_ops); > debugfs_create_file("ras_eeprom_reset", S_IWUGO | S_IRUGO, > con->dir, > adev, &amdgpu_ras_debugfs_eeprom_ops); > + debugfs_create_file("auto_reboot", S_IWUGO | S_IRUGO, con->dir, > + adev, &amdgpu_ras_debugfs_reboot_ops); > } > > void amdgpu_ras_debugfs_create(struct amdgpu_device *adev, > -- > 2.17.1 > > _______________________________________________ > amd-gfx mailing list > amd-gfx@xxxxxxxxxxxxxxxxxxxxx > https://lists.freedesktop.org/mailman/listinfo/amd-gfx _______________________________________________ amd-gfx mailing list amd-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/amd-gfx