In case of RAS error allow user configure auto system reboot through ras_ctrl. This is also part of the temproray work around for the RAS hang problem. Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 18 ++++++++++++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 10 +++++++++- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 1 + 3 files changed, 28 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index c9825ae..e26f2e9 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -3760,6 +3760,24 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, int i, r = 0; bool in_ras_intr = amdgpu_ras_intr_triggered(); + /* + * Flush RAM to disk so that after reboot + * the user can read log and see why the system rebooted. + * + * Using user mode app call instead of kernel APIs such as + * ksys_sync_helper for backward comparability with earlier + * kernels into which this is also intended. + */ + if (in_ras_intr && amdgpu_ras_get_context(adev)->reboot) { + char *envp[] = { "HOME=/", NULL }; + char *argv[] = { "/bin/sync", NULL }; + + DRM_WARN("Emergency reboot."); + + call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC); + emergency_restart(); + } + need_full_reset = job_signaled = false; INIT_LIST_HEAD(&device_list); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 1cc34de..bbcfb4f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -30,6 +30,7 @@ #include "amdgpu_ras.h" #include "amdgpu_atomfirmware.h" #include "ivsrcid/nbio/irqsrcs_nbif_7_4.h" +#include <linux/kmod.h> const char *ras_error_string[] = { "none", @@ -154,6 +155,8 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f, op = 1; else if (sscanf(str, "inject %32s %8s", block_name, err) == 2) op = 2; + else if (sscanf(str, "reboot %32s", block_name) == 1) + op = 3; else if (str[0] && str[1] && str[2] && str[3]) /* ascii string, but commands are not matched. */ return -EINVAL; @@ -287,6 +290,9 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user * /* data.inject.address is offset instead of absolute gpu address */ ret = amdgpu_ras_error_inject(adev, &data.inject); break; + case 3: + amdgpu_ras_get_context(adev)->reboot = true; + break; default: ret = -EINVAL; break; @@ -1733,6 +1739,8 @@ int amdgpu_ras_fini(struct amdgpu_device *adev) void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev) { if (atomic_cmpxchg(&amdgpu_ras_in_intr, 0, 1) == 0) { - DRM_WARN("RAS event of type ERREVENT_ATHUB_INTERRUPT detected! Stopping all GPU jobs.\n"); + DRM_WARN("RAS event of type ERREVENT_ATHUB_INTERRUPT detected!\n"); + + amdgpu_ras_reset_gpu(adev, false); } } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index 3ec2a87..a83ec99 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -333,6 +333,7 @@ struct amdgpu_ras { struct mutex recovery_lock; uint32_t flags; + bool reboot; }; struct ras_fs_data { -- 2.7.4 _______________________________________________ amd-gfx mailing list amd-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/amd-gfx