+
/**
* amdgpu_device_gpu_recover - reset the asic and recover scheduler
*
@@ -3765,11 +3803,12 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
struct amdgpu_hive_info *hive = NULL;
struct amdgpu_device *tmp_adev = NULL;
int i, r = 0;
+ bool in_ras_intr = amdgpu_ras_intr_triggered();
need_full_reset = job_signaled = false;
INIT_LIST_HEAD(&device_list);
- dev_info(adev->dev, "GPU reset begin!\n");
+ dev_info(adev->dev, "GPU %s begin!\n", in_ras_intr ? "jobs stop":"reset");
cancel_delayed_work_sync(&adev->delayed_init_work);
@@ -3799,7 +3838,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
/* Build list of devices to reset */
if (adev->gmc.xgmi.num_physical_nodes > 1) {
if (!hive) {
- amdgpu_device_unlock_adev(adev);
+ amdgpu_device_unlock_adev(adev, false);
return -ENODEV;
}
@@ -3824,7 +3863,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
/* block all schedulers and reset given job's ring */
list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
/* disable ras on ALL IPs */
- if (amdgpu_device_ip_need_full_reset(tmp_adev))
+ if (!in_ras_intr && amdgpu_device_ip_need_full_reset(tmp_adev))
amdgpu_ras_suspend(tmp_adev);
for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
@@ -3834,10 +3873,16 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
continue;
drm_sched_stop(&ring->sched, job ? &job->base : NULL);
+
+ if (in_ras_intr)
+ amdgpu_stop_all_jobs_on_sched(&ring->sched);
}
}
+ if (in_ras_intr)
+ goto skip_hw_reset;
+
/*
* Must check guilty signal here since after this point all old
* HW fences are force signaled.
@@ -3902,34 +3947,37 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
/* Post ASIC reset for all devs .*/
list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
- for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
- struct amdgpu_ring *ring = tmp_adev->rings[i];
- if (!ring || !ring->sched.thread)
- continue;
+ if (!in_ras_intr) {
+ for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
+ struct amdgpu_ring *ring = tmp_adev->rings[i];
- /* No point to resubmit jobs if we didn't HW reset*/
- if (!tmp_adev->asic_reset_res && !job_signaled)
- drm_sched_resubmit_jobs(&ring->sched);
+ if (!ring || !ring->sched.thread)
+ continue;
- drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res);
- }
+ /* No point to resubmit jobs if we didn't HW reset*/
+ if (!tmp_adev->asic_reset_res && !job_signaled)
+ drm_sched_resubmit_jobs(&ring->sched);
- if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) {
- drm_helper_resume_force_mode(tmp_adev->ddev);
- }
+ drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res);
+ }
- tmp_adev->asic_reset_res = 0;
+ if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) {
+ drm_helper_resume_force_mode(tmp_adev->ddev);
+ }
- if (r) {
- /* bad news, how to tell it to userspace ? */
- dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&adev->gpu_reset_counter));
- amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
- } else {
- dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&adev->gpu_reset_counter));
+ tmp_adev->asic_reset_res = 0;
+
+ if (r) {
+ /* bad news, how to tell it to userspace ? */
+ dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&adev->gpu_reset_counter));
+ amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
+ } else {
+ dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&adev->gpu_reset_counter));
+ }
}
- amdgpu_device_unlock_adev(tmp_adev);
+ amdgpu_device_unlock_adev(tmp_adev, in_ras_intr);
}
if (hive)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index 151d7f2..757fd6d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -40,6 +40,8 @@
#include "amdgpu_amdkfd.h"
+#include "amdgpu_ras.h"
+
/*
* KMS wrapper.
* - 3.0.0 - initial driver
@@ -1179,6 +1181,9 @@ amdgpu_pci_shutdown(struct pci_dev *pdev)
struct drm_device *dev = pci_get_drvdata(pdev);
struct amdgpu_device *adev = dev->dev_private;
+ if (amdgpu_ras_intr_triggered())
+ return;
+
/* if we are running in a VM, make sure the device
* torn down properly on reboot/shutdown.
* unfortunately we can't detect certain
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
index da2143d..ced766c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
@@ -1046,6 +1046,12 @@ int amdgpu_driver_open_kms(struct drm_device *dev, struct drm_file *file_priv)
/* Ensure IB tests are run on ring */
flush_delayed_work(&adev->delayed_init_work);
+
+ if (amdgpu_ras_intr_triggered()) {
+ DRM_ERROR("RAS Intr triggered, device disabled!!");
+ return -EHWPOISON;
+ }
+
file_priv->driver_priv = NULL;
r = pm_runtime_get_sync(dev->dev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 2d5897a..086e6df 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -24,6 +24,8 @@
#include <linux/debugfs.h>
#include <linux/list.h>
#include <linux/module.h>
+#include <linux/reboot.h>
+#include <linux/syscalls.h>
#include "amdgpu.h"
#include "amdgpu_ras.h"
#include "amdgpu_atomfirmware.h"
@@ -64,6 +66,9 @@ const char *ras_block_string[] = {
/* inject address is 52 bits */
#define RAS_UMC_INJECT_ADDR_LIMIT (0x1ULL << 52)
+
+atomic_t amdgpu_ras_in_intr = ATOMIC_INIT(0);
+
static int amdgpu_ras_reserve_vram(struct amdgpu_device *adev,
uint64_t offset, uint64_t size,
struct amdgpu_bo **bo_ptr);
@@ -80,7 +85,7 @@ static ssize_t amdgpu_ras_debugfs_read(struct file *f, char __user *buf,
ssize_t s;
char val[128];
- if (amdgpu_ras_error_query(obj->adev, &info))
+ if (amdgpu_ras_error_query(obj->adev, &info, false))
return -EINVAL;
s = snprintf(val, sizeof(val), "%s: %lu\n%s: %lu\n",
@@ -188,6 +193,10 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
return 0;
}
+
+static struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev,
+ struct ras_common_if *head);
+
/**
* DOC: AMDGPU RAS debugfs control interface
*
@@ -304,7 +313,7 @@ static ssize_t amdgpu_ras_sysfs_read(struct device *dev,
.head = obj->head,
};
- if (amdgpu_ras_error_query(obj->adev, &info))
+ if (amdgpu_ras_error_query(obj->adev, &info, false))
return -EINVAL;
return snprintf(buf, PAGE_SIZE, "%s: %lu\n%s: %lu\n",
@@ -591,7 +600,7 @@ static int amdgpu_ras_enable_all_features(struct amdgpu_device *adev,
/* query/inject/cure begin */
int amdgpu_ras_error_query(struct amdgpu_device *adev,
- struct ras_query_if *info)
+ struct ras_query_if *info, bool print)
{
struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
struct ras_err_data err_data = {0, 0, 0, NULL};
@@ -627,12 +636,14 @@ int amdgpu_ras_error_query(struct amdgpu_device *adev,
info->ue_count = obj->err_data.ue_count;
info->ce_count = obj->err_data.ce_count;
- if (err_data.ce_count)
+ if (err_data.ce_count || print) {
dev_info(adev->dev, "%ld correctable errors detected in %s block\n",
obj->err_data.ce_count, ras_block_str(info->head.block));
- if (err_data.ue_count)
+ }
+ if (err_data.ue_count || print) {
dev_info(adev->dev, "%ld uncorrectable errors detected in %s block\n",
obj->err_data.ue_count, ras_block_str(info->head.block));
+ }
return 0;
}
@@ -702,7 +713,7 @@ int amdgpu_ras_query_error_count(struct amdgpu_device *adev,
.head = obj->head,
};
- if (amdgpu_ras_error_query(adev, &info))
+ if (amdgpu_ras_error_query(adev, &info, true))
return -EINVAL;
data.ce_count += info.ce_count;
@@ -1718,3 +1729,10 @@ int amdgpu_ras_fini(struct amdgpu_device *adev)
return 0;
}
+
+void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev)
+{
+ if (atomic_cmpxchg(&amdgpu_ras_in_intr, 0, 1) == 0) {
+ DRM_WARN("RAS event of type ERREVENT_ATHUB_INTERRUPT detected! Stopping all GPU jobs.\n");
+ }
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 5a0df73..c0e22af 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -587,7 +587,7 @@ void amdgpu_ras_debugfs_remove(struct amdgpu_device *adev,
struct ras_common_if *head);
int amdgpu_ras_error_query(struct amdgpu_device *adev,
- struct ras_query_if *info);
+ struct ras_query_if *info, bool print);
int amdgpu_ras_error_inject(struct amdgpu_device *adev,
struct ras_inject_if *info);
@@ -600,4 +600,14 @@ int amdgpu_ras_interrupt_remove_handler(struct amdgpu_device *adev,
int amdgpu_ras_interrupt_dispatch(struct amdgpu_device *adev,
struct ras_dispatch_if *info);
+
+extern atomic_t amdgpu_ras_in_intr;
+
+static inline bool amdgpu_ras_intr_triggered(void)
+{
+ return !!atomic_read(&amdgpu_ras_in_intr);
+}
+
+void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev);
+
#endif
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index b2c86a0..e7a83f6 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -5669,10 +5669,12 @@ static int gfx_v9_0_process_ras_data_cb(struct amdgpu_device *adev,
struct amdgpu_iv_entry *entry)
{
/* TODO ue will trigger an interrupt. */
- kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
- if (adev->gfx.funcs->query_ras_error_count)
- adev->gfx.funcs->query_ras_error_count(adev, err_data);
- amdgpu_ras_reset_gpu(adev, 0);
+ if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX)) {
+ kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
+ if (adev->gfx.funcs->query_ras_error_count)
+ adev->gfx.funcs->query_ras_error_count(adev, err_data);
+ amdgpu_ras_reset_gpu(adev, 0);
+ }
return AMDGPU_RAS_SUCCESS;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index 43b4fbc..87a66c2 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -243,18 +243,20 @@ static int gmc_v9_0_process_ras_data_cb(struct amdgpu_device *adev,
struct ras_err_data *err_data,
struct amdgpu_iv_entry *entry)
{
- kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
- if (adev->umc.funcs->query_ras_error_count)
- adev->umc.funcs->query_ras_error_count(adev, err_data);
- /* umc query_ras_error_address is also responsible for clearing
- * error status
- */
- if (adev->umc.funcs->query_ras_error_address)
- adev->umc.funcs->query_ras_error_address(adev, err_data);
+ if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX)) {
+ kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
+ if (adev->umc.funcs->query_ras_error_count)
+ adev->umc.funcs->query_ras_error_count(adev, err_data);
+ /* umc query_ras_error_address is also responsible for clearing
+ * error status
+ */
+ if (adev->umc.funcs->query_ras_error_address)
+ adev->umc.funcs->query_ras_error_address(adev, err_data);
- /* only uncorrectable error needs gpu reset */
- if (err_data->ue_count)
- amdgpu_ras_reset_gpu(adev, 0);
+ /* only uncorrectable error needs gpu reset */
+ if (err_data->ue_count)
+ amdgpu_ras_reset_gpu(adev, 0);
+ }
return AMDGPU_RAS_SUCCESS;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
index 367f9d6..545990c 100644
--- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
+++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
@@ -30,6 +30,7 @@
#include "nbio/nbio_7_4_0_smn.h"
#include "ivsrcid/nbio/irqsrcs_nbif_7_4.h"
#include <uapi/linux/kfd_ioctl.h>
+#include "amdgpu_ras.h"
#define smnNBIF_MGCG_CTRL_LCLK 0x1013a21c
@@ -329,6 +330,8 @@ static void nbio_v7_4_handle_ras_controller_intr_no_bifring(struct amdgpu_device
BIF_DOORBELL_INT_CNTL,
RAS_CNTLR_INTERRUPT_CLEAR, 1);
WREG32_SOC15(NBIO, 0, mmBIF_DOORBELL_INT_CNTL, bif_doorbell_intr_cntl);
+
+ amdgpu_ras_global_ras_isr(adev);
}
}
@@ -344,6 +347,8 @@ static void nbio_v7_4_handle_ras_err_event_athub_intr_no_bifring(struct amdgpu_d
BIF_DOORBELL_INT_CNTL,
RAS_ATHUB_ERR_EVENT_INTERRUPT_CLEAR, 1);
WREG32_SOC15(NBIO, 0, mmBIF_DOORBELL_INT_CNTL, bif_doorbell_intr_cntl);
+
+ amdgpu_ras_global_ras_isr(adev);
}
}
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
index 956432f..438e504 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
@@ -1972,24 +1972,26 @@ static int sdma_v4_0_process_ras_data_cb(struct amdgpu_device *adev,
uint32_t err_source;
int instance;
- instance = sdma_v4_0_irq_id_to_seq(entry->client_id);
- if (instance < 0)
- return 0;
+ if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX)) {
+ instance = sdma_v4_0_irq_id_to_seq(entry->client_id);
+ if (instance < 0)
+ return 0;
- switch (entry->src_id) {
- case SDMA0_4_0__SRCID__SDMA_SRAM_ECC:
- err_source = 0;
- break;
- case SDMA0_4_0__SRCID__SDMA_ECC:
- err_source = 1;
- break;
- default:
- return 0;
- }
+ switch (entry->src_id) {
+ case SDMA0_4_0__SRCID__SDMA_SRAM_ECC:
+ err_source = 0;
+ break;
+ case SDMA0_4_0__SRCID__SDMA_ECC:
+ err_source = 1;
+ break;
+ default:
+ return 0;
+ }
- kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
+ kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
- amdgpu_ras_reset_gpu(adev, 0);
+ amdgpu_ras_reset_gpu(adev, 0);
+ }
return AMDGPU_RAS_SUCCESS;
}