Cc: Alexander Deucher <Alexander.Deucher@xxxxxxx>
Cc: John Clements <john.clements@xxxxxxx>
Cc: Hawking Zhang <Hawking.Zhang@xxxxxxx>
Signed-off-by: Luben Tuikov <luben.tuikov@xxxxxxx>
Reviewed-by: John Clements <john.clements@xxxxxxx>
---
drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 +
drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c | 36 +++++++-------
drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 10 ++++
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 66 ++++++++++++++++++++++---
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 8 +--
5 files changed, 93 insertions(+), 28 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 3147c1c935c8..a269f778194f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -197,6 +197,7 @@ extern struct amdgpu_mgpu_info mgpu_info;
extern int amdgpu_ras_enable;
extern uint amdgpu_ras_mask;
extern int amdgpu_bad_page_threshold;
+extern uint amdgpu_ras_thread_poll_ms;
extern struct amdgpu_watchdog_timer amdgpu_watchdog_timer;
extern int amdgpu_async_gfx_ring;
extern int amdgpu_mcbp;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
index d481a33f4eaf..558e887e99b6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
@@ -332,12 +332,12 @@ static int amdgpu_ctx_query(struct amdgpu_device *adev,
}
static int amdgpu_ctx_query2(struct amdgpu_device *adev,
- struct amdgpu_fpriv *fpriv, uint32_t id,
- union drm_amdgpu_ctx_out *out)
+ struct amdgpu_fpriv *fpriv, uint32_t id,
+ union drm_amdgpu_ctx_out *out)
{
+ struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct amdgpu_ctx *ctx;
struct amdgpu_ctx_mgr *mgr;
- unsigned long ras_counter;
if (!fpriv)
return -EINVAL;
@@ -362,20 +362,22 @@ static int amdgpu_ctx_query2(struct amdgpu_device *adev,
if (atomic_read(&ctx->guilty))
out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_GUILTY;
- /*query ue count*/
- /* ras_counter = amdgpu_ras_query_error_count(adev, false); */
- /* /\*ras counter is monotonic increasing*\/ */
- /* if (ras_counter != ctx->ras_counter_ue) { */
- /* out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_RAS_UE; */
- /* ctx->ras_counter_ue = ras_counter; */
- /* } */
-
- /* /\*query ce count*\/ */
- /* ras_counter = amdgpu_ras_query_error_count(adev, true); */
- /* if (ras_counter != ctx->ras_counter_ce) { */
- /* out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_RAS_CE; */
- /* ctx->ras_counter_ce = ras_counter; */
- /* } */
+ if (con) {
+ int ce_count, ue_count;
+
+ ce_count = atomic_read(&con->ras_ce_count);
+ ue_count = atomic_read(&con->ras_ue_count);
+
+ if (ce_count != ctx->ras_counter_ce) {
+ out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_RAS_CE;
+ ctx->ras_counter_ce = ce_count;
+ }
+
+ if (ue_count != ctx->ras_counter_ue) {
+ out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_RAS_UE;
+ ctx->ras_counter_ue = ue_count;
+ }
+ }
mutex_unlock(&mgr->lock);
return 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index 877469d288f8..641c374b8525 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -183,6 +183,7 @@ struct amdgpu_mgpu_info mgpu_info = {
int amdgpu_ras_enable = -1;
uint amdgpu_ras_mask = 0xffffffff;
int amdgpu_bad_page_threshold = -1;
+uint amdgpu_ras_thread_poll_ms = 3000;
struct amdgpu_watchdog_timer amdgpu_watchdog_timer = {
.timeout_fatal_disable = false,
.period = 0x0, /* default to 0x0 (timeout disable) */
@@ -534,6 +535,15 @@ module_param_named(emu_mode, amdgpu_emu_mode, int, 0444);
MODULE_PARM_DESC(ras_enable, "Enable RAS features on the GPU (0 = disable, 1 = enable, -1 = auto (default))");
module_param_named(ras_enable, amdgpu_ras_enable, int, 0444);
+/**
+ * DOC: ras_thread_poll (uint)
+ * Number of milliseconds between RAS poll for errors.
+ * Valid range 0 and [500,10000]. Set to 0 to disable the thread.
+ * Default: 3000.
+ */
+MODULE_PARM_DESC(ras_thread_poll, "Number of milliseconds between RAS poll for errors. Valid range 0 and [500,10000]. Set to 0 to disable the thread. Default: 3000.");
+module_param_named(ras_thread_poll, amdgpu_ras_thread_poll_ms, uint, 0444);
+
/**
* DOC: ras_mask (uint)
* Mask of RAS features to enable (default 0xffffffff), only valid when ras_enable == 1
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index b1c57a5b6e89..30bec289e9ad 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1043,15 +1043,17 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev,
}
/* get the total error counts on all IPs */
-unsigned long amdgpu_ras_query_error_count(struct amdgpu_device *adev,
- bool is_ce)
+static void amdgpu_ras_query_error_count(struct amdgpu_device *adev)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct ras_manager *obj;
- struct ras_err_data data = {0, 0};
+ int ce_count, ue_count;
if (!adev->ras_enabled || !con)
- return 0;
+ return;
+
+ ce_count = 0;
+ ue_count = 0;
list_for_each_entry(obj, &con->head, node) {
struct ras_query_if info = {
@@ -1059,13 +1061,14 @@ unsigned long amdgpu_ras_query_error_count(struct amdgpu_device *adev,
};
if (amdgpu_ras_query_error_status(adev, &info))
- return 0;
+ return;
- data.ce_count += info.ce_count;
- data.ue_count += info.ue_count;
+ ce_count += info.ce_count;
+ ue_count += info.ue_count;
}
- return is_ce ? data.ce_count : data.ue_count;
+ atomic_set(&con->ras_ce_count, ce_count);
+ atomic_set(&con->ras_ue_count, ue_count);
}
/* query/inject/cure end */
@@ -2109,6 +2112,49 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev)
adev->ras_hw_enabled & amdgpu_ras_mask;
}
+static int amdgpu_ras_thread(void *data)
+{
+ struct amdgpu_device *adev = data;
+ struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+
+ con->ras_thread_poll_ms = amdgpu_ras_thread_poll_ms;
+ if (con->ras_thread_poll_ms == 0) {
+ atomic_set(&con->ras_ce_count, 0);
+ atomic_set(&con->ras_ue_count, 0);
+ return 0;
+ } else if (con->ras_thread_poll_ms < 500 ||
+ con->ras_thread_poll_ms > 10000) {
+ con->ras_thread_poll_ms = 3000;
+ }
+
+ while (1) {
+ if (kthread_should_stop())
+ break;
+ if (kthread_should_park())
+ kthread_parkme();
+
+ amdgpu_ras_query_error_count(adev);
+ msleep_interruptible(con->ras_thread_poll_ms);
+ }
+
+ return 0;
+}
+
+static int amdgpu_ras_thread_start(struct amdgpu_device *adev)
+{
+ struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+ struct task_struct *kt;
+
+ kt = kthread_run(amdgpu_ras_thread, adev,
+ "amdras:%s", pci_name(adev->pdev));
+ if (IS_ERR(kt))
+ return PTR_ERR(kt);
+
+ con->ras_thread = kt;
+
+ return 0;
+}
+
int amdgpu_ras_init(struct amdgpu_device *adev)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
@@ -2182,6 +2228,10 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
goto release_con;
}
+ r = amdgpu_ras_thread_start(adev);
+ if (r)
+ goto release_con;
+
dev_info(adev->dev, "RAS INFO: ras initialized successfully, "
"hardware ability[%x] ras_mask[%x]\n",
adev->ras_hw_enabled, adev->ras_enabled);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 201fbdee1d09..fb9e4c7ab054 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -340,6 +340,11 @@ struct amdgpu_ras {
/* disable ras error count harvest in recovery */
bool disable_ras_err_cnt_harvest;
+
+ struct task_struct *ras_thread;
+ unsigned int ras_thread_poll_ms;
+ atomic_t ras_ue_count;
+ atomic_t ras_ce_count;
};
struct ras_fs_data {
@@ -485,9 +490,6 @@ int amdgpu_ras_request_reset_on_boot(struct amdgpu_device *adev,
void amdgpu_ras_resume(struct amdgpu_device *adev);
void amdgpu_ras_suspend(struct amdgpu_device *adev);
-unsigned long amdgpu_ras_query_error_count(struct amdgpu_device *adev,
- bool is_ce);
-
/* error handling functions */
int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
struct eeprom_table_record *bps, int pages);