[AMD Official Use Only - General]
-----------------
Best Regards,
Thomas
_____________________________________________
From: Zhang, Hawking <Hawking.Zhang@xxxxxxx>
Sent: Wednesday, January 17, 2024 7:54 PM
To: Chai, Thomas <YiPeng.Chai@xxxxxxx>; amd-gfx@xxxxxxxxxxxxxxxxxxxxx
Cc: Zhou1, Tao <Tao.Zhou1@xxxxxxx>; Li, Candice <Candice.Li@xxxxxxx>; Wang, Yang(Kevin) <KevinYang.Wang@xxxxxxx>; Yang, Stanley <Stanley.Yang@xxxxxxx>
Subject: RE: [PATCH 3/5] drm/amdgpu: Use asynchronous polling to handle umc_v12_0 poisoning
From: Zhang, Hawking <Hawking.Zhang@xxxxxxx>
Sent: Wednesday, January 17, 2024 7:54 PM
To: Chai, Thomas <YiPeng.Chai@xxxxxxx>; amd-gfx@xxxxxxxxxxxxxxxxxxxxx
Cc: Zhou1, Tao <Tao.Zhou1@xxxxxxx>; Li, Candice <Candice.Li@xxxxxxx>; Wang, Yang(Kevin) <KevinYang.Wang@xxxxxxx>; Yang, Stanley <Stanley.Yang@xxxxxxx>
Subject: RE: [PATCH 3/5] drm/amdgpu: Use asynchronous polling to handle umc_v12_0 poisoning
[AMD Official Use Only - General]
Please check my comments inline
Regards,
Hawking
Hawking
-----Original Message-----
From: Chai, Thomas <YiPeng.Chai@xxxxxxx>
Sent: Tuesday, January 16, 2024 16:21
To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx
Cc: Chai, Thomas <YiPeng.Chai@xxxxxxx>; Zhang, Hawking <Hawking.Zhang@xxxxxxx>; Zhou1, Tao <Tao.Zhou1@xxxxxxx>; Li, Candice <Candice.Li@xxxxxxx>; Wang, Yang(Kevin) <KevinYang.Wang@xxxxxxx>; Yang, Stanley <Stanley.Yang@xxxxxxx>; Chai, Thomas <YiPeng.Chai@xxxxxxx>
Subject: [PATCH 3/5] drm/amdgpu: Use asynchronous polling to handle umc_v12_0 poisoning
From: Chai, Thomas <YiPeng.Chai@xxxxxxx>
Sent: Tuesday, January 16, 2024 16:21
To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx
Cc: Chai, Thomas <YiPeng.Chai@xxxxxxx>; Zhang, Hawking <Hawking.Zhang@xxxxxxx>; Zhou1, Tao <Tao.Zhou1@xxxxxxx>; Li, Candice <Candice.Li@xxxxxxx>; Wang, Yang(Kevin) <KevinYang.Wang@xxxxxxx>; Yang, Stanley <Stanley.Yang@xxxxxxx>; Chai, Thomas <YiPeng.Chai@xxxxxxx>
Subject: [PATCH 3/5] drm/amdgpu: Use asynchronous polling to handle umc_v12_0 poisoning
Use asynchronous polling to handle umc_v12_0 poisoning.
Signed-off-by: YiPeng Chai <YiPeng.Chai@xxxxxxx>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 5 +
drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 143 +++++++++++++++++++-----
drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h | 3 +
3 files changed, 120 insertions(+), 31 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 856206e95842..44929281840e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -118,6 +118,8 @@ const char *get_ras_block_str(struct ras_common_if *ras_block)
/* typical ECC bad page rate is 1 bad page per 100MB VRAM */
#define RAS_BAD_PAGE_COVER (100 * 1024 * 1024ULL)
+#define MAX_UMC_POISON_POLLING_TIME_ASYNC 100 //ms
+
enum amdgpu_ras_retire_page_reservation {
AMDGPU_RAS_RETIRE_PAGE_RESERVED,
AMDGPU_RAS_RETIRE_PAGE_PENDING,
@@ -2670,6 +2672,9 @@ static int amdgpu_ras_page_retirement_thread(void *param)
atomic_read(&con->page_retirement_req_cnt));
atomic_dec(&con->page_retirement_req_cnt);
+
+ amdgpu_umc_poison_retire_page_polling_timeout(adev,
+ false, MAX_UMC_POISON_POLLING_TIME_ASYNC);
}
return 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
index 9d1cf41cf483..2dde29cb807d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
@@ -23,6 +23,7 @@
#include "amdgpu.h"
#include "umc_v6_7.h"
+#define MAX_UMC_POISON_POLLING_TIME_SYNC 20 //ms
static int amdgpu_umc_convert_error_address(struct amdgpu_device *adev,
struct ras_err_data *err_data, uint64_t err_addr, @@ -85,17 +86,14 @@ int amdgpu_umc_page_retirement_mca(struct amdgpu_device *adev,
return ret;
}
-static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
- void *ras_error_status,
- struct amdgpu_iv_entry *entry,
- bool reset)
+static void amdgpu_umc_handle_bad_pages(struct amdgpu_device *adev,
+ void *ras_error_status)
{
struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
int ret = 0;
unsigned long err_count;
-
- kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
+ mutex_lock(&con->page_retirement_lock);
ret = amdgpu_dpm_get_ecc_info(adev, (void *)&(con->umc_ecc));
if (ret == -EOPNOTSUPP) {
if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops && @@ -163,19 +161,86 @@ static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
con->update_channel_flag = false;
}
}
-
- if (reset) {
- /* use mode-2 reset for poison consumption */
- if (!entry)
- con->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE2_RESET;
- amdgpu_ras_reset_gpu(adev);
- }
}
kfree(err_data->err_addr);
+
+ mutex_unlock(&con->page_retirement_lock);
+}
+
+static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
+ void *ras_error_status,
+ struct amdgpu_iv_entry *entry,
+ bool reset)
+{
+ struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
+ struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+
+ kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
+ amdgpu_umc_handle_bad_pages(adev, ras_error_status);
+
+ if (err_data->ue_count && reset) {
+ /* use mode-2 reset for poison consumption */
+ if (!entry)
+ con->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE2_RESET;
>[Hawking]: Shall we do further check on con->poison_supported flag to decide issuing mode-2 or mode-1.
[thomas] This “gpu reset” code is not belong to page retirement range, but reuse old code. About how to reset gpu, we can do it with another patch in future.
+ amdgpu_ras_reset_gpu(adev);
+ }
+
return AMDGPU_RAS_SUCCESS;
}
+int amdgpu_umc_poison_retire_page_polling_timeout(struct amdgpu_device *adev,
+ bool reset, uint32_t timeout_ms)
[Hawking] int amdgpu_umc_bad_page_polling_timeout(struct amdgpu_device *adev, boot reset, uint32_t timeout_ms)
[thomas] OK
+{
+ struct ras_err_data err_data;
+ struct ras_common_if head = {
+ .block = AMDGPU_RAS_BLOCK__UMC,
+ };
+ struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head);
+ uint32_t timeout = timeout_ms;
+
+ memset(&err_data, 0, sizeof(err_data));
+ amdgpu_ras_error_data_init(&err_data);
+
+ do {
+
+ amdgpu_umc_handle_bad_pages(adev, &err_data);
+
+ if (timeout && !err_data.de_count) {
+ msleep(1);
+ timeout--;
+ }
+
+ } while (timeout && !err_data.de_count);
+
+ if (!timeout)
+ dev_warn(adev->dev, "Page retirment executed, but did not find bad
+pages\n");
[Hawking] dev_warn (adev->dev, “can’t find bad pages\n”);
[thomas] OK
+
+ if (err_data.de_count)
+ dev_info(adev->dev, "Page retirment: ue:%ld, ce:%ld, de:%ld\n",
+ err_data.ue_count, err_data.ce_count, err_data.de_count);
[Hawking] dev_info(adev->dev, “%ld new deferred hardware errors detected\n”, err_data.de_count)
[thomas] OK
+
+ if (obj) {
+ obj->err_data.ue_count += err_data.ue_count;
+ obj->err_data.ce_count += err_data.ce_count;
+ obj->err_data.de_count += err_data.de_count;
+ }
+
+ amdgpu_ras_error_data_fini(&err_data);
+
+ kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
+
+ if (reset) {
+ struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+
+ /* use mode-2 reset for poison consumption */
+ con->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE2_RESET;
[Hawking]: Shall we do further check on con->poison_supported flag to decide issuing mode-2 or mode-1.
[thomas] This “gpu reset” code is not belong to page retirement range, but reuse old code. About how to reset gpu, we can do it with
another patch in future.
+ amdgpu_ras_reset_gpu(adev);
+ }
+
+ return 0;
+}
+
int amdgpu_umc_poison_handler(struct amdgpu_device *adev, bool reset) {
int ret = AMDGPU_RAS_SUCCESS;
@@ -193,25 +258,41 @@ int amdgpu_umc_poison_handler(struct amdgpu_device *adev, bool reset)
}
if (!amdgpu_sriov_vf(adev)) {
- struct ras_err_data err_data;
- struct ras_common_if head = {
- .block = AMDGPU_RAS_BLOCK__UMC,
- };
- struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head);
-
- ret = amdgpu_ras_error_data_init(&err_data);
- if (ret)
- return ret;
-
- ret = amdgpu_umc_do_page_retirement(adev, &err_data, NULL, reset);
-
- if (ret == AMDGPU_RAS_SUCCESS && obj) {
- obj->err_data.ue_count += err_data.ue_count;
- obj->err_data.ce_count += err_data.ce_count;
- obj->err_data.de_count += err_data.de_count;
- }
+ if (amdgpu_ip_version(adev, UMC_HWIP, 0) != IP_VERSION(12, 0, 0)) {
[Hawking]: if (amdgpu_ip_version(adev, UMC_HWIP, 0) < IP_VERSION(12, 0, 0)) {
[thomas] OK
+ struct ras_err_data err_data;
+ struct ras_common_if head = {
+ .block = AMDGPU_RAS_BLOCK__UMC,
+ };
+ struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head);
+
+ ret = amdgpu_ras_error_data_init(&err_data);
+ if (ret)
+ return ret;
+
+ ret = amdgpu_umc_do_page_retirement(adev, &err_data, NULL, reset);
- amdgpu_ras_error_data_fini(&err_data);
+ if (ret == AMDGPU_RAS_SUCCESS && obj) {
+ obj->err_data.ue_count += err_data.ue_count;
+ obj->err_data.ce_count += err_data.ce_count;
+ obj->err_data.de_count += err_data.de_count;
+ }
+
+ amdgpu_ras_error_data_fini(&err_data);
+ } else {
+ if (reset) {
+ amdgpu_umc_poison_retire_page_polling_timeout(adev,
+ reset, MAX_UMC_POISON_POLLING_TIME_SYNC);
+ } else {
+ struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+
+ dev_info(adev->dev, "Page retirement pending request count: %d\n",
+ atomic_read(&con->page_retirement_req_cnt));
[Hawking]: might remove the printed message here.
[thomas] OK
+
+ atomic_inc(&con->page_retirement_req_cnt);
+
+ wake_up(&con->page_retirement_wq);
+ }
+ }
} else {
if (adev->virt.ops && adev->virt.ops->ras_poison_handler)
adev->virt.ops->ras_poison_handler(adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
index 417a6726c71b..ee487adce7e4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
@@ -118,4 +118,7 @@ int amdgpu_umc_page_retirement_mca(struct amdgpu_device *adev,
int amdgpu_umc_loop_channels(struct amdgpu_device *adev,
umc_func func, void *data);
+
+int amdgpu_umc_poison_retire_page_polling_timeout(struct amdgpu_device *adev,
+ bool reset, uint32_t timeout_ms);
#endif
--
2.34.1