RE: [PATCH V2 1/4] drm/amdgpu: add variable to record the deferred error number read by driver

"Chai, Thomas" <YiPeng.Chai@xxxxxxx> · Fri, 21 Jun 2024 08:52:33 +0000

[AMD Official Use Only - AMD Internal Distribution Only]

prevd_queried_count and de_queried_count are used to accurately count the number of DE lost after driver receives a large number of poison creation interrupts.

Since amdgpu_ras_query_error_status can be called by page_retirment_thread, xxx_err_count sysfs and gpu recovery,
using local variable to save the old de_queried_count before calling amdgpu_ras_query_error_status in page_retirment_thread will be inaccurate.


-----------------
Best Regards,
Thomas

-----Original Message-----
From: Zhang, Hawking <Hawking.Zhang@xxxxxxx>
Sent: Friday, June 21, 2024 2:37 PM
To: Chai, Thomas <YiPeng.Chai@xxxxxxx>; amd-gfx@xxxxxxxxxxxxxxxxxxxxx
Cc: Zhou1, Tao <Tao.Zhou1@xxxxxxx>; Li, Candice <Candice.Li@xxxxxxx>; Wang, Yang(Kevin) <KevinYang.Wang@xxxxxxx>; Yang, Stanley <Stanley.Yang@xxxxxxx>
Subject: RE: [PATCH V2 1/4] drm/amdgpu: add variable to record the deferred error number read by driver

[AMD Official Use Only - AMD Internal Distribution Only]

Shall we make pre_de_queried_count to be local variable? Others look good to me

Regards,
Hawking

-----Original Message-----
From: Chai, Thomas <YiPeng.Chai@xxxxxxx>
Sent: Thursday, June 20, 2024 13:40
To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx
Cc: Zhang, Hawking <Hawking.Zhang@xxxxxxx>; Zhou1, Tao <Tao.Zhou1@xxxxxxx>; Li, Candice <Candice.Li@xxxxxxx>; Wang, Yang(Kevin) <KevinYang.Wang@xxxxxxx>; Yang, Stanley <Stanley.Yang@xxxxxxx>; Chai, Thomas <YiPeng.Chai@xxxxxxx>
Subject: [PATCH V2 1/4] drm/amdgpu: add variable to record the deferred error number read by driver

Add variable to record the deferred error number read by driver.

Signed-off-by: YiPeng Chai <YiPeng.Chai@xxxxxxx>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 62 ++++++++++++++++++-------  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  3 +-  drivers/gpu/drm/amd/amdgpu/umc_v12_0.c  |  4 +-
 3 files changed, 48 insertions(+), 21 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 86cb97d2155b..f674e34037b6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -120,7 +120,7 @@ const char *get_ras_block_str(struct ras_common_if *ras_block)
 /* typical ECC bad page rate is 1 bad page per 100MB VRAM */
 #define RAS_BAD_PAGE_COVER              (100 * 1024 * 1024ULL)

-#define MAX_UMC_POISON_POLLING_TIME_ASYNC  100  //ms
+#define MAX_UMC_POISON_POLLING_TIME_ASYNC  300  //ms

 #define AMDGPU_RAS_RETIRE_PAGE_INTERVAL 100  //ms

@@ -2804,7 +2804,8 @@ static void amdgpu_ras_ecc_log_init(struct ras_ecc_log_info *ecc_log)
        memset(&ecc_log->ecc_key, 0xad, sizeof(ecc_log->ecc_key));

        INIT_RADIX_TREE(&ecc_log->de_page_tree, GFP_KERNEL);
-       ecc_log->de_updated = false;
+       ecc_log->de_queried_count = 0;
+       ecc_log->prev_de_queried_count = 0;
 }

 static void amdgpu_ras_ecc_log_fini(struct ras_ecc_log_info *ecc_log) @@ -2823,7 +2824,8 @@ static void amdgpu_ras_ecc_log_fini(struct ras_ecc_log_info *ecc_log)
        mutex_unlock(&ecc_log->lock);

        mutex_destroy(&ecc_log->lock);
-       ecc_log->de_updated = false;
+       ecc_log->de_queried_count = 0;
+       ecc_log->prev_de_queried_count = 0;
 }
 #endif

@@ -2856,40 +2858,64 @@ static void amdgpu_ras_do_page_retirement(struct work_struct *work)
        mutex_unlock(&con->umc_ecc_log.lock);
 }

-static void amdgpu_ras_poison_creation_handler(struct amdgpu_device *adev,
-                               uint32_t timeout_ms)
+static int amdgpu_ras_poison_creation_handler(struct amdgpu_device *adev,
+                               uint32_t poison_creation_count)
 {
        int ret = 0;
        struct ras_ecc_log_info *ecc_log;
        struct ras_query_if info;
-       uint32_t timeout = timeout_ms;
+       uint32_t timeout = 0;
        struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
+       uint64_t de_queried_count;
+       uint32_t new_detect_count, total_detect_count;
+       uint32_t need_query_count = poison_creation_count;
+       bool query_data_timeout = false;

        memset(&info, 0, sizeof(info));
        info.head.block = AMDGPU_RAS_BLOCK__UMC;

        ecc_log = &ras->umc_ecc_log;
-       ecc_log->de_updated = false;
+       total_detect_count = 0;
        do {
                ret = amdgpu_ras_query_error_status(adev, &info);
-               if (ret) {
-                       dev_err(adev->dev, "Failed to query ras error! ret:%d\n", ret);
-                       return;
+               if (ret)
+                       return ret;
+
+               de_queried_count = ecc_log->de_queried_count;
+               if (de_queried_count > ecc_log->prev_de_queried_count) {
+                       new_detect_count = de_queried_count - ecc_log->prev_de_queried_count;
+                       ecc_log->prev_de_queried_count = de_queried_count;
+                       timeout = 0;
+               } else {
+                       new_detect_count = 0;
                }

-               if (timeout && !ecc_log->de_updated) {
-                       msleep(1);
-                       timeout--;
+               if (new_detect_count) {
+                       total_detect_count += new_detect_count;
+               } else {
+                       if (!timeout && need_query_count)
+                               timeout = MAX_UMC_POISON_POLLING_TIME_ASYNC;
+
+                       if (timeout) {
+                               if (!--timeout) {
+                                       query_data_timeout = true;
+                                       break;
+                               }
+                               msleep(1);
+                       }
                }
-       } while (timeout && !ecc_log->de_updated);
+       } while (total_detect_count < need_query_count);

-       if (timeout_ms && !timeout) {
-               dev_warn(adev->dev, "Can't find deferred error\n");
-               return;
+       if (query_data_timeout) {
+               dev_warn(adev->dev, "Can't find deferred error! count: %u\n",
+                       (need_query_count - total_detect_count));
+               return -ENOENT;
        }

-       if (!ret)
+       if (total_detect_count)
                schedule_delayed_work(&ras->page_retirement_dwork, 0);
+
+       return 0;
 }

 static int amdgpu_ras_poison_consumption_handler(struct amdgpu_device *adev, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 83437fef9df5..748bbac666e6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -469,7 +469,8 @@ struct ras_ecc_log_info {
        struct mutex lock;
        siphash_key_t ecc_key;
        struct radix_tree_root de_page_tree;
-       bool    de_updated;
+       uint64_t        de_queried_count;
+       uint64_t        prev_de_queried_count;
 };

 struct amdgpu_ras {
diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
index 6d6350f220b0..0faa21d8a7b4 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
@@ -557,7 +557,7 @@ static int umc_v12_0_update_ecc_status(struct amdgpu_device *adev,
        ret = amdgpu_umc_logs_ecc_err(adev, &con->umc_ecc_log.de_page_tree, ecc_err);
        if (ret) {
                if (ret == -EEXIST)
-                       con->umc_ecc_log.de_updated = true;
+                       con->umc_ecc_log.de_queried_count++;
                else
                        dev_err(adev->dev, "Fail to log ecc error! ret:%d\n", ret);

@@ -566,7 +566,7 @@ static int umc_v12_0_update_ecc_status(struct amdgpu_device *adev,
                return ret;
        }

-       con->umc_ecc_log.de_updated = true;
+       con->umc_ecc_log.de_queried_count++;

        return 0;
 }
--
2.34.1