[PATCH V2 1/2] drm/amdgpu: flush all cached ras bad pages to eeprom

YiPeng Chai <YiPeng.Chai@xxxxxxx> · Tue, 9 Jul 2024 13:56:05 +0800

Before uninstalling gpu driver, flush all cached ras
bad pages to eeprom.

v2:
  Put the same code into a function and reuse the function.

Signed-off-by: YiPeng Chai <YiPeng.Chai@xxxxxxx>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 35 ++++++++++++++++++++-----
 1 file changed, 29 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 0e748cd55d43..d923151af752 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -124,6 +124,8 @@ const char *get_ras_block_str(struct ras_common_if *ras_block)
 
 #define AMDGPU_RAS_RETIRE_PAGE_INTERVAL 100  //ms
 
+#define MAX_FLUSH_RETIRE_DWORK_TIMES  100
+
 enum amdgpu_ras_retire_page_reservation {
 	AMDGPU_RAS_RETIRE_PAGE_RESERVED,
 	AMDGPU_RAS_RETIRE_PAGE_PENDING,
@@ -2837,6 +2839,23 @@ static void amdgpu_ras_ecc_log_fini(struct ras_ecc_log_info *ecc_log)
 }
 #endif
 
+static bool amdgpu_ras_schedule_retirement_dwork(struct amdgpu_ras *con,
+				uint32_t delayed_ms)
+{
+	int ret;
+
+	mutex_lock(&con->umc_ecc_log.lock);
+	ret = radix_tree_tagged(&con->umc_ecc_log.de_page_tree,
+			UMC_ECC_NEW_DETECTED_TAG);
+	mutex_unlock(&con->umc_ecc_log.lock);
+
+	if (ret)
+		schedule_delayed_work(&con->page_retirement_dwork,
+			msecs_to_jiffies(delayed_ms));
+
+	return ret ? true : false;
+}
+
 static void amdgpu_ras_do_page_retirement(struct work_struct *work)
 {
 	struct amdgpu_ras *con = container_of(work, struct amdgpu_ras,
@@ -2858,12 +2877,8 @@ static void amdgpu_ras_do_page_retirement(struct work_struct *work)
 	if (err_cnt && con->is_rma)
 		amdgpu_ras_reset_gpu(adev);
 
-	mutex_lock(&con->umc_ecc_log.lock);
-	if (radix_tree_tagged(&con->umc_ecc_log.de_page_tree,
-				UMC_ECC_NEW_DETECTED_TAG))
-		schedule_delayed_work(&con->page_retirement_dwork,
-			msecs_to_jiffies(AMDGPU_RAS_RETIRE_PAGE_INTERVAL));
-	mutex_unlock(&con->umc_ecc_log.lock);
+	amdgpu_ras_schedule_retirement_dwork(con,
+			AMDGPU_RAS_RETIRE_PAGE_INTERVAL);
 }
 
 static int amdgpu_ras_poison_creation_handler(struct amdgpu_device *adev,
@@ -3179,11 +3194,19 @@ static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)
 {
 	struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 	struct ras_err_handler_data *data = con->eh_data;
+	int max_flush_timeout = MAX_FLUSH_RETIRE_DWORK_TIMES;
+	bool ret;
 
 	/* recovery_init failed to init it, fini is useless */
 	if (!data)
 		return 0;
 
+	/* Save all cached bad pages to eeprom */
+	do {
+		flush_delayed_work(&con->page_retirement_dwork);
+		ret = amdgpu_ras_schedule_retirement_dwork(con, 0);
+	} while (ret && max_flush_timeout--);
+
 	if (con->page_retirement_thread)
 		kthread_stop(con->page_retirement_thread);
 
-- 
2.34.1