Signed-off-by: Dennis Li <Dennis.Li@xxxxxxx>
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index b7ee587484b2..ff4387bbfb1e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -170,7 +170,7 @@ struct amdgpu_mgpu_info mgpu_info = {
};
int amdgpu_ras_enable = -1;
uint amdgpu_ras_mask = 0xffffffff;
-int amdgpu_bad_page_threshold = -1;
+int amdgpu_bad_page_threshold = 100;
/**
* DOC: vramlimit (int)
@@ -804,7 +804,7 @@ module_param_named(reset_method, amdgpu_reset_method, int, 0444);
* faulty pages by ECC exceed threshold value and leave it for user's further
* check.
*/
-MODULE_PARM_DESC(bad_page_threshold, "Bad page threshold(-1 = auto(default value), 0 = disable bad page retirement)");
+MODULE_PARM_DESC(bad_page_threshold, "Bad page threshold(-1 = auto, 0 = disable bad page retirement, 100 = default value");
module_param_named(bad_page_threshold, amdgpu_bad_page_threshold, int, 0444);
MODULE_PARM_DESC(num_kcq, "number of kernel compute queue user want to setup (8 if set to greater than 8 or less than 0, only affect gfx 8+)");
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 93699ea4860c..fb1c3f6cef29 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1779,7 +1779,7 @@ static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev,
tmp_threshold = max_length;
if (tmp_threshold == -1) {
- val = adev->gmc.mc_vram_size;
+ val = adev->gmc.real_vram_size;
do_div(val, RAS_BAD_PAGE_RATE);
con->bad_page_cnt_threshold = min(lower_32_bits(val),
max_length);
@@ -1812,8 +1812,16 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
atomic_set(&con->in_recovery, 0);
con->adev = adev;
- max_eeprom_records_len = amdgpu_ras_eeprom_get_record_max_length();
- amdgpu_ras_validate_threshold(adev, max_eeprom_records_len);
+ if (!con->bad_page_cnt_threshold) {
+ max_eeprom_records_len = amdgpu_ras_eeprom_get_record_max_length();
+ amdgpu_ras_validate_threshold(adev, max_eeprom_records_len);
+
+ ret = amdgpu_vram_mgr_reserve_backup_pages(
+ ttm_manager_type(&adev->mman.bdev, TTM_PL_VRAM),
+ con->bad_page_cnt_threshold);
+ if (ret)
+ goto out;
+ }
ret = amdgpu_ras_eeprom_init(&con->eeprom_control, &exc_err_limit);
/*
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
index 69ba8dd4f3ee..927d33d75c22 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
@@ -52,6 +52,8 @@ struct amdgpu_vram_mgr {
spinlock_t lock;
struct list_head reservations_pending;
struct list_head reserved_pages;
+ struct list_head backup_pages;
+ uint32_t num_backup_pages;
atomic64_t usage;
atomic64_t vis_usage;
};
@@ -127,6 +129,8 @@ uint64_t amdgpu_vram_mgr_usage(struct ttm_resource_manager *man);
uint64_t amdgpu_vram_mgr_vis_usage(struct ttm_resource_manager *man);
int amdgpu_vram_mgr_reserve_range(struct ttm_resource_manager *man,
uint64_t start, uint64_t size);
+int amdgpu_vram_mgr_reserve_backup_pages(struct ttm_resource_manager *man,
+ uint32_t num_pages);
int amdgpu_vram_mgr_query_page_status(struct ttm_resource_manager *man,
uint64_t start);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
index 21d18efca277..b325b067926b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
@@ -28,6 +28,9 @@
#include "amdgpu_atomfirmware.h"
#include "atom.h"
+static int amdgpu_vram_mgr_free_backup_pages(struct amdgpu_vram_mgr *mgr,
+ uint32_t num_pages);
+
static inline struct amdgpu_vram_mgr *to_vram_mgr(struct ttm_resource_manager *man)
{
return container_of(man, struct amdgpu_vram_mgr, manager);
@@ -189,6 +192,7 @@ int amdgpu_vram_mgr_init(struct amdgpu_device *adev)
spin_lock_init(&mgr->lock);
INIT_LIST_HEAD(&mgr->reservations_pending);
INIT_LIST_HEAD(&mgr->reserved_pages);
+ INIT_LIST_HEAD(&mgr->backup_pages);
/* Add the two VRAM-related sysfs files */
ret = sysfs_create_files(&adev->dev->kobj, amdgpu_vram_mgr_attributes);
@@ -229,6 +233,11 @@ void amdgpu_vram_mgr_fini(struct amdgpu_device *adev)
drm_mm_remove_node(&rsv->mm_node);
kfree(rsv);
}
+
+ list_for_each_entry_safe(rsv, temp, &mgr->backup_pages, node) {
+ drm_mm_remove_node(&rsv->mm_node);
+ kfree(rsv);
+ }
drm_mm_takedown(&mgr->mm);
spin_unlock(&mgr->lock);
@@ -300,12 +309,14 @@ static void amdgpu_vram_mgr_do_reserve(struct ttm_resource_manager *man)
continue;
dev_dbg(adev->dev, "Reservation 0x%llx - %lld, Succeeded\n",
- rsv->mm_node.start, rsv->mm_node.size);
+ rsv->mm_node.start << PAGE_SHIFT, rsv->mm_node.size);
vis_usage = amdgpu_vram_mgr_vis_size(adev, &rsv->mm_node);
atomic64_add(vis_usage, &mgr->vis_usage);
atomic64_add(rsv->mm_node.size << PAGE_SHIFT, &mgr->usage);
list_move(&rsv->node, &mgr->reserved_pages);
+
+ amdgpu_vram_mgr_free_backup_pages(mgr, rsv->mm_node.size);
}
}
@@ -322,6 +333,7 @@ int amdgpu_vram_mgr_reserve_range(struct ttm_resource_manager *man,
uint64_t start, uint64_t size)
{
struct amdgpu_vram_mgr *mgr = to_vram_mgr(man);
+ struct amdgpu_device *adev = to_amdgpu_device(mgr);
struct amdgpu_vram_reservation *rsv;
rsv = kzalloc(sizeof(*rsv), GFP_KERNEL);
@@ -332,14 +344,94 @@ int amdgpu_vram_mgr_reserve_range(struct ttm_resource_manager *man,
rsv->mm_node.start = start >> PAGE_SHIFT;
rsv->mm_node.size = size >> PAGE_SHIFT;
+ dev_dbg(adev->dev, "Pending Reservation: 0x%llx\n", start);
+
spin_lock(&mgr->lock);
- list_add_tail(&mgr->reservations_pending, &rsv->node);
+ list_add_tail(&rsv->node, &mgr->reservations_pending);
amdgpu_vram_mgr_do_reserve(man);
spin_unlock(&mgr->lock);
return 0;
}
+static int amdgpu_vram_mgr_free_backup_pages(struct amdgpu_vram_mgr *mgr,
+ uint32_t num_pages)
+{
+ struct amdgpu_device *adev = to_amdgpu_device(mgr);
+ struct amdgpu_vram_reservation *rsv;
+ uint32_t i;
+ uint64_t vis_usage = 0, total_usage = 0;
+
+ if (num_pages > mgr->num_backup_pages) {
+ dev_warn(adev->dev, "No enough backup pages\n");
+ return -EINVAL;
+ }
+
+ for (i = 0; i < num_pages; i++) {
+ rsv = list_first_entry(&mgr->backup_pages,
+ struct amdgpu_vram_reservation, node);
+ vis_usage += amdgpu_vram_mgr_vis_size(adev, &rsv->mm_node);
+ total_usage += (rsv->mm_node.size << PAGE_SHIFT);
+ drm_mm_remove_node(&rsv->mm_node);
+ list_del(&rsv->node);
+ kfree(rsv);
+ mgr->num_backup_pages--;
+ }
+
+ atomic64_sub(total_usage, &mgr->usage);
+ atomic64_sub(vis_usage, &mgr->vis_usage);
+
+ return 0;
+}
+
+int amdgpu_vram_mgr_reserve_backup_pages(struct ttm_resource_manager *man,
+ uint32_t num_pages)
+{
+ struct amdgpu_vram_mgr *mgr = to_vram_mgr(man);
+ struct amdgpu_device *adev = to_amdgpu_device(mgr);
+ struct amdgpu_vram_reservation *rsv;
+ struct drm_mm *mm = &mgr->mm;
+ uint32_t i;
+ int ret = 0;
+ uint64_t vis_usage, total_usage;
+
+ for (i = 0; i < num_pages; i++) {
+ rsv = kzalloc(sizeof(*rsv), GFP_KERNEL);
+ if (!rsv) {
+ ret = -ENOMEM;
+ goto pro_end;
+ }
+
+ INIT_LIST_HEAD(&rsv->node);
+
+ ret = drm_mm_insert_node(mm, &rsv->mm_node, 1);
+ if (ret) {
+ dev_err(adev->dev, "failed to reserve backup page %d, ret 0x%x\n", i, ret);
+ kfree(rsv);
+ goto pro_end;
+ }
+
+ vis_usage = amdgpu_vram_mgr_vis_size(adev, &rsv->mm_node);
+ total_usage = (rsv->mm_node.size << PAGE_SHIFT);
+
+ spin_lock(&mgr->lock);
+ atomic64_add(vis_usage, &mgr->vis_usage);
+ atomic64_add(total_usage, &mgr->usage);
+ list_add_tail(&rsv->node, &mgr->backup_pages);
+ mgr->num_backup_pages++;
+ spin_unlock(&mgr->lock);
+ }
+
+pro_end:
+ if (ret) {
+ spin_lock(&mgr->lock);
+ amdgpu_vram_mgr_free_backup_pages(mgr, mgr->num_backup_pages);
+ spin_unlock(&mgr->lock);
+ }
+
+ return ret;
+}
+
/**
* amdgpu_vram_mgr_query_page_status - query the reservation status
*