RAS_MAX_RECORD_NUM may mean the maximum record number, as in the maximum house number on your street, or it may mean the maximum number of records, as in the count of records, which is also a number. To make this distinction whether the number is ordinal (index) or cardinal (count), rename this macro to RAS_MAX_RECORD_COUNT. This makes it easy to understand what it refers to, especially when we compute quantities such as, how many records do we have left in the table, especially when there are so many other numbers, quantities and numerical macros around. Also rename the long, amdgpu_ras_eeprom_get_record_max_length() to the more succinct and clear, amdgpu_ras_eeprom_max_record_count(). When computing the threshold, which also deals with counts, i.e. "how many", use cardinal "max_eeprom_records_count", than the quantitative "max_eeprom_records_len". Simplify the logic here and there, as well. Cc: Guchun Chen <guchun.chen@xxxxxxx> Cc: John Clements <john.clements@xxxxxxx> Cc: Hawking Zhang <Hawking.Zhang@xxxxxxx> Cc: Alexander Deucher <Alexander.Deucher@xxxxxxx> Signed-off-by: Luben Tuikov <luben.tuikov@xxxxxxx> Acked-by: Alexander Deucher <Alexander.Deucher@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 9 ++-- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 50 ++++++++----------- .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 8 +-- .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h | 2 +- 4 files changed, 30 insertions(+), 39 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 80dab0c6e19ebb..406043c9906425 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -868,11 +868,10 @@ MODULE_PARM_DESC(reset_method, "GPU reset method (-1 = auto (default), 0 = legac module_param_named(reset_method, amdgpu_reset_method, int, 0444); /** - * DOC: bad_page_threshold (int) - * Bad page threshold is to specify the threshold value of faulty pages - * detected by RAS ECC, that may result in GPU entering bad status if total - * faulty pages by ECC exceed threshold value and leave it for user's further - * check. + * DOC: bad_page_threshold (int) Bad page threshold is specifies the + * threshold value of faulty pages detected by RAS ECC, which may + * result in the GPU entering bad status when the number of total + * faulty pages by ECC exceeds the threshold value. */ MODULE_PARM_DESC(bad_page_threshold, "Bad page threshold(-1 = auto(default value), 0 = disable bad page retirement)"); module_param_named(bad_page_threshold, amdgpu_bad_page_threshold, int, 0444); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 64054a825b7421..c7ad90483f21d3 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -71,8 +71,8 @@ const char *ras_block_string[] = { /* inject address is 52 bits */ #define RAS_UMC_INJECT_ADDR_LIMIT (0x1ULL << 52) -/* typical ECC bad page rate(1 bad page per 100MB VRAM) */ -#define RAS_BAD_PAGE_RATE (100 * 1024 * 1024ULL) +/* typical ECC bad page rate is 1 bad page per 100MB VRAM */ +#define RAS_BAD_PAGE_COVER (100 * 1024 * 1024ULL) enum amdgpu_ras_retire_page_reservation { AMDGPU_RAS_RETIRE_PAGE_RESERVED, @@ -1841,27 +1841,24 @@ int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev) static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev) { struct amdgpu_ras_eeprom_control *control = - &adev->psp.ras.ras->eeprom_control; - struct eeprom_table_record *bps = NULL; - int ret = 0; + &adev->psp.ras.ras->eeprom_control; + struct eeprom_table_record *bps; + int ret; /* no bad page record, skip eeprom access */ - if (!control->num_recs || (amdgpu_bad_page_threshold == 0)) - return ret; + if (control->num_recs == 0 || amdgpu_bad_page_threshold == 0) + return 0; bps = kcalloc(control->num_recs, sizeof(*bps), GFP_KERNEL); if (!bps) return -ENOMEM; - if (amdgpu_ras_eeprom_read(control, bps, control->num_recs)) { + ret = amdgpu_ras_eeprom_read(control, bps, control->num_recs); + if (ret) dev_err(adev->dev, "Failed to load EEPROM table records!"); - ret = -EIO; - goto out; - } - - ret = amdgpu_ras_add_bad_pages(adev, bps, control->num_recs); + else + ret = amdgpu_ras_add_bad_pages(adev, bps, control->num_recs); -out: kfree(bps); return ret; } @@ -1901,11 +1898,9 @@ static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev, } static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev, - uint32_t max_length) + uint32_t max_count) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); - int tmp_threshold = amdgpu_bad_page_threshold; - u64 val; /* * Justification of value bad_page_cnt_threshold in ras structure @@ -1926,18 +1921,15 @@ static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev, * take no effect. */ - if (tmp_threshold < -1) - tmp_threshold = -1; - else if (tmp_threshold > max_length) - tmp_threshold = max_length; + if (amdgpu_bad_page_threshold < 0) { + u64 val = adev->gmc.mc_vram_size; - if (tmp_threshold == -1) { - val = adev->gmc.mc_vram_size; - do_div(val, RAS_BAD_PAGE_RATE); + do_div(val, RAS_BAD_PAGE_COVER); con->bad_page_cnt_threshold = min(lower_32_bits(val), - max_length); + max_count); } else { - con->bad_page_cnt_threshold = tmp_threshold; + con->bad_page_cnt_threshold = min_t(int, max_count, + amdgpu_bad_page_threshold); } } @@ -1945,7 +1937,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct ras_err_handler_data **data; - uint32_t max_eeprom_records_len = 0; + u32 max_eeprom_records_count = 0; bool exc_err_limit = false; int ret; @@ -1965,8 +1957,8 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev) atomic_set(&con->in_recovery, 0); con->adev = adev; - max_eeprom_records_len = amdgpu_ras_eeprom_get_record_max_length(); - amdgpu_ras_validate_threshold(adev, max_eeprom_records_len); + max_eeprom_records_count = amdgpu_ras_eeprom_max_record_count(); + amdgpu_ras_validate_threshold(adev, max_eeprom_records_count); /* Todo: During test the SMU might fail to read the eeprom through I2C * when the GPU is pending on XGMI reset during probe time diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c index 54ef31594accd9..21e1e59e4857ff 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c @@ -54,7 +54,7 @@ #define RAS_TBL_SIZE_BYTES (256 * 1024) #define RAS_HDR_START 0 #define RAS_RECORD_START (RAS_HDR_START + RAS_TABLE_HEADER_SIZE) -#define RAS_MAX_RECORD_NUM ((RAS_TBL_SIZE_BYTES - RAS_TABLE_HEADER_SIZE) \ +#define RAS_MAX_RECORD_COUNT ((RAS_TBL_SIZE_BYTES - RAS_TABLE_HEADER_SIZE) \ / RAS_TABLE_RECORD_SIZE) #define to_amdgpu_device(x) (container_of(x, struct amdgpu_ras, eeprom_control))->adev @@ -532,7 +532,7 @@ static int amdgpu_ras_eeprom_xfer(struct amdgpu_ras_eeprom_control *control, * TODO - Check the assumption is correct */ control->num_recs += num; - control->num_recs %= RAS_MAX_RECORD_NUM; + control->num_recs %= RAS_MAX_RECORD_COUNT; control->tbl_hdr.tbl_size += RAS_TABLE_RECORD_SIZE * num; if (control->tbl_hdr.tbl_size > RAS_TBL_SIZE_BYTES) control->tbl_hdr.tbl_size = RAS_TABLE_HEADER_SIZE + @@ -568,9 +568,9 @@ int amdgpu_ras_eeprom_write(struct amdgpu_ras_eeprom_control *control, return amdgpu_ras_eeprom_xfer(control, records, num, true); } -inline uint32_t amdgpu_ras_eeprom_get_record_max_length(void) +inline uint32_t amdgpu_ras_eeprom_max_record_count(void) { - return RAS_MAX_RECORD_NUM; + return RAS_MAX_RECORD_COUNT; } /* Used for testing if bugs encountered */ diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h index 4906ed9fb8cdd3..504729b8053759 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h @@ -88,7 +88,7 @@ int amdgpu_ras_eeprom_read(struct amdgpu_ras_eeprom_control *control, int amdgpu_ras_eeprom_write(struct amdgpu_ras_eeprom_control *control, struct eeprom_table_record *records, const u32 num); -inline uint32_t amdgpu_ras_eeprom_get_record_max_length(void); +inline uint32_t amdgpu_ras_eeprom_max_record_count(void); void amdgpu_ras_eeprom_test(struct amdgpu_ras_eeprom_control *control); -- 2.32.0 _______________________________________________ amd-gfx mailing list amd-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/amd-gfx