On Tue, Jun 8, 2021 at 5:41 PM Luben Tuikov <luben.tuikov@xxxxxxx> wrote: > > RAS_MAX_RECORD_NUM may mean the maximum record > number, as in the maximum house number on your > street, or it may mean the maximum number of > records, as in the count of records, which is also > a number. To make this distinction whether the > number is ordinal (index) or cardinal (count), > rename this macro to RAS_MAX_RECORD_COUNT. > > This makes it easy to understand what it refers > to, especially when we compute quantities such as, > how many records do we have left in the table, > especially when there are so many other numbers, > quantities and numerical macros around. > > Also rename the long, > amdgpu_ras_eeprom_get_record_max_length() to the > more succinct and clear, > amdgpu_ras_eeprom_max_record_count(). > > When computing the threshold, which also deals > with counts, i.e. "how many", use cardinal > "max_eeprom_records_count", than the quantitative > "max_eeprom_records_len". > > Simplify the logic here and there, as well. > > Cc: Guchun Chen <guchun.chen@xxxxxxx> > Cc: John Clements <john.clements@xxxxxxx> > Cc: Hawking Zhang <Hawking.Zhang@xxxxxxx> > Cc: Alexander Deucher <Alexander.Deucher@xxxxxxx> > Signed-off-by: Luben Tuikov <luben.tuikov@xxxxxxx> Acked-by: Alex Deucher <alexander.deucher@xxxxxxx> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 9 ++-- > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 50 ++++++++----------- > .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 8 +-- > .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h | 2 +- > 4 files changed, 30 insertions(+), 39 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c > index 3de1accb060e37..0203f654576bcc 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c > @@ -853,11 +853,10 @@ MODULE_PARM_DESC(reset_method, "GPU reset method (-1 = auto (default), 0 = legac > module_param_named(reset_method, amdgpu_reset_method, int, 0444); > > /** > - * DOC: bad_page_threshold (int) > - * Bad page threshold is to specify the threshold value of faulty pages > - * detected by RAS ECC, that may result in GPU entering bad status if total > - * faulty pages by ECC exceed threshold value and leave it for user's further > - * check. > + * DOC: bad_page_threshold (int) Bad page threshold is specifies the > + * threshold value of faulty pages detected by RAS ECC, which may > + * result in the GPU entering bad status when the number of total > + * faulty pages by ECC exceeds the threshold value. > */ > MODULE_PARM_DESC(bad_page_threshold, "Bad page threshold(-1 = auto(default value), 0 = disable bad page retirement)"); > module_param_named(bad_page_threshold, amdgpu_bad_page_threshold, int, 0444); > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > index 66c96c65e7eeb9..95ab400b641af0 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > @@ -71,8 +71,8 @@ const char *ras_block_string[] = { > /* inject address is 52 bits */ > #define RAS_UMC_INJECT_ADDR_LIMIT (0x1ULL << 52) > > -/* typical ECC bad page rate(1 bad page per 100MB VRAM) */ > -#define RAS_BAD_PAGE_RATE (100 * 1024 * 1024ULL) > +/* typical ECC bad page rate is 1 bad page per 100MB VRAM */ > +#define RAS_BAD_PAGE_COVER (100 * 1024 * 1024ULL) > > enum amdgpu_ras_retire_page_reservation { > AMDGPU_RAS_RETIRE_PAGE_RESERVED, > @@ -1841,27 +1841,24 @@ int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev) > static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev) > { > struct amdgpu_ras_eeprom_control *control = > - &adev->psp.ras.ras->eeprom_control; > - struct eeprom_table_record *bps = NULL; > - int ret = 0; > + &adev->psp.ras.ras->eeprom_control; > + struct eeprom_table_record *bps; > + int ret; > > /* no bad page record, skip eeprom access */ > - if (!control->num_recs || (amdgpu_bad_page_threshold == 0)) > - return ret; > + if (control->num_recs == 0 || amdgpu_bad_page_threshold == 0) > + return 0; > > bps = kcalloc(control->num_recs, sizeof(*bps), GFP_KERNEL); > if (!bps) > return -ENOMEM; > > - if (amdgpu_ras_eeprom_read(control, bps, control->num_recs)) { > + ret = amdgpu_ras_eeprom_read(control, bps, control->num_recs); > + if (ret) > dev_err(adev->dev, "Failed to load EEPROM table records!"); > - ret = -EIO; > - goto out; > - } > - > - ret = amdgpu_ras_add_bad_pages(adev, bps, control->num_recs); > + else > + ret = amdgpu_ras_add_bad_pages(adev, bps, control->num_recs); > > -out: > kfree(bps); > return ret; > } > @@ -1901,11 +1898,9 @@ static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev, > } > > static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev, > - uint32_t max_length) > + uint32_t max_count) > { > struct amdgpu_ras *con = amdgpu_ras_get_context(adev); > - int tmp_threshold = amdgpu_bad_page_threshold; > - u64 val; > > /* > * Justification of value bad_page_cnt_threshold in ras structure > @@ -1926,18 +1921,15 @@ static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev, > * take no effect. > */ > > - if (tmp_threshold < -1) > - tmp_threshold = -1; > - else if (tmp_threshold > max_length) > - tmp_threshold = max_length; > + if (amdgpu_bad_page_threshold < 0) { > + u64 val = adev->gmc.mc_vram_size; > > - if (tmp_threshold == -1) { > - val = adev->gmc.mc_vram_size; > - do_div(val, RAS_BAD_PAGE_RATE); > + do_div(val, RAS_BAD_PAGE_COVER); > con->bad_page_cnt_threshold = min(lower_32_bits(val), > - max_length); > + max_count); > } else { > - con->bad_page_cnt_threshold = tmp_threshold; > + con->bad_page_cnt_threshold = min_t(int, max_count, > + amdgpu_bad_page_threshold); > } > } > > @@ -1945,7 +1937,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev) > { > struct amdgpu_ras *con = amdgpu_ras_get_context(adev); > struct ras_err_handler_data **data; > - uint32_t max_eeprom_records_len = 0; > + u32 max_eeprom_records_count = 0; > bool exc_err_limit = false; > int ret; > > @@ -1965,8 +1957,8 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev) > atomic_set(&con->in_recovery, 0); > con->adev = adev; > > - max_eeprom_records_len = amdgpu_ras_eeprom_get_record_max_length(); > - amdgpu_ras_validate_threshold(adev, max_eeprom_records_len); > + max_eeprom_records_count = amdgpu_ras_eeprom_max_record_count(); > + amdgpu_ras_validate_threshold(adev, max_eeprom_records_count); > > /* Todo: During test the SMU might fail to read the eeprom through I2C > * when the GPU is pending on XGMI reset during probe time > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c > index 54ef31594accd9..21e1e59e4857ff 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c > @@ -54,7 +54,7 @@ > #define RAS_TBL_SIZE_BYTES (256 * 1024) > #define RAS_HDR_START 0 > #define RAS_RECORD_START (RAS_HDR_START + RAS_TABLE_HEADER_SIZE) > -#define RAS_MAX_RECORD_NUM ((RAS_TBL_SIZE_BYTES - RAS_TABLE_HEADER_SIZE) \ > +#define RAS_MAX_RECORD_COUNT ((RAS_TBL_SIZE_BYTES - RAS_TABLE_HEADER_SIZE) \ > / RAS_TABLE_RECORD_SIZE) > > #define to_amdgpu_device(x) (container_of(x, struct amdgpu_ras, eeprom_control))->adev > @@ -532,7 +532,7 @@ static int amdgpu_ras_eeprom_xfer(struct amdgpu_ras_eeprom_control *control, > * TODO - Check the assumption is correct > */ > control->num_recs += num; > - control->num_recs %= RAS_MAX_RECORD_NUM; > + control->num_recs %= RAS_MAX_RECORD_COUNT; > control->tbl_hdr.tbl_size += RAS_TABLE_RECORD_SIZE * num; > if (control->tbl_hdr.tbl_size > RAS_TBL_SIZE_BYTES) > control->tbl_hdr.tbl_size = RAS_TABLE_HEADER_SIZE + > @@ -568,9 +568,9 @@ int amdgpu_ras_eeprom_write(struct amdgpu_ras_eeprom_control *control, > return amdgpu_ras_eeprom_xfer(control, records, num, true); > } > > -inline uint32_t amdgpu_ras_eeprom_get_record_max_length(void) > +inline uint32_t amdgpu_ras_eeprom_max_record_count(void) > { > - return RAS_MAX_RECORD_NUM; > + return RAS_MAX_RECORD_COUNT; > } > > /* Used for testing if bugs encountered */ > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h > index 4906ed9fb8cdd3..504729b8053759 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h > @@ -88,7 +88,7 @@ int amdgpu_ras_eeprom_read(struct amdgpu_ras_eeprom_control *control, > int amdgpu_ras_eeprom_write(struct amdgpu_ras_eeprom_control *control, > struct eeprom_table_record *records, const u32 num); > > -inline uint32_t amdgpu_ras_eeprom_get_record_max_length(void); > +inline uint32_t amdgpu_ras_eeprom_max_record_count(void); > > void amdgpu_ras_eeprom_test(struct amdgpu_ras_eeprom_control *control); > > -- > 2.32.0 > > _______________________________________________ > amd-gfx mailing list > amd-gfx@xxxxxxxxxxxxxxxxxxxxx > https://lists.freedesktop.org/mailman/listinfo/amd-gfx _______________________________________________ amd-gfx mailing list amd-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/amd-gfx