Re: [PATCH 36/40] drm/amdgpu: Use explicit cardinality for clarity

Alex Deucher <alexdeucher@xxxxxxxxx> · Thu, 10 Jun 2021 17:17:40 -0400



On Tue, Jun 8, 2021 at 5:41 PM Luben Tuikov <luben.tuikov@xxxxxxx> wrote:
>
> RAS_MAX_RECORD_NUM may mean the maximum record
> number, as in the maximum house number on your
> street, or it may mean the maximum number of
> records, as in the count of records, which is also
> a number. To make this distinction whether the
> number is ordinal (index) or cardinal (count),
> rename this macro to RAS_MAX_RECORD_COUNT.
>
> This makes it easy to understand what it refers
> to, especially when we compute quantities such as,
> how many records do we have left in the table,
> especially when there are so many other numbers,
> quantities and numerical macros around.
>
> Also rename the long,
> amdgpu_ras_eeprom_get_record_max_length() to the
> more succinct and clear,
> amdgpu_ras_eeprom_max_record_count().
>
> When computing the threshold, which also deals
> with counts, i.e. "how many", use cardinal
> "max_eeprom_records_count", than the quantitative
> "max_eeprom_records_len".
>
> Simplify the logic here and there, as well.
>
> Cc: Guchun Chen <guchun.chen@xxxxxxx>
> Cc: John Clements <john.clements@xxxxxxx>
> Cc: Hawking Zhang <Hawking.Zhang@xxxxxxx>
> Cc: Alexander Deucher <Alexander.Deucher@xxxxxxx>
> Signed-off-by: Luben Tuikov <luben.tuikov@xxxxxxx>

Acked-by: Alex Deucher <alexander.deucher@xxxxxxx>

> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c       |  9 ++--
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c       | 50 ++++++++-----------
>  .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c    |  8 +--
>  .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h    |  2 +-
>  4 files changed, 30 insertions(+), 39 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> index 3de1accb060e37..0203f654576bcc 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> @@ -853,11 +853,10 @@ MODULE_PARM_DESC(reset_method, "GPU reset method (-1 = auto (default), 0 = legac
>  module_param_named(reset_method, amdgpu_reset_method, int, 0444);
>
>  /**
> - * DOC: bad_page_threshold (int)
> - * Bad page threshold is to specify the threshold value of faulty pages
> - * detected by RAS ECC, that may result in GPU entering bad status if total
> - * faulty pages by ECC exceed threshold value and leave it for user's further
> - * check.
> + * DOC: bad_page_threshold (int) Bad page threshold is specifies the
> + * threshold value of faulty pages detected by RAS ECC, which may
> + * result in the GPU entering bad status when the number of total
> + * faulty pages by ECC exceeds the threshold value.
>   */
>  MODULE_PARM_DESC(bad_page_threshold, "Bad page threshold(-1 = auto(default value), 0 = disable bad page retirement)");
>  module_param_named(bad_page_threshold, amdgpu_bad_page_threshold, int, 0444);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 66c96c65e7eeb9..95ab400b641af0 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -71,8 +71,8 @@ const char *ras_block_string[] = {
>  /* inject address is 52 bits */
>  #define        RAS_UMC_INJECT_ADDR_LIMIT       (0x1ULL << 52)
>
> -/* typical ECC bad page rate(1 bad page per 100MB VRAM) */
> -#define RAS_BAD_PAGE_RATE              (100 * 1024 * 1024ULL)
> +/* typical ECC bad page rate is 1 bad page per 100MB VRAM */
> +#define RAS_BAD_PAGE_COVER              (100 * 1024 * 1024ULL)
>
>  enum amdgpu_ras_retire_page_reservation {
>         AMDGPU_RAS_RETIRE_PAGE_RESERVED,
> @@ -1841,27 +1841,24 @@ int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev)
>  static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev)
>  {
>         struct amdgpu_ras_eeprom_control *control =
> -                                       &adev->psp.ras.ras->eeprom_control;
> -       struct eeprom_table_record *bps = NULL;
> -       int ret = 0;
> +               &adev->psp.ras.ras->eeprom_control;
> +       struct eeprom_table_record *bps;
> +       int ret;
>
>         /* no bad page record, skip eeprom access */
> -       if (!control->num_recs || (amdgpu_bad_page_threshold == 0))
> -               return ret;
> +       if (control->num_recs == 0 || amdgpu_bad_page_threshold == 0)
> +               return 0;
>
>         bps = kcalloc(control->num_recs, sizeof(*bps), GFP_KERNEL);
>         if (!bps)
>                 return -ENOMEM;
>
> -       if (amdgpu_ras_eeprom_read(control, bps, control->num_recs)) {
> +       ret = amdgpu_ras_eeprom_read(control, bps, control->num_recs);
> +       if (ret)
>                 dev_err(adev->dev, "Failed to load EEPROM table records!");
> -               ret = -EIO;
> -               goto out;
> -       }
> -
> -       ret = amdgpu_ras_add_bad_pages(adev, bps, control->num_recs);
> +       else
> +               ret = amdgpu_ras_add_bad_pages(adev, bps, control->num_recs);
>
> -out:
>         kfree(bps);
>         return ret;
>  }
> @@ -1901,11 +1898,9 @@ static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
>  }
>
>  static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev,
> -                                       uint32_t max_length)
> +                                         uint32_t max_count)
>  {
>         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> -       int tmp_threshold = amdgpu_bad_page_threshold;
> -       u64 val;
>
>         /*
>          * Justification of value bad_page_cnt_threshold in ras structure
> @@ -1926,18 +1921,15 @@ static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev,
>          *      take no effect.
>          */
>
> -       if (tmp_threshold < -1)
> -               tmp_threshold = -1;
> -       else if (tmp_threshold > max_length)
> -               tmp_threshold = max_length;
> +       if (amdgpu_bad_page_threshold < 0) {
> +               u64 val = adev->gmc.mc_vram_size;
>
> -       if (tmp_threshold == -1) {
> -               val = adev->gmc.mc_vram_size;
> -               do_div(val, RAS_BAD_PAGE_RATE);
> +               do_div(val, RAS_BAD_PAGE_COVER);
>                 con->bad_page_cnt_threshold = min(lower_32_bits(val),
> -                                               max_length);
> +                                                 max_count);
>         } else {
> -               con->bad_page_cnt_threshold = tmp_threshold;
> +               con->bad_page_cnt_threshold = min_t(int, max_count,
> +                                                   amdgpu_bad_page_threshold);
>         }
>  }
>
> @@ -1945,7 +1937,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
>  {
>         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
>         struct ras_err_handler_data **data;
> -       uint32_t max_eeprom_records_len = 0;
> +       u32  max_eeprom_records_count = 0;
>         bool exc_err_limit = false;
>         int ret;
>
> @@ -1965,8 +1957,8 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
>         atomic_set(&con->in_recovery, 0);
>         con->adev = adev;
>
> -       max_eeprom_records_len = amdgpu_ras_eeprom_get_record_max_length();
> -       amdgpu_ras_validate_threshold(adev, max_eeprom_records_len);
> +       max_eeprom_records_count = amdgpu_ras_eeprom_max_record_count();
> +       amdgpu_ras_validate_threshold(adev, max_eeprom_records_count);
>
>         /* Todo: During test the SMU might fail to read the eeprom through I2C
>          * when the GPU is pending on XGMI reset during probe time
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> index 54ef31594accd9..21e1e59e4857ff 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> @@ -54,7 +54,7 @@
>  #define RAS_TBL_SIZE_BYTES      (256 * 1024)
>  #define RAS_HDR_START           0
>  #define RAS_RECORD_START        (RAS_HDR_START + RAS_TABLE_HEADER_SIZE)
> -#define RAS_MAX_RECORD_NUM      ((RAS_TBL_SIZE_BYTES - RAS_TABLE_HEADER_SIZE) \
> +#define RAS_MAX_RECORD_COUNT    ((RAS_TBL_SIZE_BYTES - RAS_TABLE_HEADER_SIZE) \
>                                  / RAS_TABLE_RECORD_SIZE)
>
>  #define to_amdgpu_device(x) (container_of(x, struct amdgpu_ras, eeprom_control))->adev
> @@ -532,7 +532,7 @@ static int amdgpu_ras_eeprom_xfer(struct amdgpu_ras_eeprom_control *control,
>                  * TODO - Check the assumption is correct
>                  */
>                 control->num_recs += num;
> -               control->num_recs %= RAS_MAX_RECORD_NUM;
> +               control->num_recs %= RAS_MAX_RECORD_COUNT;
>                 control->tbl_hdr.tbl_size += RAS_TABLE_RECORD_SIZE * num;
>                 if (control->tbl_hdr.tbl_size > RAS_TBL_SIZE_BYTES)
>                         control->tbl_hdr.tbl_size = RAS_TABLE_HEADER_SIZE +
> @@ -568,9 +568,9 @@ int amdgpu_ras_eeprom_write(struct amdgpu_ras_eeprom_control *control,
>         return amdgpu_ras_eeprom_xfer(control, records, num, true);
>  }
>
> -inline uint32_t amdgpu_ras_eeprom_get_record_max_length(void)
> +inline uint32_t amdgpu_ras_eeprom_max_record_count(void)
>  {
> -       return RAS_MAX_RECORD_NUM;
> +       return RAS_MAX_RECORD_COUNT;
>  }
>
>  /* Used for testing if bugs encountered */
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
> index 4906ed9fb8cdd3..504729b8053759 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
> @@ -88,7 +88,7 @@ int amdgpu_ras_eeprom_read(struct amdgpu_ras_eeprom_control *control,
>  int amdgpu_ras_eeprom_write(struct amdgpu_ras_eeprom_control *control,
>                             struct eeprom_table_record *records, const u32 num);
>
> -inline uint32_t amdgpu_ras_eeprom_get_record_max_length(void);
> +inline uint32_t amdgpu_ras_eeprom_max_record_count(void);
>
>  void amdgpu_ras_eeprom_test(struct amdgpu_ras_eeprom_control *control);
>
> --
> 2.32.0
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@xxxxxxxxxxxxxxxxxxxxx
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@xxxxxxxxxxxxxxxxxxxxx
https://lists.freedesktop.org/mailman/listinfo/amd-gfx