Re: [PATCH 38/40] drm/amdgpu: RAS EEPROM table is now in debugfs

Luben Tuikov <luben.tuikov@xxxxxxx> · Fri, 11 Jun 2021 13:30:45 -0400

On 2021-06-11 1:16 p.m., Alex Deucher wrote:
> On Tue, Jun 8, 2021 at 5:41 PM Luben Tuikov <luben.tuikov@xxxxxxx> wrote:
>> Add "ras_eeprom_size" file in debugfs, which
>> reports the maximum size allocated to the RAS
>> table in EEROM, as the number of bytes and the
>> number of records it could store. For instance,
>>
>> $cat /sys/kernel/debug/dri/0/ras/ras_eeprom_size
>> 262144 bytes or 10921 records
>> $_
>>
>> Add "ras_eeprom_table" file in debugfs, which
>> dumps the RAS table stored EEPROM, in a formatted
>> way. For instance,
>>
>> $cat ras_eeprom_table
>>  Signature    Version  FirstOffs       Size   Checksum
>> 0x414D4452 0x00010000 0x00000014 0x000000EC 0x000000DA
>> Index  Offset ErrType Bank/CU          TimeStamp      Offs/Addr MemChl MCUMCID    RetiredPage
>>     0 0x00014      ue    0x00 0x00000000607608DC 0x000000000000   0x00    0x00 0x000000000000
>>     1 0x0002C      ue    0x00 0x00000000607608DC 0x000000001000   0x00    0x00 0x000000000001
>>     2 0x00044      ue    0x00 0x00000000607608DC 0x000000002000   0x00    0x00 0x000000000002
>>     3 0x0005C      ue    0x00 0x00000000607608DC 0x000000003000   0x00    0x00 0x000000000003
>>     4 0x00074      ue    0x00 0x00000000607608DC 0x000000004000   0x00    0x00 0x000000000004
>>     5 0x0008C      ue    0x00 0x00000000607608DC 0x000000005000   0x00    0x00 0x000000000005
>>     6 0x000A4      ue    0x00 0x00000000607608DC 0x000000006000   0x00    0x00 0x000000000006
>>     7 0x000BC      ue    0x00 0x00000000607608DC 0x000000007000   0x00    0x00 0x000000000007
>>     8 0x000D4      ue    0x00 0x00000000607608DD 0x000000008000   0x00    0x00 0x000000000008
>> $_
>>
>> Cc: Alexander Deucher <Alexander.Deucher@xxxxxxx>
>> Cc: Andrey Grodzovsky <Andrey.Grodzovsky@xxxxxxx>
>> Cc: John Clements <john.clements@xxxxxxx>
>> Cc: Hawking Zhang <Hawking.Zhang@xxxxxxx>
>> Cc: Xinhui Pan <xinhui.pan@xxxxxxx>
>> Signed-off-by: Luben Tuikov <luben.tuikov@xxxxxxx>
> Seems like a useful feature.  Just a few comments below.
>
> Alex
>
>
>> ---
>>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c       |  12 +-
>>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h       |   1 +
>>  .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c    | 241 +++++++++++++++++-
>>  .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h    |  10 +-
>>  4 files changed, 252 insertions(+), 12 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
>> index 1424f2cc2076c1..d791a360a92366 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
>> @@ -404,9 +404,9 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f,
>>                 /* umc ce/ue error injection for a bad page is not allowed */
>>                 if ((data.head.block == AMDGPU_RAS_BLOCK__UMC) &&
>>                     amdgpu_ras_check_bad_page(adev, data.inject.address)) {
>> -                       dev_warn(adev->dev, "RAS WARN: 0x%llx has been marked "
>> -                                       "as bad before error injection!\n",
>> -                                       data.inject.address);
>> +                       dev_warn(adev->dev, "RAS WARN: inject: 0x%llx has "
>> +                                "already been marked as bad!\n",
>> +                                data.inject.address);
> This seems unrelated to this patch.

It's just cosmetic fix, to correctly align, as it seems that the previous alignment was arbitrary.
Just pressing TAB in Emacs does wonders. :-)

I was in this file and decided to fix this. It's just cosmetic. No functional change.

>
>>                         break;
>>                 }
>>
>> @@ -1301,6 +1301,12 @@ static struct dentry *amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device *
>>                            &con->bad_page_cnt_threshold);
>>         debugfs_create_x32("ras_hw_enabled", 0444, dir, &adev->ras_hw_enabled);
>>         debugfs_create_x32("ras_enabled", 0444, dir, &adev->ras_enabled);
>> +       debugfs_create_file("ras_eeprom_size", S_IRUGO, dir, adev,
>> +                           &amdgpu_ras_debugfs_eeprom_size_ops);
>> +       con->de_ras_eeprom_table = debugfs_create_file("ras_eeprom_table",
>> +                                                      S_IRUGO, dir, adev,
>> +                                                      &amdgpu_ras_debugfs_eeprom_table_ops);
>> +       amdgpu_ras_debugfs_set_ret_size(&con->eeprom_control);
>>
>>         /*
>>          * After one uncorrectable error happens, usually GPU recovery will
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
>> index 256cea5d34f2b6..283afd791db107 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
>> @@ -318,6 +318,7 @@ struct amdgpu_ras {
>>         /* sysfs */
>>         struct device_attribute features_attr;
>>         struct bin_attribute badpages_attr;
>> +       struct dentry *de_ras_eeprom_table;
>>         /* block array */
>>         struct ras_manager *objs;
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
>> index dc4a845a32404c..677e379f5fb5e9 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
>> @@ -27,6 +27,8 @@
>>  #include <linux/bits.h>
>>  #include "atom.h"
>>  #include "amdgpu_eeprom.h"
>> +#include <linux/debugfs.h>
>> +#include <linux/uaccess.h>
>>
>>  #define EEPROM_I2C_MADDR_VEGA20         0x0
>>  #define EEPROM_I2C_MADDR_ARCTURUS       0x40000
>> @@ -70,6 +72,13 @@
>>  #define RAS_OFFSET_TO_INDEX(_C, _O) (((_O) - \
>>                                       (_C)->ras_record_offset) / RAS_TABLE_RECORD_SIZE)
>>
>> +/* Given a 0-based relative record index, 0, 1, 2, ..., etc., off
>> + * of "fri", return the absolute record index off of the end of
>> + * the table header.
>> + */
>> +#define RAS_RI_TO_AI(_C, _I) (((_I) + (_C)->ras_fri) % \
>> +                             (_C)->ras_max_record_count)
>> +
>>  #define RAS_NUM_RECS(_tbl_hdr)  (((_tbl_hdr)->tbl_size - \
>>                                   RAS_TABLE_HEADER_SIZE) / RAS_TABLE_RECORD_SIZE)
>>
>> @@ -77,13 +86,10 @@
>>
>>  static bool __is_ras_eeprom_supported(struct amdgpu_device *adev)
>>  {
>> -       if ((adev->asic_type == CHIP_VEGA20) ||
>> -           (adev->asic_type == CHIP_ARCTURUS) ||
>> -           (adev->asic_type == CHIP_SIENNA_CICHLID) ||
>> -           (adev->asic_type == CHIP_ALDEBARAN))
>> -               return true;
>> -
>> -       return false;
>> +       return  adev->asic_type == CHIP_VEGA20 ||
>> +               adev->asic_type == CHIP_ARCTURUS ||
>> +               adev->asic_type == CHIP_SIENNA_CICHLID ||
>> +               adev->asic_type == CHIP_ALDEBARAN;
> Unrelated whitespace change.

It's more readable and succinct like this, no?

Do you want me to revert these? I mean, they're pleasing to have and change no functionality, and since I was in this file...

Regards,
Luben

>
>>  }
>>
>>  static bool __get_eeprom_i2c_addr_arct(struct amdgpu_device *adev,
>> @@ -258,6 +264,8 @@ int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control)
>>         control->ras_num_recs = 0;
>>         control->ras_fri = 0;
>>
>> +       amdgpu_ras_debugfs_set_ret_size(control);
>> +
>>         mutex_unlock(&control->ras_tbl_mutex);
>>
>>         return res;
>> @@ -591,6 +599,8 @@ int amdgpu_ras_eeprom_append(struct amdgpu_ras_eeprom_control *control,
>>         res = amdgpu_ras_eeprom_append_table(control, record, num);
>>         if (!res)
>>                 res = amdgpu_ras_eeprom_update_header(control);
>> +       if (!res)
>> +               amdgpu_ras_debugfs_set_ret_size(control);
>>
>>         mutex_unlock(&control->ras_tbl_mutex);
>>         return res;
>> @@ -734,6 +744,223 @@ inline uint32_t amdgpu_ras_eeprom_max_record_count(void)
>>         return RAS_MAX_RECORD_COUNT;
>>  }
>>
>> +static ssize_t
>> +amdgpu_ras_debugfs_eeprom_size_read(struct file *f, char __user *buf,
>> +                                   size_t size, loff_t *pos)
>> +{
>> +       struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private;
>> +       struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
>> +       struct amdgpu_ras_eeprom_control *control = ras ? &ras->eeprom_control : NULL;
>> +       u8 data[50];
>> +       int res;
>> +
>> +       if (!size)
>> +               return size;
>> +
>> +       if (!ras || !control) {
>> +               res = snprintf(data, sizeof(data), "Not supported\n");
>> +       } else {
>> +               res = snprintf(data, sizeof(data), "%d bytes or %d records\n",
>> +                              RAS_TBL_SIZE_BYTES, control->ras_max_record_count);
>> +       }
>> +
>> +       if (*pos >= res)
>> +               return 0;
>> +
>> +       res -= *pos;
>> +       res = min_t(size_t, res, size);
>> +
>> +       if (copy_to_user(buf, &data[*pos], res))
>> +               return -EINVAL;
>> +
>> +       *pos += res;
>> +
>> +       return res;
>> +}
>> +
>> +const struct file_operations amdgpu_ras_debugfs_eeprom_size_ops = {
>> +       .owner = THIS_MODULE,
>> +       .read = amdgpu_ras_debugfs_eeprom_size_read,
>> +       .write = NULL,
>> +       .llseek = default_llseek,
>> +};
>> +
>> +static const char *tbl_hdr_str = " Signature    Version  FirstOffs       Size   Checksum\n";
>> +static const char *tbl_hdr_fmt = "0x%08X 0x%08X 0x%08X 0x%08X 0x%08X\n";
>> +#define tbl_hdr_fmt_size (5 * (2+8) + 4 + 1)
>> +static const char *rec_hdr_str = "Index  Offset ErrType Bank/CU          TimeStamp      Offs/Addr MemChl MCUMCID    RetiredPage\n";
>> +static const char *rec_hdr_fmt = "%5d 0x%05X %7s    0x%02X 0x%016llX 0x%012llX   0x%02X    0x%02X 0x%012llX\n";
>> +#define rec_hdr_fmt_size (5 + 1 + 7 + 1 + 7 + 1 + 7 + 1 + 18 + 1 + 14 + 1 + 6 + 1 + 7 + 1 + 14 + 1)
>> +
>> +static const char *record_err_type_str[AMDGPU_RAS_EEPROM_ERR_COUNT] = {
>> +       "ignore",
>> +       "re",
>> +       "ue",
>> +};
>> +
>> +static loff_t amdgpu_ras_debugfs_table_size(struct amdgpu_ras_eeprom_control *control)
>> +{
>> +       return strlen(tbl_hdr_str) + tbl_hdr_fmt_size +
>> +               strlen(rec_hdr_str) + rec_hdr_fmt_size * control->ras_num_recs;
>> +}
>> +
>> +void amdgpu_ras_debugfs_set_ret_size(struct amdgpu_ras_eeprom_control *control)
>> +{
>> +       struct amdgpu_ras *ras = container_of(control, struct amdgpu_ras,
>> +                                             eeprom_control);
>> +       struct dentry *de = ras->de_ras_eeprom_table;
>> +
>> +       if (de)
>> +               d_inode(de)->i_size = amdgpu_ras_debugfs_table_size(control);
>> +}
>> +
>> +static ssize_t amdgpu_ras_debugfs_table_read(struct file *f, char __user *buf,
>> +                                            size_t size, loff_t *pos)
>> +{
>> +       struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private;
>> +       struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
>> +       struct amdgpu_ras_eeprom_control *control = &ras->eeprom_control;
>> +       const size_t orig_size = size;
>> +       int res = -EINVAL;
>> +       size_t data_len;
>> +
>> +       mutex_lock(&control->ras_tbl_mutex);
>> +
>> +       /* We want *pos - data_len > 0, which means there's
>> +        * bytes to be printed from data.
>> +        */
>> +       data_len = strlen(tbl_hdr_str);
>> +       if (*pos < data_len) {
>> +               data_len -= *pos;
>> +               data_len = min_t(size_t, data_len, size);
>> +               if (copy_to_user(buf, &tbl_hdr_str[*pos], data_len))
>> +                       goto Out;
>> +               buf += data_len;
>> +               size -= data_len;
>> +               *pos += data_len;
>> +       }
>> +
>> +       data_len = strlen(tbl_hdr_str) + tbl_hdr_fmt_size;
>> +       if (*pos < data_len && size > 0) {
>> +               u8 data[tbl_hdr_fmt_size + 1];
>> +               loff_t lpos;
>> +
>> +               snprintf(data, sizeof(data), tbl_hdr_fmt,
>> +                        control->tbl_hdr.header,
>> +                        control->tbl_hdr.version,
>> +                        control->tbl_hdr.first_rec_offset,
>> +                        control->tbl_hdr.tbl_size,
>> +                        control->tbl_hdr.checksum);
>> +
>> +               data_len -= *pos;
>> +               data_len = min_t(size_t, data_len, size);
>> +               lpos = *pos - strlen(tbl_hdr_str);
>> +               if (copy_to_user(buf, &data[lpos], data_len))
>> +                       goto Out;
>> +               buf += data_len;
>> +               size -= data_len;
>> +               *pos += data_len;
>> +       }
>> +
>> +       data_len = strlen(tbl_hdr_str) + tbl_hdr_fmt_size + strlen(rec_hdr_str);
>> +       if (*pos < data_len && size > 0) {
>> +               loff_t lpos;
>> +
>> +               data_len -= *pos;
>> +               data_len = min_t(size_t, data_len, size);
>> +               lpos = *pos - strlen(tbl_hdr_str) - tbl_hdr_fmt_size;
>> +               if (copy_to_user(buf, &rec_hdr_str[lpos], data_len))
>> +                       goto Out;
>> +               buf += data_len;
>> +               size -= data_len;
>> +               *pos += data_len;
>> +       }
>> +
>> +       data_len = amdgpu_ras_debugfs_table_size(control);
>> +       if (*pos < data_len && size > 0) {
>> +               u8 dare[RAS_TABLE_RECORD_SIZE];
>> +               u8 data[rec_hdr_fmt_size + 1];
>> +               /* Find the starting record index
>> +                */
>> +               int s = (*pos - strlen(tbl_hdr_str) - tbl_hdr_fmt_size -
>> +                        strlen(rec_hdr_str)) / rec_hdr_fmt_size;
>> +               int r = (*pos - strlen(tbl_hdr_str) - tbl_hdr_fmt_size -
>> +                        strlen(rec_hdr_str)) % rec_hdr_fmt_size;
>> +               struct eeprom_table_record record;
>> +
>> +               for ( ; size > 0 && s < control->ras_num_recs; s++) {
>> +                       u32 ai = RAS_RI_TO_AI(control, s);
>> +                       /* Read a single record
>> +                        */
>> +                       res = __amdgpu_ras_eeprom_read(control, dare, ai, 1);
>> +                       if (res)
>> +                               goto Out;
>> +                       __decode_table_record_from_buf(control, &record, dare);
>> +                       snprintf(data, sizeof(data), rec_hdr_fmt,
>> +                                s,
>> +                                RAS_INDEX_TO_OFFSET(control, ai),
>> +                                record_err_type_str[record.err_type],
>> +                                record.bank,
>> +                                record.ts,
>> +                                record.offset,
>> +                                record.mem_channel,
>> +                                record.mcumc_id,
>> +                                record.retired_page);
>> +
>> +                       data_len = min_t(size_t, rec_hdr_fmt_size - r, size);
>> +                       if (copy_to_user(buf, &data[r], data_len))
>> +                               return -EINVAL;
>> +                       buf += data_len;
>> +                       size -= data_len;
>> +                       *pos += data_len;
>> +                       r = 0;
>> +               }
>> +       }
>> +       res = 0;
>> +Out:
>> +       mutex_unlock(&control->ras_tbl_mutex);
>> +       return res < 0 ? res : orig_size - size;
>> +}
>> +
>> +static ssize_t
>> +amdgpu_ras_debugfs_eeprom_table_read(struct file *f, char __user *buf,
>> +                                    size_t size, loff_t *pos)
>> +{
>> +       struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private;
>> +       struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
>> +       struct amdgpu_ras_eeprom_control *control = ras ? &ras->eeprom_control : NULL;
>> +       u8 data[81];
>> +       int res;
>> +
>> +       if (!size)
>> +               return size;
>> +
>> +       if (!ras || !control) {
>> +               res = snprintf(data, sizeof(data), "Not supported\n");
>> +               if (*pos >= res)
>> +                       return 0;
>> +
>> +               res -= *pos;
>> +               res = min_t(size_t, res, size);
>> +
>> +               if (copy_to_user(buf, &data[*pos], res))
>> +                       return -EINVAL;
>> +
>> +               *pos += res;
>> +
>> +               return res;
>> +       } else {
>> +               return amdgpu_ras_debugfs_table_read(f, buf, size, pos);
>> +       }
>> +}
>> +
>> +const struct file_operations amdgpu_ras_debugfs_eeprom_table_ops = {
>> +       .owner = THIS_MODULE,
>> +       .read = amdgpu_ras_debugfs_eeprom_table_read,
>> +       .write = NULL,
>> +       .llseek = default_llseek,
>> +};
>> +
>>  /**
>>   * __verify_ras_table_checksum -- verify the RAS EEPROM table checksum
>>   * @control: pointer to control structure
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
>> index edb0195ea2eb8c..430e08ab3313a2 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
>> @@ -29,9 +29,10 @@
>>  struct amdgpu_device;
>>
>>  enum amdgpu_ras_eeprom_err_type {
>> -       AMDGPU_RAS_EEPROM_ERR_PLACE_HOLDER,
>> +       AMDGPU_RAS_EEPROM_ERR_NA,
>>         AMDGPU_RAS_EEPROM_ERR_RECOVERABLE,
>> -       AMDGPU_RAS_EEPROM_ERR_NON_RECOVERABLE
>> +       AMDGPU_RAS_EEPROM_ERR_NON_RECOVERABLE,
>> +       AMDGPU_RAS_EEPROM_ERR_COUNT,
>>  };
>>
>>  struct amdgpu_ras_eeprom_table_header {
>> @@ -121,4 +122,9 @@ int amdgpu_ras_eeprom_append(struct amdgpu_ras_eeprom_control *control,
>>
>>  inline uint32_t amdgpu_ras_eeprom_max_record_count(void);
>>
>> +void amdgpu_ras_debugfs_set_ret_size(struct amdgpu_ras_eeprom_control *control);
>> +
>> +extern const struct file_operations amdgpu_ras_debugfs_eeprom_size_ops;
>> +extern const struct file_operations amdgpu_ras_debugfs_eeprom_table_ops;
>> +
>>  #endif // _AMDGPU_RAS_EEPROM_H
>> --
>> 2.32.0
>>
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@xxxxxxxxxxxxxxxxxxxxx
>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&amp;data=04%7C01%7Cluben.tuikov%40amd.com%7C44ce5499d22045e9181108d92cfcab41%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637590285983174149%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&amp;sdata=x5YnmU8DjF1AwNdP6s04B0%2F47%2BxOxaZKou2cairZ3t0%3D&amp;reserved=0

_______________________________________________
amd-gfx mailing list
amd-gfx@xxxxxxxxxxxxxxxxxxxxx
https://lists.freedesktop.org/mailman/listinfo/amd-gfx