RE: [PATCH] drm/amdgpu: format old RAS eeprom data into V3

"Xie, Patrick" <Gangliang.Xie@xxxxxxx> · Thu, 13 Mar 2025 03:41:18 +0000

[AMD Official Use Only - AMD Internal Distribution Only]

Hi, Tao:
        I am worried about host reboot or power down during the eeprom formating, which will make the bad page info lost.
        If the issue needs to be considered, I suggest save bad page info on host disk before eeprom formatting, and make a mark on eeprom

-----Original Message-----
From: amd-gfx <amd-gfx-bounces@xxxxxxxxxxxxxxxxxxxxx> On Behalf Of amd-gfx-request@xxxxxxxxxxxxxxxxxxxxx
Sent: Wednesday, March 12, 2025 6:09 PM
To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx
Subject: amd-gfx Digest, Vol 106, Issue 157

Send amd-gfx mailing list submissions to
        amd-gfx@xxxxxxxxxxxxxxxxxxxxx

To subscribe or unsubscribe via the World Wide Web, visit
        https://lists.freedesktop.org/mailman/listinfo/amd-gfx
or, via email, send a message with subject or body 'help' to
        amd-gfx-request@xxxxxxxxxxxxxxxxxxxxx

You can reach the person managing the list at
        amd-gfx-owner@xxxxxxxxxxxxxxxxxxxxx

When replying, please edit your Subject line so it is more specific than "Re: Contents of amd-gfx digest..."


Today's Topics:

   1. [PATCH] drm/amdgpu: format old RAS eeprom data into V3
      version (Tao Zhou)
   2. Re: [PATCH 2/2] drm/amdgpu: Make use of drm_wedge_app_info
      (Raag Jadav)
   3. [PATCH] drm/amdgpu/pm: Handle SCLK offset correctly in
      overdrive for smu 14.0.2 (Tomasz Paku?a)


----------------------------------------------------------------------

Message: 1
Date: Wed, 12 Mar 2025 18:05:48 +0800
From: Tao Zhou <tao.zhou1@xxxxxxx>
To: <amd-gfx@xxxxxxxxxxxxxxxxxxxxx>
Cc: Tao Zhou <tao.zhou1@xxxxxxx>
Subject: [PATCH] drm/amdgpu: format old RAS eeprom data into V3
        version
Message-ID: <20250312100548.283389-1-tao.zhou1@xxxxxxx>
Content-Type: text/plain

Clear old data and save it in V3 format.

v2: only format eeprom data for new ASICs.

Signed-off-by: Tao Zhou <tao.zhou1@xxxxxxx>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c       |  7 +++++
 .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c    | 26 ++++++++++---------
 .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h    |  1 +
 3 files changed, 22 insertions(+), 12 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 837f33698b38..d3b9b4d9fb89 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -3465,6 +3465,13 @@ int amdgpu_ras_init_badpage_info(struct amdgpu_device *adev)
                                adev, control->bad_channel_bitmap);
                        con->update_channel_flag = false;
                }
+
+               /* The format action is only applied to new ASICs */
+               if (IP_VERSION_MAJ(amdgpu_ip_version(adev, UMC_HWIP, 0)) >= 12 &&
+                   control->tbl_hdr.version < RAS_TABLE_VER_V3)
+                       if (!amdgpu_ras_eeprom_reset_table(control))
+                               if (amdgpu_ras_save_bad_pages(adev, NULL))
+                                       dev_warn(adev->dev, "Failed to format RAS EEPROM data in V3
+version!\n");
        }

        return ret;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
index 09a6f8bc1a5a..71dddb8983ee 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
@@ -413,9 +413,11 @@ static void amdgpu_ras_set_eeprom_table_version(struct amdgpu_ras_eeprom_control

        switch (amdgpu_ip_version(adev, UMC_HWIP, 0)) {
        case IP_VERSION(8, 10, 0):
-       case IP_VERSION(12, 0, 0):
                hdr->version = RAS_TABLE_VER_V2_1;
                return;
+       case IP_VERSION(12, 0, 0):
+               hdr->version = RAS_TABLE_VER_V3;
+               return;
        default:
                hdr->version = RAS_TABLE_VER_V1;
                return;
@@ -443,7 +445,7 @@ int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control)
        hdr->header = RAS_TABLE_HDR_VAL;
        amdgpu_ras_set_eeprom_table_version(control);

-       if (hdr->version == RAS_TABLE_VER_V2_1) {
+       if (hdr->version >= RAS_TABLE_VER_V2_1) {
                hdr->first_rec_offset = RAS_RECORD_START_V2_1;
                hdr->tbl_size = RAS_TABLE_HEADER_SIZE +
                                RAS_TABLE_V2_1_INFO_SIZE;
@@ -461,7 +463,7 @@ int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control)
        }

        csum = __calc_hdr_byte_sum(control);
-       if (hdr->version == RAS_TABLE_VER_V2_1)
+       if (hdr->version >= RAS_TABLE_VER_V2_1)
                csum += __calc_ras_info_byte_sum(control);
        csum = -csum;
        hdr->checksum = csum;
@@ -752,7 +754,7 @@ amdgpu_ras_eeprom_update_header(struct amdgpu_ras_eeprom_control *control)
                        "Saved bad pages %d reaches threshold value %d\n",
                        control->ras_num_bad_pages, ras->bad_page_cnt_threshold);
                control->tbl_hdr.header = RAS_TABLE_HDR_BAD;
-               if (control->tbl_hdr.version == RAS_TABLE_VER_V2_1) {
+               if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1) {
                        control->tbl_rai.rma_status = GPU_RETIRED__ECC_REACH_THRESHOLD;
                        control->tbl_rai.health_percent = 0;
                }
@@ -765,7 +767,7 @@ amdgpu_ras_eeprom_update_header(struct amdgpu_ras_eeprom_control *control)
                amdgpu_dpm_send_rma_reason(adev);
        }

-       if (control->tbl_hdr.version == RAS_TABLE_VER_V2_1)
+       if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1)
                control->tbl_hdr.tbl_size = RAS_TABLE_HEADER_SIZE +
                                            RAS_TABLE_V2_1_INFO_SIZE +
                                            control->ras_num_recs * RAS_TABLE_RECORD_SIZE; @@ -805,7 +807,7 @@ amdgpu_ras_eeprom_update_header(struct amdgpu_ras_eeprom_control *control)
         * now calculate gpu health percent
         */
        if (amdgpu_bad_page_threshold != 0 &&
-           control->tbl_hdr.version == RAS_TABLE_VER_V2_1 &&
+           control->tbl_hdr.version >= RAS_TABLE_VER_V2_1 &&
            control->ras_num_bad_pages <= ras->bad_page_cnt_threshold)
                control->tbl_rai.health_percent = ((ras->bad_page_cnt_threshold -
                                                   control->ras_num_bad_pages) * 100) / @@ -818,7 +820,7 @@ amdgpu_ras_eeprom_update_header(struct amdgpu_ras_eeprom_control *control)
                csum += *pp;

        csum += __calc_hdr_byte_sum(control);
-       if (control->tbl_hdr.version == RAS_TABLE_VER_V2_1)
+       if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1)
                csum += __calc_ras_info_byte_sum(control);
        /* avoid sign extension when assigning to "checksum" */
        csum = -csum;
@@ -1035,7 +1037,7 @@ uint32_t amdgpu_ras_eeprom_max_record_count(struct amdgpu_ras_eeprom_control *co
        /* get available eeprom table version first before eeprom table init */
        amdgpu_ras_set_eeprom_table_version(control);

-       if (control->tbl_hdr.version == RAS_TABLE_VER_V2_1)
+       if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1)
                return RAS_MAX_RECORD_COUNT_V2_1;
        else
                return RAS_MAX_RECORD_COUNT;
@@ -1280,7 +1282,7 @@ static int __verify_ras_table_checksum(struct amdgpu_ras_eeprom_control *control
        int buf_size, res;
        u8  csum, *buf, *pp;

-       if (control->tbl_hdr.version == RAS_TABLE_VER_V2_1)
+       if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1)
                buf_size = RAS_TABLE_HEADER_SIZE +
                           RAS_TABLE_V2_1_INFO_SIZE +
                           control->ras_num_recs * RAS_TABLE_RECORD_SIZE; @@ -1383,7 +1385,7 @@ int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control)

        __decode_table_header_from_buf(hdr, buf);

-       if (hdr->version == RAS_TABLE_VER_V2_1) {
+       if (hdr->version >= RAS_TABLE_VER_V2_1) {
                control->ras_num_recs = RAS_NUM_RECS_V2_1(hdr);
                control->ras_record_offset = RAS_RECORD_START_V2_1;
                control->ras_max_record_count = RAS_MAX_RECORD_COUNT_V2_1; @@ -1423,7 +1425,7 @@ int amdgpu_ras_eeprom_check(struct amdgpu_ras_eeprom_control *control)
                DRM_DEBUG_DRIVER("Found existing EEPROM table with %d records",
                                 control->ras_num_bad_pages);

-               if (hdr->version == RAS_TABLE_VER_V2_1) {
+               if (hdr->version >= RAS_TABLE_VER_V2_1) {
                        res = __read_table_ras_info(control);
                        if (res)
                                return res;
@@ -1443,7 +1445,7 @@ int amdgpu_ras_eeprom_check(struct amdgpu_ras_eeprom_control *control)
                                        ras->bad_page_cnt_threshold);
        } else if (hdr->header == RAS_TABLE_HDR_BAD &&
                   amdgpu_bad_page_threshold != 0) {
-               if (hdr->version == RAS_TABLE_VER_V2_1) {
+               if (hdr->version >= RAS_TABLE_VER_V2_1) {
                        res = __read_table_ras_info(control);
                        if (res)
                                return res;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
index 13f7eda9a696..ec6d7ea37ad0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
@@ -28,6 +28,7 @@

 #define RAS_TABLE_VER_V1           0x00010000
 #define RAS_TABLE_VER_V2_1         0x00021000
+#define RAS_TABLE_VER_V3           0x00030000

 struct amdgpu_device;

--
2.34.1



------------------------------

Message: 2
Date: Tue, 11 Mar 2025 19:13:15 +0200
From: Raag Jadav <raag.jadav@xxxxxxxxx>
To: Alex Deucher <alexdeucher@xxxxxxxxx>
Cc: Andr? Almeida <andrealmeid@xxxxxxxxxx>,
        dri-devel@xxxxxxxxxxxxxxxxxxxxx, linux-kernel@xxxxxxxxxxxxxxx,
        kernel-dev@xxxxxxxxxx, amd-gfx@xxxxxxxxxxxxxxxxxxxxx,
        intel-xe@xxxxxxxxxxxxxxxxxxxxx, intel-gfx@xxxxxxxxxxxxxxxxxxxxx, Alex
        Deucher <alexander.deucher@xxxxxxx>, Christian K?nig
        <christian.koenig@xxxxxxx>, siqueira@xxxxxxxxxx, airlied@xxxxxxxxx,
        simona@xxxxxxxx, rodrigo.vivi@xxxxxxxxx, jani.nikula@xxxxxxxxxxxxxxx,
        Xaver Hugl <xaver.hugl@xxxxxxx>
Subject: Re: [PATCH 2/2] drm/amdgpu: Make use of drm_wedge_app_info
Message-ID: <Z9BvK55_Nim54eOu@xxxxxxxxxxxxxxxxxx>
Content-Type: text/plain; charset=utf-8

On Mon, Mar 10, 2025 at 06:03:27PM -0400, Alex Deucher wrote:
> On Mon, Mar 10, 2025 at 5:54?PM Andr? Almeida <andrealmeid@xxxxxxxxxx> wrote:
> >
> > Em 01/03/2025 03:04, Raag Jadav escreveu:
> > > On Fri, Feb 28, 2025 at 06:49:43PM -0300, Andr? Almeida wrote:
> > >> Hi Raag,
> > >>
> > >> On 2/28/25 11:58, Raag Jadav wrote:
> > >>> On Fri, Feb 28, 2025 at 09:13:53AM -0300, Andr? Almeida wrote:
> > >>>> To notify userspace about which app (if any) made the device get in a
> > >>>> wedge state, make use of drm_wedge_app_info parameter, filling it with
> > >>>> the app PID and name.
> > >>>>
> > >>>> Signed-off-by: Andr? Almeida <andrealmeid@xxxxxxxxxx>
> > >>>> ---
> > >>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 19 +++++++++++++++++--
> > >>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_job.c    |  6 +++++-
> > >>>>    2 files changed, 22 insertions(+), 3 deletions(-)
> > >>>>
> > >>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > >>>> index 00b9b87dafd8..e06adf6f34fd 100644
> > >>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > >>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > >>>> @@ -6123,8 +6123,23 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
> > >>>>            atomic_set(&adev->reset_domain->reset_res, r);
> > >>>> -  if (!r)
> > >>>> -          drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE, NULL);
> > >>>> +  if (!r) {
> > >>>> +          struct drm_wedge_app_info aux, *info = NULL;
> > >>>> +
> > >>>> +          if (job) {
> > >>>> +                  struct amdgpu_task_info *ti;
> > >>>> +
> > >>>> +                  ti = amdgpu_vm_get_task_info_pasid(adev, job->pasid);
> > >>>> +                  if (ti) {
> > >>>> +                          aux.pid = ti->pid;
> > >>>> +                          aux.comm = ti->process_name;
> > >>>> +                          info = &aux;
> > >>>> +                          amdgpu_vm_put_task_info(ti);
> > >>>> +                  }
> > >>>> +          }
> > >>> Is this guaranteed to be guilty app and not some scheduled worker?
> > >>
> > >> This is how amdgpu decides which app is the guilty one earlier in the code
> > >> as in the print:
> > >>
> > >>      ti = amdgpu_vm_get_task_info_pasid(ring->adev, job->pasid);
> > >>
> > >>      "Process information: process %s pid %d thread %s pid %d\n"
> > >>
> > >> So I think it's consistent with what the driver thinks it's the guilty
> > >> process.
> > >
> > > Sure, but with something like app_info we're kind of hinting to userspace
> > > that an application was _indeed_ involved with reset. Is that also guaranteed?
> > >
> > > Is it possible that an application needlessly suffers from a false positive
> > > scenario (reset due to other factors)?
> > >
> >
> > I asked Alex Deucher in IRC about that and yes, there's a chance that
> > this is a false positive. However, for the majority of cases this is the
> > right app that caused the hang. This is what amdgpu is doing for GL
> > robustness as well and devcoredump, so it's very consistent with how
> > amdgpu deals with this scenario even if the mechanism is still not perfect.
>
> It's usually the guilty one, but it's not guaranteed.  For example,
> say you have a ROCm user queue and a gfx job submitted to a kernel
> queue.  The actual guilty job may be the ROCm user queue, but the
> driver may not detect that the ROCm queue was hung until some other
> event (e.g., memory pressure).  However, the timer for the gfx job may
> timeout before that happens on the ROCm queue so in that case the gfx
> job would be incorrectly considered guilty.

So it boils down to what are the chances of that happening and whether
it's significant enough to open the door for API abuse.

Considering this is amd specific accuracy, it's still an open question
how other drivers are/will be managing it.

Raag


------------------------------

Message: 3
Date: Tue, 11 Mar 2025 22:38:33 +0100
From: Tomasz Paku?a <tomasz.pakula.oficjalny@xxxxxxxxx>
To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx
Cc: alexander.deucher@xxxxxxx
Subject: [PATCH] drm/amdgpu/pm: Handle SCLK offset correctly in
        overdrive for smu 14.0.2
Message-ID:
        <20250311213833.870840-1-tomasz.pakula.oficjalny@xxxxxxxxx>
Content-Type: text/plain; charset=UTF-8

Currently, it seems like the code was carried over from RDNA3 because
it assumes two possible values to set. RDNA4, instead of having:
0: min SCLK
1: max SCLK
only has
0: SCLK offset

This change makes it so it only reports current offset value instead of
showing possible min/max values and their indices. Moreover, it now only
accepts the offset as a value, without the indice index.

Additionally, the lower bound was printed as %u by mistake.

Old:
OD_SCLK_OFFSET:
0: -500Mhz
1: 1000Mhz
OD_MCLK:
0: 97Mhz
1: 1259MHz
OD_VDDGFX_OFFSET:
0mV
OD_RANGE:
SCLK_OFFSET:    -500Mhz       1000Mhz
MCLK:      97Mhz       1500Mhz
VDDGFX_OFFSET:    -200mv          0mv

New:
OD_SCLK_OFFSET:
0Mhz
OD_MCLK:
0: 97Mhz
1: 1259MHz
OD_VDDGFX_OFFSET:
0mV
OD_RANGE:
SCLK_OFFSET:    -500Mhz       1000Mhz
MCLK:      97Mhz       1500Mhz
VDDGFX_OFFSET:    -200mv          0mv

Setting this offset:
Old: "s 1 <offset>"
New: "s <offset>"

Signed-off-by: Tomasz Paku?a <tomasz.pakula.oficjalny@xxxxxxxxx>
---
 .../drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c  | 59 ++++++-------------
 1 file changed, 18 insertions(+), 41 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c
index 5cad09c5f2ff..62bd9647541a 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c
@@ -1193,16 +1193,9 @@ static int smu_v14_0_2_print_clk_levels(struct smu_context *smu,
                                                         PP_OD_FEATURE_GFXCLK_BIT))
                        break;

-               PPTable_t *pptable = smu->smu_table.driver_pptable;
-               const OverDriveLimits_t * const overdrive_upperlimits =
-                                       &pptable->SkuTable.OverDriveLimitsBasicMax;
-               const OverDriveLimits_t * const overdrive_lowerlimits =
-                                       &pptable->SkuTable.OverDriveLimitsBasicMin;
-
                size += sysfs_emit_at(buf, size, "OD_SCLK_OFFSET:\n");
-               size += sysfs_emit_at(buf, size, "0: %dMhz\n1: %uMhz\n",
-                                       overdrive_lowerlimits->GfxclkFoffset,
-                                       overdrive_upperlimits->GfxclkFoffset);
+               size += sysfs_emit_at(buf, size, "%dMhz\n",
+                                       od_table->OverDriveTable.GfxclkFoffset);
                break;

        case SMU_OD_MCLK:
@@ -1336,13 +1329,9 @@ static int smu_v14_0_2_print_clk_levels(struct smu_context *smu,
                size += sysfs_emit_at(buf, size, "%s:\n", "OD_RANGE");

                if (smu_v14_0_2_is_od_feature_supported(smu, PP_OD_FEATURE_GFXCLK_BIT)) {
-                       smu_v14_0_2_get_od_setting_limits(smu,
-                                                         PP_OD_FEATURE_GFXCLK_FMIN,
-                                                         &min_value,
-                                                         NULL);
                        smu_v14_0_2_get_od_setting_limits(smu,
                                                          PP_OD_FEATURE_GFXCLK_FMAX,
-                                                         NULL,
+                                                         &min_value,
                                                          &max_value);
                        size += sysfs_emit_at(buf, size, "SCLK_OFFSET: %7dMhz %10uMhz\n",
                                              min_value, max_value);
@@ -2417,36 +2406,24 @@ static int smu_v14_0_2_od_edit_dpm_table(struct smu_context *smu,
                        return -ENOTSUPP;
                }

-               for (i = 0; i < size; i += 2) {
-                       if (i + 2 > size) {
-                               dev_info(adev->dev, "invalid number of input parameters %d\n", size);
-                               return -EINVAL;
-                       }
-
-                       switch (input[i]) {
-                       case 1:
-                               smu_v14_0_2_get_od_setting_limits(smu,
-                                                                 PP_OD_FEATURE_GFXCLK_FMAX,
-                                                                 &minimum,
-                                                                 &maximum);
-                               if (input[i + 1] < minimum ||
-                                   input[i + 1] > maximum) {
-                                       dev_info(adev->dev, "GfxclkFmax (%ld) must be within [%u, %u]!\n",
-                                               input[i + 1], minimum, maximum);
-                                       return -EINVAL;
-                               }
-
-                               od_table->OverDriveTable.GfxclkFoffset = input[i + 1];
-                               od_table->OverDriveTable.FeatureCtrlMask |= 1U << PP_OD_FEATURE_GFXCLK_BIT;
-                               break;
+               if (size != 1) {
+                       dev_info(adev->dev, "invalid number of input parameters %d\n", size);
+                       return -EINVAL;
+               }

-                       default:
-                               dev_info(adev->dev, "Invalid SCLK_VDDC_TABLE index: %ld\n", input[i]);
-                               dev_info(adev->dev, "Supported indices: [0:min,1:max]\n");
-                               return -EINVAL;
-                       }
+               smu_v14_0_2_get_od_setting_limits(smu,
+                                                 PP_OD_FEATURE_GFXCLK_FMAX,
+                                                 &minimum,
+                                                 &maximum);
+               if (input[0] < minimum ||
+                   input[0] > maximum) {
+                       dev_info(adev->dev, "GfxclkFoffset must be within [%d, %u]!\n",
+                                minimum, maximum);
+                       return -EINVAL;
                }

+               od_table->OverDriveTable.GfxclkFoffset = input[0];
+               od_table->OverDriveTable.FeatureCtrlMask |= 1U << PP_OD_FEATURE_GFXCLK_BIT;
                break;

        case PP_OD_EDIT_MCLK_VDDC_TABLE:
--
2.48.1



------------------------------

Subject: Digest Footer

_______________________________________________
amd-gfx mailing list
amd-gfx@xxxxxxxxxxxxxxxxxxxxx
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


------------------------------

End of amd-gfx Digest, Vol 106, Issue 157
*****************************************