[AMD Official Use Only - AMD Internal Distribution Only] Hi Patrick, I know your concern, but I think the sudden power off is not an usual case in server platform. Regards, Tao > -----Original Message----- > From: amd-gfx <amd-gfx-bounces@xxxxxxxxxxxxxxxxxxxxx> On Behalf Of Xie, Patrick > Sent: Thursday, March 13, 2025 11:41 AM > To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx > Subject: RE: [PATCH] drm/amdgpu: format old RAS eeprom data into V3 > > [AMD Official Use Only - AMD Internal Distribution Only] > > [AMD Official Use Only - AMD Internal Distribution Only] > > Hi, Tao: > I am worried about host reboot or power down during the eeprom formating, > which will make the bad page info lost. > If the issue needs to be considered, I suggest save bad page info on host disk > before eeprom formatting, and make a mark on eeprom > > -----Original Message----- > From: amd-gfx <amd-gfx-bounces@xxxxxxxxxxxxxxxxxxxxx> On Behalf Of amd-gfx- > request@xxxxxxxxxxxxxxxxxxxxx > Sent: Wednesday, March 12, 2025 6:09 PM > To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx > Subject: amd-gfx Digest, Vol 106, Issue 157 > > Send amd-gfx mailing list submissions to > amd-gfx@xxxxxxxxxxxxxxxxxxxxx > > To subscribe or unsubscribe via the World Wide Web, visit > https://lists.freedesktop.org/mailman/listinfo/amd-gfx > or, via email, send a message with subject or body 'help' to > amd-gfx-request@xxxxxxxxxxxxxxxxxxxxx > > You can reach the person managing the list at > amd-gfx-owner@xxxxxxxxxxxxxxxxxxxxx > > When replying, please edit your Subject line so it is more specific than "Re: Contents > of amd-gfx digest..." > > > Today's Topics: > > 1. [PATCH] drm/amdgpu: format old RAS eeprom data into V3 > version (Tao Zhou) > 2. Re: [PATCH 2/2] drm/amdgpu: Make use of drm_wedge_app_info > (Raag Jadav) > 3. [PATCH] drm/amdgpu/pm: Handle SCLK offset correctly in > overdrive for smu 14.0.2 (Tomasz Paku?a) > > > ---------------------------------------------------------------------- > > Message: 1 > Date: Wed, 12 Mar 2025 18:05:48 +0800 > From: Tao Zhou <tao.zhou1@xxxxxxx> > To: <amd-gfx@xxxxxxxxxxxxxxxxxxxxx> > Cc: Tao Zhou <tao.zhou1@xxxxxxx> > Subject: [PATCH] drm/amdgpu: format old RAS eeprom data into V3 > version > Message-ID: <20250312100548.283389-1-tao.zhou1@xxxxxxx> > Content-Type: text/plain > > Clear old data and save it in V3 format. > > v2: only format eeprom data for new ASICs. > > Signed-off-by: Tao Zhou <tao.zhou1@xxxxxxx> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 7 +++++ > .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 26 ++++++++++--------- > .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h | 1 + > 3 files changed, 22 insertions(+), 12 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > index 837f33698b38..d3b9b4d9fb89 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > @@ -3465,6 +3465,13 @@ int amdgpu_ras_init_badpage_info(struct > amdgpu_device *adev) > adev, control->bad_channel_bitmap); > con->update_channel_flag = false; > } > + > + /* The format action is only applied to new ASICs */ > + if (IP_VERSION_MAJ(amdgpu_ip_version(adev, UMC_HWIP, 0)) >= 12 > && > + control->tbl_hdr.version < RAS_TABLE_VER_V3) > + if (!amdgpu_ras_eeprom_reset_table(control)) > + if (amdgpu_ras_save_bad_pages(adev, NULL)) > + dev_warn(adev->dev, "Failed to > +format RAS EEPROM data in V3 version!\n"); > } > > return ret; > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c > index 09a6f8bc1a5a..71dddb8983ee 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c > @@ -413,9 +413,11 @@ static void amdgpu_ras_set_eeprom_table_version(struct > amdgpu_ras_eeprom_control > > switch (amdgpu_ip_version(adev, UMC_HWIP, 0)) { > case IP_VERSION(8, 10, 0): > - case IP_VERSION(12, 0, 0): > hdr->version = RAS_TABLE_VER_V2_1; > return; > + case IP_VERSION(12, 0, 0): > + hdr->version = RAS_TABLE_VER_V3; > + return; > default: > hdr->version = RAS_TABLE_VER_V1; > return; > @@ -443,7 +445,7 @@ int amdgpu_ras_eeprom_reset_table(struct > amdgpu_ras_eeprom_control *control) > hdr->header = RAS_TABLE_HDR_VAL; > amdgpu_ras_set_eeprom_table_version(control); > > - if (hdr->version == RAS_TABLE_VER_V2_1) { > + if (hdr->version >= RAS_TABLE_VER_V2_1) { > hdr->first_rec_offset = RAS_RECORD_START_V2_1; > hdr->tbl_size = RAS_TABLE_HEADER_SIZE + > RAS_TABLE_V2_1_INFO_SIZE; @@ -461,7 +463,7 @@ int > amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control) > } > > csum = __calc_hdr_byte_sum(control); > - if (hdr->version == RAS_TABLE_VER_V2_1) > + if (hdr->version >= RAS_TABLE_VER_V2_1) > csum += __calc_ras_info_byte_sum(control); > csum = -csum; > hdr->checksum = csum; > @@ -752,7 +754,7 @@ amdgpu_ras_eeprom_update_header(struct > amdgpu_ras_eeprom_control *control) > "Saved bad pages %d reaches threshold value %d\n", > control->ras_num_bad_pages, ras->bad_page_cnt_threshold); > control->tbl_hdr.header = RAS_TABLE_HDR_BAD; > - if (control->tbl_hdr.version == RAS_TABLE_VER_V2_1) { > + if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1) { > control->tbl_rai.rma_status = > GPU_RETIRED__ECC_REACH_THRESHOLD; > control->tbl_rai.health_percent = 0; > } > @@ -765,7 +767,7 @@ amdgpu_ras_eeprom_update_header(struct > amdgpu_ras_eeprom_control *control) > amdgpu_dpm_send_rma_reason(adev); > } > > - if (control->tbl_hdr.version == RAS_TABLE_VER_V2_1) > + if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1) > control->tbl_hdr.tbl_size = RAS_TABLE_HEADER_SIZE + > RAS_TABLE_V2_1_INFO_SIZE + > control->ras_num_recs * RAS_TABLE_RECORD_SIZE; > @@ -805,7 +807,7 @@ amdgpu_ras_eeprom_update_header(struct > amdgpu_ras_eeprom_control *control) > * now calculate gpu health percent > */ > if (amdgpu_bad_page_threshold != 0 && > - control->tbl_hdr.version == RAS_TABLE_VER_V2_1 && > + control->tbl_hdr.version >= RAS_TABLE_VER_V2_1 && > control->ras_num_bad_pages <= ras->bad_page_cnt_threshold) > control->tbl_rai.health_percent = ((ras->bad_page_cnt_threshold - > control->ras_num_bad_pages) * 100) / @@ -818,7 > +820,7 @@ amdgpu_ras_eeprom_update_header(struct > amdgpu_ras_eeprom_control *control) > csum += *pp; > > csum += __calc_hdr_byte_sum(control); > - if (control->tbl_hdr.version == RAS_TABLE_VER_V2_1) > + if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1) > csum += __calc_ras_info_byte_sum(control); > /* avoid sign extension when assigning to "checksum" */ > csum = -csum; > @@ -1035,7 +1037,7 @@ uint32_t amdgpu_ras_eeprom_max_record_count(struct > amdgpu_ras_eeprom_control *co > /* get available eeprom table version first before eeprom table init */ > amdgpu_ras_set_eeprom_table_version(control); > > - if (control->tbl_hdr.version == RAS_TABLE_VER_V2_1) > + if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1) > return RAS_MAX_RECORD_COUNT_V2_1; > else > return RAS_MAX_RECORD_COUNT; @@ -1280,7 +1282,7 @@ static > int __verify_ras_table_checksum(struct amdgpu_ras_eeprom_control *control > int buf_size, res; > u8 csum, *buf, *pp; > > - if (control->tbl_hdr.version == RAS_TABLE_VER_V2_1) > + if (control->tbl_hdr.version >= RAS_TABLE_VER_V2_1) > buf_size = RAS_TABLE_HEADER_SIZE + > RAS_TABLE_V2_1_INFO_SIZE + > control->ras_num_recs * RAS_TABLE_RECORD_SIZE; @@ - > 1383,7 +1385,7 @@ int amdgpu_ras_eeprom_init(struct > amdgpu_ras_eeprom_control *control) > > __decode_table_header_from_buf(hdr, buf); > > - if (hdr->version == RAS_TABLE_VER_V2_1) { > + if (hdr->version >= RAS_TABLE_VER_V2_1) { > control->ras_num_recs = RAS_NUM_RECS_V2_1(hdr); > control->ras_record_offset = RAS_RECORD_START_V2_1; > control->ras_max_record_count = RAS_MAX_RECORD_COUNT_V2_1; > @@ -1423,7 +1425,7 @@ int amdgpu_ras_eeprom_check(struct > amdgpu_ras_eeprom_control *control) > DRM_DEBUG_DRIVER("Found existing EEPROM table with %d > records", > control->ras_num_bad_pages); > > - if (hdr->version == RAS_TABLE_VER_V2_1) { > + if (hdr->version >= RAS_TABLE_VER_V2_1) { > res = __read_table_ras_info(control); > if (res) > return res; @@ -1443,7 +1445,7 @@ int > amdgpu_ras_eeprom_check(struct amdgpu_ras_eeprom_control *control) > ras->bad_page_cnt_threshold); > } else if (hdr->header == RAS_TABLE_HDR_BAD && > amdgpu_bad_page_threshold != 0) { > - if (hdr->version == RAS_TABLE_VER_V2_1) { > + if (hdr->version >= RAS_TABLE_VER_V2_1) { > res = __read_table_ras_info(control); > if (res) > return res; diff --git > a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h > index 13f7eda9a696..ec6d7ea37ad0 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h > @@ -28,6 +28,7 @@ > > #define RAS_TABLE_VER_V1 0x00010000 > #define RAS_TABLE_VER_V2_1 0x00021000 > +#define RAS_TABLE_VER_V3 0x00030000 > > struct amdgpu_device; > > -- > 2.34.1 > > > > ------------------------------ > > Message: 2 > Date: Tue, 11 Mar 2025 19:13:15 +0200 > From: Raag Jadav <raag.jadav@xxxxxxxxx> > To: Alex Deucher <alexdeucher@xxxxxxxxx> > Cc: Andr? Almeida <andrealmeid@xxxxxxxxxx>, > dri-devel@xxxxxxxxxxxxxxxxxxxxx, linux-kernel@xxxxxxxxxxxxxxx, > kernel-dev@xxxxxxxxxx, amd-gfx@xxxxxxxxxxxxxxxxxxxxx, > intel-xe@xxxxxxxxxxxxxxxxxxxxx, intel-gfx@xxxxxxxxxxxxxxxxxxxxx, Alex > Deucher <alexander.deucher@xxxxxxx>, Christian K?nig > <christian.koenig@xxxxxxx>, siqueira@xxxxxxxxxx, airlied@xxxxxxxxx, > simona@xxxxxxxx, rodrigo.vivi@xxxxxxxxx, jani.nikula@xxxxxxxxxxxxxxx, > Xaver Hugl <xaver.hugl@xxxxxxx> > Subject: Re: [PATCH 2/2] drm/amdgpu: Make use of drm_wedge_app_info > Message-ID: <Z9BvK55_Nim54eOu@xxxxxxxxxxxxxxxxxx> > Content-Type: text/plain; charset=utf-8 > > On Mon, Mar 10, 2025 at 06:03:27PM -0400, Alex Deucher wrote: > > On Mon, Mar 10, 2025 at 5:54?PM Andr? Almeida <andrealmeid@xxxxxxxxxx> > wrote: > > > > > > Em 01/03/2025 03:04, Raag Jadav escreveu: > > > > On Fri, Feb 28, 2025 at 06:49:43PM -0300, Andr? Almeida wrote: > > > >> Hi Raag, > > > >> > > > >> On 2/28/25 11:58, Raag Jadav wrote: > > > >>> On Fri, Feb 28, 2025 at 09:13:53AM -0300, Andr? Almeida wrote: > > > >>>> To notify userspace about which app (if any) made the device > > > >>>> get in a wedge state, make use of drm_wedge_app_info parameter, > > > >>>> filling it with the app PID and name. > > > >>>> > > > >>>> Signed-off-by: Andr? Almeida <andrealmeid@xxxxxxxxxx> > > > >>>> --- > > > >>>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 19 > +++++++++++++++++-- > > > >>>> drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 6 +++++- > > > >>>> 2 files changed, 22 insertions(+), 3 deletions(-) > > > >>>> > > > >>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > > > >>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > > > >>>> index 00b9b87dafd8..e06adf6f34fd 100644 > > > >>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > > > >>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > > > >>>> @@ -6123,8 +6123,23 @@ int amdgpu_device_gpu_recover(struct > amdgpu_device *adev, > > > >>>> atomic_set(&adev->reset_domain->reset_res, r); > > > >>>> - if (!r) > > > >>>> - drm_dev_wedged_event(adev_to_drm(adev), > DRM_WEDGE_RECOVERY_NONE, NULL); > > > >>>> + if (!r) { > > > >>>> + struct drm_wedge_app_info aux, *info = NULL; > > > >>>> + > > > >>>> + if (job) { > > > >>>> + struct amdgpu_task_info *ti; > > > >>>> + > > > >>>> + ti = amdgpu_vm_get_task_info_pasid(adev, job->pasid); > > > >>>> + if (ti) { > > > >>>> + aux.pid = ti->pid; > > > >>>> + aux.comm = ti->process_name; > > > >>>> + info = &aux; > > > >>>> + amdgpu_vm_put_task_info(ti); > > > >>>> + } > > > >>>> + } > > > >>> Is this guaranteed to be guilty app and not some scheduled worker? > > > >> > > > >> This is how amdgpu decides which app is the guilty one earlier in > > > >> the code as in the print: > > > >> > > > >> ti = amdgpu_vm_get_task_info_pasid(ring->adev, job->pasid); > > > >> > > > >> "Process information: process %s pid %d thread %s pid %d\n" > > > >> > > > >> So I think it's consistent with what the driver thinks it's the > > > >> guilty process. > > > > > > > > Sure, but with something like app_info we're kind of hinting to > > > > userspace that an application was _indeed_ involved with reset. Is that also > guaranteed? > > > > > > > > Is it possible that an application needlessly suffers from a false > > > > positive scenario (reset due to other factors)? > > > > > > > > > > I asked Alex Deucher in IRC about that and yes, there's a chance > > > that this is a false positive. However, for the majority of cases > > > this is the right app that caused the hang. This is what amdgpu is > > > doing for GL robustness as well and devcoredump, so it's very > > > consistent with how amdgpu deals with this scenario even if the mechanism is > still not perfect. > > > > It's usually the guilty one, but it's not guaranteed. For example, > > say you have a ROCm user queue and a gfx job submitted to a kernel > > queue. The actual guilty job may be the ROCm user queue, but the > > driver may not detect that the ROCm queue was hung until some other > > event (e.g., memory pressure). However, the timer for the gfx job may > > timeout before that happens on the ROCm queue so in that case the gfx > > job would be incorrectly considered guilty. > > So it boils down to what are the chances of that happening and whether it's > significant enough to open the door for API abuse. > > Considering this is amd specific accuracy, it's still an open question how other > drivers are/will be managing it. > > Raag > > > ------------------------------ > > Message: 3 > Date: Tue, 11 Mar 2025 22:38:33 +0100 > From: Tomasz Paku?a <tomasz.pakula.oficjalny@xxxxxxxxx> > To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx > Cc: alexander.deucher@xxxxxxx > Subject: [PATCH] drm/amdgpu/pm: Handle SCLK offset correctly in > overdrive for smu 14.0.2 > Message-ID: > <20250311213833.870840-1-tomasz.pakula.oficjalny@xxxxxxxxx> > Content-Type: text/plain; charset=UTF-8 > > Currently, it seems like the code was carried over from RDNA3 because it assumes > two possible values to set. RDNA4, instead of having: > 0: min SCLK > 1: max SCLK > only has > 0: SCLK offset > > This change makes it so it only reports current offset value instead of showing > possible min/max values and their indices. Moreover, it now only accepts the offset > as a value, without the indice index. > > Additionally, the lower bound was printed as %u by mistake. > > Old: > OD_SCLK_OFFSET: > 0: -500Mhz > 1: 1000Mhz > OD_MCLK: > 0: 97Mhz > 1: 1259MHz > OD_VDDGFX_OFFSET: > 0mV > OD_RANGE: > SCLK_OFFSET: -500Mhz 1000Mhz > MCLK: 97Mhz 1500Mhz > VDDGFX_OFFSET: -200mv 0mv > > New: > OD_SCLK_OFFSET: > 0Mhz > OD_MCLK: > 0: 97Mhz > 1: 1259MHz > OD_VDDGFX_OFFSET: > 0mV > OD_RANGE: > SCLK_OFFSET: -500Mhz 1000Mhz > MCLK: 97Mhz 1500Mhz > VDDGFX_OFFSET: -200mv 0mv > > Setting this offset: > Old: "s 1 <offset>" > New: "s <offset>" > > Signed-off-by: Tomasz Paku?a <tomasz.pakula.oficjalny@xxxxxxxxx> > --- > .../drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c | 59 ++++++------------- > 1 file changed, 18 insertions(+), 41 deletions(-) > > diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c > b/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c > index 5cad09c5f2ff..62bd9647541a 100644 > --- a/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c > +++ b/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c > @@ -1193,16 +1193,9 @@ static int smu_v14_0_2_print_clk_levels(struct > smu_context *smu, > PP_OD_FEATURE_GFXCLK_BIT)) > break; > > - PPTable_t *pptable = smu->smu_table.driver_pptable; > - const OverDriveLimits_t * const overdrive_upperlimits = > - &pptable->SkuTable.OverDriveLimitsBasicMax; > - const OverDriveLimits_t * const overdrive_lowerlimits = > - &pptable->SkuTable.OverDriveLimitsBasicMin; > - > size += sysfs_emit_at(buf, size, "OD_SCLK_OFFSET:\n"); > - size += sysfs_emit_at(buf, size, "0: %dMhz\n1: %uMhz\n", > - overdrive_lowerlimits->GfxclkFoffset, > - overdrive_upperlimits->GfxclkFoffset); > + size += sysfs_emit_at(buf, size, "%dMhz\n", > + > + od_table->OverDriveTable.GfxclkFoffset); > break; > > case SMU_OD_MCLK: > @@ -1336,13 +1329,9 @@ static int smu_v14_0_2_print_clk_levels(struct > smu_context *smu, > size += sysfs_emit_at(buf, size, "%s:\n", "OD_RANGE"); > > if (smu_v14_0_2_is_od_feature_supported(smu, > PP_OD_FEATURE_GFXCLK_BIT)) { > - smu_v14_0_2_get_od_setting_limits(smu, > - PP_OD_FEATURE_GFXCLK_FMIN, > - &min_value, > - NULL); > smu_v14_0_2_get_od_setting_limits(smu, > PP_OD_FEATURE_GFXCLK_FMAX, > - NULL, > + &min_value, > &max_value); > size += sysfs_emit_at(buf, size, > "SCLK_OFFSET: %7dMhz %10uMhz\n", > min_value, max_value); @@ -2417,36 +2406,24 @@ > static int smu_v14_0_2_od_edit_dpm_table(struct smu_context *smu, > return -ENOTSUPP; > } > > - for (i = 0; i < size; i += 2) { > - if (i + 2 > size) { > - dev_info(adev->dev, "invalid number of input parameters %d\n", > size); > - return -EINVAL; > - } > - > - switch (input[i]) { > - case 1: > - smu_v14_0_2_get_od_setting_limits(smu, > - PP_OD_FEATURE_GFXCLK_FMAX, > - &minimum, > - &maximum); > - if (input[i + 1] < minimum || > - input[i + 1] > maximum) { > - dev_info(adev->dev, "GfxclkFmax (%ld) must be within > [%u, %u]!\n", > - input[i + 1], minimum, maximum); > - return -EINVAL; > - } > - > - od_table->OverDriveTable.GfxclkFoffset = input[i + 1]; > - od_table->OverDriveTable.FeatureCtrlMask |= 1U << > PP_OD_FEATURE_GFXCLK_BIT; > - break; > + if (size != 1) { > + dev_info(adev->dev, "invalid number of input parameters %d\n", > size); > + return -EINVAL; > + } > > - default: > - dev_info(adev->dev, "Invalid SCLK_VDDC_TABLE > index: %ld\n", input[i]); > - dev_info(adev->dev, "Supported indices: [0:min,1:max]\n"); > - return -EINVAL; > - } > + smu_v14_0_2_get_od_setting_limits(smu, > + PP_OD_FEATURE_GFXCLK_FMAX, > + &minimum, > + &maximum); > + if (input[0] < minimum || > + input[0] > maximum) { > + dev_info(adev->dev, "GfxclkFoffset must be within [%d, %u]!\n", > + minimum, maximum); > + return -EINVAL; > } > > + od_table->OverDriveTable.GfxclkFoffset = input[0]; > + od_table->OverDriveTable.FeatureCtrlMask |= 1U << > + PP_OD_FEATURE_GFXCLK_BIT; > break; > > case PP_OD_EDIT_MCLK_VDDC_TABLE: > -- > 2.48.1 > > > > ------------------------------ > > Subject: Digest Footer > > _______________________________________________ > amd-gfx mailing list > amd-gfx@xxxxxxxxxxxxxxxxxxxxx > https://lists.freedesktop.org/mailman/listinfo/amd-gfx > > > ------------------------------ > > End of amd-gfx Digest, Vol 106, Issue 157 > *****************************************