[AMD Official Use Only - General] > -----Original Message----- > From: Zhou1, Tao <Tao.Zhou1@xxxxxxx> > Sent: Tuesday, February 21, 2023 4:29 PM > To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx; Zhang, Hawking > <Hawking.Zhang@xxxxxxx>; Yang, Stanley <Stanley.Yang@xxxxxxx>; Chai, > Thomas <YiPeng.Chai@xxxxxxx>; Li, Candice <Candice.Li@xxxxxxx> > Cc: Zhou1, Tao <Tao.Zhou1@xxxxxxx> > Subject: [PATCH 2/2] drm/amdgpu: add bad_page_threshold check in > ras_eeprom_check_err > > bad_page_threshold controls page retirement behavior and it should be also > checked. > > Signed-off-by: Tao Zhou <tao.zhou1@xxxxxxx> > --- > .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 20 ++++++++++++++- > ---- > 1 file changed, 15 insertions(+), 5 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c > index 9d370465b08d..c88123896fe8 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c > @@ -417,7 +417,8 @@ bool > amdgpu_ras_eeprom_check_err_threshold(struct amdgpu_device *adev) { > struct amdgpu_ras *con = amdgpu_ras_get_context(adev); > > - if (!__is_ras_eeprom_supported(adev)) > + if (!__is_ras_eeprom_supported(adev) || > + !amdgpu_bad_page_threshold) > return false; > > /* skip check eeprom table for VEGA20 Gaming */ @@ -428,10 > +429,19 @@ bool amdgpu_ras_eeprom_check_err_threshold(struct > amdgpu_device *adev) > return false; > > if (con->eeprom_control.tbl_hdr.header == RAS_TABLE_HDR_BAD) { > - dev_warn(adev->dev, "This GPU is in BAD status."); > - dev_warn(adev->dev, "Please retire it or set a larger " > - "threshold value when reloading driver.\n"); > - return true; > + if (amdgpu_bad_page_threshold == -1) { > + dev_warn(adev->dev, "RAS records:%d exceed > threshold:%d", > + con->eeprom_control.ras_num_recs, con- > >bad_page_cnt_threshold); > + dev_warn(adev->dev, > + "But GPU can be operated due to > bad_page_threshold = -1.\n"); > + return false; > + } else if (amdgpu_bad_page_threshold > 0 || > + amdgpu_bad_page_threshold == -2) { Stanley: it can't guarantee use to set amdgpu_bad_page_threshold value as expected for example -3, how about set this if condition as below else if (amdgpu_bad_page_threshold) { ... } And in patch#1 the value -2 isn't need anymore. Regards, Stanley > + dev_warn(adev->dev, "This GPU is in BAD status."); > + dev_warn(adev->dev, "Please retire it or set a larger > " > + "threshold value when reloading driver.\n"); > + return true; > + } > } > > return false; > -- > 2.35.1