[AMD Official Use Only - AMD Internal Distribution Only] > -----Original Message----- > From: Zhang, Hawking <Hawking.Zhang@xxxxxxx> > Sent: Wednesday, January 22, 2025 6:30 PM > To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx; Zhou1, Tao <Tao.Zhou1@xxxxxxx> > Cc: Zhang, Hawking <Hawking.Zhang@xxxxxxx> > Subject: [PATCH] drm/amdgpu: Update usage for bad page threshold > > The driver's behavior varies based on > the configuration of amdgpu_bad_page_threshold setting > > Signed-off-by: Hawking Zhang <Hawking.Zhang@xxxxxxx> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 2 +- > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 54 ++++++++++--------- > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 2 +- > .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 25 +++++---- > 4 files changed, 44 insertions(+), 39 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c > index 99d884e6763a..87ea2e2a062f 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c > @@ -983,7 +983,7 @@ module_param_named(reset_method, > amdgpu_reset_method, int, 0644); > * result in the GPU entering bad status when the number of total > * faulty pages by ECC exceeds the threshold value. > */ > -MODULE_PARM_DESC(bad_page_threshold, "Bad page threshold(-1 = ignore > threshold (default value), 0 = disable bad page retirement, -2 = driver sets > threshold)"); > +MODULE_PARM_DESC(bad_page_threshold, "Bad page threshold(-1 = ignore > +threshold (default value), 0 = disable bad page retirement, -2 = > +threshold determined by a formula, 0 < threshold < max records, > +user-defined threshold)"); > module_param_named(bad_page_threshold, amdgpu_bad_page_threshold, int, > 0444); > > MODULE_PARM_DESC(num_kcq, "number of kernel compute queue user want to > setup (8 if set to greater than 8 or less than 0, only affect gfx 8+)"); diff --git > a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > index 960476e6124b..5676ffe5c43a 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > @@ -3071,35 +3071,35 @@ static void amdgpu_ras_validate_threshold(struct > amdgpu_device *adev, > struct amdgpu_ras *con = amdgpu_ras_get_context(adev); > > /* > - * Justification of value bad_page_cnt_threshold in ras structure > + * amdgpu_bad_page_threshold is used to config > + * the threshold for the number of bad pages. > + * -1: Threshold is set to default value > + * Driver will issue a warning message when threshold is reached > + * and continue runtime services. > + * 0: Disable bad page retirement > + * Driver will not retire bad pages > + * which is intended for debugging purpose. > + * -2: Threshold is determined by a formula > + * that assumes 1 bad page per 100M of local memory. > + * Driver will continue runtime services when threhold is reached. > + * 0 < threshold < max number of bad page records in EEPROM, > + * A user-defined threshold is set > + * Driver will halt runtime services when this custom threshold is > reached. > * > - * Generally, 0 <= amdgpu_bad_page_threshold <= max record length > - * in eeprom or amdgpu_bad_page_threshold == -2, introduce two > - * scenarios accordingly. > - * > - * Bad page retirement enablement: > - * - If amdgpu_bad_page_threshold = -2, > - * bad_page_cnt_threshold = typical value by formula. > - * > - * - When the value from user is 0 < amdgpu_bad_page_threshold < > - * max record length in eeprom, use it directly. > - * > - * Bad page retirement disablement: > - * - If amdgpu_bad_page_threshold = 0, bad page retirement > - * functionality is disabled, and bad_page_cnt_threshold will > - * take no effect. > */ > + if (amdgpu_bad_page_threshold == -2) { > + u64 val = adev->gmc.mc_vram_size; > > - if (amdgpu_bad_page_threshold < 0) { > - u64 val = adev->gmc.mc_vram_size; > + do_div(val, RAS_BAD_PAGE_COVER); > + con->bad_page_cnt_threshold = min(lower_32_bits(val), > + max_count); > + } else if (amdgpu_bad_page_threshold == -1) { > + con->bad_page_cnt_threshold = ((con->reserved_pages_in_bytes) >> > 21) << 4; > + } else { > + con->bad_page_cnt_threshold = min_t(int, max_count, > + amdgpu_bad_page_threshold); > + } > > - do_div(val, RAS_BAD_PAGE_COVER); > - con->bad_page_cnt_threshold = min(lower_32_bits(val), > - max_count); > - } else { > - con->bad_page_cnt_threshold = min_t(int, max_count, > - amdgpu_bad_page_threshold); > - } > } > > #ifdef HAVE_KFIFO_PUT_NON_POINTER > @@ -3852,8 +3852,10 @@ static void amdgpu_ras_init_reserved_vram_size(struct > amdgpu_device *adev) > case IP_VERSION(13, 0, 2): > case IP_VERSION(13, 0, 6): > case IP_VERSION(13, 0, 12): > + con->reserved_pages_in_bytes = > AMDGPU_RAS_RESERVED_VRAM_SIZE_DEFAULT; > + break; > case IP_VERSION(13, 0, 14): > - con->reserved_pages_in_bytes = > AMDGPU_RAS_RESERVED_VRAM_SIZE; > + con->reserved_pages_in_bytes = > (AMDGPU_RAS_RESERVED_VRAM_SIZE_DEFAULT > +<< 1); > break; > default: > break; > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h > index 82db986c36a0..cc4586581dba 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h > @@ -65,7 +65,7 @@ struct amdgpu_iv_entry; > > /* Reserve 8 physical dram row for possible retirement. > * In worst cases, it will lose 8 * 2MB memory in vram domain */ > -#define AMDGPU_RAS_RESERVED_VRAM_SIZE (16ULL << 20) > +#define AMDGPU_RAS_RESERVED_VRAM_SIZE_DEFAULT (16ULL << 20) > /* The high three bits indicates socketid */ #define > AMDGPU_RAS_GET_FEATURES(val) ((val) & > ~AMDGPU_RAS_FEATURES_SOCKETID_MASK) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c > index 0d824f016916..bd9ed86d8fde 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c > @@ -1428,8 +1428,9 @@ int amdgpu_ras_eeprom_check(struct > amdgpu_ras_eeprom_control *control) > > res = __verify_ras_table_checksum(control); > if (res) > - DRM_ERROR("RAS table incorrect checksum or error:%d\n", > - res); > + dev_err(adev->dev, > + "RAS table incorrect checksum or error:%d\n", > + res); > > /* Warn if we are at 90% of the threshold or above > */ > @@ -1447,8 +1448,9 @@ int amdgpu_ras_eeprom_check(struct > amdgpu_ras_eeprom_control *control) > > res = __verify_ras_table_checksum(control); > if (res) { > - dev_err(adev->dev, "RAS Table incorrect checksum or > error:%d\n", > - res); > + dev_err(adev->dev, > + "RAS Table incorrect checksum or error:%d\n", > + res); > return -EINVAL; > } > if (ras->bad_page_cnt_threshold > control->ras_num_bad_pages) > { @@ -1466,17 +1468,18 @@ int amdgpu_ras_eeprom_check(struct > amdgpu_ras_eeprom_control *control) > res = amdgpu_ras_eeprom_correct_header_tag(control, > > RAS_TABLE_HDR_VAL); > } else { > - dev_err(adev->dev, "RAS records:%d exceed threshold:%d", > + dev_warn(adev->dev, > + "RAS records:%d exceed threshold:%d\n", > control->ras_num_bad_pages, ras- > >bad_page_cnt_threshold); > - if (amdgpu_bad_page_threshold == -1) { > - dev_warn(adev->dev, "GPU will be initialized due to > bad_page_threshold = -1."); > + if ((amdgpu_bad_page_threshold == -1) || > + (amdgpu_bad_page_threshold == -2)) { [Tao] as discussed, besides modprobe, ras->is_rma shouldn't be set in runtime if threshold is -1. > res = 0; > + dev_warn(adev->dev, > + "Please consult AMD Service Action Guide > (SAG) for appropriate > +service procedures\n"); > } else { > ras->is_rma = true; > - dev_err(adev->dev, > - "RAS records:%d exceed threshold:%d, " > - "GPU will not be initialized. Replace this GPU > or increase the threshold", > - control->ras_num_bad_pages, ras- > >bad_page_cnt_threshold); > + dev_warn(adev->dev, > + "User defined threshold is set, runtime service > will be halt when > +threshold is reached\n"); > } > } > } else { > -- > 2.17.1