> -----Original Message----- > From: amd-gfx [mailto:amd-gfx-bounces at lists.freedesktop.org] On Behalf > Of David Panariti > Sent: Tuesday, June 06, 2017 4:33 PM > To: amd-gfx at lists.freedesktop.org > Cc: Panariti, David > Subject: [PATCH 3/3] drm/amdgpu: Add kernel parameter to control use of > ECC/EDC. > > Allow various kinds of memory integrity methods (e.g. ECC/EDC) to be > enabled > or disabled. By default, all features are disabled. > > EDC is Error Detection and Correction. It can detect ECC errors and do 0 or > more of: count SEC (single error corrected) and DED (double error detected, > i.e. uncorrected ECC error), halt the affected block, interrupt the CPU. > Currently, only counting errors is supported. > > Fixed a whitespace error. > > Signed-off-by: David Panariti <David.Panariti at amd.com> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 + > drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 4 ++++ > drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 28 > ++++++++++++++++++++++++++-- > drivers/gpu/drm/amd/include/amd_shared.h | 14 ++++++++++++++ > 4 files changed, 45 insertions(+), 2 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > index a6f51eb..3e930ee 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > @@ -116,6 +116,7 @@ extern int amdgpu_cntl_sb_buf_per_se; > extern int amdgpu_param_buf_per_se; > extern int amdgpu_job_hang_limit; > extern int amdgpu_lbpw; > +extern unsigned amdgpu_ecc_flags; > > #define AMDGPU_DEFAULT_GTT_SIZE_MB 3072ULL /* 3GB by > default */ > #define AMDGPU_WAIT_IDLE_TIMEOUT_IN_MS 3000 > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c > index c4825ff..972660d 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c > @@ -115,6 +115,7 @@ int amdgpu_cntl_sb_buf_per_se = 0; > int amdgpu_param_buf_per_se = 0; > int amdgpu_job_hang_limit = 0; > int amdgpu_lbpw = -1; > +unsigned amdgpu_ecc_flags = 0; I'd suggest setting amdgpu_ecc_flags to AMD_ECC_SUPPORT_BEST by default. That can be our asic specific default setting. In the case of CZ, that will be disabled until we decide to enable EDC by default. > > MODULE_PARM_DESC(vramlimit, "Restrict VRAM for testing, in > megabytes"); > module_param_named(vramlimit, amdgpu_vram_limit, int, 0600); > @@ -795,6 +796,9 @@ static struct pci_driver amdgpu_kms_pci_driver = { > .driver.pm = &amdgpu_pm_ops, > }; > > +MODULE_PARM_DESC(ecc_flags, "ECC/EDC enable flags (0 = disable > ECC/EDC (default))"); > +module_param_named(ecc_flags, amdgpu_ecc_flags, uint, 0444); > + > > > static int __init amdgpu_init(void) > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c > b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c > index 46e766e..3b5685c 100644 > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c > @@ -1824,7 +1824,17 @@ static int > gfx_v8_0_do_edc_gpr_workarounds(struct amdgpu_device *adev) > if (dis_bit) { > /* On Carrizo, EDC may be permanently disabled by a fuse. */ > DRM_INFO("CZ EDC hardware is disabled, GC_EDC_CONFIG: > 0x%08x.\n", > - tmp); > + tmp); > + return 0; > + } > + > + /* > + * Check if EDC has been requested by a kernel parameter. > + * For Carrizo, EDC is the best/safest mode WRT error handling. > + */ > + if (!(amdgpu_ecc_flags > + & (AMD_ECC_SUPPORT_BEST | AMD_ECC_SUPPORT_EDC))) { > + DRM_INFO("EDC support has not been requested.\n"); Can drop this debugging statement or make it DRM_DEBUG. > return 0; > } > > @@ -1962,6 +1972,20 @@ static int > gfx_v8_0_do_edc_gpr_workarounds(struct amdgpu_device *adev) > goto fail; > } > > + /* 00 - GB_EDC_DED_MODE_LOG: Count DED errors but do not halt > */ > + tmp = REG_SET_FIELD(tmp, GB_EDC_MODE, DED_MODE, 0); > + /* Do not propagate the errors to the next block. */ > + tmp = REG_SET_FIELD(tmp, GB_EDC_MODE, PROP_FED, 0); > + WREG32(mmGB_EDC_MODE, tmp); > + > + tmp = RREG32(mmCC_GC_EDC_CONFIG); > + > + /* > + * Clear EDC_DISABLE bit so the counters are available. > + */ > + tmp = REG_SET_FIELD(tmp, CC_GC_EDC_CONFIG, DIS_EDC, 0); > + WREG32(mmCC_GC_EDC_CONFIG, tmp); > + > gfx_v8_0_edc_clear_counters(adev); > > fail: > @@ -5548,7 +5572,7 @@ static int gfx_v8_0_pre_soft_reset(void *handle) > gfx_v8_0_cp_compute_enable(adev, false); > } > > - return 0; > + return 0; > } > > static int gfx_v8_0_soft_reset(void *handle) > diff --git a/drivers/gpu/drm/amd/include/amd_shared.h > b/drivers/gpu/drm/amd/include/amd_shared.h > index 0f58e95..1cf30aa 100644 > --- a/drivers/gpu/drm/amd/include/amd_shared.h > +++ b/drivers/gpu/drm/amd/include/amd_shared.h > @@ -187,6 +187,20 @@ enum amd_fan_ctrl_mode { > #define AMD_PG_SUPPORT_GFX_QUICK_MG (1 << 11) > #define AMD_PG_SUPPORT_GFX_PIPELINE (1 << 12) > > +/* > + * ECC flags > + * Allows the user to choose what kind of error detection/correction is > used. > + * Currently, EDC is supported on Carrizo. > + * > + * The AMD_ECC_SUPPORT_BEST bit is used to allow a user to have the > driver > + * set what it thinks is best/safest mode. This may not be the same as the > + * default, depending on the GPU and the application. > + * Using a single bit makes it easy to request the best support without > + * needing to know all currently supported modes. > + */ > +#define AMD_ECC_SUPPORT_BEST (1 << 0) > +#define AMD_ECC_SUPPORT_EDC (1 << 1) > + > enum amd_pm_state_type { > /* not used for dpm */ > POWER_STATE_TYPE_DEFAULT, > -- > 2.7.4 > > _______________________________________________ > amd-gfx mailing list > amd-gfx at lists.freedesktop.org > https://lists.freedesktop.org/mailman/listinfo/amd-gfx