[PATCH 3/3] drm/amdgpu: Add kernel parameter to control use of

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Allow various kinds of memory integrity methods (e.g. ECC/EDC) to be enabled
or disabled.  By default, all features are disabled.

EDC is Error Detection and Correction.  It can detect ECC errors and do 0 or
more of: count SEC (single error corrected) and DED (double error detected,
i.e. uncorrected ECC error), halt the affected block, interrupt the CPU.
Currently, only counting errors is supported.

Signed-off-by: David Panariti <David.Panariti at amd.com><mailto:David.Panariti at amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu.h      |  1 +
drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c  |  4 ++++
drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c    | 34 +++++++++++++++++++++++++++-----
drivers/gpu/drm/amd/include/amd_shared.h | 14 +++++++++++++
4 files changed, 48 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 4a16e3c..0322392 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -111,6 +111,7 @@ extern int amdgpu_prim_buf_per_se;
extern int amdgpu_pos_buf_per_se;
extern int amdgpu_cntl_sb_buf_per_se;
extern int amdgpu_param_buf_per_se;
+extern unsigned amdgpu_ecc_flags;
 #define AMDGPU_DEFAULT_GTT_SIZE_MB                      3072ULL /* 3GB by default */
#define AMDGPU_WAIT_IDLE_TIMEOUT_IN_MS                   3000
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index ead00d7..00e16ac 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -110,6 +110,7 @@ int amdgpu_prim_buf_per_se = 0;
int amdgpu_pos_buf_per_se = 0;
int amdgpu_cntl_sb_buf_per_se = 0;
int amdgpu_param_buf_per_se = 0;
+unsigned amdgpu_ecc_flags = 0;
 MODULE_PARM_DESC(vramlimit, "Restrict VRAM for testing, in megabytes");
module_param_named(vramlimit, amdgpu_vram_limit, int, 0600);
@@ -235,6 +236,9 @@ module_param_named(cntl_sb_buf_per_se, amdgpu_cntl_sb_buf_per_se, int, 0444);
MODULE_PARM_DESC(param_buf_per_se, "the size of Off-Chip Pramater Cache per Shader Engine (default depending on gfx)");
module_param_named(param_buf_per_se, amdgpu_param_buf_per_se, int, 0444);
+MODULE_PARM_DESC(ecc_flags, "ECC/EDC enable flags (0 = disable ECC/EDC (default))");
+module_param_named(ecc_flags, amdgpu_ecc_flags, uint, 0444);
+
 static const struct pci_device_id pciidlist[] = {
#ifdef  CONFIG_DRM_AMDGPU_SI
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
index 2f5bf5f..05cab7e 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
@@ -1708,7 +1708,7 @@ static int gfx_v8_0_edc_clear_counters(struct amdgpu_device *adev)
                                                                               count = RREG32(cp->rnmap_addr);
                                                                               if (count != 0) {
                                                                                               /*
-                                                                                              * Workaround failed.
+                                                                                             * EDC workaround failed.
                                                                                                * If people are interested
                                                                                                * in EDC at all, they will
                                                                                                * want to know which
@@ -1747,14 +1747,24 @@ static int gfx_v8_0_do_edc_gpr_workarounds(struct amdgpu_device *adev)
                               return 0;
               }
-        DRM_INFO("Detected Carrizo.\n");
+             DRM_INFO("Detected Carrizo.\n");
                tmp = RREG32(mmCC_GC_EDC_CONFIG);
               dis_bit = REG_GET_FIELD(tmp, CC_GC_EDC_CONFIG, DIS_EDC);
               if (dis_bit) {
-                              /* On Carrizo, EDC may be disabled by a fuse. */
-                              DRM_INFO("EDC hardware is disabled, GC_EDC_CONFIG: 0x%08x.\n",
-                                              tmp);
+                             /* On Carrizo, EDC may be disabled permanently by a fuse. */
+                             DRM_INFO("Carrizo EDC hardware is disabled, GC_EDC_CONFIG: 0x%08x.\n",
+                                             tmp);
+                             return 0;
+             }
+
+             /*
+             * Check if EDC has been requested by a kernel parameter.
+             * For Carrizo, EDC is the best/safest mode WRT error handling.
+             */
+             if (!(amdgpu_ecc_flags
+                   & (AMD_ECC_SUPPORT_BEST | AMD_ECC_SUPPORT_EDC))) {
+                             DRM_INFO("EDC support has not been requested.\n");
                               return 0;
               }
@@ -1892,6 +1902,20 @@ static int gfx_v8_0_do_edc_gpr_workarounds(struct amdgpu_device *adev)
                               goto fail;
               }
+             /* 00 - GB_EDC_DED_MODE_LOG: Count DED errors but do not halt */
+             tmp = REG_SET_FIELD(tmp, GB_EDC_MODE, DED_MODE, 0);
+             /* Do not propagate the errors to the next block. */
+             tmp = REG_SET_FIELD(tmp, GB_EDC_MODE, PROP_FED, 0);
+             WREG32(mmGB_EDC_MODE, tmp);
+
+             tmp = RREG32(mmCC_GC_EDC_CONFIG);
+
+             /*
+             * Clear EDC_DISABLE bit so the counters are available.
+             */
+             tmp = REG_SET_FIELD(tmp, CC_GC_EDC_CONFIG, DIS_EDC, 0);
+             WREG32(mmCC_GC_EDC_CONFIG, tmp);
+
               gfx_v8_0_edc_clear_counters(adev);
 fail:
diff --git a/drivers/gpu/drm/amd/include/amd_shared.h b/drivers/gpu/drm/amd/include/amd_shared.h
index 2ccf44e..c4fd013 100644
--- a/drivers/gpu/drm/amd/include/amd_shared.h
+++ b/drivers/gpu/drm/amd/include/amd_shared.h
@@ -179,6 +179,20 @@ struct amd_pp_profile {
#define AMD_PG_SUPPORT_GFX_QUICK_MG                                (1 << 11)
#define AMD_PG_SUPPORT_GFX_PIPELINE                     (1 << 12)
+/*
+ * ECC flags
+ * Allows the user to choose what kind of error detection/correction is used.
+ * Currently, EDC is supported on Carrizo.
+ *
+ * The AMD_ECC_SUPPORT_BEST bit is used to allow a user to have the driver
+ * set what it thinks is best/safest mode.  This may not be the same as the
+ * default, depending on the GPU and the application.
+ * Using a single bit makes it easy to request the best support without
+ * needing to know all currently supported modes.
+ */
+#define AMD_ECC_SUPPORT_BEST                                     (1 << 0)
+#define AMD_ECC_SUPPORT_EDC                                       (1 << 1)
+
enum amd_pm_state_type {
               /* not used for dpm */
               POWER_STATE_TYPE_DEFAULT,
--
2.7.4







_______________________________________________

amd-gfx mailing list

amd-gfx at lists.freedesktop.org<mailto:amd-gfx at lists.freedesktop.org>

https://lists.freedesktop.org/mailman/listinfo/amd-gfx


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.freedesktop.org/archives/amd-gfx/attachments/20170501/b6c1018e/attachment-0001.html>


[Index of Archives]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux