CZ EDC param and support

David.Panariti@xxxxxxx (Panariti, David) · Fri, 28 Apr 2017 14:18:11 +0000

Actually, the attachment was an oversight.
It's easier for me to attach, open the attachment and then delete the attachment.
I got only 2/3 this time.
I've gotten a comment that inline patches are preferred.

Sorry for the inconvenience.

davep

From: Koenig, Christian
Sent: Friday, April 28, 2017 4:06 AM
To: Panariti, David <David.Panariti at amd.com>; gpudriverdevsupport <gpudriverdevsupport at amd.com>; amd-gfx at lists.freedesktop.org
Subject: Re: CZ EDC param and support

You somehow messed up the attachment.

Instead of individual files everything is squashed together as all-edc.patch.

Please fix that otherwise proper review won't be possible.

Christian.

Am 28.04.2017 um 00:13 schrieb Panariti, David:
The changes in the workarounds function use DRM_INFO rather than DRM_DEBUG because CZs with EDC are often used in embedded environments and any info can be useful especially in the case of an intermittent problem.

>From e1ce383592c275b58ad95bd80b5479af8c1f9dae Mon Sep 17 00:00:00 2001
From: David Panariti <David.Panariti@xxxxxxx><mailto:David.Panariti at amd.com>
Date: Fri, 14 Apr 2017 13:41:52 -0400
Subject: [PATCH 1/3] drm/amdgpu: Moved gfx_v8_0_select_se_sh() in lieu of
re-redundant prototype.

Will be needed for the rest of the EDC workarounds patch.

Change-Id: Ie586ab38a69e98a91c6cb5747e285ce8bfdd1c86
Signed-off-by: David Panariti <David.Panariti at amd.com><mailto:David.Panariti at amd.com>
---
drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 46 +++++++++++++++++------------------
1 file changed, 23 insertions(+), 23 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
index 2ff5f19..27b57cb 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
@@ -1500,6 +1500,29 @@ static int gfx_v8_0_kiq_init(struct amdgpu_device *adev)
               return 0;
}
+static void gfx_v8_0_select_se_sh(struct amdgpu_device *adev,
+                                                               u32 se_num, u32 sh_num, u32 instance)
+{
+             u32 data;
+
+             if (instance == 0xffffffff)
+                             data = REG_SET_FIELD(0, GRBM_GFX_INDEX, INSTANCE_BROADCAST_WRITES, 1);
+             else
+                             data = REG_SET_FIELD(0, GRBM_GFX_INDEX, INSTANCE_INDEX, instance);
+
+             if (se_num == 0xffffffff)
+                             data = REG_SET_FIELD(data, GRBM_GFX_INDEX, SE_BROADCAST_WRITES, 1);
+             else
+                             data = REG_SET_FIELD(data, GRBM_GFX_INDEX, SE_INDEX, se_num);
+
+             if (sh_num == 0xffffffff)
+                             data = REG_SET_FIELD(data, GRBM_GFX_INDEX, SH_BROADCAST_WRITES, 1);
+             else
+                             data = REG_SET_FIELD(data, GRBM_GFX_INDEX, SH_INDEX, sh_num);
+
+             WREG32(mmGRBM_GFX_INDEX, data);
+}
+
static const u32 vgpr_init_compute_shader[] =
{
               0x7e000209, 0x7e020208,
@@ -3556,29 +3579,6 @@ static void gfx_v8_0_tiling_mode_table_init(struct amdgpu_device *adev)
               }
}
-static void gfx_v8_0_select_se_sh(struct amdgpu_device *adev,
-                                                                u32 se_num, u32 sh_num, u32 instance)
-{
-              u32 data;
-
-              if (instance == 0xffffffff)
-                              data = REG_SET_FIELD(0, GRBM_GFX_INDEX, INSTANCE_BROADCAST_WRITES, 1);
-              else
-                              data = REG_SET_FIELD(0, GRBM_GFX_INDEX, INSTANCE_INDEX, instance);
-
-              if (se_num == 0xffffffff)
-                              data = REG_SET_FIELD(data, GRBM_GFX_INDEX, SE_BROADCAST_WRITES, 1);
-              else
-                              data = REG_SET_FIELD(data, GRBM_GFX_INDEX, SE_INDEX, se_num);
-
-              if (sh_num == 0xffffffff)
-                              data = REG_SET_FIELD(data, GRBM_GFX_INDEX, SH_BROADCAST_WRITES, 1);
-              else
-                              data = REG_SET_FIELD(data, GRBM_GFX_INDEX, SH_INDEX, sh_num);
-
-              WREG32(mmGRBM_GFX_INDEX, data);
-}
-
static u32 gfx_v8_0_create_bitmask(u32 bit_width)
{
               return (u32)((1ULL << bit_width) - 1);
--
2.7.4


>From 38fac8cab73dbc07e0ee7599b52106bc09dd32ea Mon Sep 17 00:00:00 2001
From: David Panariti <David.Panariti@xxxxxxx><mailto:David.Panariti at amd.com>
Date: Mon, 24 Apr 2017 11:05:45 -0400
Subject: [PATCH 2/3] drm/amdgpu: Complete Carrizo EDC (Error Detection and
Correction) workarounds.

The workarounds are unconditionally performed on CZs with EDC enabled.
EDC detects uncorrected ECC errors and uses data poisoning to prevent
corrupted compute results from being used (read).
EDC enabled CZs are often used in embedded environments.

Change-Id: I84c261785329beeb797f11efbe0ec35790f2996c
Signed-off-by: David Panariti <David.Panariti at amd.com><mailto:David.Panariti at amd.com>
---
drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 148 ++++++++++++++++++++++++----------
1 file changed, 106 insertions(+), 42 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
index 27b57cb..2f5bf5f 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
@@ -1645,35 +1645,92 @@ static const u32 sgpr2_init_regs[] =
               mmCOMPUTE_USER_DATA_9, 0xedcedc09,
};
-static const u32 sec_ded_counter_registers[] =
-{
-              mmCPC_EDC_ATC_CNT,
-              mmCPC_EDC_SCRATCH_CNT,
-              mmCPC_EDC_UCODE_CNT,
-              mmCPF_EDC_ATC_CNT,
-              mmCPF_EDC_ROQ_CNT,
-              mmCPF_EDC_TAG_CNT,
-              mmCPG_EDC_ATC_CNT,
-              mmCPG_EDC_DMA_CNT,
-              mmCPG_EDC_TAG_CNT,
-              mmDC_EDC_CSINVOC_CNT,
-              mmDC_EDC_RESTORE_CNT,
-              mmDC_EDC_STATE_CNT,
-              mmGDS_EDC_CNT,
-              mmGDS_EDC_GRBM_CNT,
-              mmGDS_EDC_OA_DED,
-              mmSPI_EDC_CNT,
-              mmSQC_ATC_EDC_GATCL1_CNT,
-              mmSQC_EDC_CNT,
-              mmSQ_EDC_DED_CNT,
-              mmSQ_EDC_INFO,
-              mmSQ_EDC_SEC_CNT,
-              mmTCC_EDC_CNT,
-              mmTCP_ATC_EDC_GATCL1_CNT,
-              mmTCP_EDC_CNT,
-              mmTD_EDC_CNT
+struct reg32_counter_name_map {
+             uint32_t rnmap_addr;   /* Counter register address */
+             const char *rnmap_name;           /* Name of the counter */
+             size_t rnmap_num_instances;  /* Number of block instances */
};
+#define DEF_mmREG32_NAME_MAP_ELEMENT(reg, num_instances) {             \
+             .rnmap_addr = mm##reg,                                                                           \
+             .rnmap_name = #reg,                                                                    \
+             .rnmap_num_instances = num_instances                                            \
+}
+
+/* See GRBM_GFX_INDEX, et. al. registers. */
+static const struct reg32_counter_name_map sec_ded_counter_registers[] = {
+             DEF_mmREG32_NAME_MAP_ELEMENT(SQC_EDC_CNT, 2),
+             DEF_mmREG32_NAME_MAP_ELEMENT(SQC_ATC_EDC_GATCL1_CNT, 2),
+
+             DEF_mmREG32_NAME_MAP_ELEMENT(SQ_EDC_SEC_CNT, 8),
+             DEF_mmREG32_NAME_MAP_ELEMENT(SQ_EDC_DED_CNT, 8),
+             DEF_mmREG32_NAME_MAP_ELEMENT(TCP_EDC_CNT, 8),
+             DEF_mmREG32_NAME_MAP_ELEMENT(TCP_ATC_EDC_GATCL1_CNT, 8),
+             DEF_mmREG32_NAME_MAP_ELEMENT(TD_EDC_CNT, 8),
+
+             DEF_mmREG32_NAME_MAP_ELEMENT(TCC_EDC_CNT, 4),
+
+             DEF_mmREG32_NAME_MAP_ELEMENT(CPC_EDC_ATC_CNT, 1),
+             DEF_mmREG32_NAME_MAP_ELEMENT(CPC_EDC_SCRATCH_CNT, 1),
+             DEF_mmREG32_NAME_MAP_ELEMENT(CPC_EDC_UCODE_CNT, 1),
+             DEF_mmREG32_NAME_MAP_ELEMENT(CPF_EDC_ATC_CNT, 1),
+             DEF_mmREG32_NAME_MAP_ELEMENT(CPF_EDC_ROQ_CNT, 1),
+             DEF_mmREG32_NAME_MAP_ELEMENT(CPF_EDC_TAG_CNT, 1),
+             DEF_mmREG32_NAME_MAP_ELEMENT(CPG_EDC_ATC_CNT, 1),
+             DEF_mmREG32_NAME_MAP_ELEMENT(CPG_EDC_DMA_CNT, 1),
+             DEF_mmREG32_NAME_MAP_ELEMENT(CPG_EDC_TAG_CNT, 1),
+             DEF_mmREG32_NAME_MAP_ELEMENT(DC_EDC_CSINVOC_CNT, 1),
+             DEF_mmREG32_NAME_MAP_ELEMENT(DC_EDC_STATE_CNT, 1),
+             DEF_mmREG32_NAME_MAP_ELEMENT(DC_EDC_RESTORE_CNT, 1),
+             DEF_mmREG32_NAME_MAP_ELEMENT(GDS_EDC_CNT, 1),
+             DEF_mmREG32_NAME_MAP_ELEMENT(GDS_EDC_GRBM_CNT, 1),
+             DEF_mmREG32_NAME_MAP_ELEMENT(SPI_EDC_CNT, 1),
+};
+
+static int gfx_v8_0_edc_clear_counters(struct amdgpu_device *adev)
+{
+             int ci, se, sh, i;
+             uint32_t count;
+             int r = 0;
+
+             mutex_lock(&adev->grbm_idx_mutex);
+
+             for (ci = 0; ci < ARRAY_SIZE(sec_ded_counter_registers); ++ci) {
+                             const struct reg32_counter_name_map *cp =
+                                             sec_ded_counter_registers + ci;
+                             const char *name = cp->rnmap_name;
+
+                             for (se = 0; se < adev->gfx.config.max_shader_engines; ++se) {
+                                             for (sh = 0; sh < adev->gfx.config.max_sh_per_se; ++sh) {
+                                                             for (i = 0; i < cp->rnmap_num_instances; ++i) {
+                                                                             gfx_v8_0_select_se_sh(adev, se, sh, i);
+                                                                             count = RREG32(cp->rnmap_addr);
+                                                                             count = RREG32(cp->rnmap_addr);
+                                                                             if (count != 0) {
+                                                                                             /*
+                                                                                             * Workaround failed.
+                                                                                             * If people are interested
+                                                                                             * in EDC at all, they will
+                                                                                             * want to know which
+                                                                                             * counters had problems.
+                                                                                             */
+                                                                                             DRM_WARN("EDC counter %s is 0x%08x, but should be 0\n.",
+                                                                                                             name, count);
+                                                                                             r = -EINVAL;
+                                                                                             goto ret;
+                                                                             }
+                                                             }
+                                             }
+                             }
+             }
+
+ret:
+             gfx_v8_0_select_se_sh(adev, 0xffffffff, 0xffffffff, 0xffffffff);
+             mutex_unlock(&adev->grbm_idx_mutex);
+
+             return r;
+}
+
static int gfx_v8_0_do_edc_gpr_workarounds(struct amdgpu_device *adev)
{
               struct amdgpu_ring *ring = &adev->gfx.compute_ring[0];
@@ -1681,18 +1738,36 @@ static int gfx_v8_0_do_edc_gpr_workarounds(struct amdgpu_device *adev)
               struct fence *f = NULL;
               int r, i;
               u32 tmp;
+             u32 dis_bit;
               unsigned total_size, vgpr_offset, sgpr_offset;
               u64 gpu_addr;
-              /* only supported on CZ */
-              if (adev->asic_type != CHIP_CARRIZO)
+             if (adev->asic_type != CHIP_CARRIZO) {
+                             /* EDC is only supported on CZ */
+                             return 0;
+             }
+
+        DRM_INFO("Detected Carrizo.\n");
+
+             tmp = RREG32(mmCC_GC_EDC_CONFIG);
+             dis_bit = REG_GET_FIELD(tmp, CC_GC_EDC_CONFIG, DIS_EDC);
+             if (dis_bit) {
+                             /* On Carrizo, EDC may be disabled by a fuse. */
+                             DRM_INFO("EDC hardware is disabled, GC_EDC_CONFIG: 0x%08x.\n",
+                                             tmp);
                               return 0;
+             }
                /* bail if the compute ring is not ready */
               if (!ring->ready)
                               return 0;
-              tmp = RREG32(mmGB_EDC_MODE);
+             DRM_INFO("Applying EDC workarounds.\n");
+
+             /*
+             * Interested parties can enable EDC using debugfs register reads and
+             * writes.
+             */
               WREG32(mmGB_EDC_MODE, 0);
                total_size =
@@ -1817,18 +1892,7 @@ static int gfx_v8_0_do_edc_gpr_workarounds(struct amdgpu_device *adev)
                               goto fail;
               }
-              tmp = REG_SET_FIELD(tmp, GB_EDC_MODE, DED_MODE, 2);
-              tmp = REG_SET_FIELD(tmp, GB_EDC_MODE, PROP_FED, 1);
-              WREG32(mmGB_EDC_MODE, tmp);
-
-              tmp = RREG32(mmCC_GC_EDC_CONFIG);
-              tmp = REG_SET_FIELD(tmp, CC_GC_EDC_CONFIG, DIS_EDC, 0) | 1;
-              WREG32(mmCC_GC_EDC_CONFIG, tmp);
-
-
-              /* read back registers to clear the counters */
-              for (i = 0; i < ARRAY_SIZE(sec_ded_counter_registers); i++)
-                              RREG32(sec_ded_counter_registers[i]);
+             gfx_v8_0_edc_clear_counters(adev);
 fail:
               amdgpu_ib_free(adev, &ib, NULL);
--
2.7.4


>From ec4803205582c1011f5ced1ead70ee244268b4b8 Mon Sep 17 00:00:00 2001
From: David Panariti <David.Panariti@xxxxxxx><mailto:David.Panariti at amd.com>
Date: Wed, 26 Apr 2017 10:13:06 -0400
Subject: [PATCH 3/3] drm/amdgpu: Add kernel parameter to control use of
ECC/EDC.

Allow various kinds of memory integrity methods (e.g. ECC/EDC) to be enabled
or disabled.  By default, all features are disabled.

EDC is Error Detection and Correction.  It can detect ECC errors and do 0 or
more of: count SEC (single error corrected) and DED (double error detected,
i.e. uncorrected ECC error), halt the affected block, interrupt the CPU.
Currently, only counting errors is supported.

Signed-off-by: David Panariti <David.Panariti at amd.com><mailto:David.Panariti at amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu.h      |  1 +
drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c  |  4 ++++
drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c    | 34 +++++++++++++++++++++++++++-----
drivers/gpu/drm/amd/include/amd_shared.h | 14 +++++++++++++
4 files changed, 48 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 4a16e3c..0322392 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -111,6 +111,7 @@ extern int amdgpu_prim_buf_per_se;
extern int amdgpu_pos_buf_per_se;
extern int amdgpu_cntl_sb_buf_per_se;
extern int amdgpu_param_buf_per_se;
+extern unsigned amdgpu_ecc_flags;
 #define AMDGPU_DEFAULT_GTT_SIZE_MB                      3072ULL /* 3GB by default */
#define AMDGPU_WAIT_IDLE_TIMEOUT_IN_MS                   3000
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index ead00d7..00e16ac 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -110,6 +110,7 @@ int amdgpu_prim_buf_per_se = 0;
int amdgpu_pos_buf_per_se = 0;
int amdgpu_cntl_sb_buf_per_se = 0;
int amdgpu_param_buf_per_se = 0;
+unsigned amdgpu_ecc_flags = 0;
 MODULE_PARM_DESC(vramlimit, "Restrict VRAM for testing, in megabytes");
module_param_named(vramlimit, amdgpu_vram_limit, int, 0600);
@@ -235,6 +236,9 @@ module_param_named(cntl_sb_buf_per_se, amdgpu_cntl_sb_buf_per_se, int, 0444);
MODULE_PARM_DESC(param_buf_per_se, "the size of Off-Chip Pramater Cache per Shader Engine (default depending on gfx)");
module_param_named(param_buf_per_se, amdgpu_param_buf_per_se, int, 0444);
+MODULE_PARM_DESC(ecc_flags, "ECC/EDC enable flags (0 = disable ECC/EDC (default))");
+module_param_named(ecc_flags, amdgpu_ecc_flags, uint, 0444);
+
 static const struct pci_device_id pciidlist[] = {
#ifdef  CONFIG_DRM_AMDGPU_SI
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
index 2f5bf5f..05cab7e 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
@@ -1708,7 +1708,7 @@ static int gfx_v8_0_edc_clear_counters(struct amdgpu_device *adev)
                                                                               count = RREG32(cp->rnmap_addr);
                                                                               if (count != 0) {
                                                                                               /*
-                                                                                              * Workaround failed.
+                                                                                             * EDC workaround failed.
                                                                                                * If people are interested
                                                                                                * in EDC at all, they will
                                                                                                * want to know which
@@ -1747,14 +1747,24 @@ static int gfx_v8_0_do_edc_gpr_workarounds(struct amdgpu_device *adev)
                               return 0;
               }
-        DRM_INFO("Detected Carrizo.\n");
+             DRM_INFO("Detected Carrizo.\n");
                tmp = RREG32(mmCC_GC_EDC_CONFIG);
               dis_bit = REG_GET_FIELD(tmp, CC_GC_EDC_CONFIG, DIS_EDC);
               if (dis_bit) {
-                              /* On Carrizo, EDC may be disabled by a fuse. */
-                              DRM_INFO("EDC hardware is disabled, GC_EDC_CONFIG: 0x%08x.\n",
-                                              tmp);
+                             /* On Carrizo, EDC may be disabled permanently by a fuse. */
+                             DRM_INFO("Carrizo EDC hardware is disabled, GC_EDC_CONFIG: 0x%08x.\n",
+                                             tmp);
+                             return 0;
+             }
+
+             /*
+             * Check if EDC has been requested by a kernel parameter.
+             * For Carrizo, EDC is the best/safest mode WRT error handling.
+             */
+             if (!(amdgpu_ecc_flags
+                   & (AMD_ECC_SUPPORT_BEST | AMD_ECC_SUPPORT_EDC))) {
+                             DRM_INFO("EDC support has not been requested.\n");
                               return 0;
               }
@@ -1892,6 +1902,20 @@ static int gfx_v8_0_do_edc_gpr_workarounds(struct amdgpu_device *adev)
                               goto fail;
               }
+             /* 00 - GB_EDC_DED_MODE_LOG: Count DED errors but do not halt */
+             tmp = REG_SET_FIELD(tmp, GB_EDC_MODE, DED_MODE, 0);
+             /* Do not propagate the errors to the next block. */
+             tmp = REG_SET_FIELD(tmp, GB_EDC_MODE, PROP_FED, 0);
+             WREG32(mmGB_EDC_MODE, tmp);
+
+             tmp = RREG32(mmCC_GC_EDC_CONFIG);
+
+             /*
+             * Clear EDC_DISABLE bit so the counters are available.
+             */
+             tmp = REG_SET_FIELD(tmp, CC_GC_EDC_CONFIG, DIS_EDC, 0);
+             WREG32(mmCC_GC_EDC_CONFIG, tmp);
+
               gfx_v8_0_edc_clear_counters(adev);
 fail:
diff --git a/drivers/gpu/drm/amd/include/amd_shared.h b/drivers/gpu/drm/amd/include/amd_shared.h
index 2ccf44e..c4fd013 100644
--- a/drivers/gpu/drm/amd/include/amd_shared.h
+++ b/drivers/gpu/drm/amd/include/amd_shared.h
@@ -179,6 +179,20 @@ struct amd_pp_profile {
#define AMD_PG_SUPPORT_GFX_QUICK_MG                                (1 << 11)
#define AMD_PG_SUPPORT_GFX_PIPELINE                     (1 << 12)
+/*
+ * ECC flags
+ * Allows the user to choose what kind of error detection/correction is used.
+ * Currently, EDC is supported on Carrizo.
+ *
+ * The AMD_ECC_SUPPORT_BEST bit is used to allow a user to have the driver
+ * set what it thinks is best/safest mode.  This may not be the same as the
+ * default, depending on the GPU and the application.
+ * Using a single bit makes it easy to request the best support without
+ * needing to know all currently supported modes.
+ */
+#define AMD_ECC_SUPPORT_BEST                                     (1 << 0)
+#define AMD_ECC_SUPPORT_EDC                                       (1 << 1)
+
enum amd_pm_state_type {
               /* not used for dpm */
               POWER_STATE_TYPE_DEFAULT,
--
2.7.4



-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.freedesktop.org/archives/amd-gfx/attachments/20170428/43d4e969/attachment-0001.html>