[PATCH 2/3] drm/amdgpu: Complete Carrizo EDC (Error Detection and

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



The workarounds are unconditionally performed on CZs with EDC enabled.
EDC detects uncorrected ECC errors and uses data poisoning to prevent
corrupted compute results from being used (read).
EDC enabled CZs are often used in embedded environments.

Change-Id: I84c261785329beeb797f11efbe0ec35790f2996c
Signed-off-by: David Panariti <David.Panariti at amd.com><mailto:David.Panariti at amd.com>
---
drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 148 ++++++++++++++++++++++++----------
1 file changed, 106 insertions(+), 42 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
index 27b57cb..2f5bf5f 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
@@ -1645,35 +1645,92 @@ static const u32 sgpr2_init_regs[] =
               mmCOMPUTE_USER_DATA_9, 0xedcedc09,
};
-static const u32 sec_ded_counter_registers[] =
-{
-              mmCPC_EDC_ATC_CNT,
-              mmCPC_EDC_SCRATCH_CNT,
-              mmCPC_EDC_UCODE_CNT,
-              mmCPF_EDC_ATC_CNT,
-              mmCPF_EDC_ROQ_CNT,
-              mmCPF_EDC_TAG_CNT,
-              mmCPG_EDC_ATC_CNT,
-              mmCPG_EDC_DMA_CNT,
-              mmCPG_EDC_TAG_CNT,
-              mmDC_EDC_CSINVOC_CNT,
-              mmDC_EDC_RESTORE_CNT,
-              mmDC_EDC_STATE_CNT,
-              mmGDS_EDC_CNT,
-              mmGDS_EDC_GRBM_CNT,
-              mmGDS_EDC_OA_DED,
-              mmSPI_EDC_CNT,
-              mmSQC_ATC_EDC_GATCL1_CNT,
-              mmSQC_EDC_CNT,
-              mmSQ_EDC_DED_CNT,
-              mmSQ_EDC_INFO,
-              mmSQ_EDC_SEC_CNT,
-              mmTCC_EDC_CNT,
-              mmTCP_ATC_EDC_GATCL1_CNT,
-              mmTCP_EDC_CNT,
-              mmTD_EDC_CNT
+struct reg32_counter_name_map {
+             uint32_t rnmap_addr;   /* Counter register address */
+             const char *rnmap_name;           /* Name of the counter */
+             size_t rnmap_num_instances;  /* Number of block instances */
};
+#define DEF_mmREG32_NAME_MAP_ELEMENT(reg, num_instances) {             \
+             .rnmap_addr = mm##reg,                                                                           \
+             .rnmap_name = #reg,                                                                    \
+             .rnmap_num_instances = num_instances                                            \
+}
+
+/* See GRBM_GFX_INDEX, et. al. registers. */
+static const struct reg32_counter_name_map sec_ded_counter_registers[] = {
+             DEF_mmREG32_NAME_MAP_ELEMENT(SQC_EDC_CNT, 2),
+             DEF_mmREG32_NAME_MAP_ELEMENT(SQC_ATC_EDC_GATCL1_CNT, 2),
+
+             DEF_mmREG32_NAME_MAP_ELEMENT(SQ_EDC_SEC_CNT, 8),
+             DEF_mmREG32_NAME_MAP_ELEMENT(SQ_EDC_DED_CNT, 8),
+             DEF_mmREG32_NAME_MAP_ELEMENT(TCP_EDC_CNT, 8),
+             DEF_mmREG32_NAME_MAP_ELEMENT(TCP_ATC_EDC_GATCL1_CNT, 8),
+             DEF_mmREG32_NAME_MAP_ELEMENT(TD_EDC_CNT, 8),
+
+             DEF_mmREG32_NAME_MAP_ELEMENT(TCC_EDC_CNT, 4),
+
+             DEF_mmREG32_NAME_MAP_ELEMENT(CPC_EDC_ATC_CNT, 1),
+             DEF_mmREG32_NAME_MAP_ELEMENT(CPC_EDC_SCRATCH_CNT, 1),
+             DEF_mmREG32_NAME_MAP_ELEMENT(CPC_EDC_UCODE_CNT, 1),
+             DEF_mmREG32_NAME_MAP_ELEMENT(CPF_EDC_ATC_CNT, 1),
+             DEF_mmREG32_NAME_MAP_ELEMENT(CPF_EDC_ROQ_CNT, 1),
+             DEF_mmREG32_NAME_MAP_ELEMENT(CPF_EDC_TAG_CNT, 1),
+             DEF_mmREG32_NAME_MAP_ELEMENT(CPG_EDC_ATC_CNT, 1),
+             DEF_mmREG32_NAME_MAP_ELEMENT(CPG_EDC_DMA_CNT, 1),
+             DEF_mmREG32_NAME_MAP_ELEMENT(CPG_EDC_TAG_CNT, 1),
+             DEF_mmREG32_NAME_MAP_ELEMENT(DC_EDC_CSINVOC_CNT, 1),
+             DEF_mmREG32_NAME_MAP_ELEMENT(DC_EDC_STATE_CNT, 1),
+             DEF_mmREG32_NAME_MAP_ELEMENT(DC_EDC_RESTORE_CNT, 1),
+             DEF_mmREG32_NAME_MAP_ELEMENT(GDS_EDC_CNT, 1),
+             DEF_mmREG32_NAME_MAP_ELEMENT(GDS_EDC_GRBM_CNT, 1),
+             DEF_mmREG32_NAME_MAP_ELEMENT(SPI_EDC_CNT, 1),
+};
+
+static int gfx_v8_0_edc_clear_counters(struct amdgpu_device *adev)
+{
+             int ci, se, sh, i;
+             uint32_t count;
+             int r = 0;
+
+             mutex_lock(&adev->grbm_idx_mutex);
+
+             for (ci = 0; ci < ARRAY_SIZE(sec_ded_counter_registers); ++ci) {
+                             const struct reg32_counter_name_map *cp =
+                                             sec_ded_counter_registers + ci;
+                             const char *name = cp->rnmap_name;
+
+                             for (se = 0; se < adev->gfx.config.max_shader_engines; ++se) {
+                                             for (sh = 0; sh < adev->gfx.config.max_sh_per_se; ++sh) {
+                                                             for (i = 0; i < cp->rnmap_num_instances; ++i) {
+                                                                             gfx_v8_0_select_se_sh(adev, se, sh, i);
+                                                                             count = RREG32(cp->rnmap_addr);
+                                                                             count = RREG32(cp->rnmap_addr);
+                                                                             if (count != 0) {
+                                                                                             /*
+                                                                                             * Workaround failed.
+                                                                                             * If people are interested
+                                                                                             * in EDC at all, they will
+                                                                                             * want to know which
+                                                                                             * counters had problems.
+                                                                                             */
+                                                                                             DRM_WARN("EDC counter %s is 0x%08x, but should be 0\n.",
+                                                                                                             name, count);
+                                                                                             r = -EINVAL;
+                                                                                             goto ret;
+                                                                             }
+                                                             }
+                                             }
+                             }
+             }
+
+ret:
+             gfx_v8_0_select_se_sh(adev, 0xffffffff, 0xffffffff, 0xffffffff);
+             mutex_unlock(&adev->grbm_idx_mutex);
+
+             return r;
+}
+
static int gfx_v8_0_do_edc_gpr_workarounds(struct amdgpu_device *adev)
{
               struct amdgpu_ring *ring = &adev->gfx.compute_ring[0];
@@ -1681,18 +1738,36 @@ static int gfx_v8_0_do_edc_gpr_workarounds(struct amdgpu_device *adev)
               struct fence *f = NULL;
               int r, i;
               u32 tmp;
+             u32 dis_bit;
               unsigned total_size, vgpr_offset, sgpr_offset;
               u64 gpu_addr;
-              /* only supported on CZ */
-              if (adev->asic_type != CHIP_CARRIZO)
+             if (adev->asic_type != CHIP_CARRIZO) {
+                             /* EDC is only supported on CZ */
+                             return 0;
+             }
+
+        DRM_INFO("Detected Carrizo.\n");
+
+             tmp = RREG32(mmCC_GC_EDC_CONFIG);
+             dis_bit = REG_GET_FIELD(tmp, CC_GC_EDC_CONFIG, DIS_EDC);
+             if (dis_bit) {
+                             /* On Carrizo, EDC may be disabled by a fuse. */
+                             DRM_INFO("EDC hardware is disabled, GC_EDC_CONFIG: 0x%08x.\n",
+                                             tmp);
                               return 0;
+             }
                /* bail if the compute ring is not ready */
               if (!ring->ready)
                               return 0;
-              tmp = RREG32(mmGB_EDC_MODE);
+             DRM_INFO("Applying EDC workarounds.\n");
+
+             /*
+             * Interested parties can enable EDC using debugfs register reads and
+             * writes.
+             */
               WREG32(mmGB_EDC_MODE, 0);
                total_size =
@@ -1817,18 +1892,7 @@ static int gfx_v8_0_do_edc_gpr_workarounds(struct amdgpu_device *adev)
                               goto fail;
               }
-              tmp = REG_SET_FIELD(tmp, GB_EDC_MODE, DED_MODE, 2);
-              tmp = REG_SET_FIELD(tmp, GB_EDC_MODE, PROP_FED, 1);
-              WREG32(mmGB_EDC_MODE, tmp);
-
-              tmp = RREG32(mmCC_GC_EDC_CONFIG);
-              tmp = REG_SET_FIELD(tmp, CC_GC_EDC_CONFIG, DIS_EDC, 0) | 1;
-              WREG32(mmCC_GC_EDC_CONFIG, tmp);
-
-
-              /* read back registers to clear the counters */
-              for (i = 0; i < ARRAY_SIZE(sec_ded_counter_registers); i++)
-                              RREG32(sec_ded_counter_registers[i]);
+             gfx_v8_0_edc_clear_counters(adev);
 fail:
               amdgpu_ib_free(adev, &ib, NULL);
--
2.7.4



[Index of Archives]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux