> -----Original Message----- > From: amd-gfx [mailto:amd-gfx-bounces at lists.freedesktop.org] On Behalf > Of David Panariti > Sent: Tuesday, June 06, 2017 4:33 PM > To: amd-gfx at lists.freedesktop.org > Cc: Panariti, David > Subject: [PATCH 2/3] drm/amdgpu: Complete Carrizo EDC (Error Detection > and Correction) workarounds. > > The workarounds are unconditionally performed on CZs with EDC enabled. > EDC detects uncorrected ECC errors and uses data poisoning to prevent > corrupted compute results from being used (read). > EDC enabled CZs are often used in embedded environments. > > Signed-off-by: David Panariti <David.Panariti at amd.com> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu.h | 13 ++++ > drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 136 > +++++++++++++++++++++++----------- > 2 files changed, 107 insertions(+), 42 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > index 0d64bbba..a6f51eb 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > @@ -2060,5 +2060,18 @@ int amdgpu_dm_display_resume(struct > amdgpu_device *adev ); > static inline int amdgpu_dm_display_resume(struct amdgpu_device *adev) > { return 0; } > #endif > > +/* Carrizo EDC support */ > +struct reg32_counter_name_map { > + uint32_t rnmap_addr; /* Counter register address */ > + const char *rnmap_name; /* Name of the counter */ > + size_t rnmap_num_instances; /* Number of block instances */ > +}; > + > +#define DEF_mmREG32_NAME_MAP_ELEMENT(reg, num_instances) { > \ > + .rnmap_addr = mm##reg, \ > + .rnmap_name = #reg, \ > + .rnmap_num_instances = num_instances \ > +} > + > #include "amdgpu_object.h" > #endif > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c > b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c > index 07172f8..46e766e 100644 > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c > @@ -1727,35 +1727,80 @@ static const u32 sgpr2_init_regs[] = > mmCOMPUTE_USER_DATA_9, 0xedcedc09, > }; > > -static const u32 sec_ded_counter_registers[] = > -{ > - mmCPC_EDC_ATC_CNT, > - mmCPC_EDC_SCRATCH_CNT, > - mmCPC_EDC_UCODE_CNT, > - mmCPF_EDC_ATC_CNT, > - mmCPF_EDC_ROQ_CNT, > - mmCPF_EDC_TAG_CNT, > - mmCPG_EDC_ATC_CNT, > - mmCPG_EDC_DMA_CNT, > - mmCPG_EDC_TAG_CNT, > - mmDC_EDC_CSINVOC_CNT, > - mmDC_EDC_RESTORE_CNT, > - mmDC_EDC_STATE_CNT, > - mmGDS_EDC_CNT, > - mmGDS_EDC_GRBM_CNT, > - mmGDS_EDC_OA_DED, > - mmSPI_EDC_CNT, > - mmSQC_ATC_EDC_GATCL1_CNT, > - mmSQC_EDC_CNT, > - mmSQ_EDC_DED_CNT, > - mmSQ_EDC_INFO, > - mmSQ_EDC_SEC_CNT, > - mmTCC_EDC_CNT, > - mmTCP_ATC_EDC_GATCL1_CNT, > - mmTCP_EDC_CNT, > - mmTD_EDC_CNT > +/* See GRBM_GFX_INDEX, et. al. registers. */ > +static const struct reg32_counter_name_map sec_ded_counter_registers[] > = { > + DEF_mmREG32_NAME_MAP_ELEMENT(SQC_EDC_CNT, 2), > + > DEF_mmREG32_NAME_MAP_ELEMENT(SQC_ATC_EDC_GATCL1_CN > T, 2), > + > + DEF_mmREG32_NAME_MAP_ELEMENT(SQ_EDC_SEC_CNT, 8), > + DEF_mmREG32_NAME_MAP_ELEMENT(SQ_EDC_DED_CNT, 8), > + DEF_mmREG32_NAME_MAP_ELEMENT(TCP_EDC_CNT, 8), > + > DEF_mmREG32_NAME_MAP_ELEMENT(TCP_ATC_EDC_GATCL1_CN > T, 8), > + DEF_mmREG32_NAME_MAP_ELEMENT(TD_EDC_CNT, 8), > + > + DEF_mmREG32_NAME_MAP_ELEMENT(TCC_EDC_CNT, 4), > + > + DEF_mmREG32_NAME_MAP_ELEMENT(CPC_EDC_ATC_CNT, 1), > + DEF_mmREG32_NAME_MAP_ELEMENT(CPC_EDC_SCRATCH_CNT, > 1), > + DEF_mmREG32_NAME_MAP_ELEMENT(CPC_EDC_UCODE_CNT, 1), > + DEF_mmREG32_NAME_MAP_ELEMENT(CPF_EDC_ATC_CNT, 1), > + DEF_mmREG32_NAME_MAP_ELEMENT(CPF_EDC_ROQ_CNT, 1), > + DEF_mmREG32_NAME_MAP_ELEMENT(CPF_EDC_TAG_CNT, 1), > + DEF_mmREG32_NAME_MAP_ELEMENT(CPG_EDC_ATC_CNT, 1), > + DEF_mmREG32_NAME_MAP_ELEMENT(CPG_EDC_DMA_CNT, 1), > + DEF_mmREG32_NAME_MAP_ELEMENT(CPG_EDC_TAG_CNT, 1), > + DEF_mmREG32_NAME_MAP_ELEMENT(DC_EDC_CSINVOC_CNT, 1), > + DEF_mmREG32_NAME_MAP_ELEMENT(DC_EDC_STATE_CNT, 1), > + DEF_mmREG32_NAME_MAP_ELEMENT(DC_EDC_RESTORE_CNT, 1), > + DEF_mmREG32_NAME_MAP_ELEMENT(GDS_EDC_CNT, 1), > + DEF_mmREG32_NAME_MAP_ELEMENT(GDS_EDC_GRBM_CNT, 1), > + DEF_mmREG32_NAME_MAP_ELEMENT(SPI_EDC_CNT, 1), > }; > > +static int gfx_v8_0_edc_clear_counters(struct amdgpu_device *adev) > +{ > + int ci, se, sh, i; > + uint32_t count; > + int r = 0; > + > + mutex_lock(&adev->grbm_idx_mutex); > + > + for (ci = 0; ci < ARRAY_SIZE(sec_ded_counter_registers); ++ci) { > + const struct reg32_counter_name_map *cp = > + sec_ded_counter_registers + ci; > + const char *name = cp->rnmap_name; > + > + for (se = 0; se < adev->gfx.config.max_shader_engines; > ++se) { > + for (sh = 0; sh < adev->gfx.config.max_sh_per_se; > ++sh) { > + for (i = 0; i < cp->rnmap_num_instances; ++i) > { > + gfx_v8_0_select_se_sh(adev, se, sh, > i); > + count = RREG32(cp->rnmap_addr); > + count = RREG32(cp->rnmap_addr); > + if (count != 0) { > + /* > + * EDC workaround failed. > + * If people are interested > + * in EDC at all, they will > + * want to know which > + * counters had problems. > + */ > + DRM_WARN("EDC counter > %s is 0x%08x, but should be 0\n.", > + name, count); > + r = -EINVAL; > + goto ret; > + } > + } > + } > + } > + } > + > +ret: > + gfx_v8_0_select_se_sh(adev, 0xffffffff, 0xffffffff, 0xffffffff); > + mutex_unlock(&adev->grbm_idx_mutex); > + > + return r; > +} > + > static int gfx_v8_0_do_edc_gpr_workarounds(struct amdgpu_device > *adev) > { > struct amdgpu_ring *ring = &adev->gfx.compute_ring[0]; > @@ -1763,18 +1808,36 @@ static int > gfx_v8_0_do_edc_gpr_workarounds(struct amdgpu_device *adev) > struct dma_fence *f = NULL; > int r, i; > u32 tmp; > + u32 dis_bit; > unsigned total_size, vgpr_offset, sgpr_offset; > u64 gpu_addr; > > - /* only supported on CZ */ > - if (adev->asic_type != CHIP_CARRIZO) > + if (adev->asic_type != CHIP_CARRIZO) { > + /* EDC is only supported on CZ */ > + return 0; > + } No need to add the additional parens. > + > + DRM_INFO("Detected Carrizo.\n"); > + Drop this debugging print. > + tmp = RREG32(mmCC_GC_EDC_CONFIG); > + dis_bit = REG_GET_FIELD(tmp, CC_GC_EDC_CONFIG, DIS_EDC); > + if (dis_bit) { You can just check the bit directly, e.g., if (REG_GET_FIELD(tmp, CC_GC_EDC_CONFIG, DIS_EDC)) { > + /* On Carrizo, EDC may be permanently disabled by a fuse. */ > + DRM_INFO("CZ EDC hardware is disabled, GC_EDC_CONFIG: > 0x%08x.\n", > + tmp); I would reverse the logic here and only print a message if EDC is enabled. > return 0; > + } > > /* bail if the compute ring is not ready */ > if (!ring->ready) > return 0; > > - tmp = RREG32(mmGB_EDC_MODE); > + DRM_INFO("Applying EDC workarounds.\n"); > + Drop the debugging print. > + /* > + * Interested parties can enable EDC using debugfs register reads and > + * writes. > + */ > WREG32(mmGB_EDC_MODE, 0); > > total_size = > @@ -1899,18 +1962,7 @@ static int > gfx_v8_0_do_edc_gpr_workarounds(struct amdgpu_device *adev) > goto fail; > } > > - tmp = REG_SET_FIELD(tmp, GB_EDC_MODE, DED_MODE, 2); > - tmp = REG_SET_FIELD(tmp, GB_EDC_MODE, PROP_FED, 1); > - WREG32(mmGB_EDC_MODE, tmp); > - > - tmp = RREG32(mmCC_GC_EDC_CONFIG); > - tmp = REG_SET_FIELD(tmp, CC_GC_EDC_CONFIG, DIS_EDC, 0) | 1; > - WREG32(mmCC_GC_EDC_CONFIG, tmp); > - > - > - /* read back registers to clear the counters */ > - for (i = 0; i < ARRAY_SIZE(sec_ded_counter_registers); i++) > - RREG32(sec_ded_counter_registers[i]); > + gfx_v8_0_edc_clear_counters(adev); > > fail: > amdgpu_ib_free(adev, &ib, NULL); > -- > 2.7.4 > > _______________________________________________ > amd-gfx mailing list > amd-gfx at lists.freedesktop.org > https://lists.freedesktop.org/mailman/listinfo/amd-gfx