From: Dennis Li <Dennis.Li@xxxxxxx> SQ's watchdog timer monitors forward progress, a mask of which waves caused the watchdog timeout is recorded into ras status registers and then trigger a system fatal error event. v2: 1. change *query_timeout_status to *query_sq_timeout_status. 2. move query_sq_timeout_status into amdgpu_ras_do_recovery. 3. add module parameters to enable/disable fatal error event and modify the watchdog timer. v3: 1. remove unused parameters of *enable_watchdog_timer Signed-off-by: Dennis Li <Dennis.Li@xxxxxxx> Reviewed-by: Hawking Zhang <Hawking.Zhang@xxxxxxx> Signed-off-by: Alex Deucher <alexander.deucher@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 7 ++ drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 18 ++++ drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h | 2 + drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 3 + drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 5 + drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c | 106 ++++++++++++++++++++++ drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.h | 3 + drivers/gpu/drm/amd/amdgpu/soc15_common.h | 30 ++++++ 8 files changed, 174 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 361e84369b6c..f82ad1b8eed5 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -127,6 +127,12 @@ struct amdgpu_mgpu_info uint32_t num_apu; }; +struct amdgpu_watchdog_timer +{ + bool timeout_fatal_disable; + uint32_t period; /* maxCycles = (1 << period), the number of cycles before a timeout */ +}; + #define AMDGPU_MAX_TIMEOUT_PARAM_LENGTH 256 /* @@ -186,6 +192,7 @@ extern struct amdgpu_mgpu_info mgpu_info; extern int amdgpu_ras_enable; extern uint amdgpu_ras_mask; extern int amdgpu_bad_page_threshold; +extern struct amdgpu_watchdog_timer amdgpu_watchdog_timer; extern int amdgpu_async_gfx_ring; extern int amdgpu_mcbp; extern int amdgpu_discovery; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index c4d822b46ea4..42654deb9c55 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -172,6 +172,10 @@ struct amdgpu_mgpu_info mgpu_info = { int amdgpu_ras_enable = -1; uint amdgpu_ras_mask = 0xffffffff; int amdgpu_bad_page_threshold = 100; +struct amdgpu_watchdog_timer amdgpu_watchdog_timer = { + .timeout_fatal_disable = false, + .period = 0x3f, /* about 8s */ +}; /** * DOC: vramlimit (int) @@ -527,6 +531,20 @@ module_param_named(ras_enable, amdgpu_ras_enable, int, 0444); MODULE_PARM_DESC(ras_mask, "Mask of RAS features to enable (default 0xffffffff), only valid when ras_enable == 1"); module_param_named(ras_mask, amdgpu_ras_mask, uint, 0444); +/** + * DOC: timeout_fatal_disable (bool) + * Disable Watchdog timeout fatal error event + */ +MODULE_PARM_DESC(timeout_fatal_disable, "disable watchdog timeout fatal error (false = default)"); +module_param_named(timeout_fatal_disable, amdgpu_watchdog_timer.timeout_fatal_disable, bool, 0644); + +/** + * DOC: timeout_period (uint) + * Modify the watchdog timeout max_cycles as (1 << period) + */ +MODULE_PARM_DESC(timeout_period, "watchdog timeout period (0x1F = default), timeout maxCycles = (1 << period)"); +module_param_named(timeout_period, amdgpu_watchdog_timer.period, uint, 0644); + /** * DOC: si_support (int) * Set SI support driver. This parameter works after set config CONFIG_DRM_AMDGPU_SI. For SI asic, when radeon driver is enabled, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h index 1ab9632282d4..d92f0f14cbeb 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h @@ -226,6 +226,8 @@ struct amdgpu_gfx_funcs { void (*init_spm_golden)(struct amdgpu_device *adev); void (*query_ras_error_status) (struct amdgpu_device *adev); void (*update_perfmon_mgcg)(struct amdgpu_device *adev, bool enable); + void (*enable_watchdog_timer)(struct amdgpu_device *adev); + void (*query_sq_timeout_status)(struct amdgpu_device *adev); }; struct sq_work { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 09546dec40ff..5805c78c356b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1467,6 +1467,9 @@ static void amdgpu_ras_error_status_query(struct amdgpu_device *adev, case AMDGPU_RAS_BLOCK__GFX: if (adev->gfx.funcs->query_ras_error_status) adev->gfx.funcs->query_ras_error_status(adev); + + if (adev->gfx.funcs->query_sq_timeout_status) + adev->gfx.funcs->query_sq_timeout_status(adev); break; case AMDGPU_RAS_BLOCK__MMHUB: if (adev->mmhub.funcs->query_ras_error_status) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c index 977eeec70e1b..c37139d2c487 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c @@ -2121,6 +2121,8 @@ static const struct amdgpu_gfx_funcs gfx_v9_4_2_gfx_funcs = { .query_ras_error_count = &gfx_v9_4_2_query_ras_error_count, .reset_ras_error_count = &gfx_v9_4_2_reset_ras_error_count, .query_ras_error_status = &gfx_v9_4_2_query_ras_error_status, + .enable_watchdog_timer = &gfx_v9_4_2_enable_watchdog_timer, + .query_sq_timeout_status = &gfx_v9_4_2_query_sq_timeout_status, }; static int gfx_v9_0_gpu_early_init(struct amdgpu_device *adev) @@ -3965,6 +3967,9 @@ static int gfx_v9_0_hw_init(void *handle) if (adev->asic_type == CHIP_ALDEBARAN) gfx_v9_4_2_set_power_brake_sequence(adev); + if (adev->gfx.funcs->enable_watchdog_timer) + adev->gfx.funcs->enable_watchdog_timer(adev); + return r; } diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c index b2e2026c3ec7..1faeae14ead9 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c @@ -1129,3 +1129,109 @@ void gfx_v9_4_2_query_ras_error_status(struct amdgpu_device *adev) gfx_v9_4_2_query_ea_err_status(adev); gfx_v9_4_2_query_utc_err_status(adev); } + +void gfx_v9_4_2_enable_watchdog_timer(struct amdgpu_device *adev) +{ + uint32_t i; + uint32_t data; + + data = REG_SET_FIELD(0, SQ_TIMEOUT_CONFIG, TIMEOUT_FATAL_DISABLE, + amdgpu_watchdog_timer.timeout_fatal_disable ? 1 : + 0); + data = REG_SET_FIELD(data, SQ_TIMEOUT_CONFIG, PERIOD_SEL, + amdgpu_watchdog_timer.period); + + mutex_lock(&adev->grbm_idx_mutex); + for (i = 0; i < adev->gfx.config.max_shader_engines; i++) { + gfx_v9_4_2_select_se_sh(adev, i, 0xffffffff, 0xffffffff); + WREG32_SOC15(GC, 0, regSQ_TIMEOUT_CONFIG, data); + } + gfx_v9_4_2_select_se_sh(adev, 0xffffffff, 0xffffffff, 0xffffffff); + mutex_unlock(&adev->grbm_idx_mutex); +} + +static uint32_t wave_read_ind(struct amdgpu_device *adev, uint32_t simd, uint32_t wave, uint32_t address) +{ + WREG32_SOC15_RLC_EX(reg, GC, 0, regSQ_IND_INDEX, + (wave << SQ_IND_INDEX__WAVE_ID__SHIFT) | + (simd << SQ_IND_INDEX__SIMD_ID__SHIFT) | + (address << SQ_IND_INDEX__INDEX__SHIFT) | + (SQ_IND_INDEX__FORCE_READ_MASK)); + return RREG32_SOC15(GC, 0, regSQ_IND_DATA); +} + +static void gfx_v9_4_2_log_cu_timeout_status(struct amdgpu_device *adev, + uint32_t status) +{ + struct amdgpu_cu_info *cu_info = &adev->gfx.cu_info; + uint32_t i, simd, wave; + uint32_t wave_status; + uint32_t wave_pc_lo, wave_pc_hi; + uint32_t wave_exec_lo, wave_exec_hi; + uint32_t wave_inst_dw0, wave_inst_dw1; + uint32_t wave_ib_sts; + + for (i = 0; i < 32; i++) { + if (!((i << 1) & status)) + continue; + + simd = i / cu_info->max_waves_per_simd; + wave = i % cu_info->max_waves_per_simd; + + wave_status = wave_read_ind(adev, simd, wave, ixSQ_WAVE_STATUS); + wave_pc_lo = wave_read_ind(adev, simd, wave, ixSQ_WAVE_PC_LO); + wave_pc_hi = wave_read_ind(adev, simd, wave, ixSQ_WAVE_PC_HI); + wave_exec_lo = + wave_read_ind(adev, simd, wave, ixSQ_WAVE_EXEC_LO); + wave_exec_hi = + wave_read_ind(adev, simd, wave, ixSQ_WAVE_EXEC_HI); + wave_inst_dw0 = + wave_read_ind(adev, simd, wave, ixSQ_WAVE_INST_DW0); + wave_inst_dw1 = + wave_read_ind(adev, simd, wave, ixSQ_WAVE_INST_DW1); + wave_ib_sts = wave_read_ind(adev, simd, wave, ixSQ_WAVE_IB_STS); + + dev_info( + adev->dev, + "\t SIMD %d, Wave %d: status 0x%x, pc 0x%llx, exec 0x%llx, inst 0x%llx, ib_sts 0x%x\n", + simd, wave, wave_status, + ((uint64_t)wave_pc_hi << 32 | wave_pc_lo), + ((uint64_t)wave_exec_hi << 32 | wave_exec_lo), + ((uint64_t)wave_inst_dw1 << 32 | wave_inst_dw0), + wave_ib_sts); + } +} + +void gfx_v9_4_2_query_sq_timeout_status(struct amdgpu_device *adev) +{ + uint32_t se_idx, sh_idx, cu_idx; + uint32_t status; + + mutex_lock(&adev->grbm_idx_mutex); + for (se_idx = 0; se_idx < adev->gfx.config.max_shader_engines; + se_idx++) { + for (sh_idx = 0; sh_idx < adev->gfx.config.max_sh_per_se; + sh_idx++) { + for (cu_idx = 0; + cu_idx < adev->gfx.config.max_cu_per_sh; + cu_idx++) { + gfx_v9_4_2_select_se_sh(adev, se_idx, sh_idx, + cu_idx); + status = RREG32_SOC15(GC, 0, + regSQ_TIMEOUT_STATUS); + if (status != 0) { + dev_info( + adev->dev, + "GFX Watchdog Timeout: SE %d, SH %d, CU %d\n", + se_idx, sh_idx, cu_idx); + gfx_v9_4_2_log_cu_timeout_status( + adev, status); + } + /* clear old status */ + WREG32_SOC15(GC, 0, regSQ_TIMEOUT_STATUS, 0); + } + } + } + gfx_v9_4_2_select_se_sh(adev, 0xffffffff, 0xffffffff, 0xffffffff); + mutex_unlock(&adev->grbm_idx_mutex); +} \ No newline at end of file diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.h b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.h index d7e3041947e8..e01fa6afa8e4 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.h +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.h @@ -35,4 +35,7 @@ int gfx_v9_4_2_ras_error_inject(struct amdgpu_device *adev, void *inject_if); void gfx_v9_4_2_query_ras_error_status(struct amdgpu_device *adev); int gfx_v9_4_2_query_ras_error_count(struct amdgpu_device *adev, void *ras_error_status); + +void gfx_v9_4_2_enable_watchdog_timer(struct amdgpu_device *adev); +void gfx_v9_4_2_query_sq_timeout_status(struct amdgpu_device *adev); #endif /* __GFX_V9_4_2_H__ */ diff --git a/drivers/gpu/drm/amd/amdgpu/soc15_common.h b/drivers/gpu/drm/amd/amdgpu/soc15_common.h index 52ffbea63a4f..8cdf5d1685cb 100644 --- a/drivers/gpu/drm/amd/amdgpu/soc15_common.h +++ b/drivers/gpu/drm/amd/amdgpu/soc15_common.h @@ -100,6 +100,30 @@ } \ } while (0) +#define WREG32_RLC_EX(prefix, reg, value) \ + do { \ + if (amdgpu_sriov_fullaccess(adev)) { \ + uint32_t i = 0; \ + uint32_t retries = 50000; \ + uint32_t r0 = adev->reg_offset[GC_HWIP][0][prefix##SCRATCH_REG0_BASE_IDX] + prefix##SCRATCH_REG0; \ + uint32_t r1 = adev->reg_offset[GC_HWIP][0][prefix##SCRATCH_REG1_BASE_IDX] + prefix##SCRATCH_REG1; \ + uint32_t spare_int = adev->reg_offset[GC_HWIP][0][prefix##RLC_SPARE_INT_BASE_IDX] + prefix##RLC_SPARE_INT; \ + WREG32(r0, value); \ + WREG32(r1, (reg | 0x80000000)); \ + WREG32(spare_int, 0x1); \ + for (i = 0; i < retries; i++) { \ + u32 tmp = RREG32(r1); \ + if (!(tmp & 0x80000000)) \ + break; \ + udelay(10); \ + } \ + if (i >= retries) \ + pr_err("timeout: rlcg program reg:0x%05x failed !\n", reg); \ + } else { \ + WREG32(reg, value); \ + } \ + } while (0) + #define WREG32_SOC15_RLC_SHADOW(ip, inst, reg, value) \ do { \ uint32_t target_reg = adev->reg_offset[ip##_HWIP][inst][reg##_BASE_IDX] + reg;\ @@ -142,6 +166,12 @@ WREG32_RLC(target_reg, value); \ } while (0) +#define WREG32_SOC15_RLC_EX(prefix, ip, inst, reg, value) \ + do { \ + uint32_t target_reg = adev->reg_offset[GC_HWIP][0][reg##_BASE_IDX] + reg;\ + WREG32_RLC_EX(prefix, target_reg, value); \ + } while (0) + #define WREG32_FIELD15_RLC(ip, idx, reg, field, val) \ WREG32_RLC((adev->reg_offset[ip##_HWIP][idx][mm##reg##_BASE_IDX] + mm##reg), \ (RREG32(adev->reg_offset[ip##_HWIP][idx][mm##reg##_BASE_IDX] + mm##reg) \ -- 2.29.2 _______________________________________________ amd-gfx mailing list amd-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/amd-gfx