Add a module param "force_gpu_coredump" to force coredump on relatively harmless gpu hw errors. Signed-off-by: Akhil P Oommen <akhilpo@xxxxxxxxxxxxxx> --- (no changes since v1) drivers/gpu/drm/msm/adreno/a5xx_gpu.c | 33 ++++++++++++++++++-------- drivers/gpu/drm/msm/adreno/a6xx_gpu.c | 38 +++++++++++++++++++++--------- drivers/gpu/drm/msm/adreno/adreno_device.c | 4 ++++ 3 files changed, 54 insertions(+), 21 deletions(-) diff --git a/drivers/gpu/drm/msm/adreno/a5xx_gpu.c b/drivers/gpu/drm/msm/adreno/a5xx_gpu.c index 5e2750e..1861e9a 100644 --- a/drivers/gpu/drm/msm/adreno/a5xx_gpu.c +++ b/drivers/gpu/drm/msm/adreno/a5xx_gpu.c @@ -14,6 +14,7 @@ #include "a5xx_gpu.h" extern bool hang_debug; +extern bool force_gpu_coredump; static void a5xx_dump(struct msm_gpu *gpu); #define GPU_PAS_ID 13 @@ -1237,11 +1238,6 @@ static void a5xx_fault_detect_irq(struct msm_gpu *gpu) gpu_read(gpu, REG_A5XX_CP_IB1_BUFSZ), gpu_read64(gpu, REG_A5XX_CP_IB2_BASE, REG_A5XX_CP_IB2_BASE_HI), gpu_read(gpu, REG_A5XX_CP_IB2_BUFSZ)); - - /* Turn off the hangcheck timer to keep it from bothering us */ - del_timer(&gpu->hangcheck_timer); - - kthread_queue_work(gpu->worker, &gpu->recover_work); } #define RBBM_ERROR_MASK \ @@ -1255,6 +1251,7 @@ static void a5xx_fault_detect_irq(struct msm_gpu *gpu) static irqreturn_t a5xx_irq(struct msm_gpu *gpu) { u32 status = gpu_read(gpu, REG_A5XX_RBBM_INT_0_STATUS); + bool coredump = false; /* * Clear all the interrupts except RBBM_AHB_ERROR - if we clear it @@ -1264,20 +1261,30 @@ static irqreturn_t a5xx_irq(struct msm_gpu *gpu) status & ~A5XX_RBBM_INT_0_MASK_RBBM_AHB_ERROR); /* Pass status to a5xx_rbbm_err_irq because we've already cleared it */ - if (status & RBBM_ERROR_MASK) + if (status & RBBM_ERROR_MASK) { a5xx_rbbm_err_irq(gpu, status); + coredump |= force_gpu_coredump; + } - if (status & A5XX_RBBM_INT_0_MASK_CP_HW_ERROR) + if (status & A5XX_RBBM_INT_0_MASK_CP_HW_ERROR) { a5xx_cp_err_irq(gpu); + coredump |= force_gpu_coredump; + } - if (status & A5XX_RBBM_INT_0_MASK_MISC_HANG_DETECT) + if (status & A5XX_RBBM_INT_0_MASK_MISC_HANG_DETECT) { a5xx_fault_detect_irq(gpu); + coredump = true; + } - if (status & A5XX_RBBM_INT_0_MASK_UCHE_OOB_ACCESS) + if (status & A5XX_RBBM_INT_0_MASK_UCHE_OOB_ACCESS) { a5xx_uche_err_irq(gpu); + coredump |= force_gpu_coredump; + } - if (status & A5XX_RBBM_INT_0_MASK_GPMU_VOLTAGE_DROOP) + if (status & A5XX_RBBM_INT_0_MASK_GPMU_VOLTAGE_DROOP) { a5xx_gpmu_err_irq(gpu); + coredump |= force_gpu_coredump; + } if (status & A5XX_RBBM_INT_0_MASK_CP_CACHE_FLUSH_TS) { a5xx_preempt_trigger(gpu); @@ -1287,6 +1294,12 @@ static irqreturn_t a5xx_irq(struct msm_gpu *gpu) if (status & A5XX_RBBM_INT_0_MASK_CP_SW) a5xx_preempt_irq(gpu); + if (coredump) { + /* Turn off the hangcheck timer to keep it from bothering us */ + del_timer(&gpu->hangcheck_timer); + kthread_queue_work(gpu->worker, &gpu->recover_work); + } + return IRQ_HANDLED; } diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c index 6c2edce..f96587f 100644 --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c @@ -15,6 +15,8 @@ #define GPU_PAS_ID 13 +extern bool force_gpu_coredump; + static inline bool _a6xx_check_idle(struct msm_gpu *gpu) { struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu); @@ -1354,40 +1356,54 @@ static void a6xx_fault_detect_irq(struct msm_gpu *gpu) gpu_read(gpu, REG_A6XX_CP_IB1_REM_SIZE), gpu_read64(gpu, REG_A6XX_CP_IB2_BASE, REG_A6XX_CP_IB2_BASE_HI), gpu_read(gpu, REG_A6XX_CP_IB2_REM_SIZE)); - - /* Turn off the hangcheck timer to keep it from bothering us */ - del_timer(&gpu->hangcheck_timer); - - kthread_queue_work(gpu->worker, &gpu->recover_work); } static irqreturn_t a6xx_irq(struct msm_gpu *gpu) { u32 status = gpu_read(gpu, REG_A6XX_RBBM_INT_0_STATUS); + bool coredump = false; gpu_write(gpu, REG_A6XX_RBBM_INT_CLEAR_CMD, status); - if (status & A6XX_RBBM_INT_0_MASK_RBBM_HANG_DETECT) + if (status & A6XX_RBBM_INT_0_MASK_RBBM_HANG_DETECT) { a6xx_fault_detect_irq(gpu); + coredump = true; + } - if (status & A6XX_RBBM_INT_0_MASK_CP_AHB_ERROR) + if (status & A6XX_RBBM_INT_0_MASK_CP_AHB_ERROR) { dev_err_ratelimited(&gpu->pdev->dev, "CP | AHB bus error\n"); + coredump |= force_gpu_coredump; + } - if (status & A6XX_RBBM_INT_0_MASK_CP_HW_ERROR) + if (status & A6XX_RBBM_INT_0_MASK_CP_HW_ERROR) { a6xx_cp_hw_err_irq(gpu); + coredump |= force_gpu_coredump; + } - if (status & A6XX_RBBM_INT_0_MASK_RBBM_ATB_ASYNCFIFO_OVERFLOW) + if (status & A6XX_RBBM_INT_0_MASK_RBBM_ATB_ASYNCFIFO_OVERFLOW) { dev_err_ratelimited(&gpu->pdev->dev, "RBBM | ATB ASYNC overflow\n"); + coredump |= force_gpu_coredump; + } - if (status & A6XX_RBBM_INT_0_MASK_RBBM_ATB_BUS_OVERFLOW) + if (status & A6XX_RBBM_INT_0_MASK_RBBM_ATB_BUS_OVERFLOW) { dev_err_ratelimited(&gpu->pdev->dev, "RBBM | ATB bus overflow\n"); + coredump |= force_gpu_coredump; + } - if (status & A6XX_RBBM_INT_0_MASK_UCHE_OOB_ACCESS) + if (status & A6XX_RBBM_INT_0_MASK_UCHE_OOB_ACCESS) { dev_err_ratelimited(&gpu->pdev->dev, "UCHE | Out of bounds access\n"); + coredump |= force_gpu_coredump; + } if (status & A6XX_RBBM_INT_0_MASK_CP_CACHE_FLUSH_TS) msm_gpu_retire(gpu); + if (coredump) { + /* Turn off the hangcheck timer to keep it from bothering us */ + del_timer(&gpu->hangcheck_timer); + kthread_queue_work(gpu->worker, &gpu->recover_work); + } + return IRQ_HANDLED; } diff --git a/drivers/gpu/drm/msm/adreno/adreno_device.c b/drivers/gpu/drm/msm/adreno/adreno_device.c index 2a6ce76..a159cb9 100644 --- a/drivers/gpu/drm/msm/adreno/adreno_device.c +++ b/drivers/gpu/drm/msm/adreno/adreno_device.c @@ -20,6 +20,10 @@ bool allow_vram_carveout = false; MODULE_PARM_DESC(allow_vram_carveout, "Allow using VRAM Carveout, in place of IOMMU"); module_param_named(allow_vram_carveout, allow_vram_carveout, bool, 0600); +bool force_gpu_coredump = false; +MODULE_PARM_DESC(snapshot_debugbus, "Force gpu coredump on hw errors which are usually harmless"); +module_param_named(force_gpu_coredump, force_gpu_coredump, bool, 0600); + static const struct adreno_info gpulist[] = { { .rev = ADRENO_REV(2, 0, 0, 0), -- QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation.