[AMD Official Use Only - AMD Internal Distribution Only]
Acked-by: Vitaly Prosyak <vitaly.prosyak@xxxxxxx>
From: amd-gfx <amd-gfx-bounces@xxxxxxxxxxxxxxxxxxxxx> on behalf of amd-gfx-request@xxxxxxxxxxxxxxxxxxxxx <amd-gfx-request@xxxxxxxxxxxxxxxxxxxxx>
Sent: Wednesday, July 17, 2024 4:40 PM To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx <amd-gfx@xxxxxxxxxxxxxxxxxxxxx> Subject: amd-gfx Digest, Vol 98, Issue 217 Send amd-gfx mailing list submissions to
amd-gfx@xxxxxxxxxxxxxxxxxxxxx To subscribe or unsubscribe via the World Wide Web, visit https://lists.freedesktop.org/mailman/listinfo/amd-gfx or, via email, send a message with subject or body 'help' to amd-gfx-request@xxxxxxxxxxxxxxxxxxxxx You can reach the person managing the list at amd-gfx-owner@xxxxxxxxxxxxxxxxxxxxx When replying, please edit your Subject line so it is more specific than "Re: Contents of amd-gfx digest..." Today's Topics: 1. [PATCH 1/6] drm/amdgpu/gfx: add bad opcode interrupt (Alex Deucher) 2. [PATCH 5/6] drm/amdgpu/gfx9: Enable bad opcode interrupt (Alex Deucher) 3. [PATCH 3/6] drm/amdgpu/gfx10: Enable bad opcode interrupt (Alex Deucher) ---------------------------------------------------------------------- Message: 1 Date: Wed, 17 Jul 2024 16:40:06 -0400 From: Alex Deucher <alexander.deucher@xxxxxxx> To: <amd-gfx@xxxxxxxxxxxxxxxxxxxxx> Cc: Alex Deucher <alexander.deucher@xxxxxxx> Subject: [PATCH 1/6] drm/amdgpu/gfx: add bad opcode interrupt Message-ID: <20240717204011.15342-1-alexander.deucher@xxxxxxx> Content-Type: text/plain Add the irq source for bad opcodes. Signed-off-by: Alex Deucher <alexander.deucher@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h index ddda94e49db4..86d3fa7eef90 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h @@ -391,6 +391,7 @@ struct amdgpu_gfx { struct amdgpu_irq_src eop_irq; struct amdgpu_irq_src priv_reg_irq; struct amdgpu_irq_src priv_inst_irq; + struct amdgpu_irq_src bad_op_irq; struct amdgpu_irq_src cp_ecc_error_irq; struct amdgpu_irq_src sq_irq; struct amdgpu_irq_src rlc_gc_fed_irq; -- 2.45.2 ------------------------------ Message: 2 Date: Wed, 17 Jul 2024 16:40:10 -0400 From: Alex Deucher <alexander.deucher@xxxxxxx> To: <amd-gfx@xxxxxxxxxxxxxxxxxxxxx> Cc: Alex Deucher <alexander.deucher@xxxxxxx> Subject: [PATCH 5/6] drm/amdgpu/gfx9: Enable bad opcode interrupt Message-ID: <20240717204011.15342-5-alexander.deucher@xxxxxxx> Content-Type: text/plain For the bad opcode case, it will cause CP/ME hang. The firmware will prevent the ME side from hanging by raising a bad opcode interrupt. And the driver needs to perform a vmid reset when receiving the interrupt. Signed-off-by: Alex Deucher <alexander.deucher@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 65 +++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c index 97476fb2ca40..675a1a8e2515 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c @@ -2182,6 +2182,13 @@ static int gfx_v9_0_sw_init(void *handle) if (r) return r; + /* Bad opcode Event */ + r = amdgpu_irq_add_id(adev, SOC15_IH_CLIENTID_GRBM_CP, + GFX_9_0__SRCID__CP_BAD_OPCODE_ERROR, + &adev->gfx.bad_op_irq); + if (r) + return r; + /* Privileged reg */ r = amdgpu_irq_add_id(adev, SOC15_IH_CLIENTID_GRBM_CP, GFX_9_0__SRCID__CP_PRIV_REG_FAULT, &adev->gfx.priv_reg_irq); @@ -3937,6 +3944,7 @@ static int gfx_v9_0_hw_fini(void *handle) amdgpu_irq_put(adev, &adev->gfx.cp_ecc_error_irq, 0); amdgpu_irq_put(adev, &adev->gfx.priv_reg_irq, 0); amdgpu_irq_put(adev, &adev->gfx.priv_inst_irq, 0); + amdgpu_irq_put(adev, &adev->gfx.bad_op_irq, 0); /* DF freeze and kcq disable will fail */ if (!amdgpu_ras_intr_triggered()) @@ -4747,6 +4755,10 @@ static int gfx_v9_0_late_init(void *handle) if (r) return r; + r = amdgpu_irq_get(adev, &adev->gfx.bad_op_irq, 0); + if (r) + return r; + r = gfx_v9_0_ecc_late_init(handle); if (r) return r; @@ -5990,6 +6002,42 @@ static int gfx_v9_0_set_priv_reg_fault_state(struct amdgpu_device *adev, return 0; } +static int gfx_v9_0_set_bad_op_fault_state(struct amdgpu_device *adev, + struct amdgpu_irq_src *source, + unsigned type, + enum amdgpu_interrupt_state state) +{ + u32 cp_int_cntl_reg, cp_int_cntl; + int i, j; + + switch (state) { + case AMDGPU_IRQ_STATE_DISABLE: + case AMDGPU_IRQ_STATE_ENABLE: + WREG32_FIELD15(GC, 0, CP_INT_CNTL_RING0, + OPCODE_ERROR_INT_ENABLE, + state == AMDGPU_IRQ_STATE_ENABLE ? 1 : 0); + for (i = 0; i < adev->gfx.mec.num_mec; i++) { + for (j = 0; j < adev->gfx.mec.num_pipe_per_mec; j++) { + /* MECs start at 1 */ + cp_int_cntl_reg = gfx_v9_0_get_cpc_int_cntl(adev, i + 1, j); + + if (cp_int_cntl_reg) { + cp_int_cntl = RREG32_SOC15_IP(GC, cp_int_cntl_reg); + cp_int_cntl = REG_SET_FIELD(cp_int_cntl, CP_ME1_PIPE0_INT_CNTL, + OPCODE_ERROR_INT_ENABLE, + state == AMDGPU_IRQ_STATE_ENABLE ? 1 : 0); + WREG32_SOC15_IP(GC, cp_int_cntl_reg, cp_int_cntl); + } + } + } + break; + default: + break; + } + + return 0; +} + static int gfx_v9_0_set_priv_inst_fault_state(struct amdgpu_device *adev, struct amdgpu_irq_src *source, unsigned type, @@ -6163,6 +6211,15 @@ static int gfx_v9_0_priv_reg_irq(struct amdgpu_device *adev, return 0; } +static int gfx_v9_0_bad_op_irq(struct amdgpu_device *adev, + struct amdgpu_irq_src *source, + struct amdgpu_iv_entry *entry) +{ + DRM_ERROR("Illegal opcode in command stream\n"); + gfx_v9_0_fault(adev, entry); + return 0; +} + static int gfx_v9_0_priv_inst_irq(struct amdgpu_device *adev, struct amdgpu_irq_src *source, struct amdgpu_iv_entry *entry) @@ -7346,6 +7403,11 @@ static const struct amdgpu_irq_src_funcs gfx_v9_0_priv_reg_irq_funcs = { .process = gfx_v9_0_priv_reg_irq, }; +static const struct amdgpu_irq_src_funcs gfx_v9_0_bad_op_irq_funcs = { + .set = gfx_v9_0_set_bad_op_fault_state, + .process = gfx_v9_0_bad_op_irq, +}; + static const struct amdgpu_irq_src_funcs gfx_v9_0_priv_inst_irq_funcs = { .set = gfx_v9_0_set_priv_inst_fault_state, .process = gfx_v9_0_priv_inst_irq, @@ -7365,6 +7427,9 @@ static void gfx_v9_0_set_irq_funcs(struct amdgpu_device *adev) adev->gfx.priv_reg_irq.num_types = 1; adev->gfx.priv_reg_irq.funcs = &gfx_v9_0_priv_reg_irq_funcs; + adev->gfx.bad_op_irq.num_types = 1; + adev->gfx.bad_op_irq.funcs = &gfx_v9_0_bad_op_irq_funcs; + adev->gfx.priv_inst_irq.num_types = 1; adev->gfx.priv_inst_irq.funcs = &gfx_v9_0_priv_inst_irq_funcs; -- 2.45.2 ------------------------------ Message: 3 Date: Wed, 17 Jul 2024 16:40:08 -0400 From: Alex Deucher <alexander.deucher@xxxxxxx> To: <amd-gfx@xxxxxxxxxxxxxxxxxxxxx> Cc: Jesse Zhang <jesse.zhang@xxxxxxx>, Alex Deucher <alexander.deucher@xxxxxxx> Subject: [PATCH 3/6] drm/amdgpu/gfx10: Enable bad opcode interrupt Message-ID: <20240717204011.15342-3-alexander.deucher@xxxxxxx> Content-Type: text/plain From: Jesse Zhang <jesse.zhang@xxxxxxx> For the bad opcode case, it will cause CP/ME hang. The firmware will prevent the ME side from hanging by raising a bad opcode interrupt. And the driver needs to perform a vmid reset when receiving the interrupt. v2: update irq naming (drop priv) (Alex) Signed-off-by: Jesse Zhang <jesse.zhang@xxxxxxx> Signed-off-by: Alex Deucher <alexander.deucher@xxxxxxx> --- drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 74 ++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c index 66d80f3dc661..4ce13a4f7a20 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c @@ -4740,6 +4740,13 @@ static int gfx_v10_0_sw_init(void *handle) if (r) return r; + /* Bad opcode Event */ + r = amdgpu_irq_add_id(adev, SOC15_IH_CLIENTID_GRBM_CP, + GFX_10_1__SRCID__CP_BAD_OPCODE_ERROR, + &adev->gfx.bad_op_irq); + if (r) + return r; + /* Privileged reg */ r = amdgpu_irq_add_id(adev, SOC15_IH_CLIENTID_GRBM_CP, GFX_10_1__SRCID__CP_PRIV_REG_FAULT, &adev->gfx.priv_reg_irq); @@ -7416,6 +7423,7 @@ static int gfx_v10_0_hw_fini(void *handle) amdgpu_irq_put(adev, &adev->gfx.priv_reg_irq, 0); amdgpu_irq_put(adev, &adev->gfx.priv_inst_irq, 0); + amdgpu_irq_put(adev, &adev->gfx.bad_op_irq, 0); /* WA added for Vangogh asic fixing the SMU suspend failure * It needs to set power gating again during gfxoff control @@ -7726,6 +7734,10 @@ static int gfx_v10_0_late_init(void *handle) if (r) return r; + r = amdgpu_irq_get(adev, &adev->gfx.bad_op_irq, 0); + if (r) + return r; + return 0; } @@ -9162,6 +9174,51 @@ static int gfx_v10_0_set_priv_reg_fault_state(struct amdgpu_device *adev, return 0; } +static int gfx_v10_0_set_bad_op_fault_state(struct amdgpu_device *adev, + struct amdgpu_irq_src *source, + unsigned type, + enum amdgpu_interrupt_state state) +{ + u32 cp_int_cntl_reg, cp_int_cntl; + int i , j; + + switch (state) { + case AMDGPU_IRQ_STATE_DISABLE: + case AMDGPU_IRQ_STATE_ENABLE: + for (i = 0; i < adev->gfx.me.num_me; i++) { + for (j = 0; j < adev->gfx.me.num_pipe_per_me; j++) { + cp_int_cntl_reg = gfx_v10_0_get_cpg_int_cntl(adev, i, j); + + if (cp_int_cntl_reg) { + cp_int_cntl = RREG32_SOC15_IP(GC, cp_int_cntl_reg); + cp_int_cntl = REG_SET_FIELD(cp_int_cntl, CP_INT_CNTL_RING0, + OPCODE_ERROR_INT_ENABLE, + state == AMDGPU_IRQ_STATE_ENABLE ? 1 : 0); + WREG32_SOC15_IP(GC, cp_int_cntl_reg, cp_int_cntl); + } + } + } + for (i = 0; i < adev->gfx.mec.num_mec; i++) { + for (j = 0; j < adev->gfx.mec.num_pipe_per_mec; j++) { + /* MECs start at 1 */ + cp_int_cntl_reg = gfx_v10_0_get_cpc_int_cntl(adev, i + 1, j); + + if (cp_int_cntl_reg) { + cp_int_cntl = RREG32_SOC15_IP(GC, cp_int_cntl_reg); + cp_int_cntl = REG_SET_FIELD(cp_int_cntl, CP_ME1_PIPE0_INT_CNTL, + OPCODE_ERROR_INT_ENABLE, + state == AMDGPU_IRQ_STATE_ENABLE ? 1 : 0); + WREG32_SOC15_IP(GC, cp_int_cntl_reg, cp_int_cntl); + } + } + } + break; + default: + break; + } + return 0; +} + static int gfx_v10_0_set_priv_inst_fault_state(struct amdgpu_device *adev, struct amdgpu_irq_src *source, unsigned int type, @@ -9237,6 +9294,15 @@ static int gfx_v10_0_priv_reg_irq(struct amdgpu_device *adev, return 0; } +static int gfx_v10_0_bad_op_irq(struct amdgpu_device *adev, + struct amdgpu_irq_src *source, + struct amdgpu_iv_entry *entry) +{ + DRM_ERROR("Illegal opcode in command stream \n"); + gfx_v10_0_handle_priv_fault(adev, entry); + return 0; +} + static int gfx_v10_0_priv_inst_irq(struct amdgpu_device *adev, struct amdgpu_irq_src *source, struct amdgpu_iv_entry *entry) @@ -9624,6 +9690,11 @@ static const struct amdgpu_irq_src_funcs gfx_v10_0_priv_reg_irq_funcs = { .process = gfx_v10_0_priv_reg_irq, }; +static const struct amdgpu_irq_src_funcs gfx_v10_0_bad_op_irq_funcs = { + .set = gfx_v10_0_set_bad_op_fault_state, + .process = gfx_v10_0_bad_op_irq, +}; + static const struct amdgpu_irq_src_funcs gfx_v10_0_priv_inst_irq_funcs = { .set = gfx_v10_0_set_priv_inst_fault_state, .process = gfx_v10_0_priv_inst_irq, @@ -9645,6 +9716,9 @@ static void gfx_v10_0_set_irq_funcs(struct amdgpu_device *adev) adev->gfx.priv_reg_irq.num_types = 1; adev->gfx.priv_reg_irq.funcs = &gfx_v10_0_priv_reg_irq_funcs; + adev->gfx.bad_op_irq.num_types = 1; + adev->gfx.bad_op_irq.funcs = &gfx_v10_0_bad_op_irq_funcs; + adev->gfx.priv_inst_irq.num_types = 1; adev->gfx.priv_inst_irq.funcs = &gfx_v10_0_priv_inst_irq_funcs; } -- 2.45.2 ------------------------------ Subject: Digest Footer _______________________________________________ amd-gfx mailing list amd-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/amd-gfx ------------------------------ End of amd-gfx Digest, Vol 98, Issue 217 **************************************** |