Am 19.06.2018 um 18:09 schrieb Andrey Grodzovsky: > Access to SQ_EDC_INFO requires selecting register instance and > hence mutex lock when accessing GRBM_GFX_INDEX for which a work > is schedueled from IH. But SQ interrupt can be raised on many instances > at once which means queuing work will usually succeed for the first one > but fail for the reset since the work takes time to process. To avoid > losing info about other interrupt instances call the parsing function > directly from high IRQ when current work hasn't finished and avoid > accessing SQ_EDC_INFO in that case. > > Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky at amd.com> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu.h | 7 +++ > drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 97 ++++++++++++++++++++++++++++++----- > 2 files changed, 91 insertions(+), 13 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > index e8c6cc1..a7b9ef5 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > @@ -930,6 +930,11 @@ struct amdgpu_ngg { > bool init; > }; > > +struct sq_work { > + struct work_struct work; > + unsigned ih_data; > +}; > + > struct amdgpu_gfx { > struct mutex gpu_clock_mutex; > struct amdgpu_gfx_config config; > @@ -970,6 +975,8 @@ struct amdgpu_gfx { > struct amdgpu_irq_src priv_inst_irq; > struct amdgpu_irq_src cp_ecc_error_irq; > struct amdgpu_irq_src sq_irq; > + struct sq_work sq_work; > + > /* gfx status */ > uint32_t gfx_current_status; > /* ce ram size*/ > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c > index 93904a7..0add7fc 100644 > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c > @@ -704,6 +704,17 @@ static const u32 stoney_mgcg_cgcg_init[] = > mmCGTS_SM_CTRL_REG, 0xffffffff, 0x96940200, > }; > > + > +static const char * const sq_edc_source_names[] = { > + "SQ_EDC_INFO_SOURCE_INVALID: No EDC error has occurred", > + "SQ_EDC_INFO_SOURCE_INST: EDC source is Instruction Fetch", > + "SQ_EDC_INFO_SOURCE_SGPR: EDC source is SGPR or SQC data return", > + "SQ_EDC_INFO_SOURCE_VGPR: EDC source is VGPR", > + "SQ_EDC_INFO_SOURCE_LDS: EDC source is LDS", > + "SQ_EDC_INFO_SOURCE_GDS: EDC source is GDS", > + "SQ_EDC_INFO_SOURCE_TA: EDC source is TA", > +}; > + > static void gfx_v8_0_set_ring_funcs(struct amdgpu_device *adev); > static void gfx_v8_0_set_irq_funcs(struct amdgpu_device *adev); > static void gfx_v8_0_set_gds_init(struct amdgpu_device *adev); > @@ -2003,6 +2014,8 @@ static int gfx_v8_0_compute_ring_init(struct amdgpu_device *adev, int ring_id, > return 0; > } > > +static void gfx_v8_0_sq_irq_work_func(struct work_struct *work); > + > static int gfx_v8_0_sw_init(void *handle) > { > int i, j, k, r, ring_id; > @@ -2066,6 +2079,8 @@ static int gfx_v8_0_sw_init(void *handle) > return r; > } > > + INIT_WORK(&adev->gfx.sq_work.work, gfx_v8_0_sq_irq_work_func); > + > adev->gfx.gfx_current_status = AMDGPU_GFX_NORMAL_MODE; > > gfx_v8_0_scratch_init(adev); > @@ -6952,14 +6967,11 @@ static int gfx_v8_0_cp_ecc_error_irq(struct amdgpu_device *adev, > return 0; > } > > -static int gfx_v8_0_sq_irq(struct amdgpu_device *adev, > - struct amdgpu_irq_src *source, > - struct amdgpu_iv_entry *entry) > +static void gfx_v8_0_parse_sq_irq(struct amdgpu_device *adev, unsigned ih_data) > { > - u8 enc, se_id; > + u32 enc, se_id, sh_id, cu_id; > char type[20]; > - unsigned ih_data = entry->src_data[0]; > - > + int sq_edc_source = -1; > > enc = REG_GET_FIELD(ih_data, SQ_INTERRUPT_WORD_CMN, ENCODING); > se_id = REG_GET_FIELD(ih_data, SQ_INTERRUPT_WORD_CMN, SE_ID); > @@ -6985,6 +6997,24 @@ static int gfx_v8_0_sq_irq(struct amdgpu_device *adev, > case 1: > case 2: > > + cu_id = REG_GET_FIELD(ih_data, SQ_INTERRUPT_WORD_WAVE, CU_ID); > + sh_id = REG_GET_FIELD(ih_data, SQ_INTERRUPT_WORD_WAVE, SH_ID); > + > + /* > + * This function can be called either directly from ISR > + * or from BH in which case we can access SQ_EDC_INFO > + * instance > + */ > + if (in_task()) { > + mutex_lock(&adev->grbm_idx_mutex); > + gfx_v8_0_select_se_sh(adev, se_id, sh_id, cu_id); > + > + sq_edc_source = REG_GET_FIELD(RREG32(mmSQ_EDC_INFO), SQ_EDC_INFO, SOURCE); > + > + gfx_v8_0_select_se_sh(adev, 0xffffffff, 0xffffffff, 0xffffffff); > + mutex_unlock(&adev->grbm_idx_mutex); > + } > + > if (enc == 1) > sprintf(type, "instruction intr"); > else > @@ -6992,20 +7022,61 @@ static int gfx_v8_0_sq_irq(struct amdgpu_device *adev, > > DRM_INFO( > "SQ %s detected: " > - "se_id %d, cu_id %d, simd_id %d, wave_id %d, vm_id %d\n" > - "trap %s, sh_id %d. ", > - type, se_id, > - REG_GET_FIELD(ih_data, SQ_INTERRUPT_WORD_WAVE, CU_ID), > + "se_id %d, sh_id %d, cu_id %d, simd_id %d, wave_id %d, vm_id %d " > + "trap %s, sq_ed_info.source %s.\n", > + type, se_id, sh_id, cu_id, > REG_GET_FIELD(ih_data, SQ_INTERRUPT_WORD_WAVE, SIMD_ID), > REG_GET_FIELD(ih_data, SQ_INTERRUPT_WORD_WAVE, WAVE_ID), > REG_GET_FIELD(ih_data, SQ_INTERRUPT_WORD_WAVE, VM_ID), > REG_GET_FIELD(ih_data, SQ_INTERRUPT_WORD_WAVE, PRIV) ? "true" : "false", > - REG_GET_FIELD(ih_data, SQ_INTERRUPT_WORD_WAVE, SH_ID) > - ); > + (sq_edc_source != -1) ? sq_edc_source_names[sq_edc_source] : "unavailable" > + ); > break; > default: > DRM_ERROR("SQ invalid encoding type\n."); > - return -EINVAL; > + } > +} > + > +static void gfx_v8_0_sq_irq_work_func(struct work_struct *work) > +{ > + > + struct amdgpu_device *adev = container_of(work, struct amdgpu_device, gfx.sq_work.work); > + struct sq_work *sq_work = container_of(work, struct sq_work, work); > + > + smp_rmb(); > + > + gfx_v8_0_parse_sq_irq(adev, READ_ONCE(sq_work->ih_data)); > + > + WRITE_ONCE(sq_work->ih_data, 0); > + /* Make the value change visible ASAP to IH */ > + smp_wmb(); > +} > + > +static int gfx_v8_0_sq_irq(struct amdgpu_device *adev, > + struct amdgpu_irq_src *source, > + struct amdgpu_iv_entry *entry) > +{ > + unsigned ih_data = entry->src_data[0]; > + > + /* > + * Try to submit work so SQ_EDC_INFO can be accessed from > + * BH. If previous work submission hasn't finished yet > + * just print whatever info is possible directly from the ISR. > + */ > + smp_rmb(); > + if (READ_ONCE(adev->gfx.sq_work.ih_data)) > + gfx_v8_0_parse_sq_irq(adev, ih_data); > + else { > + WRITE_ONCE(adev->gfx.sq_work.ih_data, ih_data); > + /* Verify the new value visible in BH handler */ > + smp_wmb(); > + if (!schedule_work(&adev->gfx.sq_work.work)) { > + > + WRITE_ONCE(adev->gfx.sq_work.ih_data, 0); > + gfx_v8_0_parse_sq_irq(adev, ih_data); > + /* Verify 0 is visible to next HW IH */ > + smp_wmb(); > + } At least of hand that looks like it is racy. You should probably rather use work_pending() like this: if (work_pending(&adev->gfx.sq_work.work)) { Â Â Â gfx_v8_0_parse_sq_irq(adev, ih_data); } else { Â Â Â adev->gfx.sq_work.ih_data = ih_data; Â Â Â schedule_work(&adev->gfx.sq_work.work); } Regards, Christian. > } > > return 0;