On Fri, Aug 28, 2020 at 12:06 PM Andrey Grodzovsky <andrey.grodzovsky@xxxxxxx> wrote: > > At this point the ASIC is already post reset by the HW/PSP > so the HW not in proper state to be configured for suspension, > some bloks might be even gated and so best is to avoid touching it. typo: "blocks" > > v2: Rename in_dpc to more meaningful name > > Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@xxxxxxx> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 + > drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 41 +++++++++++++++++++++++++++++- > drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 6 +++++ > drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 6 +++++ > drivers/gpu/drm/amd/amdgpu/atom.c | 1 + > drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 18 ++++++++----- > drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 3 +++ > drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c | 3 +++ > 8 files changed, 71 insertions(+), 8 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > index 3399242..cac51e8 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > @@ -992,6 +992,7 @@ struct amdgpu_device { > atomic_t throttling_logging_enabled; > struct ratelimit_state throttling_logging_rs; > uint32_t ras_features; > + bool in_pci_err_recovery; > }; > > static inline struct amdgpu_device *drm_to_adev(struct drm_device *ddev) > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > index 937f8b0..e67cbf2 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > @@ -319,6 +319,9 @@ uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg, > { > uint32_t ret; > > + if (adev->in_pci_err_recovery) > + return 0; > + > if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev)) > return amdgpu_kiq_rreg(adev, reg); > > @@ -352,6 +355,9 @@ uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg, > */ > uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) > { > + if (adev->in_pci_err_recovery) > + return 0; > + > if (offset < adev->rmmio_size) > return (readb(adev->rmmio + offset)); > BUG(); > @@ -374,14 +380,21 @@ uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) > */ > void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) > { > + if (adev->in_pci_err_recovery) > + return; > + > if (offset < adev->rmmio_size) > writeb(value, adev->rmmio + offset); > else > BUG(); > } > > -void static inline amdgpu_mm_wreg_mmio(struct amdgpu_device *adev, uint32_t reg, uint32_t v, uint32_t acc_flags) > +void static inline amdgpu_mm_wreg_mmio(struct amdgpu_device *adev, uint32_t reg, > + uint32_t v, uint32_t acc_flags) > { > + if (adev->in_pci_err_recovery) > + return; > + > trace_amdgpu_mm_wreg(adev->pdev->device, reg, v); > > if ((reg * 4) < adev->rmmio_size) > @@ -409,6 +422,9 @@ void static inline amdgpu_mm_wreg_mmio(struct amdgpu_device *adev, uint32_t reg, > void amdgpu_mm_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v, > uint32_t acc_flags) > { > + if (adev->in_pci_err_recovery) > + return; > + > if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev)) > return amdgpu_kiq_wreg(adev, reg, v); > > @@ -423,6 +439,9 @@ void amdgpu_mm_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v, > void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, uint32_t reg, uint32_t v, > uint32_t acc_flags) > { > + if (adev->in_pci_err_recovery) > + return; > + > if (amdgpu_sriov_fullaccess(adev) && > adev->gfx.rlc.funcs && > adev->gfx.rlc.funcs->is_rlcg_access_range) { > @@ -444,6 +463,9 @@ void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, uint32_t reg, uint32_t > */ > u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg) > { > + if (adev->in_pci_err_recovery) > + return 0; > + > if ((reg * 4) < adev->rio_mem_size) > return ioread32(adev->rio_mem + (reg * 4)); > else { > @@ -463,6 +485,9 @@ u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg) > */ > void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v) > { > + if (adev->in_pci_err_recovery) > + return; > + > if ((reg * 4) < adev->rio_mem_size) > iowrite32(v, adev->rio_mem + (reg * 4)); > else { > @@ -482,6 +507,9 @@ void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v) > */ > u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index) > { > + if (adev->in_pci_err_recovery) > + return 0; > + > if (index < adev->doorbell.num_doorbells) { > return readl(adev->doorbell.ptr + index); > } else { > @@ -502,6 +530,9 @@ u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index) > */ > void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v) > { > + if (adev->in_pci_err_recovery) > + return; > + > if (index < adev->doorbell.num_doorbells) { > writel(v, adev->doorbell.ptr + index); > } else { > @@ -520,6 +551,9 @@ void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v) > */ > u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index) > { > + if (adev->in_pci_err_recovery) > + return 0; > + > if (index < adev->doorbell.num_doorbells) { > return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index)); > } else { > @@ -540,6 +574,9 @@ u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index) > */ > void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v) > { > + if (adev->in_pci_err_recovery) > + return; > + > if (index < adev->doorbell.num_doorbells) { > atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v); > } else { > @@ -4778,7 +4815,9 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) > > pci_restore_state(pdev); > > + adev->in_pci_err_recovery = true; > r = amdgpu_device_ip_suspend(adev); > + adev->in_pci_err_recovery = false; > if (r) > goto out; > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c > index d698142..8c9bacf 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c > @@ -693,6 +693,9 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg) > struct amdgpu_kiq *kiq = &adev->gfx.kiq; > struct amdgpu_ring *ring = &kiq->ring; > > + if (adev->in_pci_err_recovery) > + return 0; > + > BUG_ON(!ring->funcs->emit_rreg); > > spin_lock_irqsave(&kiq->ring_lock, flags); > @@ -757,6 +760,9 @@ void amdgpu_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) > > BUG_ON(!ring->funcs->emit_wreg); > > + if (adev->in_pci_err_recovery) > + return; > + > spin_lock_irqsave(&kiq->ring_lock, flags); > amdgpu_ring_alloc(ring, 32); > amdgpu_ring_emit_wreg(ring, reg, v); > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c > index d6c38e2..a7771aa 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c > @@ -219,6 +219,9 @@ int psp_wait_for(struct psp_context *psp, uint32_t reg_index, > int i; > struct amdgpu_device *adev = psp->adev; > > + if (psp->adev->in_pci_err_recovery) > + return 0; > + > for (i = 0; i < adev->usec_timeout; i++) { > val = RREG32(reg_index); > if (check_changed) { > @@ -245,6 +248,9 @@ psp_cmd_submit_buf(struct psp_context *psp, > bool ras_intr = false; > bool skip_unsupport = false; > > + if (psp->adev->in_pci_err_recovery) > + return 0; > + > mutex_lock(&psp->mutex); > > memset(psp->cmd_buf_mem, 0, PSP_CMD_BUFFER_SIZE); > diff --git a/drivers/gpu/drm/amd/amdgpu/atom.c b/drivers/gpu/drm/amd/amdgpu/atom.c > index 4cfc786..613dac1 100644 > --- a/drivers/gpu/drm/amd/amdgpu/atom.c > +++ b/drivers/gpu/drm/amd/amdgpu/atom.c > @@ -750,6 +750,7 @@ static void atom_op_jump(atom_exec_context *ctx, int *ptr, int arg) > DRM_ERROR("atombios stuck in loop for more than %dsecs aborting\n", > ATOM_CMD_TIMEOUT_SEC); > ctx->abort = true; > + dump_stack(); Leftover from debugging? > } > } else { > /* jiffies wrap around we will just wait a little longer */ > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c > index 2db195e..ccf096c 100644 > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c > @@ -6980,15 +6980,19 @@ static int gfx_v10_0_hw_fini(void *handle) > > amdgpu_irq_put(adev, &adev->gfx.priv_reg_irq, 0); > amdgpu_irq_put(adev, &adev->gfx.priv_inst_irq, 0); > + > + if (!adev->in_pci_err_recovery) { > #ifndef BRING_UP_DEBUG > - if (amdgpu_async_gfx_ring) { > - r = gfx_v10_0_kiq_disable_kgq(adev); > - if (r) > - DRM_ERROR("KGQ disable failed\n"); > - } > + if (amdgpu_async_gfx_ring) { > + r = gfx_v10_0_kiq_disable_kgq(adev); > + if (r) > + DRM_ERROR("KGQ disable failed\n"); > + } > #endif > - if (amdgpu_gfx_disable_kcq(adev)) > - DRM_ERROR("KCQ disable failed\n"); > + if (amdgpu_gfx_disable_kcq(adev)) > + DRM_ERROR("KCQ disable failed\n"); > + } > + gfx9 probably needs something similar, but that can come later if we ever validate this for older parts. > if (amdgpu_sriov_vf(adev)) { > gfx_v10_0_cp_gfx_enable(adev, false); > /* Program KIQ position of RLC_CP_SCHEDULERS during destroy */ > diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c > index 8462b30..306461d 100644 > --- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c > +++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c > @@ -224,9 +224,12 @@ int smu_dpm_set_power_gate(struct smu_context *smu, uint32_t block_type, > { > int ret = 0; > > + > if (!smu->pm_enabled || !smu->adev->pm.dpm_enabled) > return -EOPNOTSUPP; > > + > + Unrelated whitespaces changes. Please drop. With this things fixed: Reviewed-by: Alex Deucher <alexander.deucher@xxxxxxx> > switch (block_type) { > /* > * Some legacy code of amdgpu_vcn.c and vcn_v2*.c still uses > diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c > index a58ea08..97aa72a 100644 > --- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c > +++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c > @@ -112,6 +112,9 @@ int smu_cmn_send_smc_msg_with_param(struct smu_context *smu, > struct amdgpu_device *adev = smu->adev; > int ret = 0, index = 0; > > + if (smu->adev->in_pci_err_recovery) > + return 0; > + > index = smu_cmn_to_asic_specific_index(smu, > CMN2ASIC_MAPPING_MSG, > msg); > -- > 2.7.4 > > _______________________________________________ > amd-gfx mailing list > amd-gfx@xxxxxxxxxxxxxxxxxxxxx > https://lists.freedesktop.org/mailman/listinfo/amd-gfx _______________________________________________ amd-gfx mailing list amd-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/amd-gfx