On 2020-08-28 3:26 p.m., Alex Deucher wrote: > On Fri, Aug 28, 2020 at 12:06 PM Andrey Grodzovsky > <andrey.grodzovsky@xxxxxxx> wrote: >> >> At this point the ASIC is already post reset by the HW/PSP >> so the HW not in proper state to be configured for suspension, >> some bloks might be even gated and so best is to avoid touching it. > > typo: "blocks" I already mentioned this in the 1st version of the patch--seems to have been ignored. Regards, Luben > >> >> v2: Rename in_dpc to more meaningful name >> >> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@xxxxxxx> >> --- >> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 + >> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 41 +++++++++++++++++++++++++++++- >> drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 6 +++++ >> drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 6 +++++ >> drivers/gpu/drm/amd/amdgpu/atom.c | 1 + >> drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 18 ++++++++----- >> drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 3 +++ >> drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c | 3 +++ >> 8 files changed, 71 insertions(+), 8 deletions(-) >> >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >> index 3399242..cac51e8 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h >> @@ -992,6 +992,7 @@ struct amdgpu_device { >> atomic_t throttling_logging_enabled; >> struct ratelimit_state throttling_logging_rs; >> uint32_t ras_features; >> + bool in_pci_err_recovery; >> }; >> >> static inline struct amdgpu_device *drm_to_adev(struct drm_device *ddev) >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> index 937f8b0..e67cbf2 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> @@ -319,6 +319,9 @@ uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg, >> { >> uint32_t ret; >> >> + if (adev->in_pci_err_recovery) >> + return 0; >> + >> if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev)) >> return amdgpu_kiq_rreg(adev, reg); >> >> @@ -352,6 +355,9 @@ uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg, >> */ >> uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) >> { >> + if (adev->in_pci_err_recovery) >> + return 0; >> + >> if (offset < adev->rmmio_size) >> return (readb(adev->rmmio + offset)); >> BUG(); >> @@ -374,14 +380,21 @@ uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) >> */ >> void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) >> { >> + if (adev->in_pci_err_recovery) >> + return; >> + >> if (offset < adev->rmmio_size) >> writeb(value, adev->rmmio + offset); >> else >> BUG(); >> } >> >> -void static inline amdgpu_mm_wreg_mmio(struct amdgpu_device *adev, uint32_t reg, uint32_t v, uint32_t acc_flags) >> +void static inline amdgpu_mm_wreg_mmio(struct amdgpu_device *adev, uint32_t reg, >> + uint32_t v, uint32_t acc_flags) >> { >> + if (adev->in_pci_err_recovery) >> + return; >> + >> trace_amdgpu_mm_wreg(adev->pdev->device, reg, v); >> >> if ((reg * 4) < adev->rmmio_size) >> @@ -409,6 +422,9 @@ void static inline amdgpu_mm_wreg_mmio(struct amdgpu_device *adev, uint32_t reg, >> void amdgpu_mm_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v, >> uint32_t acc_flags) >> { >> + if (adev->in_pci_err_recovery) >> + return; >> + >> if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev)) >> return amdgpu_kiq_wreg(adev, reg, v); >> >> @@ -423,6 +439,9 @@ void amdgpu_mm_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v, >> void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, uint32_t reg, uint32_t v, >> uint32_t acc_flags) >> { >> + if (adev->in_pci_err_recovery) >> + return; >> + >> if (amdgpu_sriov_fullaccess(adev) && >> adev->gfx.rlc.funcs && >> adev->gfx.rlc.funcs->is_rlcg_access_range) { >> @@ -444,6 +463,9 @@ void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, uint32_t reg, uint32_t >> */ >> u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg) >> { >> + if (adev->in_pci_err_recovery) >> + return 0; >> + >> if ((reg * 4) < adev->rio_mem_size) >> return ioread32(adev->rio_mem + (reg * 4)); >> else { >> @@ -463,6 +485,9 @@ u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg) >> */ >> void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v) >> { >> + if (adev->in_pci_err_recovery) >> + return; >> + >> if ((reg * 4) < adev->rio_mem_size) >> iowrite32(v, adev->rio_mem + (reg * 4)); >> else { >> @@ -482,6 +507,9 @@ void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v) >> */ >> u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index) >> { >> + if (adev->in_pci_err_recovery) >> + return 0; >> + >> if (index < adev->doorbell.num_doorbells) { >> return readl(adev->doorbell.ptr + index); >> } else { >> @@ -502,6 +530,9 @@ u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index) >> */ >> void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v) >> { >> + if (adev->in_pci_err_recovery) >> + return; >> + >> if (index < adev->doorbell.num_doorbells) { >> writel(v, adev->doorbell.ptr + index); >> } else { >> @@ -520,6 +551,9 @@ void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v) >> */ >> u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index) >> { >> + if (adev->in_pci_err_recovery) >> + return 0; >> + >> if (index < adev->doorbell.num_doorbells) { >> return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index)); >> } else { >> @@ -540,6 +574,9 @@ u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index) >> */ >> void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v) >> { >> + if (adev->in_pci_err_recovery) >> + return; >> + >> if (index < adev->doorbell.num_doorbells) { >> atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v); >> } else { >> @@ -4778,7 +4815,9 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) >> >> pci_restore_state(pdev); >> >> + adev->in_pci_err_recovery = true; >> r = amdgpu_device_ip_suspend(adev); >> + adev->in_pci_err_recovery = false; >> if (r) >> goto out; >> >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c >> index d698142..8c9bacf 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c >> @@ -693,6 +693,9 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg) >> struct amdgpu_kiq *kiq = &adev->gfx.kiq; >> struct amdgpu_ring *ring = &kiq->ring; >> >> + if (adev->in_pci_err_recovery) >> + return 0; >> + >> BUG_ON(!ring->funcs->emit_rreg); >> >> spin_lock_irqsave(&kiq->ring_lock, flags); >> @@ -757,6 +760,9 @@ void amdgpu_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) >> >> BUG_ON(!ring->funcs->emit_wreg); >> >> + if (adev->in_pci_err_recovery) >> + return; >> + >> spin_lock_irqsave(&kiq->ring_lock, flags); >> amdgpu_ring_alloc(ring, 32); >> amdgpu_ring_emit_wreg(ring, reg, v); >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c >> index d6c38e2..a7771aa 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c >> @@ -219,6 +219,9 @@ int psp_wait_for(struct psp_context *psp, uint32_t reg_index, >> int i; >> struct amdgpu_device *adev = psp->adev; >> >> + if (psp->adev->in_pci_err_recovery) >> + return 0; >> + >> for (i = 0; i < adev->usec_timeout; i++) { >> val = RREG32(reg_index); >> if (check_changed) { >> @@ -245,6 +248,9 @@ psp_cmd_submit_buf(struct psp_context *psp, >> bool ras_intr = false; >> bool skip_unsupport = false; >> >> + if (psp->adev->in_pci_err_recovery) >> + return 0; >> + >> mutex_lock(&psp->mutex); >> >> memset(psp->cmd_buf_mem, 0, PSP_CMD_BUFFER_SIZE); >> diff --git a/drivers/gpu/drm/amd/amdgpu/atom.c b/drivers/gpu/drm/amd/amdgpu/atom.c >> index 4cfc786..613dac1 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/atom.c >> +++ b/drivers/gpu/drm/amd/amdgpu/atom.c >> @@ -750,6 +750,7 @@ static void atom_op_jump(atom_exec_context *ctx, int *ptr, int arg) >> DRM_ERROR("atombios stuck in loop for more than %dsecs aborting\n", >> ATOM_CMD_TIMEOUT_SEC); >> ctx->abort = true; >> + dump_stack(); > > Leftover from debugging? > > > >> } >> } else { >> /* jiffies wrap around we will just wait a little longer */ >> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c >> index 2db195e..ccf096c 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c >> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c >> @@ -6980,15 +6980,19 @@ static int gfx_v10_0_hw_fini(void *handle) >> >> amdgpu_irq_put(adev, &adev->gfx.priv_reg_irq, 0); >> amdgpu_irq_put(adev, &adev->gfx.priv_inst_irq, 0); >> + >> + if (!adev->in_pci_err_recovery) { >> #ifndef BRING_UP_DEBUG >> - if (amdgpu_async_gfx_ring) { >> - r = gfx_v10_0_kiq_disable_kgq(adev); >> - if (r) >> - DRM_ERROR("KGQ disable failed\n"); >> - } >> + if (amdgpu_async_gfx_ring) { >> + r = gfx_v10_0_kiq_disable_kgq(adev); >> + if (r) >> + DRM_ERROR("KGQ disable failed\n"); >> + } >> #endif >> - if (amdgpu_gfx_disable_kcq(adev)) >> - DRM_ERROR("KCQ disable failed\n"); >> + if (amdgpu_gfx_disable_kcq(adev)) >> + DRM_ERROR("KCQ disable failed\n"); >> + } >> + > > gfx9 probably needs something similar, but that can come later if we > ever validate this for older parts. > >> if (amdgpu_sriov_vf(adev)) { >> gfx_v10_0_cp_gfx_enable(adev, false); >> /* Program KIQ position of RLC_CP_SCHEDULERS during destroy */ >> diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c >> index 8462b30..306461d 100644 >> --- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c >> +++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c >> @@ -224,9 +224,12 @@ int smu_dpm_set_power_gate(struct smu_context *smu, uint32_t block_type, >> { >> int ret = 0; >> >> + >> if (!smu->pm_enabled || !smu->adev->pm.dpm_enabled) >> return -EOPNOTSUPP; >> >> + >> + > > Unrelated whitespaces changes. Please drop. With this things fixed: > Reviewed-by: Alex Deucher <alexander.deucher@xxxxxxx> > > >> switch (block_type) { >> /* >> * Some legacy code of amdgpu_vcn.c and vcn_v2*.c still uses >> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c >> index a58ea08..97aa72a 100644 >> --- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c >> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c >> @@ -112,6 +112,9 @@ int smu_cmn_send_smc_msg_with_param(struct smu_context *smu, >> struct amdgpu_device *adev = smu->adev; >> int ret = 0, index = 0; >> >> + if (smu->adev->in_pci_err_recovery) >> + return 0; >> + >> index = smu_cmn_to_asic_specific_index(smu, >> CMN2ASIC_MAPPING_MSG, >> msg); >> -- >> 2.7.4 >> >> _______________________________________________ >> amd-gfx mailing list >> amd-gfx@xxxxxxxxxxxxxxxxxxxxx >> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&data=02%7C01%7Cluben.tuikov%40amd.com%7Cb1b272ee77ba4d18e56e08d84b885545%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637342396521349293&sdata=%2BLIG7VZ3lpaEUuNydr1XwodqRlAqkiweEcWtk2zppAU%3D&reserved=0 > _______________________________________________ > amd-gfx mailing list > amd-gfx@xxxxxxxxxxxxxxxxxxxxx > https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&data=02%7C01%7Cluben.tuikov%40amd.com%7Cb1b272ee77ba4d18e56e08d84b885545%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637342396521349293&sdata=%2BLIG7VZ3lpaEUuNydr1XwodqRlAqkiweEcWtk2zppAU%3D&reserved=0 > _______________________________________________ amd-gfx mailing list amd-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/amd-gfx