Re: [PATCH v2 2/7] drm/amdgpu: Avoid accessing HW when suspending SW state

Luben Tuikov <luben.tuikov@xxxxxxx> · Mon, 31 Aug 2020 16:19:20 -0400

On 2020-08-28 3:26 p.m., Alex Deucher wrote:
> On Fri, Aug 28, 2020 at 12:06 PM Andrey Grodzovsky
> <andrey.grodzovsky@xxxxxxx> wrote:
>>
>> At this point the ASIC is already post reset by the HW/PSP
>> so the HW not in proper state to be configured for suspension,
>> some bloks might be even gated and so best is to avoid touching it.
> 
> typo: "blocks"

I already mentioned this in the 1st version of the patch--seems to have been ignored.

Regards,
Luben

> 
>>
>> v2: Rename in_dpc to more meaningful name
>>
>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@xxxxxxx>
>> ---
>>  drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  1 +
>>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 41 +++++++++++++++++++++++++++++-
>>  drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c    |  6 +++++
>>  drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c    |  6 +++++
>>  drivers/gpu/drm/amd/amdgpu/atom.c          |  1 +
>>  drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c     | 18 ++++++++-----
>>  drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c  |  3 +++
>>  drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c     |  3 +++
>>  8 files changed, 71 insertions(+), 8 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> index 3399242..cac51e8 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> @@ -992,6 +992,7 @@ struct amdgpu_device {
>>         atomic_t                        throttling_logging_enabled;
>>         struct ratelimit_state          throttling_logging_rs;
>>         uint32_t                        ras_features;
>> +       bool                            in_pci_err_recovery;
>>  };
>>
>>  static inline struct amdgpu_device *drm_to_adev(struct drm_device *ddev)
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index 937f8b0..e67cbf2 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -319,6 +319,9 @@ uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg,
>>  {
>>         uint32_t ret;
>>
>> +       if (adev->in_pci_err_recovery)
>> +               return 0;
>> +
>>         if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev))
>>                 return amdgpu_kiq_rreg(adev, reg);
>>
>> @@ -352,6 +355,9 @@ uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg,
>>   */
>>  uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset)
>>  {
>> +       if (adev->in_pci_err_recovery)
>> +               return 0;
>> +
>>         if (offset < adev->rmmio_size)
>>                 return (readb(adev->rmmio + offset));
>>         BUG();
>> @@ -374,14 +380,21 @@ uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset)
>>   */
>>  void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value)
>>  {
>> +       if (adev->in_pci_err_recovery)
>> +               return;
>> +
>>         if (offset < adev->rmmio_size)
>>                 writeb(value, adev->rmmio + offset);
>>         else
>>                 BUG();
>>  }
>>
>> -void static inline amdgpu_mm_wreg_mmio(struct amdgpu_device *adev, uint32_t reg, uint32_t v, uint32_t acc_flags)
>> +void static inline amdgpu_mm_wreg_mmio(struct amdgpu_device *adev, uint32_t reg,
>> +                                      uint32_t v, uint32_t acc_flags)
>>  {
>> +       if (adev->in_pci_err_recovery)
>> +               return;
>> +
>>         trace_amdgpu_mm_wreg(adev->pdev->device, reg, v);
>>
>>         if ((reg * 4) < adev->rmmio_size)
>> @@ -409,6 +422,9 @@ void static inline amdgpu_mm_wreg_mmio(struct amdgpu_device *adev, uint32_t reg,
>>  void amdgpu_mm_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v,
>>                     uint32_t acc_flags)
>>  {
>> +       if (adev->in_pci_err_recovery)
>> +               return;
>> +
>>         if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev))
>>                 return amdgpu_kiq_wreg(adev, reg, v);
>>
>> @@ -423,6 +439,9 @@ void amdgpu_mm_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v,
>>  void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, uint32_t reg, uint32_t v,
>>                     uint32_t acc_flags)
>>  {
>> +       if (adev->in_pci_err_recovery)
>> +               return;
>> +
>>         if (amdgpu_sriov_fullaccess(adev) &&
>>                 adev->gfx.rlc.funcs &&
>>                 adev->gfx.rlc.funcs->is_rlcg_access_range) {
>> @@ -444,6 +463,9 @@ void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, uint32_t reg, uint32_t
>>   */
>>  u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg)
>>  {
>> +       if (adev->in_pci_err_recovery)
>> +               return 0;
>> +
>>         if ((reg * 4) < adev->rio_mem_size)
>>                 return ioread32(adev->rio_mem + (reg * 4));
>>         else {
>> @@ -463,6 +485,9 @@ u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg)
>>   */
>>  void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v)
>>  {
>> +       if (adev->in_pci_err_recovery)
>> +               return;
>> +
>>         if ((reg * 4) < adev->rio_mem_size)
>>                 iowrite32(v, adev->rio_mem + (reg * 4));
>>         else {
>> @@ -482,6 +507,9 @@ void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v)
>>   */
>>  u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index)
>>  {
>> +       if (adev->in_pci_err_recovery)
>> +               return 0;
>> +
>>         if (index < adev->doorbell.num_doorbells) {
>>                 return readl(adev->doorbell.ptr + index);
>>         } else {
>> @@ -502,6 +530,9 @@ u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index)
>>   */
>>  void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v)
>>  {
>> +       if (adev->in_pci_err_recovery)
>> +               return;
>> +
>>         if (index < adev->doorbell.num_doorbells) {
>>                 writel(v, adev->doorbell.ptr + index);
>>         } else {
>> @@ -520,6 +551,9 @@ void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v)
>>   */
>>  u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index)
>>  {
>> +       if (adev->in_pci_err_recovery)
>> +               return 0;
>> +
>>         if (index < adev->doorbell.num_doorbells) {
>>                 return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index));
>>         } else {
>> @@ -540,6 +574,9 @@ u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index)
>>   */
>>  void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v)
>>  {
>> +       if (adev->in_pci_err_recovery)
>> +               return;
>> +
>>         if (index < adev->doorbell.num_doorbells) {
>>                 atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v);
>>         } else {
>> @@ -4778,7 +4815,9 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
>>
>>         pci_restore_state(pdev);
>>
>> +       adev->in_pci_err_recovery = true;
>>         r = amdgpu_device_ip_suspend(adev);
>> +       adev->in_pci_err_recovery = false;
>>         if (r)
>>                 goto out;
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>> index d698142..8c9bacf 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>> @@ -693,6 +693,9 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)
>>         struct amdgpu_kiq *kiq = &adev->gfx.kiq;
>>         struct amdgpu_ring *ring = &kiq->ring;
>>
>> +       if (adev->in_pci_err_recovery)
>> +               return 0;
>> +
>>         BUG_ON(!ring->funcs->emit_rreg);
>>
>>         spin_lock_irqsave(&kiq->ring_lock, flags);
>> @@ -757,6 +760,9 @@ void amdgpu_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
>>
>>         BUG_ON(!ring->funcs->emit_wreg);
>>
>> +       if (adev->in_pci_err_recovery)
>> +               return;
>> +
>>         spin_lock_irqsave(&kiq->ring_lock, flags);
>>         amdgpu_ring_alloc(ring, 32);
>>         amdgpu_ring_emit_wreg(ring, reg, v);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
>> index d6c38e2..a7771aa 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
>> @@ -219,6 +219,9 @@ int psp_wait_for(struct psp_context *psp, uint32_t reg_index,
>>         int i;
>>         struct amdgpu_device *adev = psp->adev;
>>
>> +       if (psp->adev->in_pci_err_recovery)
>> +               return 0;
>> +
>>         for (i = 0; i < adev->usec_timeout; i++) {
>>                 val = RREG32(reg_index);
>>                 if (check_changed) {
>> @@ -245,6 +248,9 @@ psp_cmd_submit_buf(struct psp_context *psp,
>>         bool ras_intr = false;
>>         bool skip_unsupport = false;
>>
>> +       if (psp->adev->in_pci_err_recovery)
>> +               return 0;
>> +
>>         mutex_lock(&psp->mutex);
>>
>>         memset(psp->cmd_buf_mem, 0, PSP_CMD_BUFFER_SIZE);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/atom.c b/drivers/gpu/drm/amd/amdgpu/atom.c
>> index 4cfc786..613dac1 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/atom.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/atom.c
>> @@ -750,6 +750,7 @@ static void atom_op_jump(atom_exec_context *ctx, int *ptr, int arg)
>>                                         DRM_ERROR("atombios stuck in loop for more than %dsecs aborting\n",
>>                                                   ATOM_CMD_TIMEOUT_SEC);
>>                                         ctx->abort = true;
>> +                                       dump_stack();
> 
> Leftover from debugging?
> 
> 
> 
>>                                 }
>>                         } else {
>>                                 /* jiffies wrap around we will just wait a little longer */
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> index 2db195e..ccf096c 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>> @@ -6980,15 +6980,19 @@ static int gfx_v10_0_hw_fini(void *handle)
>>
>>         amdgpu_irq_put(adev, &adev->gfx.priv_reg_irq, 0);
>>         amdgpu_irq_put(adev, &adev->gfx.priv_inst_irq, 0);
>> +
>> +       if (!adev->in_pci_err_recovery) {
>>  #ifndef BRING_UP_DEBUG
>> -       if (amdgpu_async_gfx_ring) {
>> -               r = gfx_v10_0_kiq_disable_kgq(adev);
>> -               if (r)
>> -                       DRM_ERROR("KGQ disable failed\n");
>> -       }
>> +               if (amdgpu_async_gfx_ring) {
>> +                       r = gfx_v10_0_kiq_disable_kgq(adev);
>> +                       if (r)
>> +                               DRM_ERROR("KGQ disable failed\n");
>> +               }
>>  #endif
>> -       if (amdgpu_gfx_disable_kcq(adev))
>> -               DRM_ERROR("KCQ disable failed\n");
>> +               if (amdgpu_gfx_disable_kcq(adev))
>> +                       DRM_ERROR("KCQ disable failed\n");
>> +       }
>> +
> 
> gfx9 probably needs something similar, but that can come later if we
> ever validate this for older parts.
> 
>>         if (amdgpu_sriov_vf(adev)) {
>>                 gfx_v10_0_cp_gfx_enable(adev, false);
>>                 /* Program KIQ position of RLC_CP_SCHEDULERS during destroy */
>> diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
>> index 8462b30..306461d 100644
>> --- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
>> +++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
>> @@ -224,9 +224,12 @@ int smu_dpm_set_power_gate(struct smu_context *smu, uint32_t block_type,
>>  {
>>         int ret = 0;
>>
>> +
>>         if (!smu->pm_enabled || !smu->adev->pm.dpm_enabled)
>>                 return -EOPNOTSUPP;
>>
>> +
>> +
> 
> Unrelated whitespaces changes.  Please drop.  With this things fixed:
> Reviewed-by: Alex Deucher <alexander.deucher@xxxxxxx>
> 
> 
>>         switch (block_type) {
>>         /*
>>          * Some legacy code of amdgpu_vcn.c and vcn_v2*.c still uses
>> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
>> index a58ea08..97aa72a 100644
>> --- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
>> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
>> @@ -112,6 +112,9 @@ int smu_cmn_send_smc_msg_with_param(struct smu_context *smu,
>>         struct amdgpu_device *adev = smu->adev;
>>         int ret = 0, index = 0;
>>
>> +       if (smu->adev->in_pci_err_recovery)
>> +               return 0;
>> +
>>         index = smu_cmn_to_asic_specific_index(smu,
>>                                                CMN2ASIC_MAPPING_MSG,
>>                                                msg);
>> --
>> 2.7.4
>>
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx@xxxxxxxxxxxxxxxxxxxxx
>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&amp;data=02%7C01%7Cluben.tuikov%40amd.com%7Cb1b272ee77ba4d18e56e08d84b885545%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637342396521349293&amp;sdata=%2BLIG7VZ3lpaEUuNydr1XwodqRlAqkiweEcWtk2zppAU%3D&amp;reserved=0
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@xxxxxxxxxxxxxxxxxxxxx
> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&amp;data=02%7C01%7Cluben.tuikov%40amd.com%7Cb1b272ee77ba4d18e56e08d84b885545%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637342396521349293&amp;sdata=%2BLIG7VZ3lpaEUuNydr1XwodqRlAqkiweEcWtk2zppAU%3D&amp;reserved=0
> 

_______________________________________________
amd-gfx mailing list
amd-gfx@xxxxxxxxxxxxxxxxxxxxx
https://lists.freedesktop.org/mailman/listinfo/amd-gfx