RE: [PATCH] drm/amdgpu: trigger flr_work if reading pf2vf data failed

"Skvortsov, Victor" <Victor.Skvortsov@xxxxxxx> · Sun, 17 Mar 2024 00:59:52 +0000

[AMD Official Use Only - General]

> -----Original Message-----
> From: amd-gfx <amd-gfx-bounces@xxxxxxxxxxxxxxxxxxxxx> On Behalf Of Lazar,
> Lijo
> Sent: Friday, March 15, 2024 6:47 AM
> To: Luo, Zhigang <Zhigang.Luo@xxxxxxx>; amd-gfx@xxxxxxxxxxxxxxxxxxxxx
> Cc: Zhang, Hawking <Hawking.Zhang@xxxxxxx>; Saye, Sashank
> <Sashank.Saye@xxxxxxx>; Chan, Hing Pong <Jeffrey.Chan@xxxxxxx>
> Subject: Re: [PATCH] drm/amdgpu: trigger flr_work if reading pf2vf data failed
>
> Caution: This message originated from an External Source. Use proper
> caution when opening attachments, clicking links, or responding.
>
>
> On 3/14/2024 10:24 PM, Zhigang Luo wrote:
> > if reading pf2vf data failed 5 times continuously, it means something
> > is wrong. Need to trigger flr_work to recover the issue.
> >
> > also use dev_err to print the error message to get which device has
> > issue and add warning message if waiting IDH_FLR_NOTIFICATION_CMPL
> > timeout.
> >
> > Signed-off-by: Zhigang Luo <Zhigang.Luo@xxxxxxx>
> > Change-Id: Ia7ce934d0c3068ad3934715c14bbffdfcbafc4c2
> > ---
> >  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 15 +++++++----
> >  drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c   | 29 ++++++++++++++++++-
> ---
> >  drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h   |  1 +
> >  drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c      |  2 ++
> >  drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c      |  2 ++
> >  5 files changed, 39 insertions(+), 10 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > index 1e9454e6e4cb..9bb04a56d020 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > @@ -143,6 +143,8 @@ const char *amdgpu_asic_name[] = {
> >       "LAST",
> >  };
> >
> > +static inline void amdgpu_device_stop_pending_resets(struct
> > +amdgpu_device *adev);
> > +
> >  /**
> >   * DOC: pcie_replay_count
> >   *
> > @@ -4972,6 +4974,8 @@ static int amdgpu_device_reset_sriov(struct
> > amdgpu_device *adev,
> >  retry:
> >       amdgpu_amdkfd_pre_reset(adev);
> >
> > +     amdgpu_device_stop_pending_resets(adev);
> > +
> >       if (from_hypervisor)
> >               r = amdgpu_virt_request_full_gpu(adev, true);
> >       else
> > @@ -5689,11 +5693,12 @@ int amdgpu_device_gpu_recover(struct
> amdgpu_device *adev,
> >                       tmp_adev->asic_reset_res = r;
> >               }
> >
> > -             /*
> > -              * Drop all pending non scheduler resets. Scheduler resets
> > -              * were already dropped during drm_sched_stop
> > -              */
> > -             amdgpu_device_stop_pending_resets(tmp_adev);
> > +             if (!amdgpu_sriov_vf(tmp_adev))
> > +                     /*
> > +                     * Drop all pending non scheduler resets. Scheduler resets
> > +                     * were already dropped during drm_sched_stop
> > +                     */
> > +                     amdgpu_device_stop_pending_resets(tmp_adev);
> >       }
> >
>
> For better consistency - you may make this path common for both VF and
> bare metal. Call this right before "skip_hw_reset" after a successful reset. All
> pending reset jobs submitted may be cancelled at that point once a succesful
> reset of the device/hive is completed.
>
> Thanks,
> Lijo
>
> >       /* Actual ASIC resets if needed.*/ diff --git
> > a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> > index 7a4eae36778a..4e8364a46d4e 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> > @@ -32,6 +32,7 @@
> >
> >  #include "amdgpu.h"
> >  #include "amdgpu_ras.h"
> > +#include "amdgpu_reset.h"
> >  #include "vi.h"
> >  #include "soc15.h"
> >  #include "nv.h"
> > @@ -424,7 +425,7 @@ static int amdgpu_virt_read_pf2vf_data(struct
> amdgpu_device *adev)
> >               return -EINVAL;
> >
> >       if (pf2vf_info->size > 1024) {
> > -             DRM_ERROR("invalid pf2vf message size\n");
> > +             dev_err(adev->dev, "invalid pf2vf message size: 0x%x\n",
> > + pf2vf_info->size);
> >               return -EINVAL;
> >       }
> >
> > @@ -435,7 +436,9 @@ static int amdgpu_virt_read_pf2vf_data(struct
> amdgpu_device *adev)
> >                       adev->virt.fw_reserve.p_pf2vf, pf2vf_info->size,
> >                       adev->virt.fw_reserve.checksum_key, checksum);
> >               if (checksum != checkval) {
> > -                     DRM_ERROR("invalid pf2vf message\n");
> > +                     dev_err(adev->dev,
> > +                             "invalid pf2vf message: header checksum=0x%x calculated
> checksum=0x%x\n",
> > +                             checksum, checkval);
> >                       return -EINVAL;
> >               }
> >
> > @@ -449,7 +452,9 @@ static int amdgpu_virt_read_pf2vf_data(struct
> amdgpu_device *adev)
> >                       adev->virt.fw_reserve.p_pf2vf, pf2vf_info->size,
> >                       0, checksum);
> >               if (checksum != checkval) {
> > -                     DRM_ERROR("invalid pf2vf message\n");
> > +                     dev_err(adev->dev,
> > +                             "invalid pf2vf message: header checksum=0x%x calculated
> checksum=0x%x\n",
> > +                             checksum, checkval);
> >                       return -EINVAL;
> >               }
> >
> > @@ -485,7 +490,7 @@ static int amdgpu_virt_read_pf2vf_data(struct
> amdgpu_device *adev)
> >                       ((struct amd_sriov_msg_pf2vf_info *)pf2vf_info)->uuid;
> >               break;
> >       default:
> > -             DRM_ERROR("invalid pf2vf version\n");
> > +             dev_err(adev->dev, "invalid pf2vf version: 0x%x\n",
> > + pf2vf_info->version);
> >               return -EINVAL;
> >       }
> >
> > @@ -584,8 +589,21 @@ static void
> amdgpu_virt_update_vf2pf_work_item(struct work_struct *work)
> >       int ret;
> >
> >       ret = amdgpu_virt_read_pf2vf_data(adev);
> > -     if (ret)
> > +     if (ret) {
> > +             adev->virt.vf2pf_update_retry_cnt++;

Can we also add a debug print with retry_cnt value each time it's incremented?

Thanks,
Victor

> > +             if ((adev->virt.vf2pf_update_retry_cnt >= 5) &&
> > +                 amdgpu_sriov_runtime(adev) && !amdgpu_in_reset(adev)) {
> > +                     if (amdgpu_reset_domain_schedule(adev->reset_domain,
> > +                                                       &adev->virt.flr_work))
> > +                             return;
> > +                     else
> > +                             dev_err(adev->dev, "Failed to queue work! at %s",
> __func__);
> > +             }
> > +
> >               goto out;
> > +     }
> > +
> > +     adev->virt.vf2pf_update_retry_cnt = 0;
> >       amdgpu_virt_write_vf2pf_data(adev);
> >
> >  out:
> > @@ -606,6 +624,7 @@ void amdgpu_virt_init_data_exchange(struct
> amdgpu_device *adev)
> >       adev->virt.fw_reserve.p_pf2vf = NULL;
> >       adev->virt.fw_reserve.p_vf2pf = NULL;
> >       adev->virt.vf2pf_update_interval_ms = 0;
> > +     adev->virt.vf2pf_update_retry_cnt = 0;
> >
> >       if (adev->mman.fw_vram_usage_va && adev-
> >mman.drv_vram_usage_va) {
> >               DRM_WARN("Currently fw_vram and drv_vram should not have
> > values at the same time!"); diff --git
> > a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> > index 3f59b7b5523f..eedf5d44bf00 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> > @@ -257,6 +257,7 @@ struct amdgpu_virt {
> >       /* vf2pf message */
> >       struct delayed_work vf2pf_work;
> >       uint32_t vf2pf_update_interval_ms;
> > +     int vf2pf_update_retry_cnt;
> >
> >       /* multimedia bandwidth config */
> >       bool     is_mm_bw_enabled;
> > diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
> > b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
> > index a2bd2c3b1ef9..0c7275bca8f7 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
> > @@ -276,6 +276,8 @@ static void xgpu_ai_mailbox_flr_work(struct
> work_struct *work)
> >               timeout -= 10;
> >       } while (timeout > 1);
> >
> > +     dev_warn(adev->dev, "waiting IDH_FLR_NOTIFICATION_CMPL
> > + timeout\n");
> > +
> >  flr_done:
> >       atomic_set(&adev->reset_domain->in_gpu_reset, 0);
> >       up_write(&adev->reset_domain->sem);
> > diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
> > b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
> > index 77f5b55decf9..ee9aa2815d5a 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
> > @@ -309,6 +309,8 @@ static void xgpu_nv_mailbox_flr_work(struct
> work_struct *work)
> >               timeout -= 10;
> >       } while (timeout > 1);
> >
> > +     dev_warn(adev->dev, "waiting IDH_FLR_NOTIFICATION_CMPL
> > + timeout\n");
> > +
> >  flr_done:
> >       atomic_set(&adev->reset_domain->in_gpu_reset, 0);
> >       up_write(&adev->reset_domain->sem);