[AMD Official Use Only - General] > -----Original Message----- > From: amd-gfx <amd-gfx-bounces@xxxxxxxxxxxxxxxxxxxxx> On Behalf Of Lazar, > Lijo > Sent: Friday, March 15, 2024 6:47 AM > To: Luo, Zhigang <Zhigang.Luo@xxxxxxx>; amd-gfx@xxxxxxxxxxxxxxxxxxxxx > Cc: Zhang, Hawking <Hawking.Zhang@xxxxxxx>; Saye, Sashank > <Sashank.Saye@xxxxxxx>; Chan, Hing Pong <Jeffrey.Chan@xxxxxxx> > Subject: Re: [PATCH] drm/amdgpu: trigger flr_work if reading pf2vf data failed > > Caution: This message originated from an External Source. Use proper > caution when opening attachments, clicking links, or responding. > > > On 3/14/2024 10:24 PM, Zhigang Luo wrote: > > if reading pf2vf data failed 5 times continuously, it means something > > is wrong. Need to trigger flr_work to recover the issue. > > > > also use dev_err to print the error message to get which device has > > issue and add warning message if waiting IDH_FLR_NOTIFICATION_CMPL > > timeout. > > > > Signed-off-by: Zhigang Luo <Zhigang.Luo@xxxxxxx> > > Change-Id: Ia7ce934d0c3068ad3934715c14bbffdfcbafc4c2 > > --- > > drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 15 +++++++---- > > drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 29 ++++++++++++++++++- > --- > > drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 1 + > > drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 2 ++ > > drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 2 ++ > > 5 files changed, 39 insertions(+), 10 deletions(-) > > > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > > index 1e9454e6e4cb..9bb04a56d020 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > > @@ -143,6 +143,8 @@ const char *amdgpu_asic_name[] = { > > "LAST", > > }; > > > > +static inline void amdgpu_device_stop_pending_resets(struct > > +amdgpu_device *adev); > > + > > /** > > * DOC: pcie_replay_count > > * > > @@ -4972,6 +4974,8 @@ static int amdgpu_device_reset_sriov(struct > > amdgpu_device *adev, > > retry: > > amdgpu_amdkfd_pre_reset(adev); > > > > + amdgpu_device_stop_pending_resets(adev); > > + > > if (from_hypervisor) > > r = amdgpu_virt_request_full_gpu(adev, true); > > else > > @@ -5689,11 +5693,12 @@ int amdgpu_device_gpu_recover(struct > amdgpu_device *adev, > > tmp_adev->asic_reset_res = r; > > } > > > > - /* > > - * Drop all pending non scheduler resets. Scheduler resets > > - * were already dropped during drm_sched_stop > > - */ > > - amdgpu_device_stop_pending_resets(tmp_adev); > > + if (!amdgpu_sriov_vf(tmp_adev)) > > + /* > > + * Drop all pending non scheduler resets. Scheduler resets > > + * were already dropped during drm_sched_stop > > + */ > > + amdgpu_device_stop_pending_resets(tmp_adev); > > } > > > > For better consistency - you may make this path common for both VF and > bare metal. Call this right before "skip_hw_reset" after a successful reset. All > pending reset jobs submitted may be cancelled at that point once a succesful > reset of the device/hive is completed. > > Thanks, > Lijo > > > /* Actual ASIC resets if needed.*/ diff --git > > a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c > > index 7a4eae36778a..4e8364a46d4e 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c > > @@ -32,6 +32,7 @@ > > > > #include "amdgpu.h" > > #include "amdgpu_ras.h" > > +#include "amdgpu_reset.h" > > #include "vi.h" > > #include "soc15.h" > > #include "nv.h" > > @@ -424,7 +425,7 @@ static int amdgpu_virt_read_pf2vf_data(struct > amdgpu_device *adev) > > return -EINVAL; > > > > if (pf2vf_info->size > 1024) { > > - DRM_ERROR("invalid pf2vf message size\n"); > > + dev_err(adev->dev, "invalid pf2vf message size: 0x%x\n", > > + pf2vf_info->size); > > return -EINVAL; > > } > > > > @@ -435,7 +436,9 @@ static int amdgpu_virt_read_pf2vf_data(struct > amdgpu_device *adev) > > adev->virt.fw_reserve.p_pf2vf, pf2vf_info->size, > > adev->virt.fw_reserve.checksum_key, checksum); > > if (checksum != checkval) { > > - DRM_ERROR("invalid pf2vf message\n"); > > + dev_err(adev->dev, > > + "invalid pf2vf message: header checksum=0x%x calculated > checksum=0x%x\n", > > + checksum, checkval); > > return -EINVAL; > > } > > > > @@ -449,7 +452,9 @@ static int amdgpu_virt_read_pf2vf_data(struct > amdgpu_device *adev) > > adev->virt.fw_reserve.p_pf2vf, pf2vf_info->size, > > 0, checksum); > > if (checksum != checkval) { > > - DRM_ERROR("invalid pf2vf message\n"); > > + dev_err(adev->dev, > > + "invalid pf2vf message: header checksum=0x%x calculated > checksum=0x%x\n", > > + checksum, checkval); > > return -EINVAL; > > } > > > > @@ -485,7 +490,7 @@ static int amdgpu_virt_read_pf2vf_data(struct > amdgpu_device *adev) > > ((struct amd_sriov_msg_pf2vf_info *)pf2vf_info)->uuid; > > break; > > default: > > - DRM_ERROR("invalid pf2vf version\n"); > > + dev_err(adev->dev, "invalid pf2vf version: 0x%x\n", > > + pf2vf_info->version); > > return -EINVAL; > > } > > > > @@ -584,8 +589,21 @@ static void > amdgpu_virt_update_vf2pf_work_item(struct work_struct *work) > > int ret; > > > > ret = amdgpu_virt_read_pf2vf_data(adev); > > - if (ret) > > + if (ret) { > > + adev->virt.vf2pf_update_retry_cnt++; Can we also add a debug print with retry_cnt value each time it's incremented? Thanks, Victor > > + if ((adev->virt.vf2pf_update_retry_cnt >= 5) && > > + amdgpu_sriov_runtime(adev) && !amdgpu_in_reset(adev)) { > > + if (amdgpu_reset_domain_schedule(adev->reset_domain, > > + &adev->virt.flr_work)) > > + return; > > + else > > + dev_err(adev->dev, "Failed to queue work! at %s", > __func__); > > + } > > + > > goto out; > > + } > > + > > + adev->virt.vf2pf_update_retry_cnt = 0; > > amdgpu_virt_write_vf2pf_data(adev); > > > > out: > > @@ -606,6 +624,7 @@ void amdgpu_virt_init_data_exchange(struct > amdgpu_device *adev) > > adev->virt.fw_reserve.p_pf2vf = NULL; > > adev->virt.fw_reserve.p_vf2pf = NULL; > > adev->virt.vf2pf_update_interval_ms = 0; > > + adev->virt.vf2pf_update_retry_cnt = 0; > > > > if (adev->mman.fw_vram_usage_va && adev- > >mman.drv_vram_usage_va) { > > DRM_WARN("Currently fw_vram and drv_vram should not have > > values at the same time!"); diff --git > > a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h > > index 3f59b7b5523f..eedf5d44bf00 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h > > @@ -257,6 +257,7 @@ struct amdgpu_virt { > > /* vf2pf message */ > > struct delayed_work vf2pf_work; > > uint32_t vf2pf_update_interval_ms; > > + int vf2pf_update_retry_cnt; > > > > /* multimedia bandwidth config */ > > bool is_mm_bw_enabled; > > diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c > > b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c > > index a2bd2c3b1ef9..0c7275bca8f7 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c > > +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c > > @@ -276,6 +276,8 @@ static void xgpu_ai_mailbox_flr_work(struct > work_struct *work) > > timeout -= 10; > > } while (timeout > 1); > > > > + dev_warn(adev->dev, "waiting IDH_FLR_NOTIFICATION_CMPL > > + timeout\n"); > > + > > flr_done: > > atomic_set(&adev->reset_domain->in_gpu_reset, 0); > > up_write(&adev->reset_domain->sem); > > diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c > > b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c > > index 77f5b55decf9..ee9aa2815d5a 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c > > +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c > > @@ -309,6 +309,8 @@ static void xgpu_nv_mailbox_flr_work(struct > work_struct *work) > > timeout -= 10; > > } while (timeout > 1); > > > > + dev_warn(adev->dev, "waiting IDH_FLR_NOTIFICATION_CMPL > > + timeout\n"); > > + > > flr_done: > > atomic_set(&adev->reset_domain->in_gpu_reset, 0); > > up_write(&adev->reset_domain->sem);