On 3/14/2024 10:24 PM, Zhigang Luo wrote: > if reading pf2vf data failed 5 times continuously, it means something is > wrong. Need to trigger flr_work to recover the issue. > > also use dev_err to print the error message to get which device has > issue and add warning message if waiting IDH_FLR_NOTIFICATION_CMPL > timeout. > > Signed-off-by: Zhigang Luo <Zhigang.Luo@xxxxxxx> > Change-Id: Ia7ce934d0c3068ad3934715c14bbffdfcbafc4c2 > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 15 +++++++---- > drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 29 ++++++++++++++++++---- > drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 1 + > drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 2 ++ > drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 2 ++ > 5 files changed, 39 insertions(+), 10 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > index 1e9454e6e4cb..9bb04a56d020 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > @@ -143,6 +143,8 @@ const char *amdgpu_asic_name[] = { > "LAST", > }; > > +static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev); > + > /** > * DOC: pcie_replay_count > * > @@ -4972,6 +4974,8 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, > retry: > amdgpu_amdkfd_pre_reset(adev); > > + amdgpu_device_stop_pending_resets(adev); > + > if (from_hypervisor) > r = amdgpu_virt_request_full_gpu(adev, true); > else > @@ -5689,11 +5693,12 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, > tmp_adev->asic_reset_res = r; > } > > - /* > - * Drop all pending non scheduler resets. Scheduler resets > - * were already dropped during drm_sched_stop > - */ > - amdgpu_device_stop_pending_resets(tmp_adev); > + if (!amdgpu_sriov_vf(tmp_adev)) > + /* > + * Drop all pending non scheduler resets. Scheduler resets > + * were already dropped during drm_sched_stop > + */ > + amdgpu_device_stop_pending_resets(tmp_adev); > } > For better consistency - you may make this path common for both VF and bare metal. Call this right before "skip_hw_reset" after a successful reset. All pending reset jobs submitted may be cancelled at that point once a succesful reset of the device/hive is completed. Thanks, Lijo > /* Actual ASIC resets if needed.*/ > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c > index 7a4eae36778a..4e8364a46d4e 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c > @@ -32,6 +32,7 @@ > > #include "amdgpu.h" > #include "amdgpu_ras.h" > +#include "amdgpu_reset.h" > #include "vi.h" > #include "soc15.h" > #include "nv.h" > @@ -424,7 +425,7 @@ static int amdgpu_virt_read_pf2vf_data(struct amdgpu_device *adev) > return -EINVAL; > > if (pf2vf_info->size > 1024) { > - DRM_ERROR("invalid pf2vf message size\n"); > + dev_err(adev->dev, "invalid pf2vf message size: 0x%x\n", pf2vf_info->size); > return -EINVAL; > } > > @@ -435,7 +436,9 @@ static int amdgpu_virt_read_pf2vf_data(struct amdgpu_device *adev) > adev->virt.fw_reserve.p_pf2vf, pf2vf_info->size, > adev->virt.fw_reserve.checksum_key, checksum); > if (checksum != checkval) { > - DRM_ERROR("invalid pf2vf message\n"); > + dev_err(adev->dev, > + "invalid pf2vf message: header checksum=0x%x calculated checksum=0x%x\n", > + checksum, checkval); > return -EINVAL; > } > > @@ -449,7 +452,9 @@ static int amdgpu_virt_read_pf2vf_data(struct amdgpu_device *adev) > adev->virt.fw_reserve.p_pf2vf, pf2vf_info->size, > 0, checksum); > if (checksum != checkval) { > - DRM_ERROR("invalid pf2vf message\n"); > + dev_err(adev->dev, > + "invalid pf2vf message: header checksum=0x%x calculated checksum=0x%x\n", > + checksum, checkval); > return -EINVAL; > } > > @@ -485,7 +490,7 @@ static int amdgpu_virt_read_pf2vf_data(struct amdgpu_device *adev) > ((struct amd_sriov_msg_pf2vf_info *)pf2vf_info)->uuid; > break; > default: > - DRM_ERROR("invalid pf2vf version\n"); > + dev_err(adev->dev, "invalid pf2vf version: 0x%x\n", pf2vf_info->version); > return -EINVAL; > } > > @@ -584,8 +589,21 @@ static void amdgpu_virt_update_vf2pf_work_item(struct work_struct *work) > int ret; > > ret = amdgpu_virt_read_pf2vf_data(adev); > - if (ret) > + if (ret) { > + adev->virt.vf2pf_update_retry_cnt++; > + if ((adev->virt.vf2pf_update_retry_cnt >= 5) && > + amdgpu_sriov_runtime(adev) && !amdgpu_in_reset(adev)) { > + if (amdgpu_reset_domain_schedule(adev->reset_domain, > + &adev->virt.flr_work)) > + return; > + else > + dev_err(adev->dev, "Failed to queue work! at %s", __func__); > + } > + > goto out; > + } > + > + adev->virt.vf2pf_update_retry_cnt = 0; > amdgpu_virt_write_vf2pf_data(adev); > > out: > @@ -606,6 +624,7 @@ void amdgpu_virt_init_data_exchange(struct amdgpu_device *adev) > adev->virt.fw_reserve.p_pf2vf = NULL; > adev->virt.fw_reserve.p_vf2pf = NULL; > adev->virt.vf2pf_update_interval_ms = 0; > + adev->virt.vf2pf_update_retry_cnt = 0; > > if (adev->mman.fw_vram_usage_va && adev->mman.drv_vram_usage_va) { > DRM_WARN("Currently fw_vram and drv_vram should not have values at the same time!"); > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h > index 3f59b7b5523f..eedf5d44bf00 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h > @@ -257,6 +257,7 @@ struct amdgpu_virt { > /* vf2pf message */ > struct delayed_work vf2pf_work; > uint32_t vf2pf_update_interval_ms; > + int vf2pf_update_retry_cnt; > > /* multimedia bandwidth config */ > bool is_mm_bw_enabled; > diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c > index a2bd2c3b1ef9..0c7275bca8f7 100644 > --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c > +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c > @@ -276,6 +276,8 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work) > timeout -= 10; > } while (timeout > 1); > > + dev_warn(adev->dev, "waiting IDH_FLR_NOTIFICATION_CMPL timeout\n"); > + > flr_done: > atomic_set(&adev->reset_domain->in_gpu_reset, 0); > up_write(&adev->reset_domain->sem); > diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c > index 77f5b55decf9..ee9aa2815d5a 100644 > --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c > +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c > @@ -309,6 +309,8 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work) > timeout -= 10; > } while (timeout > 1); > > + dev_warn(adev->dev, "waiting IDH_FLR_NOTIFICATION_CMPL timeout\n"); > + > flr_done: > atomic_set(&adev->reset_domain->in_gpu_reset, 0); > up_write(&adev->reset_domain->sem);