On 6/19/2024 2:44 AM, Vignesh Chander wrote: > For RAS error scenario, VF guest driver will check mailbox > and set fed flag to avoid unnecessary HW accesses. > additionally, poll for reset completion message first > to avoid accidentally spamming multiple reset requests to host. > > v2: add another mailbox check for handling case where kfd detects > timeout first > > Signed-off-by: Vignesh Chander <Vignesh.Chander@xxxxxxx> > Change-Id: Ib501c653265883999c62a12a209ce5eb81c80846 > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 25 +++++++++++++++++++++--- > drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 4 +++- > drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 22 +++++++++++++++++++-- > drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h | 4 +++- > drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 22 +++++++++++++++++++-- > drivers/gpu/drm/amd/amdgpu/mxgpu_nv.h | 3 ++- > 6 files changed, 70 insertions(+), 10 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c > index 63f2286858c484..ccb3d041c2b249 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c > @@ -229,6 +229,22 @@ void amdgpu_virt_free_mm_table(struct amdgpu_device *adev) > adev->virt.mm_table.gpu_addr = 0; > } > > +/** > + * amdgpu_virt_rcvd_ras_interrupt() - receive ras interrupt > + * @adev: amdgpu device. > + * Check whether host sent RAS error message > + * Return: true if found, otherwise false > + */ > +bool amdgpu_virt_rcvd_ras_interrupt(struct amdgpu_device *adev) > +{ > + struct amdgpu_virt *virt = &adev->virt; > + > + if (!virt->ops || !virt->ops->rcvd_ras_intr) > + return false; > + > + return virt->ops->rcvd_ras_intr(adev); > +} > + > > unsigned int amd_sriov_msg_checksum(void *obj, > unsigned long obj_size, > @@ -612,11 +628,14 @@ static void amdgpu_virt_update_vf2pf_work_item(struct work_struct *work) > ret = amdgpu_virt_read_pf2vf_data(adev); > if (ret) { > adev->virt.vf2pf_update_retry_cnt++; > - if ((adev->virt.vf2pf_update_retry_cnt >= AMDGPU_VF2PF_UPDATE_MAX_RETRY_LIMIT) && > - amdgpu_sriov_runtime(adev)) { > + > + if ((amdgpu_virt_rcvd_ras_interrupt(adev) || > + adev->virt.vf2pf_update_retry_cnt >= AMDGPU_VF2PF_UPDATE_MAX_RETRY_LIMIT) && > + amdgpu_sriov_runtime(adev)) { > + > amdgpu_ras_set_fed(adev, true); > if (amdgpu_reset_domain_schedule(adev->reset_domain, > - &adev->kfd.reset_work)) > + &adev->kfd.reset_work)) Instead of this and below waits, what about checking the status in gpu_recover() or in device_reset_sriov(). It will get called for reset initiated from all sources. Setting the flag means it will wait for FLR completion. /* Actual ASIC resets if needed.*/ /* Host driver will handle XGMI hive reset for SRIOV */ if (amdgpu_sriov_vf(adev)) { + + /* RAS error is equivalent to FLR initiated from host, wait for + * completion + */ + if (amdgpu_virt_rcvd_ras_interrupt(adev) || amdgpu_ras_get_fed_status(adev)) + set_bit(AMDGPU_HOST_FLR, &reset_context.flags); + Thanks, Lijo > return; > else > dev_err(adev->dev, "Failed to queue work! at %s", __func__); > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h > index f04cd1586c7220..b42a8854dca0cb 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h > @@ -52,7 +52,7 @@ > /* tonga/fiji use this offset */ > #define mmBIF_IOV_FUNC_IDENTIFIER 0x1503 > > -#define AMDGPU_VF2PF_UPDATE_MAX_RETRY_LIMIT 5 > +#define AMDGPU_VF2PF_UPDATE_MAX_RETRY_LIMIT 2 > > enum amdgpu_sriov_vf_mode { > SRIOV_VF_MODE_BARE_METAL = 0, > @@ -94,6 +94,7 @@ struct amdgpu_virt_ops { > u32 data1, u32 data2, u32 data3); > void (*ras_poison_handler)(struct amdgpu_device *adev, > enum amdgpu_ras_block block); > + bool (*rcvd_ras_intr)(struct amdgpu_device *adev); > }; > > /* > @@ -352,6 +353,7 @@ void amdgpu_virt_ready_to_reset(struct amdgpu_device *adev); > int amdgpu_virt_wait_reset(struct amdgpu_device *adev); > int amdgpu_virt_alloc_mm_table(struct amdgpu_device *adev); > void amdgpu_virt_free_mm_table(struct amdgpu_device *adev); > +bool amdgpu_virt_rcvd_ras_interrupt(struct amdgpu_device *adev); > void amdgpu_virt_release_ras_err_handler_data(struct amdgpu_device *adev); > void amdgpu_virt_init_data_exchange(struct amdgpu_device *adev); > void amdgpu_virt_exchange_data(struct amdgpu_device *adev); > diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c > index 65656afc6ed1c2..1bb8393ad6d358 100644 > --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c > +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c > @@ -196,11 +196,22 @@ static int xgpu_ai_request_reset(struct amdgpu_device *adev) > { > int ret, i = 0; > > - while (i < AI_MAILBOX_POLL_MSG_REP_MAX) { > + if (amdgpu_ras_get_fed_status(adev) || xgpu_ai_rcvd_ras_intr(adev)) { > + dev_dbg(adev->dev, "ras flag is set, poll for IDH_FLR_NOTIFICATION_CMPL\n"); > + > + for (i = 0; i < AI_MAILBOX_POLL_MSG_REP_MAX; i++) { > + ret = xgpu_ai_poll_msg(adev, IDH_FLR_NOTIFICATION_CMPL); > + if (!ret) > + break; > + > + dev_dbg(adev->dev, "retries left = %d\n", AI_MAILBOX_POLL_MSG_REP_MAX - i); > + } > + } > + > + for (i = 0; i < AI_MAILBOX_POLL_MSG_REP_MAX; i++) { > ret = xgpu_ai_send_access_requests(adev, IDH_REQ_GPU_RESET_ACCESS); > if (!ret) > break; > - i++; > } > > return ret; > @@ -408,6 +419,12 @@ static void xgpu_ai_ras_poison_handler(struct amdgpu_device *adev, > xgpu_ai_send_access_requests(adev, IDH_RAS_POISON); > } > > +static bool xgpu_ai_rcvd_ras_intr(struct amdgpu_device *adev) > +{ > + enum idh_event msg = xgpu_ai_mailbox_peek_msg(adev); > + return (msg == IDH_RAS_ERROR_DETECTED || msg == 0xFFFFFFFF); > +} > + > const struct amdgpu_virt_ops xgpu_ai_virt_ops = { > .req_full_gpu = xgpu_ai_request_full_gpu_access, > .rel_full_gpu = xgpu_ai_release_full_gpu_access, > @@ -417,4 +434,5 @@ const struct amdgpu_virt_ops xgpu_ai_virt_ops = { > .trans_msg = xgpu_ai_mailbox_trans_msg, > .req_init_data = xgpu_ai_request_init_data, > .ras_poison_handler = xgpu_ai_ras_poison_handler, > + .rcvd_ras_intr = xgpu_ai_rcvd_ras_intr, > }; > diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h > index c520b2fabfb9a8..ed57cbc150afba 100644 > --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h > +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h > @@ -51,7 +51,9 @@ enum idh_event { > IDH_FAIL, > IDH_QUERY_ALIVE, > IDH_REQ_GPU_INIT_DATA_READY, > - > + IDH_RAS_POISON_READY, > + IDH_PF_SOFT_FLR_NOTIFICATION, > + IDH_RAS_ERROR_DETECTED, > IDH_TEXT_MESSAGE = 255, > }; > > diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c > index 17e1e8cc243752..f2e5b38a64314c 100644 > --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c > +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c > @@ -225,11 +225,22 @@ static int xgpu_nv_request_reset(struct amdgpu_device *adev) > { > int ret, i = 0; > > - while (i < NV_MAILBOX_POLL_MSG_REP_MAX) { > + if (amdgpu_ras_get_fed_status(adev) || xgpu_nv_rcvd_ras_intr(adev) { > + dev_dbg(adev->dev, "ras flag is set, poll for IDH_FLR_NOTIFICATION_CMPL\n"); > + > + for (i = 0; i < NV_MAILBOX_POLL_MSG_REP_MAX; i++) { > + ret = xgpu_nv_poll_msg(adev, IDH_FLR_NOTIFICATION_CMPL); > + if (!ret) > + break; > + > + dev_dbg(adev->dev, "retries left = %d\n", NV_MAILBOX_POLL_MSG_REP_MAX - i); > + } > + } > + > + for (i = 0; i < NV_MAILBOX_POLL_MSG_REP_MAX; i++) { > ret = xgpu_nv_send_access_requests(adev, IDH_REQ_GPU_RESET_ACCESS); > if (!ret) > break; > - i++; > } > > return ret; > @@ -449,6 +460,12 @@ static void xgpu_nv_ras_poison_handler(struct amdgpu_device *adev, > } > } > > +static bool xgpu_nv_rcvd_ras_intr(struct amdgpu_device *adev) > +{ > + enum idh_event msg = xgpu_nv_mailbox_peek_msg(adev); > + return (msg == IDH_RAS_ERROR_DETECTED || msg == 0xFFFFFFFF); > +} > + > const struct amdgpu_virt_ops xgpu_nv_virt_ops = { > .req_full_gpu = xgpu_nv_request_full_gpu_access, > .rel_full_gpu = xgpu_nv_release_full_gpu_access, > @@ -458,4 +475,5 @@ const struct amdgpu_virt_ops xgpu_nv_virt_ops = { > .wait_reset = xgpu_nv_wait_reset, > .trans_msg = xgpu_nv_mailbox_trans_msg, > .ras_poison_handler = xgpu_nv_ras_poison_handler, > + .rcvd_ras_intr = xgpu_nv_rcvd_ras_intr, > }; > diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.h b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.h > index 1e8fd90cab4347..719a4c88615752 100644 > --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.h > +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.h > @@ -52,7 +52,8 @@ enum idh_event { > IDH_QUERY_ALIVE, > IDH_REQ_GPU_INIT_DATA_READY, > IDH_RAS_POISON_READY, > - > + IDH_PF_SOFT_FLR_NOTIFICATION, > + IDH_RAS_ERROR_DETECTED, > IDH_TEXT_MESSAGE = 255, > }; >