On Wed, Aug 14, 2024 at 7:28 PM Mukul Joshi <mukul.joshi@xxxxxxx> wrote: > > Based on the recommendation of MEC FW, update BadOpcode interrupt > handling by unmapping all queues, removing the queue that got the > interrupt from scheduling and remapping rest of the queues back when > using MES scheduler. This is done to prevent the case where unmapping > of the bad queue can fail thereby causing a GPU reset. > > Signed-off-by: Mukul Joshi <mukul.joshi@xxxxxxx> > Acked-by: Harish Kasiviswanathan <Harish.Kasiviswanathan@xxxxxxx> Acked-by: Alex Deucher <alexander.deucher@xxxxxxx> > --- > v1->v2: > - No change. > > .../drm/amd/amdkfd/kfd_device_queue_manager.c | 51 +++++++++++++++++++ > .../gpu/drm/amd/amdkfd/kfd_int_process_v11.c | 9 ++-- > drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 1 + > 3 files changed, 58 insertions(+), 3 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c > index cb5b866eee3b..67b97d86e65e 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c > @@ -2871,6 +2871,57 @@ void device_queue_manager_uninit(struct device_queue_manager *dqm) > kfree(dqm); > } > > +int kfd_dqm_suspend_bad_queue_mes(struct kfd_node *knode, u32 pasid, u32 doorbell_id) > +{ > + struct kfd_process_device *pdd; > + struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); > + struct device_queue_manager *dqm = knode->dqm; > + struct device *dev = dqm->dev->adev->dev; > + struct qcm_process_device *qpd; > + struct queue *q = NULL; > + int ret = 0; > + > + if (!p) > + return -EINVAL; > + > + dqm_lock(dqm); > + > + pdd = kfd_get_process_device_data(dqm->dev, p); > + if (pdd) { > + qpd = &pdd->qpd; > + > + list_for_each_entry(q, &qpd->queues_list, list) { > + if (q->doorbell_id == doorbell_id && q->properties.is_active) { > + ret = suspend_all_queues_mes(dqm); > + if (ret) { > + dev_err(dev, "Suspending all queues failed"); > + goto out; > + } > + > + q->properties.is_evicted = true; > + q->properties.is_active = false; > + decrement_queue_count(dqm, qpd, q); > + > + ret = remove_queue_mes(dqm, q, qpd); > + if (ret) { > + dev_err(dev, "Removing bad queue failed"); > + goto out; > + } > + > + ret = resume_all_queues_mes(dqm); > + if (ret) > + dev_err(dev, "Resuming all queues failed"); > + > + break; > + } > + } > + } > + > +out: > + dqm_unlock(dqm); > + return ret; > +} > + > static int kfd_dqm_evict_pasid_mes(struct device_queue_manager *dqm, > struct qcm_process_device *qpd) > { > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c > index f524a55eee11..b3f988b275a8 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c > @@ -330,11 +330,14 @@ static void event_interrupt_wq_v11(struct kfd_node *dev, > if (source_id == SOC15_INTSRC_CP_END_OF_PIPE) > kfd_signal_event_interrupt(pasid, context_id0, 32); > else if (source_id == SOC15_INTSRC_CP_BAD_OPCODE && > - KFD_DBG_EC_TYPE_IS_PACKET(KFD_CTXID0_CP_BAD_OP_ECODE(context_id0))) > - kfd_set_dbg_ev_from_interrupt(dev, pasid, > - KFD_CTXID0_DOORBELL_ID(context_id0), > + KFD_DBG_EC_TYPE_IS_PACKET(KFD_CTXID0_CP_BAD_OP_ECODE(context_id0))) { > + u32 doorbell_id = KFD_CTXID0_DOORBELL_ID(context_id0); > + > + kfd_set_dbg_ev_from_interrupt(dev, pasid, doorbell_id, > KFD_EC_MASK(KFD_CTXID0_CP_BAD_OP_ECODE(context_id0)), > NULL, 0); > + kfd_dqm_suspend_bad_queue_mes(dev, pasid, doorbell_id); > + } > > /* SDMA */ > else if (source_id == SOC21_INTSRC_SDMA_TRAP) > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h > index f7c12d4f0abb..7bba6bed2f48 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h > @@ -1324,6 +1324,7 @@ struct kernel_queue *kernel_queue_init(struct kfd_node *dev, > enum kfd_queue_type type); > void kernel_queue_uninit(struct kernel_queue *kq); > int kfd_dqm_evict_pasid(struct device_queue_manager *dqm, u32 pasid); > +int kfd_dqm_suspend_bad_queue_mes(struct kfd_node *knode, u32 pasid, u32 doorbell_id); > > /* Process Queue Manager */ > struct process_queue_node { > -- > 2.35.1 >