On Wed, Aug 14, 2024 at 7:28 PM Mukul Joshi <mukul.joshi@xxxxxxx> wrote: > > MEC FW expects MES to unmap all queues when a VM fault is observed > on a queue and then resumed once the affected process is terminated. > Use the MES Suspend and Resume APIs to achieve this. > > Signed-off-by: Mukul Joshi <mukul.joshi@xxxxxxx> Acked-by: Alex Deucher <alexander.deucher@xxxxxxx> > --- > v1->v2: > - Add MES FW version check. > - Separate out the kfd_dqm_evict_pasid into another function. > - Use amdgpu_mes_suspend/amdgpu_mes_resume to suspend/resume queues. > > .../drm/amd/amdkfd/kfd_device_queue_manager.c | 79 ++++++++++++++++++- > 1 file changed, 77 insertions(+), 2 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c > index f6e211070299..cb5b866eee3b 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c > @@ -319,6 +319,42 @@ static int remove_all_queues_mes(struct device_queue_manager *dqm) > return retval; > } > > +static int suspend_all_queues_mes(struct device_queue_manager *dqm) > +{ > + struct amdgpu_device *adev = (struct amdgpu_device *)dqm->dev->adev; > + int r = 0; > + > + if (dqm->is_hws_hang) > + return -EIO; > + > + r = amdgpu_mes_suspend(adev); > + if (r) { > + dev_err(adev->dev, "failed to suspend gangs from MES\n"); > + dev_err(adev->dev, "MES might be in unrecoverable state, issue a GPU reset\n"); > + kfd_hws_hang(dqm); > + } > + > + return r; > +} > + > +static int resume_all_queues_mes(struct device_queue_manager *dqm) > +{ > + struct amdgpu_device *adev = (struct amdgpu_device *)dqm->dev->adev; > + int r = 0; > + > + if (dqm->is_hws_hang) > + return -EIO; > + > + r = amdgpu_mes_resume(adev); > + if (r) { > + dev_err(adev->dev, "failed to resume gangs from MES\n"); > + dev_err(adev->dev, "MES might be in unrecoverable state, issue a GPU reset\n"); > + kfd_hws_hang(dqm); > + } > + > + return r; > +} > + > static void increment_queue_count(struct device_queue_manager *dqm, > struct qcm_process_device *qpd, > struct queue *q) > @@ -2835,6 +2871,40 @@ void device_queue_manager_uninit(struct device_queue_manager *dqm) > kfree(dqm); > } > > +static int kfd_dqm_evict_pasid_mes(struct device_queue_manager *dqm, > + struct qcm_process_device *qpd) > +{ > + struct device *dev = dqm->dev->adev->dev; > + int ret = 0; > + > + /* Check if process is already evicted */ > + dqm_lock(dqm); > + if (qpd->evicted) { > + dqm_unlock(dqm); > + goto out; > + } > + dqm_unlock(dqm); > + > + ret = suspend_all_queues_mes(dqm); > + if (ret) { > + dev_err(dev, "Suspending all queues failed"); > + goto out; > + } > + > + ret = dqm->ops.evict_process_queues(dqm, qpd); > + if (ret) { > + dev_err(dev, "Evicting process queues failed"); > + goto out; > + } > + > + ret = resume_all_queues_mes(dqm); > + if (ret) > + dev_err(dev, "Resuming all queues failed"); > + > +out: > + return ret; > +} > + > int kfd_dqm_evict_pasid(struct device_queue_manager *dqm, u32 pasid) > { > struct kfd_process_device *pdd; > @@ -2845,8 +2915,13 @@ int kfd_dqm_evict_pasid(struct device_queue_manager *dqm, u32 pasid) > return -EINVAL; > WARN(debug_evictions, "Evicting pid %d", p->lead_thread->pid); > pdd = kfd_get_process_device_data(dqm->dev, p); > - if (pdd) > - ret = dqm->ops.evict_process_queues(dqm, &pdd->qpd); > + if (pdd) { > + if (dqm->dev->kfd->shared_resources.enable_mes) > + ret = kfd_dqm_evict_pasid_mes(dqm, &pdd->qpd); > + else > + ret = dqm->ops.evict_process_queues(dqm, &pdd->qpd); > + } > + > kfd_unref_process(p); > > return ret; > -- > 2.35.1 >