MEC FW expects MES to unmap all queues when a VM fault is observed on a queue and then resumed once the affected process is terminated. Use the MES Suspend and Resume APIs to achieve this. Signed-off-by: Mukul Joshi <mukul.joshi@xxxxxxx> --- .../drm/amd/amdkfd/kfd_device_queue_manager.c | 75 ++++++++++++++++++- 1 file changed, 74 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c index f6e211070299..e16b17e301d9 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -319,6 +319,56 @@ static int remove_all_queues_mes(struct device_queue_manager *dqm) return retval; } +static int suspend_all_queues_mes(struct device_queue_manager *dqm) +{ + struct amdgpu_device *adev = (struct amdgpu_device *)dqm->dev->adev; + int r; + struct mes_suspend_gang_input queue_input; + + if (dqm->is_hws_hang) + return -EIO; + + memset(&queue_input, 0x0, sizeof(struct mes_suspend_gang_input)); + queue_input.suspend_all_gangs = 1; + + amdgpu_mes_lock(&adev->mes); + r = adev->mes.funcs->suspend_gang(&adev->mes, &queue_input); + amdgpu_mes_unlock(&adev->mes); + + if (r) { + dev_err(adev->dev, "failed to suspend gangs from MES\n"); + dev_err(adev->dev, "MES might be in unrecoverable state, issue a GPU reset\n"); + kfd_hws_hang(dqm); + } + + return r; +} + +static int resume_all_queues_mes(struct device_queue_manager *dqm) +{ + struct amdgpu_device *adev = (struct amdgpu_device *)dqm->dev->adev; + int r; + struct mes_resume_gang_input queue_input; + + if (dqm->is_hws_hang) + return -EIO; + + memset(&queue_input, 0x0, sizeof(struct mes_resume_gang_input)); + queue_input.resume_all_gangs = 1; + + amdgpu_mes_lock(&adev->mes); + r = adev->mes.funcs->resume_gang(&adev->mes, &queue_input); + amdgpu_mes_unlock(&adev->mes); + + if (r) { + dev_err(adev->dev, "failed to resume gangs from MES\n"); + dev_err(adev->dev, "MES might be in unrecoverable state, issue a GPU reset\n"); + kfd_hws_hang(dqm); + } + + return r; +} + static void increment_queue_count(struct device_queue_manager *dqm, struct qcm_process_device *qpd, struct queue *q) @@ -2839,14 +2889,37 @@ int kfd_dqm_evict_pasid(struct device_queue_manager *dqm, u32 pasid) { struct kfd_process_device *pdd; struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); + struct device *dev = dqm->dev->adev->dev; int ret = 0; if (!p) return -EINVAL; WARN(debug_evictions, "Evicting pid %d", p->lead_thread->pid); pdd = kfd_get_process_device_data(dqm->dev, p); - if (pdd) + if (pdd) { + if (dqm->dev->kfd->shared_resources.enable_mes) { + ret = suspend_all_queues_mes(dqm); + if (ret) { + dev_err(dev, "Suspending all queues failed"); + goto out; + } + } ret = dqm->ops.evict_process_queues(dqm, &pdd->qpd); + if (ret) { + dev_err(dev, "Evicting process queues failed"); + goto out; + } + + if (dqm->dev->kfd->shared_resources.enable_mes) { + ret = resume_all_queues_mes(dqm); + if (ret) { + dev_err(dev, "Resuming all queues failed"); + goto out; + } + } + } + +out: kfd_unref_process(p); return ret; -- 2.35.1