[Public] > From: Greg Kroah-Hartman <gregkh@xxxxxxxxxxxxxxxxxxx> > Sent: Monday, December 30, 2024 10:43 > Subject: [PATCH 6.6 71/86] drm/amdkfd: pause autosuspend when creating pdd Hi Greg, This patch caused a regression, fix is pending here https://www.mail-archive.com/amd-gfx@xxxxxxxxxxxxxxxxxxxxx/msg116533.html Regards, Teddy > 6.6-stable review patch. If anyone has any objections, please let me know. > > ------------------ > > From: Jesse.zhang@xxxxxxx <Jesse.zhang@xxxxxxx> > > [ Upstream commit 438b39ac74e2a9dc0a5c9d653b7d8066877e86b1 ] > > When using MES creating a pdd will require talking to the GPU to setup the relevant > context. The code here forgot to wake up the GPU in case it was in suspend, this > causes KVM to EFAULT for passthrough GPU for example. This issue can be > masked if the GPU was woken up by other things (e.g. opening the KMS node) first > and have not yet gone to sleep. > > v4: do the allocation of proc_ctx_bo in a lazy fashion when the first queue is created > in a process (Felix) > > Signed-off-by: Jesse Zhang <jesse.zhang@xxxxxxx> > Reviewed-by: Yunxiang Li <Yunxiang.Li@xxxxxxx> > Signed-off-by: Alex Deucher <alexander.deucher@xxxxxxx> > Cc: stable@xxxxxxxxxxxxxxx > Signed-off-by: Sasha Levin <sashal@xxxxxxxxxx> > --- > .../drm/amd/amdkfd/kfd_device_queue_manager.c | 15 ++++++++++++ > drivers/gpu/drm/amd/amdkfd/kfd_process.c | 23 ++----------------- > 2 files changed, 17 insertions(+), 21 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c > b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c > index 4d9a406925e1..43fa260ddbce 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c > @@ -197,6 +197,21 @@ static int add_queue_mes(struct device_queue_manager > *dqm, struct queue *q, > if (dqm->is_hws_hang) > return -EIO; > > + if (!pdd->proc_ctx_cpu_ptr) { > + r = amdgpu_amdkfd_alloc_gtt_mem(adev, > + AMDGPU_MES_PROC_CTX_SIZE, > + &pdd->proc_ctx_bo, > + &pdd->proc_ctx_gpu_addr, > + &pdd->proc_ctx_cpu_ptr, > + false); > + if (r) { > + dev_err(adev->dev, > + "failed to allocate process context bo\n"); > + return r; > + } > + memset(pdd->proc_ctx_cpu_ptr, 0, > AMDGPU_MES_PROC_CTX_SIZE); > + } > + > memset(&queue_input, 0x0, sizeof(struct mes_add_queue_input)); > queue_input.process_id = qpd->pqm->process->pasid; > queue_input.page_table_base_addr = qpd->page_table_base; diff --git > a/drivers/gpu/drm/amd/amdkfd/kfd_process.c > b/drivers/gpu/drm/amd/amdkfd/kfd_process.c > index 577bdb6a9640..64346c71c62a 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c > @@ -1046,7 +1046,8 @@ static void kfd_process_destroy_pdds(struct > kfd_process *p) > > kfd_free_process_doorbells(pdd->dev->kfd, pdd); > > - if (pdd->dev->kfd->shared_resources.enable_mes) > + if (pdd->dev->kfd->shared_resources.enable_mes && > + pdd->proc_ctx_cpu_ptr) > amdgpu_amdkfd_free_gtt_mem(pdd->dev->adev, > &pdd->proc_ctx_bo); > /* > @@ -1572,7 +1573,6 @@ struct kfd_process_device > *kfd_create_process_device_data(struct kfd_node *dev, > struct kfd_process *p) > { > struct kfd_process_device *pdd = NULL; > - int retval = 0; > > if (WARN_ON_ONCE(p->n_pdds >= MAX_GPU_INSTANCE)) > return NULL; > @@ -1596,21 +1596,6 @@ struct kfd_process_device > *kfd_create_process_device_data(struct kfd_node *dev, > pdd->user_gpu_id = dev->id; > atomic64_set(&pdd->evict_duration_counter, 0); > > - if (dev->kfd->shared_resources.enable_mes) { > - retval = amdgpu_amdkfd_alloc_gtt_mem(dev->adev, > - AMDGPU_MES_PROC_CTX_SIZE, > - &pdd->proc_ctx_bo, > - &pdd->proc_ctx_gpu_addr, > - &pdd->proc_ctx_cpu_ptr, > - false); > - if (retval) { > - dev_err(dev->adev->dev, > - "failed to allocate process context bo\n"); > - goto err_free_pdd; > - } > - memset(pdd->proc_ctx_cpu_ptr, 0, > AMDGPU_MES_PROC_CTX_SIZE); > - } > - > p->pdds[p->n_pdds++] = pdd; > if (kfd_dbg_is_per_vmid_supported(pdd->dev)) > pdd->spi_dbg_override = pdd->dev->kfd2kgd->disable_debug_trap( > @@ -1622,10 +1607,6 @@ struct kfd_process_device > *kfd_create_process_device_data(struct kfd_node *dev, > idr_init(&pdd->alloc_idr); > > return pdd; > - > -err_free_pdd: > - kfree(pdd); > - return NULL; > } > > /** > -- > 2.39.5 > >