RE: [PATCH 6.6 71/86] drm/amdkfd: pause autosuspend when creating pdd

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



[Public]

> From: Greg Kroah-Hartman <gregkh@xxxxxxxxxxxxxxxxxxx>
> Sent: Monday, December 30, 2024 10:43
> Subject: [PATCH 6.6 71/86] drm/amdkfd: pause autosuspend when creating pdd

Hi Greg,

This patch caused a regression, fix is pending here https://www.mail-archive.com/amd-gfx@xxxxxxxxxxxxxxxxxxxxx/msg116533.html

Regards,
Teddy

> 6.6-stable review patch.  If anyone has any objections, please let me know.
>
> ------------------
>
> From: Jesse.zhang@xxxxxxx <Jesse.zhang@xxxxxxx>
>
> [ Upstream commit 438b39ac74e2a9dc0a5c9d653b7d8066877e86b1 ]
>
> When using MES creating a pdd will require talking to the GPU to setup the relevant
> context. The code here forgot to wake up the GPU in case it was in suspend, this
> causes KVM to EFAULT for passthrough GPU for example. This issue can be
> masked if the GPU was woken up by other things (e.g. opening the KMS node) first
> and have not yet gone to sleep.
>
> v4: do the allocation of proc_ctx_bo in a lazy fashion when the first queue is created
> in a process (Felix)
>
> Signed-off-by: Jesse Zhang <jesse.zhang@xxxxxxx>
> Reviewed-by: Yunxiang Li <Yunxiang.Li@xxxxxxx>
> Signed-off-by: Alex Deucher <alexander.deucher@xxxxxxx>
> Cc: stable@xxxxxxxxxxxxxxx
> Signed-off-by: Sasha Levin <sashal@xxxxxxxxxx>
> ---
>  .../drm/amd/amdkfd/kfd_device_queue_manager.c | 15 ++++++++++++
>  drivers/gpu/drm/amd/amdkfd/kfd_process.c      | 23 ++-----------------
>  2 files changed, 17 insertions(+), 21 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index 4d9a406925e1..43fa260ddbce 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -197,6 +197,21 @@ static int add_queue_mes(struct device_queue_manager
> *dqm, struct queue *q,
>       if (dqm->is_hws_hang)
>               return -EIO;
>
> +     if (!pdd->proc_ctx_cpu_ptr) {
> +             r = amdgpu_amdkfd_alloc_gtt_mem(adev,
> +                             AMDGPU_MES_PROC_CTX_SIZE,
> +                             &pdd->proc_ctx_bo,
> +                             &pdd->proc_ctx_gpu_addr,
> +                             &pdd->proc_ctx_cpu_ptr,
> +                             false);
> +             if (r) {
> +                     dev_err(adev->dev,
> +                             "failed to allocate process context bo\n");
> +                     return r;
> +             }
> +             memset(pdd->proc_ctx_cpu_ptr, 0,
> AMDGPU_MES_PROC_CTX_SIZE);
> +     }
> +
>       memset(&queue_input, 0x0, sizeof(struct mes_add_queue_input));
>       queue_input.process_id = qpd->pqm->process->pasid;
>       queue_input.page_table_base_addr =  qpd->page_table_base; diff --git
> a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> index 577bdb6a9640..64346c71c62a 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> @@ -1046,7 +1046,8 @@ static void kfd_process_destroy_pdds(struct
> kfd_process *p)
>
>               kfd_free_process_doorbells(pdd->dev->kfd, pdd);
>
> -             if (pdd->dev->kfd->shared_resources.enable_mes)
> +             if (pdd->dev->kfd->shared_resources.enable_mes &&
> +                     pdd->proc_ctx_cpu_ptr)
>                       amdgpu_amdkfd_free_gtt_mem(pdd->dev->adev,
>                                                  &pdd->proc_ctx_bo);
>               /*
> @@ -1572,7 +1573,6 @@ struct kfd_process_device
> *kfd_create_process_device_data(struct kfd_node *dev,
>                                                       struct kfd_process *p)
>  {
>       struct kfd_process_device *pdd = NULL;
> -     int retval = 0;
>
>       if (WARN_ON_ONCE(p->n_pdds >= MAX_GPU_INSTANCE))
>               return NULL;
> @@ -1596,21 +1596,6 @@ struct kfd_process_device
> *kfd_create_process_device_data(struct kfd_node *dev,
>       pdd->user_gpu_id = dev->id;
>       atomic64_set(&pdd->evict_duration_counter, 0);
>
> -     if (dev->kfd->shared_resources.enable_mes) {
> -             retval = amdgpu_amdkfd_alloc_gtt_mem(dev->adev,
> -                                             AMDGPU_MES_PROC_CTX_SIZE,
> -                                             &pdd->proc_ctx_bo,
> -                                             &pdd->proc_ctx_gpu_addr,
> -                                             &pdd->proc_ctx_cpu_ptr,
> -                                             false);
> -             if (retval) {
> -                     dev_err(dev->adev->dev,
> -                             "failed to allocate process context bo\n");
> -                     goto err_free_pdd;
> -             }
> -             memset(pdd->proc_ctx_cpu_ptr, 0,
> AMDGPU_MES_PROC_CTX_SIZE);
> -     }
> -
>       p->pdds[p->n_pdds++] = pdd;
>       if (kfd_dbg_is_per_vmid_supported(pdd->dev))
>               pdd->spi_dbg_override = pdd->dev->kfd2kgd->disable_debug_trap(
> @@ -1622,10 +1607,6 @@ struct kfd_process_device
> *kfd_create_process_device_data(struct kfd_node *dev,
>       idr_init(&pdd->alloc_idr);
>
>       return pdd;
> -
> -err_free_pdd:
> -     kfree(pdd);
> -     return NULL;
>  }
>
>  /**
> --
> 2.39.5
>
>






[Index of Archives]     [Linux Kernel]     [Kernel Development Newbies]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite Hiking]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux