[AMD Official Use Only - General] >> On 2022-06-22 11:36, Graham Sider wrote: >> Starting with GFX11, MES requires wptr BOs to be GTT allocated/mapped to >> GART for usermode queues in order to support oversubscription. In the >> case that work is submitted to an unmapped queue, MES must have a GART >> wptr address to determine whether the queue should be mapped. >> >> This change is accompanied with changes in MES and is applicable for >> MES_API_VERSION >= 2. >> >> v3: >> - Use amdgpu_vm_bo_lookup_mapping for wptr_bo mapping lookup >> - Move wptr_bo refcount increment to amdgpu_amdkfd_map_gtt_bo_to_gart >> - Remove list_del_init from amdgpu_amdkfd_map_gtt_bo_to_gart >> - Cleanup/fix create_queue wptr_bo error handling >> v4: >> - Add MES version shift/mask defines to amdgpu_mes.h >> - Change version check from MES_VERSION to MES_API_VERSION >> - Add check in kfd_ioctl_create_queue before wptr bo pin/GART map to >> ensure bo is a single page. >> >> Signed-off-by: Graham Sider
Graham.Sider@xxxxxxx >> Acked-by: Alex Deucher
alexander.deucher@xxxxxxx >> Reviewed-by: Philip Yang
Philip.Yang@xxxxxxx >> --- >> drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 2 + >> .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 48 +++++++++++++++++++ >> drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h | 7 +++ >> drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 45 ++++++++++++++++- >> .../drm/amd/amdkfd/kfd_device_queue_manager.c | 9 +++- >> .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c | 2 + >> drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 3 ++ >> .../amd/amdkfd/kfd_process_queue_manager.c | 20 +++++--- >> 8 files changed, 127 insertions(+), 9 deletions(-) >> >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h >> index 648c031942e9..b25b41f50213 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h >> @@ -286,6 +286,8 @@ int amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(struct kgd_mem *mem, >> void **kptr, uint64_t *size); >> void amdgpu_amdkfd_gpuvm_unmap_gtt_bo_from_kernel(struct kgd_mem *mem); >> >> +int amdgpu_amdkfd_map_gtt_bo_to_gart(struct amdgpu_device *adev, struct amdgpu_bo *bo); >> + >> int amdgpu_amdkfd_gpuvm_restore_process_bos(void *process_info, >> struct dma_fence **ef); >> int amdgpu_amdkfd_gpuvm_get_vm_fault_info(struct amdgpu_device *adev, >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c >> index afd6e6923189..615ac2895d62 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c >> @@ -2148,6 +2148,54 @@ int amdgpu_amdkfd_gpuvm_sync_memory( >> return ret; >> } >> >> +/** >> + * amdgpu_amdkfd_map_gtt_bo_to_gart - Map BO to GART and increment reference count >> + * @adev: Device to which allocated BO belongs >> + * @bo: Buffer object to be mapped >> + * >> + * Before return, bo reference count is incremented. To release the reference and unpin/ >> + * unmap the BO, call amdgpu_amdkfd_free_gtt_mem. >> + */ >> +int amdgpu_amdkfd_map_gtt_bo_to_gart(struct amdgpu_device *adev, struct amdgpu_bo *bo) >> +{ >> + int ret; >> + >> + ret = amdgpu_bo_reserve(bo, true); >> + if (ret) { >> + pr_err("Failed to reserve bo. ret %d\n", ret); >> + goto err_reserve_bo_failed; >> + } >> + >> + ret = amdgpu_bo_pin(bo, AMDGPU_GEM_DOMAIN_GTT); >> + if (ret) { >> + pr_err("Failed to pin bo. ret %d\n", ret); >> + goto err_pin_bo_failed; >> + } >> + >> + ret = amdgpu_ttm_alloc_gart(&bo->tbo); >> + if (ret) { >> + pr_err("Failed to bind bo to GART. ret %d\n", ret); >> + goto err_map_bo_gart_failed; >> + } >> + >> + amdgpu_amdkfd_remove_eviction_fence( >> + bo, bo->kfd_bo->process_info->eviction_fence); >> + >> + amdgpu_bo_unreserve(bo); >> + >> + bo = amdgpu_bo_ref(bo); >> + >> + return 0; >> + >> +err_map_bo_gart_failed: >> + amdgpu_bo_unpin(bo); >> +err_pin_bo_failed: >> + amdgpu_bo_unreserve(bo); >> +err_reserve_bo_failed: >> + >> + return ret; >> +} >> + >> /** amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel() - Map a GTT BO for kernel CPU access >> * >> * @mem: Buffer object to be mapped for CPU access >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h >> index be4b51a5b5c7..137a2cc2e807 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h >> @@ -32,6 +32,13 @@ >> #define AMDGPU_MES_MAX_GFX_PIPES 2 >> #define AMDGPU_MES_MAX_SDMA_PIPES 2 >> >> +#define AMDGPU_MES_API_VERSION_SHIFT 12 >> +#define AMDGPU_MES_FEAT_VERSION_SHIFT 24 >> + >> +#define AMDGPU_MES_VERSION_MASK 0x00000fff >> +#define AMDGPU_MES_API_VERSION_MASK 0x00fff000 >> +#define AMDGPU_MES_FEAT_VERSION_MASK 0xff000000 >> + >> enum amdgpu_mes_priority_level { >> AMDGPU_MES_PRIORITY_LEVEL_LOW = 0, >> AMDGPU_MES_PRIORITY_LEVEL_NORMAL = 1, >> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c >> index 625e837f0119..a0246b4bae6b 100644 >> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c >> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c >> @@ -299,6 +299,7 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p, >> struct kfd_process_device *pdd; >> struct queue_properties q_properties; >> uint32_t doorbell_offset_in_process = 0; >> + struct amdgpu_bo *wptr_bo = NULL; >> >> memset(&q_properties, 0, sizeof(struct queue_properties)); >> >> @@ -326,12 +327,49 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p, >> goto err_bind_process; >> } >> >> + /* Starting with GFX11, wptr BOs must be mapped to GART for MES to determine work >> + * on unmapped queues for usermode queue oversubscription (no aggregated doorbell) >> + */ >> + if (dev->shared_resources.enable_mes && >> + ((dev->adev->mes.sched_version & AMDGPU_MES_API_VERSION_MASK) >> + >> AMDGPU_MES_API_VERSION_SHIFT) >= 2) { >> + struct amdgpu_bo_va_mapping *wptr_mapping; >> + struct amdgpu_vm *wptr_vm; >> + >> + wptr_vm = drm_priv_to_vm(pdd->drm_priv); >> + err = amdgpu_bo_reserve(wptr_vm->root.bo, false); >> + if (err) >> + goto err_wptr_map_gart; >> + >> + wptr_mapping = amdgpu_vm_bo_lookup_mapping( >> + wptr_vm, args->write_pointer_address >> PAGE_SHIFT); >> + amdgpu_bo_unreserve(wptr_vm->root.bo); >> + if (!wptr_mapping) { >> + pr_err("Failed to lookup wptr bo\n"); >> + err = -EINVAL; >> + goto err_wptr_map_gart; >> + } >> + >> + wptr_bo = wptr_mapping->bo_va->base.bo; >> + if (wptr_bo->tbo.base.size > PAGE_SIZE) { >> + pr_err("Requested GART mapping for wptr bo larger than one page\n"); >> + err = -EINVAL; >> + goto err_wptr_map_gart; >> + } >> + >> + err = amdgpu_amdkfd_map_gtt_bo_to_gart(dev->adev, wptr_bo); >> + if (err) { >> + pr_err("Failed to map wptr bo to GART\n"); >> + goto err_wptr_map_gart; >> + } >> + } >> + >> pr_debug("Creating queue for PASID 0x%x on gpu 0x%x\n", >> p->pasid, >> dev->id); >> >> - err = pqm_create_queue(&p->pqm, dev, filep, &q_properties, &queue_id, NULL, NULL, NULL, >> - &doorbell_offset_in_process); >> + err = pqm_create_queue(&p->pqm, dev, filep, &q_properties, &queue_id, wptr_bo, >> + NULL, NULL, NULL, &doorbell_offset_in_process); >> if (err != 0) >> goto err_create_queue; >> >> @@ -363,6 +401,9 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p, >> return 0; >> >> err_create_queue: >> + if (wptr_bo) >> + amdgpu_amdkfd_free_gtt_mem(dev->adev, wptr_bo); >> +err_wptr_map_gart: >> err_bind_process: >> err_pdd: >> mutex_unlock(&p->mutex); >> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c >> index 213246a5b4e4..299927a4959b 100644 >> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c >> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c >> @@ -177,6 +177,7 @@ static int add_queue_mes(struct device_queue_manager *dqm, struct queue *q, >> struct kfd_process_device *pdd = qpd_to_pdd(qpd); >> struct mes_add_queue_input queue_input; >> int r, queue_type; >> + uint64_t wptr_addr_off; >> >> if (dqm->is_hws_hang) >> return -EIO; >> @@ -196,7 +197,13 @@ static int add_queue_mes(struct device_queue_manager *dqm, struct queue *q, >> AMDGPU_MES_PRIORITY_LEVEL_NORMAL; >> queue_input.doorbell_offset = q->properties.doorbell_off; >> queue_input.mqd_addr = q->gart_mqd_addr; >> - queue_input.wptr_addr = (uint64_t)q->properties.write_ptr; >> + >> + if (q->wptr_bo) { >> + wptr_addr_off = (uint64_t)q->properties.write_ptr - (uint64_t)q->wptr_bo->kfd_bo->va; >> + queue_input.wptr_addr = ((uint64_t)q->wptr_bo->tbo.resource->start << PAGE_SHIFT) + wptr_addr_off; >> + } else >> + queue_input.wptr_addr = (uint64_t)q->properties.write_ptr; >> + >> queue_input.paging = false; >> queue_input.tba_addr = qpd->tba_addr; >> queue_input.tma_addr = qpd->tma_addr; >> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c >> index 4e0387f591be..b8e14c2cc295 100644 >> --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c >> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c >> @@ -377,6 +377,8 @@ static void update_mqd_sdma(struct mqd_manager *mm, void *mqd, >> m->sdmax_rlcx_rb_base_hi = upper_32_bits(q->queue_address >> 8); >> m->sdmax_rlcx_rb_rptr_addr_lo = lower_32_bits((uint64_t)q->read_ptr); >> m->sdmax_rlcx_rb_rptr_addr_hi = upper_32_bits((uint64_t)q->read_ptr); >> + m->sdmax_rlcx_rb_wptr_poll_addr_lo = lower_32_bits((uint64_t)q->write_ptr); >> + m->sdmax_rlcx_rb_wptr_poll_addr_hi = upper_32_bits((uint64_t)q->write_ptr); >> m->sdmax_rlcx_doorbell_offset = >> q->doorbell_off << SDMA0_QUEUE0_DOORBELL_OFFSET__OFFSET__SHIFT; >> >> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h >> index 91e5fa56f0a2..59ba50ce54d3 100644 >> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h >> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h >> @@ -570,6 +570,8 @@ struct queue { >> void *gang_ctx_bo; >> uint64_t gang_ctx_gpu_addr; >> void *gang_ctx_cpu_ptr; >> + >> + struct amdgpu_bo *wptr_bo; >> }; >> >> enum KFD_MQD_TYPE { >> @@ -1205,6 +1207,7 @@ int pqm_create_queue(struct process_queue_manager *pqm, >> struct file *f, >> struct queue_properties *properties, >> unsigned int *qid, >> + struct amdgpu_bo *wptr_bo, >> const struct kfd_criu_queue_priv_data *q_data, >> const void *restore_mqd, >> const void *restore_ctl_stack, >> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c >> index 99f2a6412201..8db58348de98 100644 >> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c >> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c >> @@ -180,7 +180,8 @@ void pqm_uninit(struct process_queue_manager *pqm) >> static int init_user_queue(struct process_queue_manager *pqm, >> struct kfd_dev *dev, struct queue **q, >> struct queue_properties *q_properties, >> - struct file *f, unsigned int qid) >> + struct file *f, struct amdgpu_bo *wptr_bo, >> + unsigned int qid) >> { >> int retval; >> >> @@ -210,6 +211,7 @@ static int init_user_queue(struct process_queue_manager *pqm, >> goto cleanup; >> } >> memset((*q)->gang_ctx_cpu_ptr, 0, AMDGPU_MES_GANG_CTX_SIZE); >> + (*q)->wptr_bo = wptr_bo; >> } >> >> pr_debug("PQM After init queue"); >> @@ -226,6 +228,7 @@ int pqm_create_queue(struct process_queue_manager *pqm, >> struct file *f, >> struct queue_properties *properties, >> unsigned int *qid, >> + struct amdgpu_bo *wptr_bo, >> const struct kfd_criu_queue_priv_data *q_data, >> const void *restore_mqd, >> const void *restore_ctl_stack, >> @@ -288,7 +291,7 @@ int pqm_create_queue(struct process_queue_manager *pqm, >> * allocate_sdma_queue() in create_queue() has the >> * corresponding check logic. >> */ >> - retval = init_user_queue(pqm, dev, &q, properties, f, *qid); >> + retval = init_user_queue(pqm, dev, &q, properties, f, wptr_bo, *qid); >> if (retval != 0) >> goto err_create_queue; >> pqn->q = q; >> @@ -309,7 +312,7 @@ int pqm_create_queue(struct process_queue_manager *pqm, >> goto err_create_queue; >> } >> >> - retval = init_user_queue(pqm, dev, &q, properties, f, *qid); >> + retval = init_user_queue(pqm, dev, &q, properties, f, wptr_bo, *qid); >> if (retval != 0) >> goto err_create_queue; >> pqn->q = q; >> @@ -435,10 +438,15 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid) >> pdd->qpd.num_gws = 0; >> } >> >> - if (dev->shared_resources.enable_mes) >> + if (dev->shared_resources.enable_mes) { >> amdgpu_amdkfd_free_gtt_mem(dev->adev, >> pqn->q->gang_ctx_bo); >> - kfd_procfs_del_queue(pqn->q); >> + if (pqn->q->wptr_bo) >> + amdgpu_amdkfd_free_gtt_mem(dev->adev, pqn->q->wptr_bo); >> + >> + kfd_procfs_del_queue(pqn->q); > > Seems rebase issue, kfd_procfs_del_queue(pqn->q) should be outside if (dev->shared_resources.enable_mes) { > Yeah this was a rebase issue. Fixed – thanks! >> + >> + } >> uninit_queue(pqn->q); >> } >> >> @@ -844,7 +852,7 @@ int kfd_criu_restore_queue(struct kfd_process *p, >> >> print_queue_properties(&qp); >> >> - ret = pqm_create_queue(&p->pqm, pdd->dev, NULL, &qp, &queue_id, q_data, mqd, ctl_stack, >> + ret = pqm_create_queue(&p->pqm, pdd->dev, NULL, &qp, &queue_id, NULL, q_data, mqd, ctl_stack, > > CRIU restore user queues should create struct queue wptr_bo GART mapping based on MES API version. > Regards, > Philip > As discussed offline since this is somewhat separated this will be modified in a future patch. Best, Graham >> >> NULL); >> if (ret) { >> pr_err("Failed to create new queue err:%d\n", ret); |