On 30/03/2023 22:54, Alex Deucher wrote:
On Wed, Mar 29, 2023 at 11:48 AM Shashank Sharma <shashank.sharma@xxxxxxx> wrote:This patch: - adds a new doorbell manager object in kfd pdd structure. - allocates doorbells for a process while creating its pdd. - frees the doorbells with pdd destroy. - uses direct doorbell manager API for doorbell indexing. - removes previous calls to allocate process doorbells as its not required anymore. PS: This patch ensures that we don't break the existing KFD functionality, but now KFD userspace library must also move to creating doorbell pages as AMDGPU GEM objects using libdrm functions in userspace. The reference code for the same is available with AMDGPU Usermode queue libdrm MR. Once this is done, we will not need this patch. Cc: Alex Deucher <alexander.deucher@xxxxxxx> Cc: Christian Koenig <christian.koenig@xxxxxxx> Signed-off-by: Shashank Sharma <shashank.sharma@xxxxxxx> --- drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 13 ---- .../drm/amd/amdkfd/kfd_device_queue_manager.c | 16 ++--- drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c | 59 +++++++++---------- drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 8 +-- drivers/gpu/drm/amd/amdkfd/kfd_process.c | 26 ++++---- .../amd/amdkfd/kfd_process_queue_manager.c | 16 ++--- 6 files changed, 58 insertions(+), 80 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index 6d291aa6386b..0e40756417e5 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -327,12 +327,6 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p, goto err_bind_process; } - if (!pdd->doorbell_index && - kfd_alloc_process_doorbells(dev, &pdd->doorbell_index) < 0) { - err = -ENOMEM; - goto err_alloc_doorbells; - } - /* Starting with GFX11, wptr BOs must be mapped to GART for MES to determine work * on unmapped queues for usermode queue oversubscription (no aggregated doorbell) */ @@ -410,7 +404,6 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p, if (wptr_bo) amdgpu_amdkfd_free_gtt_mem(dev->adev, wptr_bo); err_wptr_map_gart: -err_alloc_doorbells: err_bind_process: err_pdd: mutex_unlock(&p->mutex); @@ -2163,12 +2156,6 @@ static int criu_restore_devices(struct kfd_process *p, ret = PTR_ERR(pdd); goto exit; } - - if (!pdd->doorbell_index && - kfd_alloc_process_doorbells(pdd->dev, &pdd->doorbell_index) < 0) { - ret = -ENOMEM; - goto exit; - } } /* diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c index ecb4c3abc629..5827db9b18a8 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -362,7 +362,7 @@ static int allocate_doorbell(struct qcm_process_device *qpd, /* For CP queues on SOC15 */ if (restore_id) { /* make sure that ID is free */ - if (__test_and_set_bit(*restore_id, qpd->doorbell_bitmap)) + if (__test_and_set_bit(*restore_id, qpd->proc_doorbells.doorbell_bitmap)) return -EINVAL; q->doorbell_id = *restore_id; @@ -370,20 +370,20 @@ static int allocate_doorbell(struct qcm_process_device *qpd, /* or reserve a free doorbell ID */ unsigned int found; - found = find_first_zero_bit(qpd->doorbell_bitmap, - KFD_MAX_NUM_OF_QUEUES_PER_PROCESS); + found = find_first_zero_bit(qpd->proc_doorbells.doorbell_bitmap, + KFD_MAX_NUM_OF_QUEUES_PER_PROCESS); if (found >= KFD_MAX_NUM_OF_QUEUES_PER_PROCESS) { pr_debug("No doorbells available"); return -EBUSY; } - set_bit(found, qpd->doorbell_bitmap); + set_bit(found, qpd->proc_doorbells.doorbell_bitmap); q->doorbell_id = found; } } - q->properties.doorbell_off = - kfd_get_doorbell_dw_offset_in_bar(dev, qpd_to_pdd(qpd), - q->doorbell_id); + q->properties.doorbell_off = amdgpu_doorbell_index_on_bar(dev->adev, + qpd->proc_doorbells.bo, + q->doorbell_id); return 0; } @@ -398,7 +398,7 @@ static void deallocate_doorbell(struct qcm_process_device *qpd, q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) return; - old = test_and_clear_bit(q->doorbell_id, qpd->doorbell_bitmap); + old = test_and_clear_bit(q->doorbell_id, qpd->proc_doorbells.doorbell_bitmap); WARN_ON(!old); } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c b/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c index df259f2cc58a..7d29653bff81 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c @@ -228,46 +228,41 @@ uint64_t kfd_get_number_elems(struct kfd_dev *kfd) phys_addr_t kfd_get_process_doorbells(struct kfd_process_device *pdd) { - if (!pdd->doorbell_index) { - int r = kfd_alloc_process_doorbells(pdd->dev, - &pdd->doorbell_index); - if (r < 0) - return 0; - } + struct amdgpu_device *adev = pdd->dev->adev; - return pdd->dev->doorbell_base + - pdd->doorbell_index * kfd_doorbell_process_slice(pdd->dev); + /* Return base of the first doorbell of this process */ + return adev->doorbell.base + pdd->qpd.proc_doorbells.start * sizeof(uint32_t); } -int kfd_alloc_process_doorbells(struct kfd_dev *kfd, unsigned int *doorbell_index) +int kfd_alloc_process_doorbells(struct kfd_dev *kfd, struct kfd_process_device *pdd) { - int r = 0; - - if (!kfd->shared_resources.enable_mes) - r = ida_simple_get(&kfd->doorbell_ida, 1, - kfd->max_doorbell_slices, GFP_KERNEL); - else - r = amdgpu_mes_alloc_process_doorbells( - (struct amdgpu_device *)kfd->adev, - doorbell_index); - - if (r > 0) - *doorbell_index = r; + int r; + struct qcm_process_device *qpd = &pdd->qpd; + struct amdgpu_doorbell_obj *proc_doorbells = &qpd->proc_doorbells; + + /* Allocate bitmap for dynamic doorbell allocation */ + proc_doorbells->doorbell_bitmap = bitmap_zalloc(KFD_MAX_NUM_OF_QUEUES_PER_PROCESS, + GFP_KERNEL); + if (!proc_doorbells->doorbell_bitmap) { + DRM_ERROR("Failed to allocate process doorbell bitmap\n"); + return -ENOMEM; + } - if (r < 0) - pr_err("Failed to allocate process doorbells\n"); + /* Allocate doorbells for this process from the PCI BAR */ + proc_doorbells->size = kfd_doorbell_process_slice(kfd); + r = amdgpu_doorbell_alloc_page(kfd->adev, proc_doorbells);Same thing here as the previous patch. Just call amdgpu_bo_create_kernel(..DOORBELL..) and store the bo in the process structure. Alex
got it, - Shashank
+ if (r) { + DRM_ERROR("Failed to allocate process doorbells\n"); + return r; + } return r; } -void kfd_free_process_doorbells(struct kfd_dev *kfd, unsigned int doorbell_index) +void kfd_free_process_doorbells(struct kfd_dev *kfd, struct kfd_process_device *pdd) { - if (doorbell_index) { - if (!kfd->shared_resources.enable_mes) - ida_simple_remove(&kfd->doorbell_ida, doorbell_index); - else - amdgpu_mes_free_process_doorbells( - (struct amdgpu_device *)kfd->adev, - doorbell_index); - } + struct amdgpu_doorbell_obj *proc_doorbells = &pdd->qpd.proc_doorbells; + + bitmap_free(proc_doorbells->doorbell_bitmap); + amdgpu_doorbell_free_page(kfd->adev, proc_doorbells); } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index 0ed33416c35f..c97ed8e7e02d 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -658,8 +658,8 @@ struct qcm_process_device { uint64_t ib_base; void *ib_kaddr; - /* doorbell resources per process per device */ - unsigned long *doorbell_bitmap; + /* physical doorbell pages */ + struct amdgpu_doorbell_obj proc_doorbells; }; /* KFD Memory Eviction */ @@ -1006,9 +1006,9 @@ unsigned int kfd_get_doorbell_dw_offset_in_bar(struct kfd_dev *kfd, unsigned int doorbell_id); phys_addr_t kfd_get_process_doorbells(struct kfd_process_device *pdd); int kfd_alloc_process_doorbells(struct kfd_dev *kfd, - unsigned int *doorbell_index); + struct kfd_process_device *pdd); void kfd_free_process_doorbells(struct kfd_dev *kfd, - unsigned int doorbell_index); + struct kfd_process_device *pdd); /* GTT Sub-Allocator */ int kfd_gtt_sa_allocate(struct kfd_dev *kfd, unsigned int size, diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c index 51b1683ac5c1..68d0310c2d53 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c @@ -1037,10 +1037,9 @@ static void kfd_process_destroy_pdds(struct kfd_process *p) free_pages((unsigned long)pdd->qpd.cwsr_kaddr, get_order(KFD_CWSR_TBA_TMA_SIZE)); - bitmap_free(pdd->qpd.doorbell_bitmap); idr_destroy(&pdd->alloc_idr); - kfd_free_process_doorbells(pdd->dev, pdd->doorbell_index); + kfd_free_process_doorbells(pdd->dev, pdd); if (pdd->dev->shared_resources.enable_mes) amdgpu_amdkfd_free_gtt_mem(pdd->dev->adev, @@ -1449,15 +1448,11 @@ static int init_doorbell_bitmap(struct qcm_process_device *qpd, unsigned int i; int range_start = dev->shared_resources.non_cp_doorbells_start; int range_end = dev->shared_resources.non_cp_doorbells_end; + struct amdgpu_doorbell_obj *proc_doorbells = &qpd->proc_doorbells; if (!KFD_IS_SOC15(dev)) return 0; - qpd->doorbell_bitmap = bitmap_zalloc(KFD_MAX_NUM_OF_QUEUES_PER_PROCESS, - GFP_KERNEL); - if (!qpd->doorbell_bitmap) - return -ENOMEM; - /* Mask out doorbells reserved for SDMA, IH, and VCN on SOC15. */ pr_debug("reserved doorbell 0x%03x - 0x%03x\n", range_start, range_end); pr_debug("reserved doorbell 0x%03x - 0x%03x\n", @@ -1466,9 +1461,9 @@ static int init_doorbell_bitmap(struct qcm_process_device *qpd, for (i = 0; i < KFD_MAX_NUM_OF_QUEUES_PER_PROCESS / 2; i++) { if (i >= range_start && i <= range_end) { - __set_bit(i, qpd->doorbell_bitmap); + __set_bit(i, proc_doorbells->doorbell_bitmap); __set_bit(i + KFD_QUEUE_DOORBELL_MIRROR_OFFSET, - qpd->doorbell_bitmap); + proc_doorbells->doorbell_bitmap); } } @@ -1499,9 +1494,15 @@ struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev, if (!pdd) return NULL; + retval = kfd_alloc_process_doorbells(dev, pdd); + if (retval) { + pr_err("failed to allocate process doorbells\n"); + goto err_free_pdd; + } + if (init_doorbell_bitmap(&pdd->qpd, dev)) { pr_err("Failed to init doorbell for process\n"); - goto err_free_pdd; + goto err_free_db; } pdd->dev = dev; @@ -1529,7 +1530,7 @@ struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev, false); if (retval) { pr_err("failed to allocate process context bo\n"); - goto err_free_pdd; + goto err_free_db; } memset(pdd->proc_ctx_cpu_ptr, 0, AMDGPU_MES_PROC_CTX_SIZE); } @@ -1541,6 +1542,9 @@ struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev, return pdd; +err_free_db: + kfd_free_process_doorbells(pdd->dev, pdd); + err_free_pdd: kfree(pdd); return NULL; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c index 5137476ec18e..693688d789d3 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c @@ -348,13 +348,11 @@ int pqm_create_queue(struct process_queue_manager *pqm, /* Return the doorbell offset within the doorbell page * to the caller so it can be passed up to user mode * (in bytes). - * There are always 1024 doorbells per process, so in case - * of 8-byte doorbells, there are two doorbell pages per - * process. + * relative doorbell index = Absolute doorbell index - + * absolute index of first doorbell in the page. */ - *p_doorbell_offset_in_process = - (q->properties.doorbell_off * sizeof(uint32_t)) & - (kfd_doorbell_process_slice(dev) - 1); + *p_doorbell_offset_in_process = (q->properties.doorbell_off + - pdd->qpd.proc_doorbells.start) * sizeof(uint32_t); pr_debug("PQM After DQM create queue\n"); @@ -858,12 +856,6 @@ int kfd_criu_restore_queue(struct kfd_process *p, goto exit; } - if (!pdd->doorbell_index && - kfd_alloc_process_doorbells(pdd->dev, &pdd->doorbell_index) < 0) { - ret = -ENOMEM; - goto exit; - } - /* data stored in this order: mqd, ctl_stack */ mqd = q_extra_data; ctl_stack = mqd + q_data->mqd_size; -- 2.40.0