Make amdgpu the owner of all per-pipe state of the HQDs. This change will allow us to split the queues between kfd and amdgpu with a queue granularity instead of pipe granularity. This patch fixes kfd allocating an HDP_EOP region for its 3 pipes which goes unused. v2: support for gfx9 Reviewed-by: Edward O'Callaghan <funfunctor at folklore1984.net> Reviewed-by: Felix Kuehling <Felix.Kuehling at amd.com> Acked-by: Christian König <christian.koenig at amd.com> Signed-off-by: Andres Rodriguez <andresx7 at gmail.com> --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 4 +- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c | 13 +------ drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c | 1 + drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c | 28 ++++++++++---- drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 33 +++++++++++----- drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 24 ++++++++---- .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 45 ---------------------- 7 files changed, 65 insertions(+), 83 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 3abd2dc..6b294d2 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -880,43 +880,43 @@ struct amdgpu_rlc { /* for firmware data */ u32 save_and_restore_offset; u32 clear_state_descriptor_offset; u32 avail_scratch_ram_locations; u32 reg_restore_list_size; u32 reg_list_format_start; u32 reg_list_format_separate_start; u32 starting_offsets_start; u32 reg_list_format_size_bytes; u32 reg_list_size_bytes; u32 *register_list_format; u32 *register_restore; }; struct amdgpu_mec { struct amdgpu_bo *hpd_eop_obj; u64 hpd_eop_gpu_addr; struct amdgpu_bo *mec_fw_obj; u64 mec_fw_gpu_addr; - u32 num_pipe; u32 num_mec; - u32 num_queue; + u32 num_pipe_per_mec; + u32 num_queue_per_pipe; void *mqd_backup[AMDGPU_MAX_COMPUTE_RINGS + 1]; }; struct amdgpu_kiq { u64 eop_gpu_addr; struct amdgpu_bo *eop_obj; struct amdgpu_ring ring; struct amdgpu_irq_src irq; }; /* * GPU scratch registers structures, functions & helpers */ struct amdgpu_scratch { unsigned num_reg; uint32_t reg_base; uint32_t free_mask; }; /* diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c index 038b7ea..910f9d3 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c @@ -227,52 +227,41 @@ static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int pasid, * SW cleared it. So the protocol is to always wait & clear. */ uint32_t pasid_mapping = (pasid == 0) ? 0 : (uint32_t)pasid | ATC_VMID0_PASID_MAPPING__VALID_MASK; WREG32(mmATC_VMID0_PASID_MAPPING + vmid, pasid_mapping); while (!(RREG32(mmATC_VMID_PASID_MAPPING_UPDATE_STATUS) & (1U << vmid))) cpu_relax(); WREG32(mmATC_VMID_PASID_MAPPING_UPDATE_STATUS, 1U << vmid); /* Mapping vmid to pasid also for IH block */ WREG32(mmIH_VMID_0_LUT + vmid, pasid_mapping); return 0; } static int kgd_init_pipeline(struct kgd_dev *kgd, uint32_t pipe_id, uint32_t hpd_size, uint64_t hpd_gpu_addr) { - struct amdgpu_device *adev = get_amdgpu_device(kgd); - - uint32_t mec = (++pipe_id / CIK_PIPE_PER_MEC) + 1; - uint32_t pipe = (pipe_id % CIK_PIPE_PER_MEC); - - lock_srbm(kgd, mec, pipe, 0, 0); - WREG32(mmCP_HPD_EOP_BASE_ADDR, lower_32_bits(hpd_gpu_addr >> 8)); - WREG32(mmCP_HPD_EOP_BASE_ADDR_HI, upper_32_bits(hpd_gpu_addr >> 8)); - WREG32(mmCP_HPD_EOP_VMID, 0); - WREG32(mmCP_HPD_EOP_CONTROL, hpd_size); - unlock_srbm(kgd); - + /* amdgpu owns the per-pipe state */ return 0; } static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id) { struct amdgpu_device *adev = get_amdgpu_device(kgd); uint32_t mec; uint32_t pipe; mec = (pipe_id / CIK_PIPE_PER_MEC) + 1; pipe = (pipe_id % CIK_PIPE_PER_MEC); lock_srbm(kgd, mec, pipe, 0, 0); WREG32(mmCPC_INT_CNTL, CP_INT_CNTL_RING0__TIME_STAMP_INT_ENABLE_MASK | CP_INT_CNTL_RING0__OPCODE_ERROR_INT_ENABLE_MASK); unlock_srbm(kgd); return 0; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c index 8af2975..6ba94e9 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c @@ -189,40 +189,41 @@ static int kgd_set_pasid_vmid_mapping(struct kgd_dev *kgd, unsigned int pasid, * So the protocol is to always wait & clear. */ uint32_t pasid_mapping = (pasid == 0) ? 0 : (uint32_t)pasid | ATC_VMID0_PASID_MAPPING__VALID_MASK; WREG32(mmATC_VMID0_PASID_MAPPING + vmid, pasid_mapping); while (!(RREG32(mmATC_VMID_PASID_MAPPING_UPDATE_STATUS) & (1U << vmid))) cpu_relax(); WREG32(mmATC_VMID_PASID_MAPPING_UPDATE_STATUS, 1U << vmid); /* Mapping vmid to pasid also for IH block */ WREG32(mmIH_VMID_0_LUT + vmid, pasid_mapping); return 0; } static int kgd_init_pipeline(struct kgd_dev *kgd, uint32_t pipe_id, uint32_t hpd_size, uint64_t hpd_gpu_addr) { + /* amdgpu owns the per-pipe state */ return 0; } static int kgd_init_interrupts(struct kgd_dev *kgd, uint32_t pipe_id) { struct amdgpu_device *adev = get_amdgpu_device(kgd); uint32_t mec; uint32_t pipe; mec = (++pipe_id / VI_PIPE_PER_MEC) + 1; pipe = (pipe_id % VI_PIPE_PER_MEC); lock_srbm(kgd, mec, pipe, 0, 0); WREG32(mmCPC_INT_CNTL, CP_INT_CNTL_RING0__TIME_STAMP_INT_ENABLE_MASK); unlock_srbm(kgd); return 0; } diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c index 75ad957..41bda98 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c @@ -2810,84 +2810,98 @@ static void gfx_v7_0_cp_compute_fini(struct amdgpu_device *adev) static void gfx_v7_0_mec_fini(struct amdgpu_device *adev) { int r; if (adev->gfx.mec.hpd_eop_obj) { r = amdgpu_bo_reserve(adev->gfx.mec.hpd_eop_obj, false); if (unlikely(r != 0)) dev_warn(adev->dev, "(%d) reserve HPD EOP bo failed\n", r); amdgpu_bo_unpin(adev->gfx.mec.hpd_eop_obj); amdgpu_bo_unreserve(adev->gfx.mec.hpd_eop_obj); amdgpu_bo_unref(&adev->gfx.mec.hpd_eop_obj); adev->gfx.mec.hpd_eop_obj = NULL; } } static int gfx_v7_0_mec_init(struct amdgpu_device *adev) { int r; u32 *hpd; + size_t mec_hpd_size; /* * KV: 2 MEC, 4 Pipes/MEC, 8 Queues/Pipe - 64 Queues total * CI/KB: 1 MEC, 4 Pipes/MEC, 8 Queues/Pipe - 32 Queues total * Nonetheless, we assign only 1 pipe because all other pipes will * be handled by KFD */ - adev->gfx.mec.num_mec = 1; - adev->gfx.mec.num_pipe = 1; - adev->gfx.mec.num_queue = adev->gfx.mec.num_mec * adev->gfx.mec.num_pipe * 8; + switch (adev->asic_type) { + case CHIP_KAVERI: + adev->gfx.mec.num_mec = 2; + break; + case CHIP_BONAIRE: + case CHIP_HAWAII: + case CHIP_KABINI: + case CHIP_MULLINS: + default: + adev->gfx.mec.num_mec = 1; + break; + } + adev->gfx.mec.num_pipe_per_mec = 4; + adev->gfx.mec.num_queue_per_pipe = 8; + mec_hpd_size = adev->gfx.mec.num_mec * adev->gfx.mec.num_pipe_per_mec + * GFX7_MEC_HPD_SIZE * 2; if (adev->gfx.mec.hpd_eop_obj == NULL) { r = amdgpu_bo_create(adev, - adev->gfx.mec.num_mec * adev->gfx.mec.num_pipe * GFX7_MEC_HPD_SIZE * 2, + mec_hpd_size, PAGE_SIZE, true, AMDGPU_GEM_DOMAIN_GTT, 0, NULL, NULL, &adev->gfx.mec.hpd_eop_obj); if (r) { dev_warn(adev->dev, "(%d) create HDP EOP bo failed\n", r); return r; } } r = amdgpu_bo_reserve(adev->gfx.mec.hpd_eop_obj, false); if (unlikely(r != 0)) { gfx_v7_0_mec_fini(adev); return r; } r = amdgpu_bo_pin(adev->gfx.mec.hpd_eop_obj, AMDGPU_GEM_DOMAIN_GTT, &adev->gfx.mec.hpd_eop_gpu_addr); if (r) { dev_warn(adev->dev, "(%d) pin HDP EOP bo failed\n", r); gfx_v7_0_mec_fini(adev); return r; } r = amdgpu_bo_kmap(adev->gfx.mec.hpd_eop_obj, (void **)&hpd); if (r) { dev_warn(adev->dev, "(%d) map HDP EOP bo failed\n", r); gfx_v7_0_mec_fini(adev); return r; } /* clear memory. Not sure if this is required or not */ - memset(hpd, 0, adev->gfx.mec.num_mec * adev->gfx.mec.num_pipe * GFX7_MEC_HPD_SIZE * 2); + memset(hpd, 0, mec_hpd_size); amdgpu_bo_kunmap(adev->gfx.mec.hpd_eop_obj); amdgpu_bo_unreserve(adev->gfx.mec.hpd_eop_obj); return 0; } struct hqd_registers { u32 cp_mqd_base_addr; u32 cp_mqd_base_addr_hi; u32 cp_hqd_active; u32 cp_hqd_vmid; u32 cp_hqd_persistent_state; u32 cp_hqd_pipe_priority; u32 cp_hqd_queue_priority; u32 cp_hqd_quantum; u32 cp_hqd_pq_base; u32 cp_hqd_pq_base_hi; u32 cp_hqd_pq_rptr; @@ -3191,43 +3205,43 @@ static int gfx_v7_0_compute_queue_init(struct amdgpu_device *adev, int ring_id) /** * gfx_v7_0_cp_compute_resume - setup the compute queue registers * * @adev: amdgpu_device pointer * * Program the compute queues and test them to make sure they * are working. * Returns 0 for success, error for failure. */ static int gfx_v7_0_cp_compute_resume(struct amdgpu_device *adev) { int r, i, j; u32 tmp; struct amdgpu_ring *ring; /* fix up chicken bits */ tmp = RREG32(mmCP_CPF_DEBUG); tmp |= (1 << 23); WREG32(mmCP_CPF_DEBUG, tmp); - /* init the pipes */ + /* init all pipes (even the ones we don't own) */ for (i = 0; i < adev->gfx.mec.num_mec; i++) - for (j = 0; j < adev->gfx.mec.num_pipe; j++) + for (j = 0; j < adev->gfx.mec.num_pipe_per_mec; j++) gfx_v7_0_compute_pipe_init(adev, i, j); /* init the queues */ for (i = 0; i < adev->gfx.num_compute_rings; i++) { r = gfx_v7_0_compute_queue_init(adev, i); if (r) { gfx_v7_0_cp_compute_fini(adev); return r; } } gfx_v7_0_cp_compute_enable(adev, true); for (i = 0; i < adev->gfx.num_compute_rings; i++) { ring = &adev->gfx.compute_ring[i]; ring->ready = true; r = amdgpu_ring_test_ring(ring); if (r) ring->ready = false; } diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c index 938b280..29c487e 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c @@ -1401,81 +1401,96 @@ static int gfx_v8_0_kiq_init_ring(struct amdgpu_device *adev, ring->eop_gpu_addr = kiq->eop_gpu_addr; sprintf(ring->name, "kiq %d.%d.%d", ring->me, ring->pipe, ring->queue); r = amdgpu_ring_init(adev, ring, 1024, irq, AMDGPU_CP_KIQ_IRQ_DRIVER0); if (r) dev_warn(adev->dev, "(%d) failed to init kiq ring\n", r); return r; } static void gfx_v8_0_kiq_free_ring(struct amdgpu_ring *ring, struct amdgpu_irq_src *irq) { amdgpu_wb_free(ring->adev, ring->adev->virt.reg_val_offs); amdgpu_ring_fini(ring); } static int gfx_v8_0_mec_init(struct amdgpu_device *adev) { int r; u32 *hpd; + size_t mec_hpd_size; - /* - * we assign only 1 pipe because all other pipes will - * be handled by KFD - */ - adev->gfx.mec.num_mec = 1; - adev->gfx.mec.num_pipe = 1; - adev->gfx.mec.num_queue = adev->gfx.mec.num_mec * adev->gfx.mec.num_pipe * 8; + switch (adev->asic_type) { + case CHIP_FIJI: + case CHIP_TONGA: + case CHIP_POLARIS11: + case CHIP_POLARIS12: + case CHIP_POLARIS10: + case CHIP_CARRIZO: + adev->gfx.mec.num_mec = 2; + break; + case CHIP_TOPAZ: + case CHIP_STONEY: + default: + adev->gfx.mec.num_mec = 1; + break; + } + + adev->gfx.mec.num_pipe_per_mec = 4; + adev->gfx.mec.num_queue_per_pipe = 8; + + /* only 1 pipe of the first MEC is owned by amdgpu */ + mec_hpd_size = 1 * 1 * adev->gfx.mec.num_queue_per_pipe * GFX8_MEC_HPD_SIZE; if (adev->gfx.mec.hpd_eop_obj == NULL) { r = amdgpu_bo_create(adev, - adev->gfx.mec.num_queue * GFX8_MEC_HPD_SIZE, + mec_hpd_size, PAGE_SIZE, true, AMDGPU_GEM_DOMAIN_GTT, 0, NULL, NULL, &adev->gfx.mec.hpd_eop_obj); if (r) { dev_warn(adev->dev, "(%d) create HDP EOP bo failed\n", r); return r; } } r = amdgpu_bo_reserve(adev->gfx.mec.hpd_eop_obj, false); if (unlikely(r != 0)) { gfx_v8_0_mec_fini(adev); return r; } r = amdgpu_bo_pin(adev->gfx.mec.hpd_eop_obj, AMDGPU_GEM_DOMAIN_GTT, &adev->gfx.mec.hpd_eop_gpu_addr); if (r) { dev_warn(adev->dev, "(%d) pin HDP EOP bo failed\n", r); gfx_v8_0_mec_fini(adev); return r; } r = amdgpu_bo_kmap(adev->gfx.mec.hpd_eop_obj, (void **)&hpd); if (r) { dev_warn(adev->dev, "(%d) map HDP EOP bo failed\n", r); gfx_v8_0_mec_fini(adev); return r; } - memset(hpd, 0, adev->gfx.mec.num_queue * GFX8_MEC_HPD_SIZE); + memset(hpd, 0, mec_hpd_size); amdgpu_bo_kunmap(adev->gfx.mec.hpd_eop_obj); amdgpu_bo_unreserve(adev->gfx.mec.hpd_eop_obj); return 0; } static void gfx_v8_0_kiq_fini(struct amdgpu_device *adev) { struct amdgpu_kiq *kiq = &adev->gfx.kiq; amdgpu_bo_free_kernel(&kiq->eop_obj, &kiq->eop_gpu_addr, NULL); } static int gfx_v8_0_kiq_init(struct amdgpu_device *adev) { int r; u32 *hpd; struct amdgpu_kiq *kiq = &adev->gfx.kiq; diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c index 0de7ff4..d100b15 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c @@ -465,54 +465,62 @@ static void gfx_v9_0_mec_fini(struct amdgpu_device *adev) } if (adev->gfx.mec.mec_fw_obj) { r = amdgpu_bo_reserve(adev->gfx.mec.mec_fw_obj, false); if (unlikely(r != 0)) dev_warn(adev->dev, "(%d) reserve mec firmware bo failed\n", r); amdgpu_bo_unpin(adev->gfx.mec.mec_fw_obj); amdgpu_bo_unreserve(adev->gfx.mec.mec_fw_obj); amdgpu_bo_unref(&adev->gfx.mec.mec_fw_obj); adev->gfx.mec.mec_fw_obj = NULL; } } static int gfx_v9_0_mec_init(struct amdgpu_device *adev) { int r; u32 *hpd; const __le32 *fw_data; unsigned fw_size; u32 *fw; + size_t mec_hpd_size; const struct gfx_firmware_header_v1_0 *mec_hdr; - /* - * we assign only 1 pipe because all other pipes will - * be handled by KFD - */ - adev->gfx.mec.num_mec = 1; - adev->gfx.mec.num_pipe = 1; - adev->gfx.mec.num_queue = adev->gfx.mec.num_mec * adev->gfx.mec.num_pipe * 8; + switch (adev->asic_type) { + case CHIP_VEGA10: + adev->gfx.mec.num_mec = 2; + break; + default: + adev->gfx.mec.num_mec = 1; + break; + } + + adev->gfx.mec.num_pipe_per_mec = 4; + adev->gfx.mec.num_queue_per_pipe = 8; + + /* only 1 pipe of the first MEC is owned by amdgpu */ + mec_hpd_size = 1 * 1 * adev->gfx.mec.num_queue_per_pipe * GFX9_MEC_HPD_SIZE; if (adev->gfx.mec.hpd_eop_obj == NULL) { r = amdgpu_bo_create(adev, - adev->gfx.mec.num_queue * GFX9_MEC_HPD_SIZE, + mec_hpd_size, PAGE_SIZE, true, AMDGPU_GEM_DOMAIN_GTT, 0, NULL, NULL, &adev->gfx.mec.hpd_eop_obj); if (r) { dev_warn(adev->dev, "(%d) create HDP EOP bo failed\n", r); return r; } } r = amdgpu_bo_reserve(adev->gfx.mec.hpd_eop_obj, false); if (unlikely(r != 0)) { gfx_v9_0_mec_fini(adev); return r; } r = amdgpu_bo_pin(adev->gfx.mec.hpd_eop_obj, AMDGPU_GEM_DOMAIN_GTT, &adev->gfx.mec.hpd_eop_gpu_addr); if (r) { dev_warn(adev->dev, "(%d) pin HDP EOP bo failed\n", r); gfx_v9_0_mec_fini(adev); return r; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c index f49c551..c064dea 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -455,89 +455,44 @@ static int unregister_process_nocpsch(struct device_queue_manager *dqm, } static int set_pasid_vmid_mapping(struct device_queue_manager *dqm, unsigned int pasid, unsigned int vmid) { uint32_t pasid_mapping; pasid_mapping = (pasid == 0) ? 0 : (uint32_t)pasid | ATC_VMID_PASID_MAPPING_VALID; return dqm->dev->kfd2kgd->set_pasid_vmid_mapping( dqm->dev->kgd, pasid_mapping, vmid); } int init_pipelines(struct device_queue_manager *dqm, unsigned int pipes_num, unsigned int first_pipe) { - void *hpdptr; - struct mqd_manager *mqd; - unsigned int i, err, inx; - uint64_t pipe_hpd_addr; - BUG_ON(!dqm || !dqm->dev); pr_debug("kfd: In func %s\n", __func__); - /* - * Allocate memory for the HPDs. This is hardware-owned per-pipe data. - * The driver never accesses this memory after zeroing it. - * It doesn't even have to be saved/restored on suspend/resume - * because it contains no data when there are no active queues. - */ - - err = kfd_gtt_sa_allocate(dqm->dev, CIK_HPD_EOP_BYTES * pipes_num, - &dqm->pipeline_mem); - - if (err) { - pr_err("kfd: error allocate vidmem num pipes: %d\n", - pipes_num); - return -ENOMEM; - } - - hpdptr = dqm->pipeline_mem->cpu_ptr; - dqm->pipelines_addr = dqm->pipeline_mem->gpu_addr; - - memset(hpdptr, 0, CIK_HPD_EOP_BYTES * pipes_num); - - mqd = dqm->ops.get_mqd_manager(dqm, KFD_MQD_TYPE_COMPUTE); - if (mqd == NULL) { - kfd_gtt_sa_free(dqm->dev, dqm->pipeline_mem); - return -ENOMEM; - } - - for (i = 0; i < pipes_num; i++) { - inx = i + first_pipe; - /* - * HPD buffer on GTT is allocated by amdkfd, no need to waste - * space in GTT for pipelines we don't initialize - */ - pipe_hpd_addr = dqm->pipelines_addr + i * CIK_HPD_EOP_BYTES; - pr_debug("kfd: pipeline address %llX\n", pipe_hpd_addr); - /* = log2(bytes/4)-1 */ - dqm->dev->kfd2kgd->init_pipeline(dqm->dev->kgd, inx, - CIK_HPD_EOP_BYTES_LOG2 - 3, pipe_hpd_addr); - } - return 0; } static void init_interrupts(struct device_queue_manager *dqm) { unsigned int i; BUG_ON(dqm == NULL); for (i = 0 ; i < get_pipes_num(dqm) ; i++) dqm->dev->kfd2kgd->init_interrupts(dqm->dev->kgd, i + get_first_pipe(dqm)); } static int init_scheduler(struct device_queue_manager *dqm) { int retval; BUG_ON(!dqm); -- 2.9.3