Make CU occupancy calculations work on GFX 9.4.3 by updating the logic to handle multiple XCCs correctly. Signed-off-by: Mukul Joshi <mukul.joshi@xxxxxxx> Reviewed-by: Harish Kasiviswanathan <Harish.Kasiviswanathan@xxxxxxx> (v2) --- v1->v2: - Break into 2 patches, one for the generic change and the other for GFX v9.4.3. - Incorporate Harish's comments. v2->v3: - Update code to handle both PM4 and AQL queues. .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c | 12 +++++------ .../drm/amd/amdkfd/kfd_device_queue_manager.c | 6 +++++- .../drm/amd/amdkfd/kfd_device_queue_manager.h | 2 +- drivers/gpu/drm/amd/amdkfd/kfd_process.c | 20 ++++++++++++++++--- 4 files changed, 29 insertions(+), 11 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c index 26b1f37c316e..3bc0cbf45bc5 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c @@ -963,14 +963,14 @@ static void get_wave_count(struct amdgpu_device *adev, int queue_idx, */ pipe_idx = queue_idx / adev->gfx.mec.num_queue_per_pipe; queue_slot = queue_idx % adev->gfx.mec.num_queue_per_pipe; - soc15_grbm_select(adev, 1, pipe_idx, queue_slot, 0, inst); - reg_val = RREG32_SOC15_IP(GC, SOC15_REG_OFFSET(GC, inst, + soc15_grbm_select(adev, 1, pipe_idx, queue_slot, 0, GET_INST(GC, inst)); + reg_val = RREG32_SOC15_IP(GC, SOC15_REG_OFFSET(GC, GET_INST(GC, inst), mmSPI_CSQ_WF_ACTIVE_COUNT_0) + queue_slot); wave_cnt = reg_val & SPI_CSQ_WF_ACTIVE_COUNT_0__COUNT_MASK; if (wave_cnt != 0) { queue_cnt->wave_cnt += wave_cnt; queue_cnt->doorbell_off = - (RREG32_SOC15(GC, inst, mmCP_HQD_PQ_DOORBELL_CONTROL) & + (RREG32_SOC15(GC, GET_INST(GC, inst), mmCP_HQD_PQ_DOORBELL_CONTROL) & CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_OFFSET_MASK) >> CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_OFFSET__SHIFT; } @@ -1033,7 +1033,7 @@ void kgd_gfx_v9_get_cu_occupancy(struct amdgpu_device *adev, DECLARE_BITMAP(cp_queue_bitmap, AMDGPU_MAX_QUEUES); lock_spi_csq_mutexes(adev); - soc15_grbm_select(adev, 1, 0, 0, 0, inst); + soc15_grbm_select(adev, 1, 0, 0, 0, GET_INST(GC, inst)); /* * Iterate through the shader engines and arrays of the device @@ -1046,7 +1046,7 @@ void kgd_gfx_v9_get_cu_occupancy(struct amdgpu_device *adev, se_cnt = adev->gfx.config.max_shader_engines; for (se_idx = 0; se_idx < se_cnt; se_idx++) { amdgpu_gfx_select_se_sh(adev, se_idx, 0, 0xffffffff, inst); - queue_map = RREG32_SOC15(GC, inst, mmSPI_CSQ_WF_ACTIVE_STATUS); + queue_map = RREG32_SOC15(GC, GET_INST(GC, inst), mmSPI_CSQ_WF_ACTIVE_STATUS); /* * Assumption: queue map encodes following schema: four @@ -1071,7 +1071,7 @@ void kgd_gfx_v9_get_cu_occupancy(struct amdgpu_device *adev, } amdgpu_gfx_select_se_sh(adev, 0xffffffff, 0xffffffff, 0xffffffff, inst); - soc15_grbm_select(adev, 0, 0, 0, 0, inst); + soc15_grbm_select(adev, 0, 0, 0, 0, GET_INST(GC, inst)); unlock_spi_csq_mutexes(adev); /* Update the output parameters and return */ diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c index 29578550b478..648f40091aa3 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -3542,15 +3542,19 @@ int debug_refresh_runlist(struct device_queue_manager *dqm) bool kfd_dqm_is_queue_in_process(struct device_queue_manager *dqm, struct qcm_process_device *qpd, - int doorbell_off) + int doorbell_off, u32 *queue_format) { struct queue *q; bool r = false; + if (!queue_format) + return r; + dqm_lock(dqm); list_for_each_entry(q, &qpd->queues_list, list) { if (q->properties.doorbell_off == doorbell_off) { + *queue_format = q->properties.format; r = true; goto out; } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h index 80be2036abea..09ab36f8e8c6 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h @@ -326,7 +326,7 @@ int debug_map_and_unlock(struct device_queue_manager *dqm); int debug_refresh_runlist(struct device_queue_manager *dqm); bool kfd_dqm_is_queue_in_process(struct device_queue_manager *dqm, struct qcm_process_device *qpd, - int doorbell_off); + int doorbell_off, u32 *queue_format); static inline unsigned int get_sh_mem_bases_32(struct kfd_process_device *pdd) { diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c index d73841268c9b..d07acf1b2f93 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c @@ -272,6 +272,7 @@ static int kfd_get_cu_occupancy(struct attribute *attr, char *buffer) struct kfd_process_device *pdd = NULL; int i; struct kfd_cu_occupancy cu_occupancy[AMDGPU_MAX_QUEUES]; + u32 queue_format; memset(cu_occupancy, 0x0, sizeof(cu_occupancy)); @@ -292,14 +293,27 @@ static int kfd_get_cu_occupancy(struct attribute *attr, char *buffer) wave_cnt = 0; max_waves_per_cu = 0; + /* + * For GFX 9.4.3, fetch the CU occupancy from the first XCC in the partition. + * For AQL queues, because of cooperative dispatch we multiply the wave count + * by number of XCCs in the partition to get the total wave counts across all + * XCCs in the partition. + * For PM4 queues, there is no cooperative dispatch so wave_cnt stay as it is. + */ dev->kfd2kgd->get_cu_occupancy(dev->adev, cu_occupancy, - &max_waves_per_cu, 0); + &max_waves_per_cu, ffs(dev->xcc_mask) - 1); for (i = 0; i < AMDGPU_MAX_QUEUES; i++) { if (cu_occupancy[i].wave_cnt != 0 && kfd_dqm_is_queue_in_process(dev->dqm, &pdd->qpd, - cu_occupancy[i].doorbell_off)) - wave_cnt += cu_occupancy[i].wave_cnt; + cu_occupancy[i].doorbell_off, + &queue_format)) { + if (unlikely(queue_format == KFD_QUEUE_FORMAT_PM4)) + wave_cnt += cu_occupancy[i].wave_cnt; + else + wave_cnt += (NUM_XCC(dev->xcc_mask) * + cu_occupancy[i].wave_cnt); + } } /* Translate wave count to number of compute units */ -- 2.35.1