[PATCHv3 2/2] drm/amdkfd: Fix CU occupancy for GFX 9.4.3

Mukul Joshi <mukul.joshi@xxxxxxx> · Mon, 23 Sep 2024 13:40:46 -0400

Make CU occupancy calculations work on GFX 9.4.3 by
updating the logic to handle multiple XCCs correctly.

Signed-off-by: Mukul Joshi <mukul.joshi@xxxxxxx>
Reviewed-by: Harish Kasiviswanathan <Harish.Kasiviswanathan@xxxxxxx> (v2)
---
v1->v2:
- Break into 2 patches, one for the generic change
  and the other for GFX v9.4.3.
- Incorporate Harish's comments.

v2->v3:
- Update code to handle both PM4 and AQL queues.

 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c | 12 +++++------
 .../drm/amd/amdkfd/kfd_device_queue_manager.c |  6 +++++-
 .../drm/amd/amdkfd/kfd_device_queue_manager.h |  2 +-
 drivers/gpu/drm/amd/amdkfd/kfd_process.c      | 20 ++++++++++++++++---
 4 files changed, 29 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
index 26b1f37c316e..3bc0cbf45bc5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
@@ -963,14 +963,14 @@ static void get_wave_count(struct amdgpu_device *adev, int queue_idx,
 	 */
 	pipe_idx = queue_idx / adev->gfx.mec.num_queue_per_pipe;
 	queue_slot = queue_idx % adev->gfx.mec.num_queue_per_pipe;
-	soc15_grbm_select(adev, 1, pipe_idx, queue_slot, 0, inst);
-	reg_val = RREG32_SOC15_IP(GC, SOC15_REG_OFFSET(GC, inst,
+	soc15_grbm_select(adev, 1, pipe_idx, queue_slot, 0, GET_INST(GC, inst));
+	reg_val = RREG32_SOC15_IP(GC, SOC15_REG_OFFSET(GC, GET_INST(GC, inst),
 				  mmSPI_CSQ_WF_ACTIVE_COUNT_0) + queue_slot);
 	wave_cnt = reg_val & SPI_CSQ_WF_ACTIVE_COUNT_0__COUNT_MASK;
 	if (wave_cnt != 0) {
 		queue_cnt->wave_cnt += wave_cnt;
 		queue_cnt->doorbell_off =
-			(RREG32_SOC15(GC, inst, mmCP_HQD_PQ_DOORBELL_CONTROL) &
+			(RREG32_SOC15(GC, GET_INST(GC, inst), mmCP_HQD_PQ_DOORBELL_CONTROL) &
 			 CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_OFFSET_MASK) >>
 			 CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_OFFSET__SHIFT;
 	}
@@ -1033,7 +1033,7 @@ void kgd_gfx_v9_get_cu_occupancy(struct amdgpu_device *adev,
 	DECLARE_BITMAP(cp_queue_bitmap, AMDGPU_MAX_QUEUES);
 
 	lock_spi_csq_mutexes(adev);
-	soc15_grbm_select(adev, 1, 0, 0, 0, inst);
+	soc15_grbm_select(adev, 1, 0, 0, 0, GET_INST(GC, inst));
 
 	/*
 	 * Iterate through the shader engines and arrays of the device
@@ -1046,7 +1046,7 @@ void kgd_gfx_v9_get_cu_occupancy(struct amdgpu_device *adev,
 	se_cnt = adev->gfx.config.max_shader_engines;
 	for (se_idx = 0; se_idx < se_cnt; se_idx++) {
 		amdgpu_gfx_select_se_sh(adev, se_idx, 0, 0xffffffff, inst);
-		queue_map = RREG32_SOC15(GC, inst, mmSPI_CSQ_WF_ACTIVE_STATUS);
+		queue_map = RREG32_SOC15(GC, GET_INST(GC, inst), mmSPI_CSQ_WF_ACTIVE_STATUS);
 
 		/*
 		 * Assumption: queue map encodes following schema: four
@@ -1071,7 +1071,7 @@ void kgd_gfx_v9_get_cu_occupancy(struct amdgpu_device *adev,
 	}
 
 	amdgpu_gfx_select_se_sh(adev, 0xffffffff, 0xffffffff, 0xffffffff, inst);
-	soc15_grbm_select(adev, 0, 0, 0, 0, inst);
+	soc15_grbm_select(adev, 0, 0, 0, 0, GET_INST(GC, inst));
 	unlock_spi_csq_mutexes(adev);
 
 	/* Update the output parameters and return */
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 29578550b478..648f40091aa3 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -3542,15 +3542,19 @@ int debug_refresh_runlist(struct device_queue_manager *dqm)
 
 bool kfd_dqm_is_queue_in_process(struct device_queue_manager *dqm,
 				 struct qcm_process_device *qpd,
-				 int doorbell_off)
+				 int doorbell_off, u32 *queue_format)
 {
 	struct queue *q;
 	bool r = false;
 
+	if (!queue_format)
+		return r;
+
 	dqm_lock(dqm);
 
 	list_for_each_entry(q, &qpd->queues_list, list) {
 		if (q->properties.doorbell_off == doorbell_off) {
+			*queue_format = q->properties.format;
 			r = true;
 			goto out;
 		}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
index 80be2036abea..09ab36f8e8c6 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
@@ -326,7 +326,7 @@ int debug_map_and_unlock(struct device_queue_manager *dqm);
 int debug_refresh_runlist(struct device_queue_manager *dqm);
 bool kfd_dqm_is_queue_in_process(struct device_queue_manager *dqm,
 				 struct qcm_process_device *qpd,
-				 int doorbell_off);
+				 int doorbell_off, u32 *queue_format);
 
 static inline unsigned int get_sh_mem_bases_32(struct kfd_process_device *pdd)
 {
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index d73841268c9b..d07acf1b2f93 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -272,6 +272,7 @@ static int kfd_get_cu_occupancy(struct attribute *attr, char *buffer)
 	struct kfd_process_device *pdd = NULL;
 	int i;
 	struct kfd_cu_occupancy cu_occupancy[AMDGPU_MAX_QUEUES];
+	u32 queue_format;
 
 	memset(cu_occupancy, 0x0, sizeof(cu_occupancy));
 
@@ -292,14 +293,27 @@ static int kfd_get_cu_occupancy(struct attribute *attr, char *buffer)
 	wave_cnt = 0;
 	max_waves_per_cu = 0;
 
+	/*
+	 * For GFX 9.4.3, fetch the CU occupancy from the first XCC in the partition.
+	 * For AQL queues, because of cooperative dispatch we multiply the wave count
+	 * by number of XCCs in the partition to get the total wave counts across all
+	 * XCCs in the partition.
+	 * For PM4 queues, there is no cooperative dispatch so wave_cnt stay as it is.
+	 */
 	dev->kfd2kgd->get_cu_occupancy(dev->adev, cu_occupancy,
-			&max_waves_per_cu, 0);
+			&max_waves_per_cu, ffs(dev->xcc_mask) - 1);
 
 	for (i = 0; i < AMDGPU_MAX_QUEUES; i++) {
 		if (cu_occupancy[i].wave_cnt != 0 &&
 		    kfd_dqm_is_queue_in_process(dev->dqm, &pdd->qpd,
-						cu_occupancy[i].doorbell_off))
-			wave_cnt += cu_occupancy[i].wave_cnt;
+						cu_occupancy[i].doorbell_off,
+						&queue_format)) {
+			if (unlikely(queue_format == KFD_QUEUE_FORMAT_PM4))
+				wave_cnt += cu_occupancy[i].wave_cnt;
+			else
+				wave_cnt += (NUM_XCC(dev->xcc_mask) *
+						cu_occupancy[i].wave_cnt);
+		}
 	}
 
 	/* Translate wave count to number of compute units */
-- 
2.35.1