RE: [PATCH] drm/amdkfd: Fix CU occupancy calculations for GFX 9.4.3

"Kasiviswanathan, Harish" <Harish.Kasiviswanathan@xxxxxxx> · Thu, 19 Sep 2024 19:56:09 +0000

[AMD Official Use Only - AMD Internal Distribution Only]

This looks like a more robust way to get this information. Few comments inline.

- It might be better to break this into two commits. One specific to 9.4.3 and other to use doorbell instead of LUT register. The second change affects all gfx9 ASICs.

-----Original Message-----
From: Joshi, Mukul <Mukul.Joshi@xxxxxxx>
Sent: Thursday, September 19, 2024 1:43 PM
To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx
Cc: Kasiviswanathan, Harish <Harish.Kasiviswanathan@xxxxxxx>; Joshi, Mukul <Mukul.Joshi@xxxxxxx>
Subject: [PATCH] drm/amdkfd: Fix CU occupancy calculations for GFX 9.4.3

Make CU occupancy work on GFX 9.4.3 by updating the code
to handle multiple XCCs correctly. Additionally, update
the algorithm to calculate CU occupancy by matching
dorrbell offset of the queue against the process's queues,
instead of using the IH_VMID_X_LUT registers since that can
be racy with CP updating the VMID-PASID mapping.

Signed-off-by: Mukul Joshi <mukul.joshi@xxxxxxx>
---
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c | 92 ++++++++-----------
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h |  5 +-
 .../drm/amd/amdkfd/kfd_device_queue_manager.c | 20 ++++
 .../drm/amd/amdkfd/kfd_device_queue_manager.h |  3 +
 drivers/gpu/drm/amd/amdkfd/kfd_process.c      | 18 +++-
 .../gpu/drm/amd/include/kgd_kfd_interface.h   | 10 +-
 6 files changed, 88 insertions(+), 60 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
index 1254a43ec96b..b8c01257b101 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
@@ -950,28 +950,30 @@ static void unlock_spi_csq_mutexes(struct amdgpu_device *adev)
  * @inst: xcc's instance number on a multi-XCC setup
  */
 static void get_wave_count(struct amdgpu_device *adev, int queue_idx,
-               int *wave_cnt, int *vmid, uint32_t inst)
+               struct kfd_cu_occupancy *queue_cnt, uint32_t inst)
 {
        int pipe_idx;
        int queue_slot;
        unsigned int reg_val;
-
+       unsigned int wave_cnt;
        /*
         * Program GRBM with appropriate MEID, PIPEID, QUEUEID and VMID
         * parameters to read out waves in flight. Get VMID if there are
         * non-zero waves in flight.
         */
-       *vmid = 0xFF;
-       *wave_cnt = 0;
        pipe_idx = queue_idx / adev->gfx.mec.num_queue_per_pipe;
        queue_slot = queue_idx % adev->gfx.mec.num_queue_per_pipe;
-       soc15_grbm_select(adev, 1, pipe_idx, queue_slot, 0, inst);
-       reg_val = RREG32_SOC15_IP(GC, SOC15_REG_OFFSET(GC, inst, mmSPI_CSQ_WF_ACTIVE_COUNT_0) +
-                        queue_slot);
-       *wave_cnt = reg_val & SPI_CSQ_WF_ACTIVE_COUNT_0__COUNT_MASK;
-       if (*wave_cnt != 0)
-               *vmid = (RREG32_SOC15(GC, inst, mmCP_HQD_VMID) &
-                        CP_HQD_VMID__VMID_MASK) >> CP_HQD_VMID__VMID__SHIFT;
+       soc15_grbm_select(adev, 1, pipe_idx, queue_slot, 0, GET_INST(GC, inst));
+       reg_val = RREG32_SOC15_IP(GC, SOC15_REG_OFFSET(GC, GET_INST(GC, inst),
+                                 mmSPI_CSQ_WF_ACTIVE_COUNT_0) + queue_slot);
+       wave_cnt = reg_val & SPI_CSQ_WF_ACTIVE_COUNT_0__COUNT_MASK;
+       if (wave_cnt != 0) {

Why is it queue_cnt->wave_cnt accumulating instead of direct assignment. There is some inefficiency here. The code is iterating over all queues two times once in kfd_get_cu_occupancy and once more in kgd_gfx_v9_get_cu_occupancy(). Is that strictly required?


+               queue_cnt->wave_cnt += wave_cnt;
+               queue_cnt->doorbell_off =
+                       (RREG32_SOC15(GC, GET_INST(GC, inst), mmCP_HQD_PQ_DOORBELL_CONTROL) &
+                        CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_OFFSET_MASK) >>
+                        CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_OFFSET__SHIFT;
+       }
 }

 /**
@@ -1021,24 +1023,19 @@ static void get_wave_count(struct amdgpu_device *adev, int queue_idx,
  *
  *  Reading registers referenced above involves programming GRBM appropriately
  */
-void kgd_gfx_v9_get_cu_occupancy(struct amdgpu_device *adev, int pasid,
-               int *pasid_wave_cnt, int *max_waves_per_cu, uint32_t inst)
+void kgd_gfx_v9_get_cu_occupancy(struct amdgpu_device *adev,
+                                struct kfd_cu_occupancy *cu_occupancy,
+                                int *max_waves_per_cu, uint32_t inst)
 {
        int qidx;
-       int vmid;
        int se_idx;
-       int sh_idx;
        int se_cnt;
-       int sh_cnt;
-       int wave_cnt;
        int queue_map;
-       int pasid_tmp;
        int max_queue_cnt;
-       int vmid_wave_cnt = 0;
        DECLARE_BITMAP(cp_queue_bitmap, AMDGPU_MAX_QUEUES);

        lock_spi_csq_mutexes(adev);
-       soc15_grbm_select(adev, 1, 0, 0, 0, inst);
+       soc15_grbm_select(adev, 1, 0, 0, 0, GET_INST(GC, inst));

        /*
         * Iterate through the shader engines and arrays of the device
@@ -1048,51 +1045,38 @@ void kgd_gfx_v9_get_cu_occupancy(struct amdgpu_device *adev, int pasid,
                          AMDGPU_MAX_QUEUES);
        max_queue_cnt = adev->gfx.mec.num_pipe_per_mec *
                        adev->gfx.mec.num_queue_per_pipe;
-       sh_cnt = adev->gfx.config.max_sh_per_se;
        se_cnt = adev->gfx.config.max_shader_engines;
        for (se_idx = 0; se_idx < se_cnt; se_idx++) {
-               for (sh_idx = 0; sh_idx < sh_cnt; sh_idx++) {
+               amdgpu_gfx_select_se_sh(adev, se_idx, 0, 0xffffffff, inst);
+               queue_map = RREG32_SOC15(GC, GET_INST(GC, inst), mmSPI_CSQ_WF_ACTIVE_STATUS);
+
+               /*
+                * Assumption: queue map encodes following schema: four
+                * pipes per each micro-engine, with each pipe mapping
+                * eight queues. This schema is true for GFX9 devices
+                * and must be verified for newer device families
+                */
+               for (qidx = 0; qidx < max_queue_cnt; qidx++) {
+                       /* Skip qeueus that are not associated with
+                        * compute functions
+                        */
+                       if (!test_bit(qidx, cp_queue_bitmap))
+                               continue;

-                       amdgpu_gfx_select_se_sh(adev, se_idx, sh_idx, 0xffffffff, inst);
-                       queue_map = RREG32_SOC15(GC, inst, mmSPI_CSQ_WF_ACTIVE_STATUS);
+                       if (!(queue_map & (1 << qidx)))
+                               continue;

-                       /*
-                        * Assumption: queue map encodes following schema: four
-                        * pipes per each micro-engine, with each pipe mapping
-                        * eight queues. This schema is true for GFX9 devices
-                        * and must be verified for newer device families
-                        */
-                       for (qidx = 0; qidx < max_queue_cnt; qidx++) {
-
-                               /* Skip qeueus that are not associated with
-                                * compute functions
-                                */
-                               if (!test_bit(qidx, cp_queue_bitmap))
-                                       continue;
-
-                               if (!(queue_map & (1 << qidx)))
-                                       continue;
-
-                               /* Get number of waves in flight and aggregate them */
-                               get_wave_count(adev, qidx, &wave_cnt, &vmid,
-                                               inst);
-                               if (wave_cnt != 0) {
-                                       pasid_tmp =
-                                         RREG32(SOC15_REG_OFFSET(OSSSYS, inst,
-                                                mmIH_VMID_0_LUT) + vmid);
-                                       if (pasid_tmp == pasid)
-                                               vmid_wave_cnt += wave_cnt;
-                               }
-                       }
+                       /* Get number of waves in flight and aggregate them */
+                       get_wave_count(adev, qidx, &cu_occupancy[qidx],
+                                       inst);
                }
        }

        amdgpu_gfx_select_se_sh(adev, 0xffffffff, 0xffffffff, 0xffffffff, inst);
-       soc15_grbm_select(adev, 0, 0, 0, 0, inst);
+       soc15_grbm_select(adev, 0, 0, 0, 0, GET_INST(GC, inst));
        unlock_spi_csq_mutexes(adev);

        /* Update the output parameters and return */
-       *pasid_wave_cnt = vmid_wave_cnt;
        *max_waves_per_cu = adev->gfx.cu_info.simd_per_cu *
                                adev->gfx.cu_info.max_waves_per_simd;
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
index 988c50ac3be0..b6a91a552aa4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
@@ -52,8 +52,9 @@ bool kgd_gfx_v9_get_atc_vmid_pasid_mapping_info(struct amdgpu_device *adev,
                                        uint8_t vmid, uint16_t *p_pasid);
 void kgd_gfx_v9_set_vm_context_page_table_base(struct amdgpu_device *adev,
                        uint32_t vmid, uint64_t page_table_base);
-void kgd_gfx_v9_get_cu_occupancy(struct amdgpu_device *adev, int pasid,
-               int *pasid_wave_cnt, int *max_waves_per_cu, uint32_t inst);
+void kgd_gfx_v9_get_cu_occupancy(struct amdgpu_device *adev,
+                                struct kfd_cu_occupancy *cu_occupancy,
+                                int *max_waves_per_cu, uint32_t inst);
 void kgd_gfx_v9_program_trap_handler_settings(struct amdgpu_device *adev,
                uint32_t vmid, uint64_t tba_addr, uint64_t tma_addr,
                uint32_t inst);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 71b465f8d83e..784d155d8bcc 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -3540,6 +3540,26 @@ int debug_refresh_runlist(struct device_queue_manager *dqm)
        return debug_map_and_unlock(dqm);
 }

+bool kfd_dqm_find_queue_by_doorbell_off(struct device_queue_manager *dqm,
+                                       struct qcm_process_device *qpd,
+                                       int doorbell_off)
+{
+       struct queue *q;
+       bool r = false;
+
+       dqm_lock(dqm);
+
+       list_for_each_entry(q, &qpd->queues_list, list) {
+               if (q->properties.doorbell_off == doorbell_off) {
+                       r = true;
+                       goto out;
+               }
+       }
+
+out:
+       dqm_unlock(dqm);
+       return r;
+}
 #if defined(CONFIG_DEBUG_FS)

 static void seq_reg_dump(struct seq_file *m,
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
index 08b40826ad1e..e5951589e5bd 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
@@ -324,6 +324,9 @@ void set_queue_snapshot_entry(struct queue *q,
 int debug_lock_and_unmap(struct device_queue_manager *dqm);
 int debug_map_and_unlock(struct device_queue_manager *dqm);
 int debug_refresh_runlist(struct device_queue_manager *dqm);
+bool kfd_dqm_find_queue_by_doorbell_off(struct device_queue_manager *dqm,
+                                       struct qcm_process_device *qpd,
+                                       int doorbell_off);

 static inline unsigned int get_sh_mem_bases_32(struct kfd_process_device *pdd)
 {
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index a902950cc060..6720bd30517b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -270,6 +270,10 @@ static int kfd_get_cu_occupancy(struct attribute *attr, char *buffer)
        struct kfd_node *dev = NULL;
        struct kfd_process *proc = NULL;
        struct kfd_process_device *pdd = NULL;
+       int i;
+       struct kfd_cu_occupancy cu_occupancy[AMDGPU_MAX_QUEUES];
+

You can just use sizeof(cu_occupancy) as it is an array

+       memset(cu_occupancy, 0x0, sizeof(struct kfd_cu_occupancy) * AMDGPU_MAX_QUEUES);



        pdd = container_of(attr, struct kfd_process_device, attr_cu_occupancy);
        dev = pdd->dev;
@@ -287,9 +291,19 @@ static int kfd_get_cu_occupancy(struct attribute *attr, char *buffer)
        /* Collect wave count from device if it supports */
        wave_cnt = 0;
        max_waves_per_cu = 0;
-       dev->kfd2kgd->get_cu_occupancy(dev->adev, proc->pasid, &wave_cnt,
-                       &max_waves_per_cu, 0);


A comment as to why you are using only the first xcc_mask will be helpful.

+       dev->kfd2kgd->get_cu_occupancy(dev->adev, cu_occupancy,
+                       &max_waves_per_cu, ffs(dev->xcc_mask) - 1);
+
+       for (i = 0; i < AMDGPU_MAX_QUEUES; i++) {
+               if (cu_occupancy[i].wave_cnt != 0 &&
+                   kfd_dqm_find_queue_by_doorbell_off(dev->dqm, &pdd->qpd,
+                                               cu_occupancy[i].doorbell_off))
+                       wave_cnt += cu_occupancy[i].wave_cnt;
+       }
+
+       /* Update wave_cnt for the number of XCCs in the partition */
+       wave_cnt *= NUM_XCC(dev->xcc_mask);
        /* Translate wave count to number of compute units */
        cu_cnt = (wave_cnt + (max_waves_per_cu - 1)) / max_waves_per_cu;
        return snprintf(buffer, PAGE_SIZE, "%d\n", cu_cnt);
diff --git a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
index 7744ca3ef4b1..e3e635a31b8a 100644
--- a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
+++ b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
@@ -71,6 +71,11 @@ enum kgd_memory_pool {
        KGD_POOL_FRAMEBUFFER = 3,
 };

+struct kfd_cu_occupancy {
+       u32 wave_cnt;
+       u32 doorbell_off;
+};
+
 /**
  * enum kfd_sched_policy
  *
@@ -313,8 +318,9 @@ struct kfd2kgd_calls {
                        uint32_t grace_period,
                        uint32_t *reg_offset,
                        uint32_t *reg_data);
-       void (*get_cu_occupancy)(struct amdgpu_device *adev, int pasid,
-                       int *wave_cnt, int *max_waves_per_cu, uint32_t inst);
+       void (*get_cu_occupancy)(struct amdgpu_device *adev,
+                                struct kfd_cu_occupancy *cu_occupancy,
+                                int *max_waves_per_cu, uint32_t inst);
        void (*program_trap_handler_settings)(struct amdgpu_device *adev,
                        uint32_t vmid, uint64_t tba_addr, uint64_t tma_addr,
                        uint32_t inst);
--
2.35.1