RE: [PATCHv2 1/2] drm/amdkfd: Update logic for CU occupancy calculations

"Kasiviswanathan, Harish" <Harish.Kasiviswanathan@xxxxxxx> · Mon, 23 Sep 2024 14:31:00 +0000

[AMD Official Use Only - AMD Internal Distribution Only]

One very minor comment below. Apart from that Reviewed-by: Harish Kasiviswanathan <Harish.Kasiviswanathan@xxxxxxx>


-----Original Message-----
From: Joshi, Mukul <Mukul.Joshi@xxxxxxx>
Sent: Friday, September 20, 2024 5:07 PM
To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx
Cc: Kasiviswanathan, Harish <Harish.Kasiviswanathan@xxxxxxx>; Joshi, Mukul <Mukul.Joshi@xxxxxxx>
Subject: [PATCHv2 1/2] drm/amdkfd: Update logic for CU occupancy calculations

Currently, the code use the IH_VMID_X_LUT register to map
a queue's vmid to the corresponding PASID. This logic is racy
since the CP can update the VMID-PASID mapping anytime especially
when there are more processes than number of vmids. Update the logic
to calculate CU occupancy by matching doorbell offset of the queue
with valid wave counts against the process's queues,

Signed-off-by: Mukul Joshi <mukul.joshi@xxxxxxx>
---
v1->v2:
- Break into 2 patches, one for the generic change
  and the other for GFX v9.4.3.
- Incorporate Harish's comments.

 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c | 97 ++++++++-----------
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h |  5 +-
 .../drm/amd/amdkfd/kfd_device_queue_manager.c | 20 ++++
 .../drm/amd/amdkfd/kfd_device_queue_manager.h |  3 +
 drivers/gpu/drm/amd/amdkfd/kfd_process.c      | 14 ++-
 .../gpu/drm/amd/include/kgd_kfd_interface.h   | 10 +-
 6 files changed, 87 insertions(+), 62 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
index 1254a43ec96b..fe8a8e7e9a9a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
@@ -950,28 +950,30 @@ static void unlock_spi_csq_mutexes(struct amdgpu_device *adev)
  * @inst: xcc's instance number on a multi-XCC setup
  */
 static void get_wave_count(struct amdgpu_device *adev, int queue_idx,
-               int *wave_cnt, int *vmid, uint32_t inst)
+               struct kfd_cu_occupancy *queue_cnt, uint32_t inst)
 {
        int pipe_idx;
        int queue_slot;
        unsigned int reg_val;
-
+       unsigned int wave_cnt;
        /*
         * Program GRBM with appropriate MEID, PIPEID, QUEUEID and VMID
         * parameters to read out waves in flight. Get VMID if there are
         * non-zero waves in flight.
         */
-       *vmid = 0xFF;
-       *wave_cnt = 0;
        pipe_idx = queue_idx / adev->gfx.mec.num_queue_per_pipe;
        queue_slot = queue_idx % adev->gfx.mec.num_queue_per_pipe;
        soc15_grbm_select(adev, 1, pipe_idx, queue_slot, 0, inst);
-       reg_val = RREG32_SOC15_IP(GC, SOC15_REG_OFFSET(GC, inst, mmSPI_CSQ_WF_ACTIVE_COUNT_0) +
-                        queue_slot);
-       *wave_cnt = reg_val & SPI_CSQ_WF_ACTIVE_COUNT_0__COUNT_MASK;
-       if (*wave_cnt != 0)
-               *vmid = (RREG32_SOC15(GC, inst, mmCP_HQD_VMID) &
-                        CP_HQD_VMID__VMID_MASK) >> CP_HQD_VMID__VMID__SHIFT;
+       reg_val = RREG32_SOC15_IP(GC, SOC15_REG_OFFSET(GC, inst,
+                                 mmSPI_CSQ_WF_ACTIVE_COUNT_0) + queue_slot);
+       wave_cnt = reg_val & SPI_CSQ_WF_ACTIVE_COUNT_0__COUNT_MASK;
+       if (wave_cnt != 0) {
+               queue_cnt->wave_cnt += wave_cnt;
+               queue_cnt->doorbell_off =
+                       (RREG32_SOC15(GC, inst, mmCP_HQD_PQ_DOORBELL_CONTROL) &
+                        CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_OFFSET_MASK) >>
+                        CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_OFFSET__SHIFT;
+       }
 }

 /**
@@ -1011,30 +1013,24 @@ static void get_wave_count(struct amdgpu_device *adev, int queue_idx,
  *    number of waves that are in flight for the queue at specified index. The
  *    index ranges from 0 to 7.
  *
- *    If non-zero waves are in flight, read CP_HQD_VMID register to obtain VMID
- *    of the wave(s).
+ *    If non-zero waves are in flight, store the corresponding doorbell offset
+ *    of the queue, along with the wave count.
  *
- *    Determine if VMID from above step maps to pasid provided as parameter. If
- *    it matches agrregate the wave count. That the VMID will not match pasid is
- *    a normal condition i.e. a device is expected to support multiple queues
- *    from multiple proceses.
+ *    Determine if the queue belongs to the process by comparing the doorbell
+ *    offset against the process's queues. If it matches, aggregate the wave
+ *    count for the process.
  *
  *  Reading registers referenced above involves programming GRBM appropriately
  */
-void kgd_gfx_v9_get_cu_occupancy(struct amdgpu_device *adev, int pasid,
-               int *pasid_wave_cnt, int *max_waves_per_cu, uint32_t inst)
+void kgd_gfx_v9_get_cu_occupancy(struct amdgpu_device *adev,
+                                struct kfd_cu_occupancy *cu_occupancy,
+                                int *max_waves_per_cu, uint32_t inst)
 {
        int qidx;
-       int vmid;
        int se_idx;
-       int sh_idx;
        int se_cnt;
-       int sh_cnt;
-       int wave_cnt;
        int queue_map;
-       int pasid_tmp;
        int max_queue_cnt;
-       int vmid_wave_cnt = 0;
        DECLARE_BITMAP(cp_queue_bitmap, AMDGPU_MAX_QUEUES);

        lock_spi_csq_mutexes(adev);
@@ -1048,42 +1044,30 @@ void kgd_gfx_v9_get_cu_occupancy(struct amdgpu_device *adev, int pasid,
                          AMDGPU_MAX_QUEUES);
        max_queue_cnt = adev->gfx.mec.num_pipe_per_mec *
                        adev->gfx.mec.num_queue_per_pipe;
-       sh_cnt = adev->gfx.config.max_sh_per_se;
        se_cnt = adev->gfx.config.max_shader_engines;
        for (se_idx = 0; se_idx < se_cnt; se_idx++) {
-               for (sh_idx = 0; sh_idx < sh_cnt; sh_idx++) {
+               amdgpu_gfx_select_se_sh(adev, se_idx, 0, 0xffffffff, inst);
+               queue_map = RREG32_SOC15(GC, inst, mmSPI_CSQ_WF_ACTIVE_STATUS);
+
+               /*
+                * Assumption: queue map encodes following schema: four
+                * pipes per each micro-engine, with each pipe mapping
+                * eight queues. This schema is true for GFX9 devices
+                * and must be verified for newer device families
+                */
+               for (qidx = 0; qidx < max_queue_cnt; qidx++) {
+                       /* Skip qeueus that are not associated with
+                        * compute functions
+                        */
+                       if (!test_bit(qidx, cp_queue_bitmap))
+                               continue;

-                       amdgpu_gfx_select_se_sh(adev, se_idx, sh_idx, 0xffffffff, inst);
-                       queue_map = RREG32_SOC15(GC, inst, mmSPI_CSQ_WF_ACTIVE_STATUS);
+                       if (!(queue_map & (1 << qidx)))
+                               continue;

-                       /*
-                        * Assumption: queue map encodes following schema: four
-                        * pipes per each micro-engine, with each pipe mapping
-                        * eight queues. This schema is true for GFX9 devices
-                        * and must be verified for newer device families
-                        */
-                       for (qidx = 0; qidx < max_queue_cnt; qidx++) {
-
-                               /* Skip qeueus that are not associated with
-                                * compute functions
-                                */
-                               if (!test_bit(qidx, cp_queue_bitmap))
-                                       continue;
-
-                               if (!(queue_map & (1 << qidx)))
-                                       continue;
-
-                               /* Get number of waves in flight and aggregate them */
-                               get_wave_count(adev, qidx, &wave_cnt, &vmid,
-                                               inst);
-                               if (wave_cnt != 0) {
-                                       pasid_tmp =
-                                         RREG32(SOC15_REG_OFFSET(OSSSYS, inst,
-                                                mmIH_VMID_0_LUT) + vmid);
-                                       if (pasid_tmp == pasid)
-                                               vmid_wave_cnt += wave_cnt;
-                               }
-                       }
+                       /* Get number of waves in flight and aggregate them */
+                       get_wave_count(adev, qidx, &cu_occupancy[qidx],
+                                       inst);
                }
        }

@@ -1092,7 +1076,6 @@ void kgd_gfx_v9_get_cu_occupancy(struct amdgpu_device *adev, int pasid,
        unlock_spi_csq_mutexes(adev);

        /* Update the output parameters and return */
-       *pasid_wave_cnt = vmid_wave_cnt;

pasid_wave_cnt can be removed the function comments also


        *max_waves_per_cu = adev->gfx.cu_info.simd_per_cu *
                                adev->gfx.cu_info.max_waves_per_simd;
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
index 988c50ac3be0..b6a91a552aa4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
@@ -52,8 +52,9 @@ bool kgd_gfx_v9_get_atc_vmid_pasid_mapping_info(struct amdgpu_device *adev,
                                        uint8_t vmid, uint16_t *p_pasid);
 void kgd_gfx_v9_set_vm_context_page_table_base(struct amdgpu_device *adev,
                        uint32_t vmid, uint64_t page_table_base);
-void kgd_gfx_v9_get_cu_occupancy(struct amdgpu_device *adev, int pasid,
-               int *pasid_wave_cnt, int *max_waves_per_cu, uint32_t inst);
+void kgd_gfx_v9_get_cu_occupancy(struct amdgpu_device *adev,
+                                struct kfd_cu_occupancy *cu_occupancy,
+                                int *max_waves_per_cu, uint32_t inst);
 void kgd_gfx_v9_program_trap_handler_settings(struct amdgpu_device *adev,
                uint32_t vmid, uint64_t tba_addr, uint64_t tma_addr,
                uint32_t inst);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 71b465f8d83e..29578550b478 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -3540,6 +3540,26 @@ int debug_refresh_runlist(struct device_queue_manager *dqm)
        return debug_map_and_unlock(dqm);
 }

+bool kfd_dqm_is_queue_in_process(struct device_queue_manager *dqm,
+                                struct qcm_process_device *qpd,
+                                int doorbell_off)
+{
+       struct queue *q;
+       bool r = false;
+
+       dqm_lock(dqm);
+
+       list_for_each_entry(q, &qpd->queues_list, list) {
+               if (q->properties.doorbell_off == doorbell_off) {
+                       r = true;
+                       goto out;
+               }
+       }
+
+out:
+       dqm_unlock(dqm);
+       return r;
+}
 #if defined(CONFIG_DEBUG_FS)

 static void seq_reg_dump(struct seq_file *m,
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
index 08b40826ad1e..80be2036abea 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
@@ -324,6 +324,9 @@ void set_queue_snapshot_entry(struct queue *q,
 int debug_lock_and_unmap(struct device_queue_manager *dqm);
 int debug_map_and_unlock(struct device_queue_manager *dqm);
 int debug_refresh_runlist(struct device_queue_manager *dqm);
+bool kfd_dqm_is_queue_in_process(struct device_queue_manager *dqm,
+                                struct qcm_process_device *qpd,
+                                int doorbell_off);

 static inline unsigned int get_sh_mem_bases_32(struct kfd_process_device *pdd)
 {
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index a902950cc060..d73841268c9b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -270,6 +270,10 @@ static int kfd_get_cu_occupancy(struct attribute *attr, char *buffer)
        struct kfd_node *dev = NULL;
        struct kfd_process *proc = NULL;
        struct kfd_process_device *pdd = NULL;
+       int i;
+       struct kfd_cu_occupancy cu_occupancy[AMDGPU_MAX_QUEUES];
+
+       memset(cu_occupancy, 0x0, sizeof(cu_occupancy));

        pdd = container_of(attr, struct kfd_process_device, attr_cu_occupancy);
        dev = pdd->dev;
@@ -287,9 +291,17 @@ static int kfd_get_cu_occupancy(struct attribute *attr, char *buffer)
        /* Collect wave count from device if it supports */
        wave_cnt = 0;
        max_waves_per_cu = 0;
-       dev->kfd2kgd->get_cu_occupancy(dev->adev, proc->pasid, &wave_cnt,
+
+       dev->kfd2kgd->get_cu_occupancy(dev->adev, cu_occupancy,
                        &max_waves_per_cu, 0);

+       for (i = 0; i < AMDGPU_MAX_QUEUES; i++) {
+               if (cu_occupancy[i].wave_cnt != 0 &&
+                   kfd_dqm_is_queue_in_process(dev->dqm, &pdd->qpd,
+                                               cu_occupancy[i].doorbell_off))
+                       wave_cnt += cu_occupancy[i].wave_cnt;
+       }
+
        /* Translate wave count to number of compute units */
        cu_cnt = (wave_cnt + (max_waves_per_cu - 1)) / max_waves_per_cu;
        return snprintf(buffer, PAGE_SIZE, "%d\n", cu_cnt);
diff --git a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
index 7744ca3ef4b1..e3e635a31b8a 100644
--- a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
+++ b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h
@@ -71,6 +71,11 @@ enum kgd_memory_pool {
        KGD_POOL_FRAMEBUFFER = 3,
 };

+struct kfd_cu_occupancy {
+       u32 wave_cnt;
+       u32 doorbell_off;
+};
+
 /**
  * enum kfd_sched_policy
  *
@@ -313,8 +318,9 @@ struct kfd2kgd_calls {
                        uint32_t grace_period,
                        uint32_t *reg_offset,
                        uint32_t *reg_data);
-       void (*get_cu_occupancy)(struct amdgpu_device *adev, int pasid,
-                       int *wave_cnt, int *max_waves_per_cu, uint32_t inst);
+       void (*get_cu_occupancy)(struct amdgpu_device *adev,
+                                struct kfd_cu_occupancy *cu_occupancy,
+                                int *max_waves_per_cu, uint32_t inst);
        void (*program_trap_handler_settings)(struct amdgpu_device *adev,
                        uint32_t vmid, uint64_t tba_addr, uint64_t tma_addr,
                        uint32_t inst);
--
2.35.1