[AMD Official Use Only - AMD Internal Distribution Only] > -----Original Message----- > From: Kasiviswanathan, Harish <Harish.Kasiviswanathan@xxxxxxx> > Sent: Thursday, September 19, 2024 3:56 PM > To: Joshi, Mukul <Mukul.Joshi@xxxxxxx>; amd-gfx@xxxxxxxxxxxxxxxxxxxxx > Subject: RE: [PATCH] drm/amdkfd: Fix CU occupancy calculations for GFX 9.4.3 > > [AMD Official Use Only - AMD Internal Distribution Only] > > This looks like a more robust way to get this information. Few comments > inline. > > - It might be better to break this into two commits. One specific to 9.4.3 and > other to use doorbell instead of LUT register. The second change affects all gfx9 > ASICs. > To make the changes work for GFX 9.4.3, I will have to add additional changes to access the LUT register correctly, which will then be removed in the subsequent patch for all gfx9 ASICs. So I don't think we gain much from breaking it into 2 commits. What do you think? > -----Original Message----- > From: Joshi, Mukul <Mukul.Joshi@xxxxxxx> > Sent: Thursday, September 19, 2024 1:43 PM > To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx > Cc: Kasiviswanathan, Harish <Harish.Kasiviswanathan@xxxxxxx>; Joshi, > Mukul <Mukul.Joshi@xxxxxxx> > Subject: [PATCH] drm/amdkfd: Fix CU occupancy calculations for GFX 9.4.3 > > Make CU occupancy work on GFX 9.4.3 by updating the code to handle > multiple XCCs correctly. Additionally, update the algorithm to calculate CU > occupancy by matching dorrbell offset of the queue against the process's > queues, instead of using the IH_VMID_X_LUT registers since that can be racy > with CP updating the VMID-PASID mapping. > > Signed-off-by: Mukul Joshi <mukul.joshi@xxxxxxx> > --- > .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c | 92 ++++++++--------- > -- .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h | 5 +- > .../drm/amd/amdkfd/kfd_device_queue_manager.c | 20 ++++ > .../drm/amd/amdkfd/kfd_device_queue_manager.h | 3 + > drivers/gpu/drm/amd/amdkfd/kfd_process.c | 18 +++- > .../gpu/drm/amd/include/kgd_kfd_interface.h | 10 +- > 6 files changed, 88 insertions(+), 60 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c > index 1254a43ec96b..b8c01257b101 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c > @@ -950,28 +950,30 @@ static void unlock_spi_csq_mutexes(struct > amdgpu_device *adev) > * @inst: xcc's instance number on a multi-XCC setup > */ > static void get_wave_count(struct amdgpu_device *adev, int queue_idx, > - int *wave_cnt, int *vmid, uint32_t inst) > + struct kfd_cu_occupancy *queue_cnt, uint32_t inst) > { > int pipe_idx; > int queue_slot; > unsigned int reg_val; > - > + unsigned int wave_cnt; > /* > * Program GRBM with appropriate MEID, PIPEID, QUEUEID and VMID > * parameters to read out waves in flight. Get VMID if there are > * non-zero waves in flight. > */ > - *vmid = 0xFF; > - *wave_cnt = 0; > pipe_idx = queue_idx / adev->gfx.mec.num_queue_per_pipe; > queue_slot = queue_idx % adev->gfx.mec.num_queue_per_pipe; > - soc15_grbm_select(adev, 1, pipe_idx, queue_slot, 0, inst); > - reg_val = RREG32_SOC15_IP(GC, SOC15_REG_OFFSET(GC, inst, > mmSPI_CSQ_WF_ACTIVE_COUNT_0) + > - queue_slot); > - *wave_cnt = reg_val & SPI_CSQ_WF_ACTIVE_COUNT_0__COUNT_MASK; > - if (*wave_cnt != 0) > - *vmid = (RREG32_SOC15(GC, inst, mmCP_HQD_VMID) & > - CP_HQD_VMID__VMID_MASK) >> > CP_HQD_VMID__VMID__SHIFT; > + soc15_grbm_select(adev, 1, pipe_idx, queue_slot, 0, GET_INST(GC, inst)); > + reg_val = RREG32_SOC15_IP(GC, SOC15_REG_OFFSET(GC, GET_INST(GC, > inst), > + mmSPI_CSQ_WF_ACTIVE_COUNT_0) + queue_slot); > + wave_cnt = reg_val & SPI_CSQ_WF_ACTIVE_COUNT_0__COUNT_MASK; > + if (wave_cnt != 0) { > > Why is it queue_cnt->wave_cnt accumulating instead of direct assignment. The wave_cnt read here is per SE. So, for a queue, we need to accumulate the wave_cnts across all SEs. > There is some inefficiency here. The code is iterating over all queues two times > once in kfd_get_cu_occupancy and once more in > kgd_gfx_v9_get_cu_occupancy(). Is that strictly required? > Yes. The first pass (in kgd_gfx_v9_get_cu_occupancy()) is accumulating all queues (all HQDs) which have wave_cnt !=0. At this step, we store the corresponding doorbell_offset of the queues which have wave_cnt != 0. In the second step in kfd_get_cu_occupancy(), we are checking if queues with wave_cnt != 0 belong to the process. If yes, then we update the process's total wave_cnt. Hope it makes it clear why we need the 2 passes. > > + queue_cnt->wave_cnt += wave_cnt; > + queue_cnt->doorbell_off = > + (RREG32_SOC15(GC, GET_INST(GC, inst), > mmCP_HQD_PQ_DOORBELL_CONTROL) & > + > CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_OFFSET_MASK) >> > + > CP_HQD_PQ_DOORBELL_CONTROL__DOORBELL_OFFSET__SHIFT; > + } > } > > /** > @@ -1021,24 +1023,19 @@ static void get_wave_count(struct > amdgpu_device *adev, int queue_idx, > * > * Reading registers referenced above involves programming GRBM > appropriately > */ > -void kgd_gfx_v9_get_cu_occupancy(struct amdgpu_device *adev, int pasid, > - int *pasid_wave_cnt, int *max_waves_per_cu, uint32_t inst) > +void kgd_gfx_v9_get_cu_occupancy(struct amdgpu_device *adev, > + struct kfd_cu_occupancy *cu_occupancy, > + int *max_waves_per_cu, uint32_t inst) > { > int qidx; > - int vmid; > int se_idx; > - int sh_idx; > int se_cnt; > - int sh_cnt; > - int wave_cnt; > int queue_map; > - int pasid_tmp; > int max_queue_cnt; > - int vmid_wave_cnt = 0; > DECLARE_BITMAP(cp_queue_bitmap, AMDGPU_MAX_QUEUES); > > lock_spi_csq_mutexes(adev); > - soc15_grbm_select(adev, 1, 0, 0, 0, inst); > + soc15_grbm_select(adev, 1, 0, 0, 0, GET_INST(GC, inst)); > > /* > * Iterate through the shader engines and arrays of the device @@ - > 1048,51 +1045,38 @@ void kgd_gfx_v9_get_cu_occupancy(struct > amdgpu_device *adev, int pasid, > AMDGPU_MAX_QUEUES); > max_queue_cnt = adev->gfx.mec.num_pipe_per_mec * > adev->gfx.mec.num_queue_per_pipe; > - sh_cnt = adev->gfx.config.max_sh_per_se; > se_cnt = adev->gfx.config.max_shader_engines; > for (se_idx = 0; se_idx < se_cnt; se_idx++) { > - for (sh_idx = 0; sh_idx < sh_cnt; sh_idx++) { > + amdgpu_gfx_select_se_sh(adev, se_idx, 0, 0xffffffff, inst); > + queue_map = RREG32_SOC15(GC, GET_INST(GC, inst), > + mmSPI_CSQ_WF_ACTIVE_STATUS); > + > + /* > + * Assumption: queue map encodes following schema: four > + * pipes per each micro-engine, with each pipe mapping > + * eight queues. This schema is true for GFX9 devices > + * and must be verified for newer device families > + */ > + for (qidx = 0; qidx < max_queue_cnt; qidx++) { > + /* Skip qeueus that are not associated with > + * compute functions > + */ > + if (!test_bit(qidx, cp_queue_bitmap)) > + continue; > > - amdgpu_gfx_select_se_sh(adev, se_idx, sh_idx, 0xffffffff, inst); > - queue_map = RREG32_SOC15(GC, inst, > mmSPI_CSQ_WF_ACTIVE_STATUS); > + if (!(queue_map & (1 << qidx))) > + continue; > > - /* > - * Assumption: queue map encodes following schema: four > - * pipes per each micro-engine, with each pipe mapping > - * eight queues. This schema is true for GFX9 devices > - * and must be verified for newer device families > - */ > - for (qidx = 0; qidx < max_queue_cnt; qidx++) { > - > - /* Skip qeueus that are not associated with > - * compute functions > - */ > - if (!test_bit(qidx, cp_queue_bitmap)) > - continue; > - > - if (!(queue_map & (1 << qidx))) > - continue; > - > - /* Get number of waves in flight and aggregate them */ > - get_wave_count(adev, qidx, &wave_cnt, &vmid, > - inst); > - if (wave_cnt != 0) { > - pasid_tmp = > - RREG32(SOC15_REG_OFFSET(OSSSYS, inst, > - mmIH_VMID_0_LUT) + vmid); > - if (pasid_tmp == pasid) > - vmid_wave_cnt += wave_cnt; > - } > - } > + /* Get number of waves in flight and aggregate them */ > + get_wave_count(adev, qidx, &cu_occupancy[qidx], > + inst); > } > } > > amdgpu_gfx_select_se_sh(adev, 0xffffffff, 0xffffffff, 0xffffffff, inst); > - soc15_grbm_select(adev, 0, 0, 0, 0, inst); > + soc15_grbm_select(adev, 0, 0, 0, 0, GET_INST(GC, inst)); > unlock_spi_csq_mutexes(adev); > > /* Update the output parameters and return */ > - *pasid_wave_cnt = vmid_wave_cnt; > *max_waves_per_cu = adev->gfx.cu_info.simd_per_cu * > adev->gfx.cu_info.max_waves_per_simd; > } > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h > index 988c50ac3be0..b6a91a552aa4 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h > @@ -52,8 +52,9 @@ bool > kgd_gfx_v9_get_atc_vmid_pasid_mapping_info(struct amdgpu_device *adev, > uint8_t vmid, uint16_t *p_pasid); void > kgd_gfx_v9_set_vm_context_page_table_base(struct amdgpu_device *adev, > uint32_t vmid, uint64_t page_table_base); -void > kgd_gfx_v9_get_cu_occupancy(struct amdgpu_device *adev, int pasid, > - int *pasid_wave_cnt, int *max_waves_per_cu, uint32_t inst); > +void kgd_gfx_v9_get_cu_occupancy(struct amdgpu_device *adev, > + struct kfd_cu_occupancy *cu_occupancy, > + int *max_waves_per_cu, uint32_t inst); > void kgd_gfx_v9_program_trap_handler_settings(struct amdgpu_device > *adev, > uint32_t vmid, uint64_t tba_addr, uint64_t tma_addr, > uint32_t inst); > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c > b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c > index 71b465f8d83e..784d155d8bcc 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c > @@ -3540,6 +3540,26 @@ int debug_refresh_runlist(struct > device_queue_manager *dqm) > return debug_map_and_unlock(dqm); } > > +bool kfd_dqm_find_queue_by_doorbell_off(struct device_queue_manager > *dqm, > + struct qcm_process_device *qpd, > + int doorbell_off) { > + struct queue *q; > + bool r = false; > + > + dqm_lock(dqm); > + > + list_for_each_entry(q, &qpd->queues_list, list) { > + if (q->properties.doorbell_off == doorbell_off) { > + r = true; > + goto out; > + } > + } > + > +out: > + dqm_unlock(dqm); > + return r; > +} > #if defined(CONFIG_DEBUG_FS) > > static void seq_reg_dump(struct seq_file *m, diff --git > a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h > b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h > index 08b40826ad1e..e5951589e5bd 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h > @@ -324,6 +324,9 @@ void set_queue_snapshot_entry(struct queue *q, int > debug_lock_and_unmap(struct device_queue_manager *dqm); int > debug_map_and_unlock(struct device_queue_manager *dqm); int > debug_refresh_runlist(struct device_queue_manager *dqm); > +bool kfd_dqm_find_queue_by_doorbell_off(struct device_queue_manager > *dqm, > + struct qcm_process_device *qpd, > + int doorbell_off); > > static inline unsigned int get_sh_mem_bases_32(struct kfd_process_device > *pdd) { diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c > b/drivers/gpu/drm/amd/amdkfd/kfd_process.c > index a902950cc060..6720bd30517b 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c > @@ -270,6 +270,10 @@ static int kfd_get_cu_occupancy(struct attribute > *attr, char *buffer) > struct kfd_node *dev = NULL; > struct kfd_process *proc = NULL; > struct kfd_process_device *pdd = NULL; > + int i; > + struct kfd_cu_occupancy cu_occupancy[AMDGPU_MAX_QUEUES]; > + > > You can just use sizeof(cu_occupancy) as it is an array > > + memset(cu_occupancy, 0x0, sizeof(struct kfd_cu_occupancy) * > + AMDGPU_MAX_QUEUES); > > > > pdd = container_of(attr, struct kfd_process_device, attr_cu_occupancy); > dev = pdd->dev; > @@ -287,9 +291,19 @@ static int kfd_get_cu_occupancy(struct attribute > *attr, char *buffer) > /* Collect wave count from device if it supports */ > wave_cnt = 0; > max_waves_per_cu = 0; > - dev->kfd2kgd->get_cu_occupancy(dev->adev, proc->pasid, &wave_cnt, > - &max_waves_per_cu, 0); > > > A comment as to why you are using only the first xcc_mask will be helpful. > Sure will do. Thanks, Mukul > + dev->kfd2kgd->get_cu_occupancy(dev->adev, cu_occupancy, > + &max_waves_per_cu, ffs(dev->xcc_mask) - 1); > + > + for (i = 0; i < AMDGPU_MAX_QUEUES; i++) { > + if (cu_occupancy[i].wave_cnt != 0 && > + kfd_dqm_find_queue_by_doorbell_off(dev->dqm, &pdd->qpd, > + cu_occupancy[i].doorbell_off)) > + wave_cnt += cu_occupancy[i].wave_cnt; > + } > + > + /* Update wave_cnt for the number of XCCs in the partition */ > + wave_cnt *= NUM_XCC(dev->xcc_mask); > /* Translate wave count to number of compute units */ > cu_cnt = (wave_cnt + (max_waves_per_cu - 1)) / max_waves_per_cu; > return snprintf(buffer, PAGE_SIZE, "%d\n", cu_cnt); diff --git > a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h > b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h > index 7744ca3ef4b1..e3e635a31b8a 100644 > --- a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h > +++ b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h > @@ -71,6 +71,11 @@ enum kgd_memory_pool { > KGD_POOL_FRAMEBUFFER = 3, > }; > > +struct kfd_cu_occupancy { > + u32 wave_cnt; > + u32 doorbell_off; > +}; > + > /** > * enum kfd_sched_policy > * > @@ -313,8 +318,9 @@ struct kfd2kgd_calls { > uint32_t grace_period, > uint32_t *reg_offset, > uint32_t *reg_data); > - void (*get_cu_occupancy)(struct amdgpu_device *adev, int pasid, > - int *wave_cnt, int *max_waves_per_cu, uint32_t inst); > + void (*get_cu_occupancy)(struct amdgpu_device *adev, > + struct kfd_cu_occupancy *cu_occupancy, > + int *max_waves_per_cu, uint32_t inst); > void (*program_trap_handler_settings)(struct amdgpu_device *adev, > uint32_t vmid, uint64_t tba_addr, uint64_t tma_addr, > uint32_t inst); > -- > 2.35.1 >