[Public] -----Original Message----- From: Kim, Jonathan <Jonathan.Kim@xxxxxxx> Sent: Monday, March 3, 2025 3:54 PM To: Kasiviswanathan, Harish <Harish.Kasiviswanathan@xxxxxxx>; amd-gfx@xxxxxxxxxxxxxxxxxxxxx Subject: RE: [PATCH v2 2/2] drm/amdgpu: Reduce dequeue retry timeout for gfx9 family [Public] > -----Original Message----- > From: Kasiviswanathan, Harish <Harish.Kasiviswanathan@xxxxxxx> > Sent: Monday, March 3, 2025 3:44 PM > To: amd-gfx@xxxxxxxxxxxxxxxxxxxxx > Cc: Kasiviswanathan, Harish <Harish.Kasiviswanathan@xxxxxxx>; Kim, Jonathan > <Jonathan.Kim@xxxxxxx> > Subject: [PATCH v2 2/2] drm/amdgpu: Reduce dequeue retry timeout for gfx9 family > > Dequeue retry timeout controls the interval between checks for unmet > conditions. On MI series, reduce this from 0x40 to 0x1 (~ 1 uS). The > cost of additional bandwidth consumed by CP when polling memory > shouldn't be substantial. > > Signed-off-by: Harish Kasiviswanathan <Harish.Kasiviswanathan@xxxxxxx> > Reviewed-by: : Jonathan Kim <jonathan.kim@xxxxxxx> > --- > .../drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c | 2 +- > .../drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c | 2 +- > .../drm/amd/amdgpu/amdgpu_amdkfd_gc_9_4_3.c | 4 +- > .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c | 28 ++++++------- > .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h | 5 ++- > .../drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c | 2 +- > .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c | 28 ++++++------- > .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h | 5 ++- > .../drm/amd/amdkfd/kfd_packet_manager_v9.c | 40 +++++++++++++------ > .../gpu/drm/amd/include/kgd_kfd_interface.h | 5 ++- > 10 files changed, 70 insertions(+), 51 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c > index 8dfdb18197c4..53f5f1885870 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c > @@ -189,7 +189,7 @@ const struct kfd2kgd_calls aldebaran_kfd2kgd = { > .set_address_watch = kgd_gfx_aldebaran_set_address_watch, > .clear_address_watch = kgd_gfx_v9_clear_address_watch, > .get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times, > - .build_grace_period_packet_info = > kgd_gfx_v9_build_grace_period_packet_info, > + .build_dequeue_wait_counts_packet_info = > kgd_gfx_v9_build_dequeue_wait_counts_packet_info, > .program_trap_handler_settings = > kgd_gfx_v9_program_trap_handler_settings, > .hqd_get_pq_addr = kgd_gfx_v9_hqd_get_pq_addr, > .hqd_reset = kgd_gfx_v9_hqd_reset, > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c > index 9abf29b58ac7..6fd41aece7e1 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c > @@ -415,7 +415,7 @@ const struct kfd2kgd_calls arcturus_kfd2kgd = { > .set_address_watch = kgd_gfx_v9_set_address_watch, > .clear_address_watch = kgd_gfx_v9_clear_address_watch, > .get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times, > - .build_grace_period_packet_info = > kgd_gfx_v9_build_grace_period_packet_info, > + .build_dequeue_wait_counts_packet_info = > kgd_gfx_v9_build_dequeue_wait_counts_packet_info, > .get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy, > .program_trap_handler_settings = > kgd_gfx_v9_program_trap_handler_settings, > .hqd_get_pq_addr = kgd_gfx_v9_hqd_get_pq_addr, > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gc_9_4_3.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gc_9_4_3.c > index e2ae714a700f..95f249896275 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gc_9_4_3.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gc_9_4_3.c > @@ -530,8 +530,8 @@ const struct kfd2kgd_calls gc_9_4_3_kfd2kgd = { > .get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy, > .program_trap_handler_settings = > kgd_gfx_v9_program_trap_handler_settings, > - .build_grace_period_packet_info = > - kgd_gfx_v9_build_grace_period_packet_info, > + .build_dequeue_wait_counts_packet_info = > + kgd_gfx_v9_build_dequeue_wait_counts_packet_info, > .get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times, > .enable_debug_trap = kgd_aldebaran_enable_debug_trap, > .disable_debug_trap = kgd_gfx_v9_4_3_disable_debug_trap, > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c > index 62176d607bef..0b03f2e9a858 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c > @@ -1021,25 +1021,25 @@ void kgd_gfx_v10_get_iq_wait_times(struct > amdgpu_device *adev, > *wait_times = RREG32(SOC15_REG_OFFSET(GC, 0, > mmCP_IQ_WAIT_TIME2)); > } > > -void kgd_gfx_v10_build_grace_period_packet_info(struct amdgpu_device *adev, > +void kgd_gfx_v10_build_dequeue_wait_counts_packet_info(struct amdgpu_device > *adev, > uint32_t wait_times, > - uint32_t grace_period, > + uint32_t sch_wave, > + uint32_t que_sleep, > uint32_t *reg_offset, > uint32_t *reg_data) > { > *reg_data = wait_times; > > - /* > - * The CP cannont handle a 0 grace period input and will result in > - * an infinite grace period being set so set to 1 to prevent this. > - */ > - if (grace_period == 0) > - grace_period = 1; > - > - *reg_data = REG_SET_FIELD(*reg_data, > - CP_IQ_WAIT_TIME2, > - SCH_WAVE, > - grace_period); > + if (sch_wave) > + *reg_data = REG_SET_FIELD(*reg_data, > + CP_IQ_WAIT_TIME2, > + SCH_WAVE, > + sch_wave); > + if (que_sleep) > + *reg_data = REG_SET_FIELD(*reg_data, > + CP_IQ_WAIT_TIME2, > + QUE_SLEEP, > + que_sleep); > > *reg_offset = SOC15_REG_OFFSET(GC, 0, mmCP_IQ_WAIT_TIME2); > } > @@ -1109,7 +1109,7 @@ const struct kfd2kgd_calls gfx_v10_kfd2kgd = { > .set_address_watch = kgd_gfx_v10_set_address_watch, > .clear_address_watch = kgd_gfx_v10_clear_address_watch, > .get_iq_wait_times = kgd_gfx_v10_get_iq_wait_times, > - .build_grace_period_packet_info = > kgd_gfx_v10_build_grace_period_packet_info, > + .build_dequeue_wait_counts_packet_info = > kgd_gfx_v10_build_dequeue_wait_counts_packet_info, > .program_trap_handler_settings = program_trap_handler_settings, > .hqd_get_pq_addr = kgd_gfx_v10_hqd_get_pq_addr, > .hqd_reset = kgd_gfx_v10_hqd_reset > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h > index 9efd2dd4fdd7..89ae07288b10 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h > @@ -51,9 +51,10 @@ uint32_t kgd_gfx_v10_clear_address_watch(struct > amdgpu_device *adev, > void kgd_gfx_v10_get_iq_wait_times(struct amdgpu_device *adev, > uint32_t *wait_times, > uint32_t inst); > -void kgd_gfx_v10_build_grace_period_packet_info(struct amdgpu_device *adev, > +void kgd_gfx_v10_build_dequeue_wait_counts_packet_info(struct amdgpu_device > *adev, > uint32_t wait_times, > - uint32_t grace_period, > + uint32_t sch_wave, > + uint32_t que_sleep, > uint32_t *reg_offset, > uint32_t *reg_data); > uint64_t kgd_gfx_v10_hqd_get_pq_addr(struct amdgpu_device *adev, > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c > index c718bedda0ca..2c5f22838fe0 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c > @@ -673,7 +673,7 @@ const struct kfd2kgd_calls gfx_v10_3_kfd2kgd = { > .set_vm_context_page_table_base = > set_vm_context_page_table_base_v10_3, > .program_trap_handler_settings = program_trap_handler_settings_v10_3, > .get_iq_wait_times = kgd_gfx_v10_get_iq_wait_times, > - .build_grace_period_packet_info = > kgd_gfx_v10_build_grace_period_packet_info, > + .build_dequeue_wait_counts_packet_info = > kgd_gfx_v10_build_dequeue_wait_counts_packet_info, > .enable_debug_trap = kgd_gfx_v10_enable_debug_trap, > .disable_debug_trap = kgd_gfx_v10_disable_debug_trap, > .validate_trap_override_request = > kgd_gfx_v10_validate_trap_override_request, > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c > index 441568163e20..d2bbe9973c93 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c > @@ -1077,25 +1077,25 @@ void kgd_gfx_v9_get_cu_occupancy(struct > amdgpu_device *adev, > adev->gfx.cu_info.max_waves_per_simd; > } > > -void kgd_gfx_v9_build_grace_period_packet_info(struct amdgpu_device *adev, > +void kgd_gfx_v9_build_dequeue_wait_counts_packet_info(struct amdgpu_device > *adev, > uint32_t wait_times, > - uint32_t grace_period, > + uint32_t sch_wave, > + uint32_t que_sleep, > uint32_t *reg_offset, > uint32_t *reg_data) > { > *reg_data = wait_times; > > - /* > - * The CP cannot handle a 0 grace period input and will result in > - * an infinite grace period being set so set to 1 to prevent this. > - */ > - if (grace_period == 0) > - grace_period = 1; > - > - *reg_data = REG_SET_FIELD(*reg_data, > - CP_IQ_WAIT_TIME2, > - SCH_WAVE, > - grace_period); > + if (sch_wave) > + *reg_data = REG_SET_FIELD(*reg_data, > + CP_IQ_WAIT_TIME2, > + SCH_WAVE, > + sch_wave); > + if (que_sleep) > + *reg_data = REG_SET_FIELD(*reg_data, > + CP_IQ_WAIT_TIME2, > + QUE_SLEEP, > + que_sleep); > > *reg_offset = SOC15_REG_OFFSET(GC, 0, mmCP_IQ_WAIT_TIME2); > } > @@ -1254,7 +1254,7 @@ const struct kfd2kgd_calls gfx_v9_kfd2kgd = { > .set_address_watch = kgd_gfx_v9_set_address_watch, > .clear_address_watch = kgd_gfx_v9_clear_address_watch, > .get_iq_wait_times = kgd_gfx_v9_get_iq_wait_times, > - .build_grace_period_packet_info = > kgd_gfx_v9_build_grace_period_packet_info, > + .build_dequeue_wait_counts_packet_info = > kgd_gfx_v9_build_dequeue_wait_counts_packet_info, > .get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy, > .program_trap_handler_settings = > kgd_gfx_v9_program_trap_handler_settings, > .hqd_get_pq_addr = kgd_gfx_v9_hqd_get_pq_addr, > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h > index b6a91a552aa4..54ee8992be5c 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h > @@ -97,9 +97,10 @@ uint32_t kgd_gfx_v9_clear_address_watch(struct > amdgpu_device *adev, > void kgd_gfx_v9_get_iq_wait_times(struct amdgpu_device *adev, > uint32_t *wait_times, > uint32_t inst); > -void kgd_gfx_v9_build_grace_period_packet_info(struct amdgpu_device *adev, > +void kgd_gfx_v9_build_dequeue_wait_counts_packet_info(struct amdgpu_device > *adev, > uint32_t wait_times, > - uint32_t grace_period, > + uint32_t sch_wave, > + uint32_t que_sleep, > uint32_t *reg_offset, > uint32_t *reg_data); > uint64_t kgd_gfx_v9_hqd_get_pq_addr(struct amdgpu_device *adev, > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c > b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c > index b9c611b249e6..6861e8f7a2f7 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c > @@ -298,13 +298,14 @@ static int pm_map_queues_v9(struct packet_manager > *pm, uint32_t *buffer, > } > > static inline void pm_build_dequeue_wait_counts_packet_info(struct > packet_manager *pm, > - uint32_t sch_value, uint32_t *reg_offset, > + uint32_t sch_value, uint32_t que_sleep, uint32_t *reg_offset, > uint32_t *reg_data) > { > - pm->dqm->dev->kfd2kgd->build_grace_period_packet_info( > + pm->dqm->dev->kfd2kgd->build_dequeue_wait_counts_packet_info( > pm->dqm->dev->adev, > pm->dqm->wait_times, > sch_value, > + que_sleep, > reg_offset, > reg_data); > } > @@ -320,26 +321,41 @@ static int pm_config_dequeue_wait_counts_v9(struct > packet_manager *pm, > > switch (cmd) { > case KFD_DEQUEUE_WAIT_INIT: > - /* Set CWSR grace period to 1x1000 cycle for GFX9.4.3 APU */ > - if (amdgpu_emu_mode == 0 && pm->dqm->dev->adev- > >gmc.is_app_apu && > - (KFD_GC_VERSION(pm->dqm->dev) == IP_VERSION(9, 4, 3))) > - pm_build_dequeue_wait_counts_packet_info(pm, 1, > ®_offset, ®_data); > - else > + uint32_t sch_wave = 0, que_sleep = 0; > + /* Reduce CP_IQ_WAIT_TIME2.QUE_SLEEP to 0x1 from default > 0x40. > + * On a 1GHz machine this is roughly 1 microsecond, which is > + * about how long it takes to load data out of memory during > + * queue connect > + * QUE_SLEEP: Wait Count for Dequeue Retry. > + */ > + if (KFD_GC_VERSION(pm->dqm->dev) >= IP_VERSION(9, 4, 1) && > + KFD_GC_VERSION(pm->dqm->dev) < IP_VERSION(10, 0, 0)) { > + que_sleep = 1; > + > + /* Set CWSR grace period to 1x1000 cycle for GFX9.4.3 APU > */ > + if (amdgpu_emu_mode == 0 && pm->dqm->dev->adev- > >gmc.is_app_apu && > + (KFD_GC_VERSION(pm->dqm->dev) == IP_VERSION(9, 4, > 3))) > + sch_wave = 1; > + } else /* non gfx9 */ > return 0; Please close the else statement braces to balance the if statement braces. Also the /* non gfx9 */ comment isn't correct and should be removed because you can still have GFX9 devices that are pre v9.4.1 that do not need optimized wait times on dequeue. [HK]: Yes correct. Thanks. Thanks, Jon > + > + pm_build_dequeue_wait_counts_packet_info(pm, sch_wave, > que_sleep, > + ®_offset, ®_data); > + > break; > case KFD_DEQUEUE_WAIT_RESET: > - /* function called only to get reg_offset */ > - pm_build_dequeue_wait_counts_packet_info(pm, 0, ®_offset, > ®_data); > - reg_data = pm->dqm->wait_times; > + /* reg_data would be set to dqm->wait_times */ > + pm_build_dequeue_wait_counts_packet_info(pm, 0, 0, ®_offset, > ®_data); > break; > > case KFD_DEQUEUE_WAIT_SET_SCH_WAVE: > /* The CP cannot handle value 0 and it will result in > - * an infinite grace period being set so set to 1 to prevent this. > + * an infinite grace period being set so set to 1 to prevent this. Also > + * avoid debugger API breakage as it sets 0 and expects a low value. > */ > if (!value) > value = 1; > - pm_build_dequeue_wait_counts_packet_info(pm, value, ®_offset, > ®_data); > + pm_build_dequeue_wait_counts_packet_info(pm, value, 0, > ®_offset, ®_data); > break; > default: > pr_err("Invalid dequeue wait cmd\n"); > diff --git a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h > b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h > index e3e635a31b8a..4f79e707bc80 100644 > --- a/drivers/gpu/drm/amd/include/kgd_kfd_interface.h > +++ b/drivers/gpu/drm/amd/include/kgd_kfd_interface.h > @@ -313,9 +313,10 @@ struct kfd2kgd_calls { > void (*get_iq_wait_times)(struct amdgpu_device *adev, > uint32_t *wait_times, > uint32_t inst); > - void (*build_grace_period_packet_info)(struct amdgpu_device *adev, > + void (*build_dequeue_wait_counts_packet_info)(struct amdgpu_device > *adev, > uint32_t wait_times, > - uint32_t grace_period, > + uint32_t sch_wave, > + uint32_t que_sleep, > uint32_t *reg_offset, > uint32_t *reg_data); > void (*get_cu_occupancy)(struct amdgpu_device *adev, > -- > 2.34.1