On Thu, Feb 29, 2024 at 9:58 AM Christian König <ckoenig.leichtzumerken@xxxxxxxxx> wrote: > > It turned out that executing the SET_Q_MODE packet on every submission > creates to much overhead. > > Implement a workaround which allows skipping the SET_Q_MODE packet if > subsequent submissions all use the same parameters. > > Signed-off-by: Christian König <christian.koenig@xxxxxxx> Series is: Reviewed-by: Alex Deucher <alexander.deucher@xxxxxxx> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h | 3 + > drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c | 104 +++++++++++++++++++---- > 2 files changed, 92 insertions(+), 15 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h > index 756330767909..582053f1cd56 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h > @@ -285,6 +285,9 @@ struct amdgpu_ring { > unsigned cond_exe_offs; > u64 cond_exe_gpu_addr; > volatile u32 *cond_exe_cpu_addr; > + unsigned int set_q_mode_offs; > + volatile u32 *set_q_mode_ptr; > + u64 set_q_mode_token; > unsigned vm_hub; > unsigned vm_inv_eng; > struct dma_fence *vmid_wait; > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c > index 2ccbdee570cf..6e6b6eff48e2 100644 > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c > @@ -5461,6 +5461,11 @@ static void gfx_v11_0_ring_emit_vm_flush(struct amdgpu_ring *ring, > amdgpu_ring_write(ring, PACKET3(PACKET3_PFP_SYNC_ME, 0)); > amdgpu_ring_write(ring, 0x0); > } > + > + /* Make sure that we can't skip the SET_Q_MODE packets when the VM > + * changed in any way. > + */ > + ring->set_q_mode_ptr = NULL; > } > > static void gfx_v11_0_ring_emit_fence_kiq(struct amdgpu_ring *ring, u64 addr, > @@ -5510,16 +5515,81 @@ static void gfx_v11_0_ring_emit_cntxcntl(struct amdgpu_ring *ring, > amdgpu_ring_write(ring, 0); > } > > +static unsigned gfx_v11_0_ring_emit_init_cond_exec(struct amdgpu_ring *ring, > + uint64_t addr) > +{ > + unsigned ret; > + > + amdgpu_ring_write(ring, PACKET3(PACKET3_COND_EXEC, 3)); > + amdgpu_ring_write(ring, lower_32_bits(addr)); > + amdgpu_ring_write(ring, upper_32_bits(addr)); > + /* discard following DWs if *cond_exec_gpu_addr==0 */ > + amdgpu_ring_write(ring, 0); > + ret = ring->wptr & ring->buf_mask; > + /* patch dummy value later */ > + amdgpu_ring_write(ring, 0); > + > + return ret; > +} > + > static void gfx_v11_0_ring_emit_gfx_shadow(struct amdgpu_ring *ring, > u64 shadow_va, u64 csa_va, > u64 gds_va, bool init_shadow, > int vmid) > { > struct amdgpu_device *adev = ring->adev; > + unsigned int offs, end; > > if (!adev->gfx.cp_gfx_shadow) > return; > > + /* > + * The logic here isn't easy to understand because we need to keep state > + * accross multiple executions of the function as well as between the > + * CPU and GPU. The general idea is that the newly written GPU command > + * has a condition on the previous one and only executed if really > + * necessary. > + */ > + > + /* > + * The dw in the NOP controls if the next SET_Q_MODE packet should be > + * executed or not. Reserve 64bits just to be on the save side. > + */ > + amdgpu_ring_write(ring, PACKET3(PACKET3_NOP, 1)); > + offs = ring->wptr & ring->buf_mask; > + > + /* > + * We start with skipping the prefix SET_Q_MODE and always executing > + * the postfix SET_Q_MODE packet. This is changed below with a > + * WRITE_DATA command when the postfix executed. > + */ > + amdgpu_ring_write(ring, shadow_va ? 1 : 0); > + amdgpu_ring_write(ring, 0); > + > + if (ring->set_q_mode_offs) { > + uint64_t addr; > + > + addr = amdgpu_bo_gpu_offset(ring->ring_obj); > + addr += ring->set_q_mode_offs << 2; > + end = gfx_v11_0_ring_emit_init_cond_exec(ring, addr); > + } > + > + /* > + * When the postfix SET_Q_MODE packet executes we need to make sure that the > + * next prefix SET_Q_MODE packet executes as well. > + */ > + if (!shadow_va) { > + uint64_t addr; > + > + addr = amdgpu_bo_gpu_offset(ring->ring_obj); > + addr += offs << 2; > + amdgpu_ring_write(ring, PACKET3(PACKET3_WRITE_DATA, 3)); > + amdgpu_ring_write(ring, WRITE_DATA_DST_SEL(5) | WR_CONFIRM); > + amdgpu_ring_write(ring, lower_32_bits(addr)); > + amdgpu_ring_write(ring, upper_32_bits(addr)); > + amdgpu_ring_write(ring, 0x1); > + } > + > amdgpu_ring_write(ring, PACKET3(PACKET3_SET_Q_PREEMPTION_MODE, 7)); > amdgpu_ring_write(ring, lower_32_bits(shadow_va)); > amdgpu_ring_write(ring, upper_32_bits(shadow_va)); > @@ -5531,23 +5601,26 @@ static void gfx_v11_0_ring_emit_gfx_shadow(struct amdgpu_ring *ring, > PACKET3_SET_Q_PREEMPTION_MODE_IB_VMID(vmid) : 0); > amdgpu_ring_write(ring, init_shadow ? > PACKET3_SET_Q_PREEMPTION_MODE_INIT_SHADOW_MEM : 0); > -} > > -static unsigned gfx_v11_0_ring_emit_init_cond_exec(struct amdgpu_ring *ring, > - uint64_t addr) > -{ > - unsigned ret; > + if (ring->set_q_mode_offs) > + amdgpu_ring_patch_cond_exec(ring, end); > > - amdgpu_ring_write(ring, PACKET3(PACKET3_COND_EXEC, 3)); > - amdgpu_ring_write(ring, lower_32_bits(addr)); > - amdgpu_ring_write(ring, upper_32_bits(addr)); > - /* discard following DWs if *cond_exec_gpu_addr==0 */ > - amdgpu_ring_write(ring, 0); > - ret = ring->wptr & ring->buf_mask; > - /* patch dummy value later */ > - amdgpu_ring_write(ring, 0); > + if (shadow_va) { > + uint64_t token = shadow_va ^ csa_va ^ gds_va ^ vmid; > > - return ret; > + /* > + * If the tokens match try to skip the last postfix SET_Q_MODE > + * packet to avoid saving/restoring the state all the time. > + */ > + if (ring->set_q_mode_ptr && ring->set_q_mode_token == token) > + *ring->set_q_mode_ptr = 0; > + > + ring->set_q_mode_token = token; > + } else { > + ring->set_q_mode_ptr = &ring->ring[ring->set_q_mode_offs]; > + } > + > + ring->set_q_mode_offs = offs; > } > > static int gfx_v11_0_ring_preempt_ib(struct amdgpu_ring *ring) > @@ -6114,7 +6187,7 @@ static const struct amdgpu_ring_funcs gfx_v11_0_ring_funcs_gfx = { > .emit_frame_size = /* totally 247 maximum if 16 IBs */ > 5 + /* update_spm_vmid */ > 5 + /* COND_EXEC */ > - 9 + /* SET_Q_PREEMPTION_MODE */ > + 22 + /* SET_Q_PREEMPTION_MODE */ > 7 + /* PIPELINE_SYNC */ > SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 + > SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 + > @@ -6127,6 +6200,7 @@ static const struct amdgpu_ring_funcs gfx_v11_0_ring_funcs_gfx = { > 31 + /* DE_META */ > 3 + /* CNTX_CTRL */ > 5 + /* HDP_INVL */ > + 22 + /* SET_Q_PREEMPTION_MODE */ > 8 + 8 + /* FENCE x2 */ > 8, /* gfx_v11_0_emit_mem_sync */ > .emit_ib_size = 4, /* gfx_v11_0_ring_emit_ib_gfx */ > -- > 2.34.1 >