On Wed, Jul 31, 2024 at 9:13 AM Sunil Khatri <sunil.khatri@xxxxxxx> wrote: > > Adding NOP packets one by one in the ring > does not use the CP efficiently. > > Solution: > Use CP optimization while adding NOP packet's so PFP > can discard NOP packets based on information of count > from the Header instead of fetching all NOP packets > one by one. > > Cc: Christian König <christian.koenig@xxxxxxx> > Cc: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@xxxxxxx> > Cc: Tvrtko Ursulin <tursulin@xxxxxxxxxx> > Cc: Marek Olšák <marek.olsak@xxxxxxx> > Signed-off-by: Sunil Khatri <sunil.khatri@xxxxxxx> > --- > drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 24 +++++++++++++++++++++--- > 1 file changed, 21 insertions(+), 3 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c > index 675a1a8e2515..991f7c2fc1a2 100644 > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c > @@ -7100,6 +7100,24 @@ static void gfx_v9_0_emit_wave_limit(struct amdgpu_ring *ring, bool enable) > } > } > > +static void gfx_v9_ring_insert_nop(struct amdgpu_ring *ring, uint32_t num_nop) > +{ > + int i; > + > + /* Header itself is a NOP packet */ > + if (num_nop == 1) { > + amdgpu_ring_write(ring, ring->funcs->nop); > + return; > + } > + > + /* Max HW optimization till 0x3ffe, followed by remaining one NOP at a time*/ > + amdgpu_ring_write(ring, PACKET3(PACKET3_NOP, min(num_nop - 2, 0x3ffe))); > + > + /* Header is at index 0, followed by num_nops - 1 NOP packet's */ > + for (i = 1; i < num_nop; i++) > + amdgpu_ring_write(ring, ring->funcs->nop); This loop should be removed. I explained the reason in the gfx10 commit. Marek > +} > + > static void gfx_v9_ip_print(void *handle, struct drm_printer *p) > { > struct amdgpu_device *adev = (struct amdgpu_device *)handle; > @@ -7240,7 +7258,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = { > .emit_gds_switch = gfx_v9_0_ring_emit_gds_switch, > .emit_hdp_flush = gfx_v9_0_ring_emit_hdp_flush, > .test_ring = gfx_v9_0_ring_test_ring, > - .insert_nop = amdgpu_ring_insert_nop, > + .insert_nop = gfx_v9_ring_insert_nop, > .pad_ib = amdgpu_ring_generic_pad_ib, > .emit_switch_buffer = gfx_v9_ring_emit_sb, > .emit_cntxcntl = gfx_v9_ring_emit_cntxcntl, > @@ -7294,7 +7312,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_sw_ring_funcs_gfx = { > .emit_hdp_flush = gfx_v9_0_ring_emit_hdp_flush, > .test_ring = gfx_v9_0_ring_test_ring, > .test_ib = gfx_v9_0_ring_test_ib, > - .insert_nop = amdgpu_sw_ring_insert_nop, > + .insert_nop = gfx_v9_ring_insert_nop, > .pad_ib = amdgpu_ring_generic_pad_ib, > .emit_switch_buffer = gfx_v9_ring_emit_sb, > .emit_cntxcntl = gfx_v9_ring_emit_cntxcntl, > @@ -7338,7 +7356,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = { > .emit_hdp_flush = gfx_v9_0_ring_emit_hdp_flush, > .test_ring = gfx_v9_0_ring_test_ring, > .test_ib = gfx_v9_0_ring_test_ib, > - .insert_nop = amdgpu_ring_insert_nop, > + .insert_nop = gfx_v9_ring_insert_nop, > .pad_ib = amdgpu_ring_generic_pad_ib, > .emit_wreg = gfx_v9_0_ring_emit_wreg, > .emit_reg_wait = gfx_v9_0_ring_emit_reg_wait, > -- > 2.34.1 >