On Wed, Jul 31, 2024 at 11:19 PM Marek Olšák <maraeo@xxxxxxxxx> wrote: > > On Tue, Jul 30, 2024 at 8:43 AM Sunil Khatri <sunil.khatri@xxxxxxx> wrote: > > > > Adding NOP packets one by one in the ring > > does not use the CP efficiently. > > > > Solution: > > Use CP optimization while adding NOP packet's so PFP > > can discard NOP packets based on information of count > > from the Header instead of fetching all NOP packets > > one by one. > > > > Cc: Christian König <christian.koenig@xxxxxxx> > > Cc: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@xxxxxxx> > > Cc: Tvrtko Ursulin <tursulin@xxxxxxxxxx> > > Cc: Marek Olšák <marek.olsak@xxxxxxx> > > Signed-off-by: Sunil Khatri <sunil.khatri@xxxxxxx> > > --- > > drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 24 +++++++++++++++++++++--- > > 1 file changed, 21 insertions(+), 3 deletions(-) > > > > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c > > index 853084a2ce7f..edf5b5c4d185 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c > > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c > > @@ -9397,6 +9397,24 @@ static void gfx_v10_0_emit_mem_sync(struct amdgpu_ring *ring) > > amdgpu_ring_write(ring, gcr_cntl); /* GCR_CNTL */ > > } > > > > +static void amdgpu_gfx10_ring_insert_nop(struct amdgpu_ring *ring, uint32_t num_nop) > > +{ > > + int i; > > + > > + /* Header itself is a NOP packet */ > > + if (num_nop == 1) { > > + amdgpu_ring_write(ring, ring->funcs->nop); > > + return; > > + } > > + > > + /* Max HW optimization till 0x3ffe, followed by remaining one NOP at a time*/ > > + amdgpu_ring_write(ring, PACKET3(PACKET3_NOP, min(num_nop - 2, 0x3ffe))); > > + > > + /* Header is at index 0, followed by num_nops - 1 NOP packet's */ > > + for (i = 1; i < num_nop; i++) > > + amdgpu_ring_write(ring, ring->funcs->nop); > > This loop should be removed. It's unnecessary CPU overhead and we > should never get more than 0x3fff NOPs (maybe use BUG_ON). Leaving the > whole packet body uninitialized is the fastest option. If you remove amdgpu_ring_write, you still need to move wptr somehow. amdgpu_ring_write_multiple gives a hint about how to do it: ring->wptr += count_dw; ring->wptr &= ring->ptr_mask; ring->count_dw -= count_dw; Marek > > Marek > > > +} > > + > > static void gfx_v10_ip_print(void *handle, struct drm_printer *p) > > { > > struct amdgpu_device *adev = (struct amdgpu_device *)handle; > > @@ -9588,7 +9606,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_gfx = { > > .emit_hdp_flush = gfx_v10_0_ring_emit_hdp_flush, > > .test_ring = gfx_v10_0_ring_test_ring, > > .test_ib = gfx_v10_0_ring_test_ib, > > - .insert_nop = amdgpu_ring_insert_nop, > > + .insert_nop = amdgpu_gfx10_ring_insert_nop, > > .pad_ib = amdgpu_ring_generic_pad_ib, > > .emit_switch_buffer = gfx_v10_0_ring_emit_sb, > > .emit_cntxcntl = gfx_v10_0_ring_emit_cntxcntl, > > @@ -9629,7 +9647,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_compute = { > > .emit_hdp_flush = gfx_v10_0_ring_emit_hdp_flush, > > .test_ring = gfx_v10_0_ring_test_ring, > > .test_ib = gfx_v10_0_ring_test_ib, > > - .insert_nop = amdgpu_ring_insert_nop, > > + .insert_nop = amdgpu_gfx10_ring_insert_nop, > > .pad_ib = amdgpu_ring_generic_pad_ib, > > .emit_wreg = gfx_v10_0_ring_emit_wreg, > > .emit_reg_wait = gfx_v10_0_ring_emit_reg_wait, > > @@ -9659,7 +9677,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_kiq = { > > .emit_fence = gfx_v10_0_ring_emit_fence_kiq, > > .test_ring = gfx_v10_0_ring_test_ring, > > .test_ib = gfx_v10_0_ring_test_ib, > > - .insert_nop = amdgpu_ring_insert_nop, > > + .insert_nop = amdgpu_gfx10_ring_insert_nop, > > .pad_ib = amdgpu_ring_generic_pad_ib, > > .emit_rreg = gfx_v10_0_ring_emit_rreg, > > .emit_wreg = gfx_v10_0_ring_emit_wreg, > > -- > > 2.34.1 > >