Re: [PATCH] drm/amdgpu: optimize the padding with hw optimization

Marek Olšák <maraeo@xxxxxxxxx> · Thu, 1 Aug 2024 02:53:23 -0400

On Thu, Aug 1, 2024, 00:28 Khatri, Sunil <sukhatri@xxxxxxx> wrote:

On 8/1/2024 8:49 AM, Marek Olšák wrote:

> On Tue, Jul 30, 2024 at 8:43 AM Sunil Khatri <sunil.khatri@xxxxxxx> wrote:

>> Adding NOP packets one by one in the ring

>> does not use the CP efficiently.

>>

>> Solution:

>> Use CP optimization while adding NOP packet's so PFP

>> can discard NOP packets based on information of count

>> from the Header instead of fetching all NOP packets

>> one by one.

>>

>> Cc: Christian König <christian.koenig@xxxxxxx>

>> Cc: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@xxxxxxx>

>> Cc: Tvrtko Ursulin <tursulin@xxxxxxxxxx>

>> Cc: Marek Olšák <marek.olsak@xxxxxxx>

>> Signed-off-by: Sunil Khatri <sunil.khatri@xxxxxxx>

>> ---

>>   drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 24 +++++++++++++++++++++---

>>   1 file changed, 21 insertions(+), 3 deletions(-)

>>

>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c

>> index 853084a2ce7f..edf5b5c4d185 100644

>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c

>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c

>> @@ -9397,6 +9397,24 @@ static void gfx_v10_0_emit_mem_sync(struct amdgpu_ring *ring)

>>          amdgpu_ring_write(ring, gcr_cntl); /* GCR_CNTL */

>>   }

>>

>> +static void amdgpu_gfx10_ring_insert_nop(struct amdgpu_ring *ring, uint32_t num_nop)

>> +{

>> +       int i;

>> +

>> +       /* Header itself is a NOP packet */

>> +       if (num_nop == 1) {

>> +               amdgpu_ring_write(ring, ring->funcs->nop);

>> +               return;

>> +       }

>> +

>> +       /* Max HW optimization till 0x3ffe, followed by remaining one NOP at a time*/

>> +       amdgpu_ring_write(ring, PACKET3(PACKET3_NOP, min(num_nop - 2, 0x3ffe)));

>> +

>> +       /* Header is at index 0, followed by num_nops - 1 NOP packet's */

>> +       for (i = 1; i < num_nop; i++)

>> +               amdgpu_ring_write(ring, ring->funcs->nop);

> This loop should be removed. It's unnecessary CPU overhead and we

> should never get more than 0x3fff NOPs (maybe use BUG_ON). Leaving the

> whole packet body uninitialized is the fastest option.

That was the original intent to just move the WPTR for the no of nops 

and tried too. Based on Christian inputs we should not let the nops packet

as garbage or whatever was there originally as a threat/safety measure.

It doesn't help safety. It can only be read by the GPU with kernel-level permissions.

Initializing the packet body is useless and adds CPU overhead, especially with the 256 NOPs or so that we use for no reason.

Marek

So CPU side there isnt any optimization but just CP will skip all these 

so GPU side should see the benefit.

Regards

Sunil Khatri

>

> Marek

>

>> +}

>> +

>>   static void gfx_v10_ip_print(void *handle, struct drm_printer *p)

>>   {

>>          struct amdgpu_device *adev = (struct amdgpu_device *)handle;

>> @@ -9588,7 +9606,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_gfx = {

>>          .emit_hdp_flush = gfx_v10_0_ring_emit_hdp_flush,

>>          .test_ring = gfx_v10_0_ring_test_ring,

>>          .test_ib = gfx_v10_0_ring_test_ib,

>> -       .insert_nop = amdgpu_ring_insert_nop,

>> +       .insert_nop = amdgpu_gfx10_ring_insert_nop,

>>          .pad_ib = amdgpu_ring_generic_pad_ib,

>>          .emit_switch_buffer = gfx_v10_0_ring_emit_sb,

>>          .emit_cntxcntl = gfx_v10_0_ring_emit_cntxcntl,

>> @@ -9629,7 +9647,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_compute = {

>>          .emit_hdp_flush = gfx_v10_0_ring_emit_hdp_flush,

>>          .test_ring = gfx_v10_0_ring_test_ring,

>>          .test_ib = gfx_v10_0_ring_test_ib,

>> -       .insert_nop = amdgpu_ring_insert_nop,

>> +       .insert_nop = amdgpu_gfx10_ring_insert_nop,

>>          .pad_ib = amdgpu_ring_generic_pad_ib,

>>          .emit_wreg = gfx_v10_0_ring_emit_wreg,

>>          .emit_reg_wait = gfx_v10_0_ring_emit_reg_wait,

>> @@ -9659,7 +9677,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_kiq = {

>>          .emit_fence = gfx_v10_0_ring_emit_fence_kiq,

>>          .test_ring = gfx_v10_0_ring_test_ring,

>>          .test_ib = gfx_v10_0_ring_test_ib,

>> -       .insert_nop = amdgpu_ring_insert_nop,

>> +       .insert_nop = amdgpu_gfx10_ring_insert_nop,

>>          .pad_ib = amdgpu_ring_generic_pad_ib,

>>          .emit_rreg = gfx_v10_0_ring_emit_rreg,

>>          .emit_wreg = gfx_v10_0_ring_emit_wreg,

>> --

>> 2.34.1

>>