On 2019-10-26 08:05, Koenig, Christian wrote: > Am 25.10.19 um 23:51 schrieb Tuikov, Luben: >> On 2019-10-25 2:54 a.m., Koenig, Christian wrote: >>> Am 25.10.19 um 01:44 schrieb Tuikov, Luben: >>>> Simplify padding calculations. >>>> >>>> Signed-off-by: Luben Tuikov <luben.tuikov@xxxxxxx> >>>> --- >>>> drivers/gpu/drm/amd/amdgpu/cik_sdma.c | 4 ++-- >>>> drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c | 4 ++-- >>>> drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c | 4 ++-- >>>> drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c | 4 ++-- >>>> drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 14 +++++++++----- >>>> 5 files changed, 17 insertions(+), 13 deletions(-) >>>> >>>> diff --git a/drivers/gpu/drm/amd/amdgpu/cik_sdma.c b/drivers/gpu/drm/amd/amdgpu/cik_sdma.c >>>> index c45304f1047c..1ea9e18d6f08 100644 >>>> --- a/drivers/gpu/drm/amd/amdgpu/cik_sdma.c >>>> +++ b/drivers/gpu/drm/amd/amdgpu/cik_sdma.c >>>> @@ -228,7 +228,7 @@ static void cik_sdma_ring_emit_ib(struct amdgpu_ring *ring, >>>> u32 extra_bits = vmid & 0xf; >>>> >>>> /* IB packet must end on a 8 DW boundary */ >>>> - cik_sdma_ring_insert_nop(ring, (12 - (lower_32_bits(ring->wptr) & 7)) % 8); >>>> + cik_sdma_ring_insert_nop(ring, (4-lower_32_bits(ring->wptr)) & 7); >>>> >>>> amdgpu_ring_write(ring, SDMA_PACKET(SDMA_OPCODE_INDIRECT_BUFFER, 0, extra_bits)); >>>> amdgpu_ring_write(ring, ib->gpu_addr & 0xffffffe0); /* base must be 32 byte aligned */ >>>> @@ -811,7 +811,7 @@ static void cik_sdma_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib) >>>> u32 pad_count; >>>> int i; >>>> >>>> - pad_count = (8 - (ib->length_dw & 0x7)) % 8; >>>> + pad_count = (-ib->length_dw) & 7; >>>> for (i = 0; i < pad_count; i++) >>>> if (sdma && sdma->burst_nop && (i == 0)) >>>> ib->ptr[ib->length_dw++] = >>>> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c b/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c >>>> index a10175838013..d340f179401a 100644 >>>> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c >>>> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c >>>> @@ -255,7 +255,7 @@ static void sdma_v2_4_ring_emit_ib(struct amdgpu_ring *ring, >>>> unsigned vmid = AMDGPU_JOB_GET_VMID(job); >>>> >>>> /* IB packet must end on a 8 DW boundary */ >>>> - sdma_v2_4_ring_insert_nop(ring, (10 - (lower_32_bits(ring->wptr) & 7)) % 8); >>>> + sdma_v2_4_ring_insert_nop(ring, (2-lower_32_bits(ring->wptr)) & 7); >>>> >>>> amdgpu_ring_write(ring, SDMA_PKT_HEADER_OP(SDMA_OP_INDIRECT) | >>>> SDMA_PKT_INDIRECT_HEADER_VMID(vmid & 0xf)); >>>> @@ -750,7 +750,7 @@ static void sdma_v2_4_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib >>>> u32 pad_count; >>>> int i; >>>> >>>> - pad_count = (8 - (ib->length_dw & 0x7)) % 8; >>>> + pad_count = (-ib->length_dw) & 7; >>>> for (i = 0; i < pad_count; i++) >>>> if (sdma && sdma->burst_nop && (i == 0)) >>>> ib->ptr[ib->length_dw++] = >>>> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c >>>> index 5f4e2c616241..5c3c310188b6 100644 >>>> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c >>>> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c >>>> @@ -429,7 +429,7 @@ static void sdma_v3_0_ring_emit_ib(struct amdgpu_ring *ring, >>>> unsigned vmid = AMDGPU_JOB_GET_VMID(job); >>>> >>>> /* IB packet must end on a 8 DW boundary */ >>>> - sdma_v3_0_ring_insert_nop(ring, (10 - (lower_32_bits(ring->wptr) & 7)) % 8); >>>> + sdma_v3_0_ring_insert_nop(ring, (2-lower_32_bits(ring->wptr)) & 7); >>>> >>>> amdgpu_ring_write(ring, SDMA_PKT_HEADER_OP(SDMA_OP_INDIRECT) | >>>> SDMA_PKT_INDIRECT_HEADER_VMID(vmid & 0xf)); >>>> @@ -1021,7 +1021,7 @@ static void sdma_v3_0_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib >>>> u32 pad_count; >>>> int i; >>>> >>>> - pad_count = (8 - (ib->length_dw & 0x7)) % 8; >>>> + pad_count = (-ib->length_dw) & 7; >>>> for (i = 0; i < pad_count; i++) >>>> if (sdma && sdma->burst_nop && (i == 0)) >>>> ib->ptr[ib->length_dw++] = >>>> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c >>>> index 45bd538ba97e..7c71c88e38a4 100644 >>>> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c >>>> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c >>>> @@ -698,7 +698,7 @@ static void sdma_v4_0_ring_emit_ib(struct amdgpu_ring *ring, >>>> unsigned vmid = AMDGPU_JOB_GET_VMID(job); >>>> >>>> /* IB packet must end on a 8 DW boundary */ >>>> - sdma_v4_0_ring_insert_nop(ring, (10 - (lower_32_bits(ring->wptr) & 7)) % 8); >>>> + sdma_v4_0_ring_insert_nop(ring, (2-lower_32_bits(ring->wptr)) & 7); >>>> >>>> amdgpu_ring_write(ring, SDMA_PKT_HEADER_OP(SDMA_OP_INDIRECT) | >>>> SDMA_PKT_INDIRECT_HEADER_VMID(vmid & 0xf)); >>>> @@ -1580,7 +1580,7 @@ static void sdma_v4_0_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib >>>> u32 pad_count; >>>> int i; >>>> >>>> - pad_count = (8 - (ib->length_dw & 0x7)) % 8; >>>> + pad_count = (-ib->length_dw) & 7; >>>> for (i = 0; i < pad_count; i++) >>>> if (sdma && sdma->burst_nop && (i == 0)) >>>> ib->ptr[ib->length_dw++] = >>>> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c >>>> index 0c41b4fdc58b..67ede9e4df01 100644 >>>> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c >>>> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c >>>> @@ -382,8 +382,12 @@ static void sdma_v5_0_ring_emit_ib(struct amdgpu_ring *ring, >>>> unsigned vmid = AMDGPU_JOB_GET_VMID(job); >>>> uint64_t csa_mc_addr = amdgpu_sdma_get_csa_mc_addr(ring, vmid); >>>> >>>> - /* IB packet must end on a 8 DW boundary */ >>>> - sdma_v5_0_ring_insert_nop(ring, (10 - (lower_32_bits(ring->wptr) & 7)) % 8); >>>> + /* An IB packet must end on a 8 DW boundary--the next dword must >>>> + * be on a 8-dword boundary. Our IB below is 6 dwords long, >>> That should probably read "our IB command" or "our IB packet". >> Done. >> >>>> + * thus add x number of NOPs, such that >>>> + * wptr + 6 + x = 8k, k >= 0. >>> Maybe write (wrptr + 6 + x) % 8 = 0 to make that more obvious. >> That's less obvious. In my mind I always translate it to >> a congruence expression. It's what I used to derive the algebraic >> simplifications of this patch. >> >> I'll add the highly cryptic "(wrptr + 6 + x) % 8 = 0" to make you happy, >> below the algebraic expression already there. > > The problem I see here is that people who are used to GPUs will think > that k is a float and not immediately get what's meant here. > >>>> + */ >>>> + sdma_v5_0_ring_insert_nop(ring, (2-lower_32_bits(ring->wptr)) & 7); >>> Spaces between operators please. >> Yeah, I wanted to put an emphasis on the AND-op, but I'll add them. > > What would be really nice to have is to keep the 10 as number of DW in > the IB packet around. Why? Why 10? As far as I can see, THE IB packet is 6 dwords, we want to pad with some number of dwords, in order to make sure that the wptr + 6, ends up on a boundary multiple of 8. I.e. we want to solve exactly this, for x: wptr + 6 + x = 8k, k >= 0. x = -(wtpr + 6) = -6 - wptr = 2 - wptr (mod 8) Also note that, 10 = 2 = -6 (mod 8). I believe the patch is good as is. Did you see v2 of the patch? Regards, Luben > > Maybe add an "static inline amdgpu_ring_unalign(ring, 10, 8)" helper or > something like that, the insert_nop is a ring callback anyway IIRC. > > Regards, > Christian. > >> >> Regards, >> Luben >> >> >>> Apart form that looks good to me, >>> Christian. >>> >>>> >>>> amdgpu_ring_write(ring, SDMA_PKT_HEADER_OP(SDMA_OP_INDIRECT) | >>>> SDMA_PKT_INDIRECT_HEADER_VMID(vmid & 0xf)); >>>> @@ -1086,10 +1090,10 @@ static void sdma_v5_0_vm_set_pte_pde(struct amdgpu_ib *ib, >>>> } >>>> >>>> /** >>>> - * sdma_v5_0_ring_pad_ib - pad the IB to the required number of dw >>>> - * >>>> + * sdma_v5_0_ring_pad_ib - pad the IB >>>> * @ib: indirect buffer to fill with padding >>>> * >>>> + * Pad the IB with NOPs to a boundary multiple of 8. >>>> */ >>>> static void sdma_v5_0_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib) >>>> { >>>> @@ -1097,7 +1101,7 @@ static void sdma_v5_0_ring_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib >>>> u32 pad_count; >>>> int i; >>>> >>>> - pad_count = (8 - (ib->length_dw & 0x7)) % 8; >>>> + pad_count = (-ib->length_dw) & 0x7; >>>> for (i = 0; i < pad_count; i++) >>>> if (sdma && sdma->burst_nop && (i == 0)) >>>> ib->ptr[ib->length_dw++] = > _______________________________________________ amd-gfx mailing list amd-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/amd-gfx