If we then would convert some call sites to use this macro, lets see on
the example of amdgpu_vce_ring_emit_ib(), what results that would bring.
Before (but after the wptr local caching, before it is even worse):
173c39: 48 89 f8 mov %rdi,%rax
173c3c: 48 89 d1 mov %rdx,%rcx
173c3f: 48 8b 97 c8 01 00 00 mov 0x1c8(%rdi),%rdx
173c46: 48 8b b0 a8 01 00 00 mov 0x1a8(%rax),%rsi
173c4d: 89 d7 mov %edx,%edi
173c4f: 23 b8 f8 01 00 00 and 0x1f8(%rax),%edi
173c55: 48 83 c2 01 add $0x1,%rdx
173c59: c7 04 be 02 00 00 00 movl $0x2,(%rsi,%rdi,4)
173c60: 48 23 90 f0 01 00 00 and 0x1f0(%rax),%rdx
173c67: 48 89 90 c8 01 00 00 mov %rdx,0x1c8(%rax)
173c6e: 48 8b b0 a8 01 00 00 mov 0x1a8(%rax),%rsi
173c75: 89 d7 mov %edx,%edi
173c77: 48 83 c2 01 add $0x1,%rdx
173c7b: 83 a8 e0 01 00 00 01 subl $0x1,0x1e0(%rax)
173c82: 4c 8b 41 10 mov 0x10(%rcx),%r8
173c86: 23 b8 f8 01 00 00 and 0x1f8(%rax),%edi
173c8c: 44 89 04 be mov %r8d,(%rsi,%rdi,4)
173c90: 48 23 90 f0 01 00 00 and 0x1f0(%rax),%rdx
173c97: 48 89 90 c8 01 00 00 mov %rdx,0x1c8(%rax)
173c9e: 48 8b b0 a8 01 00 00 mov 0x1a8(%rax),%rsi
173ca5: 89 d7 mov %edx,%edi
173ca7: 48 83 c2 01 add $0x1,%rdx
173cab: 83 a8 e0 01 00 00 01 subl $0x1,0x1e0(%rax)
173cb2: 44 8b 41 14 mov 0x14(%rcx),%r8d
173cb6: 23 b8 f8 01 00 00 and 0x1f8(%rax),%edi
173cbc: 44 89 04 be mov %r8d,(%rsi,%rdi,4)
173cc0: 48 23 90 f0 01 00 00 and 0x1f0(%rax),%rdx
173cc7: 48 89 90 c8 01 00 00 mov %rdx,0x1c8(%rax)
173cce: 89 d6 mov %edx,%esi
173cd0: 23 b0 f8 01 00 00 and 0x1f8(%rax),%esi
173cd6: 48 83 c2 01 add $0x1,%rdx
173cda: 83 a8 e0 01 00 00 01 subl $0x1,0x1e0(%rax)
173ce1: 8b 79 08 mov 0x8(%rcx),%edi
173ce4: 48 8b 88 a8 01 00 00 mov 0x1a8(%rax),%rcx
173ceb: 89 3c b1 mov %edi,(%rcx,%rsi,4)
173cee: 48 23 90 f0 01 00 00 and 0x1f0(%rax),%rdx
173cf5: 83 a8 e0 01 00 00 01 subl $0x1,0x1e0(%rax)
173cfc: 48 89 90 c8 01 00 00 mov %rdx,0x1c8(%rax)
173d03: c3 ret
And after:
1579: 48 89 f8 mov %rdi,%rax
157c: 44 8b 4a 08 mov 0x8(%rdx),%r9d
1580: 48 8b 7a 10 mov 0x10(%rdx),%rdi
1584: 48 8b 90 c8 01 00 00 mov 0x1c8(%rax),%rdx
158b: 8b b0 f8 01 00 00 mov 0x1f8(%rax),%esi
1591: 48 8b 88 a8 01 00 00 mov 0x1a8(%rax),%rcx
1598: 49 89 d0 mov %rdx,%r8
159b: 49 21 f0 and %rsi,%r8
159e: 42 c7 04 81 02 00 00 movl $0x2,(%rcx,%r8,4)
15a5: 00
15a6: 4c 8d 42 01 lea 0x1(%rdx),%r8
15aa: 49 21 f0 and %rsi,%r8
15ad: 42 89 3c 81 mov %edi,(%rcx,%r8,4)
15b1: 4c 8d 42 02 lea 0x2(%rdx),%r8
15b5: 48 c1 ef 20 shr $0x20,%rdi
15b9: 49 21 f0 and %rsi,%r8
15bc: 42 89 3c 81 mov %edi,(%rcx,%r8,4)
15c0: 48 8d 7a 03 lea 0x3(%rdx),%rdi
15c4: 48 83 c2 04 add $0x4,%rdx
15c8: 48 21 fe and %rdi,%rsi
15cb: 44 89 0c b1 mov %r9d,(%rcx,%rsi,4)
15cf: 48 23 90 f0 01 00 00 and 0x1f0(%rax),%rdx
15d6: 83 a8 e0 01 00 00 04 subl $0x4,0x1e0(%rax)
15dd: 48 89 90 c8 01 00 00 mov %rdx,0x1c8(%rax)
15e4: c3 ret
Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@xxxxxxxxxx>
Cc: Christian König <christian.koenig@xxxxxxx>
Cc: Sunil Khatri <sunil.khatri@xxxxxxx>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h | 222 ++++++++++++++++++++++-
1 file changed, 220 insertions(+), 2 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
index b57951d8c997..4f467864ed09 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
@@ -383,15 +383,233 @@ static inline void amdgpu_ring_clear_ring(struct amdgpu_ring *ring)
memset32(ring->ring, ring->funcs->nop, ring->buf_mask + 1);
}
-static inline void amdgpu_ring_write(struct amdgpu_ring *ring, uint32_t v)
+static inline void
+amdgpu_ring_write1(struct amdgpu_ring *ring,
+ u32 v1)
{
+ const u32 buf_mask = ring->buf_mask;
u64 wptr = ring->wptr;
- ring->ring[wptr++ & ring->buf_mask] = v;
+ ring->ring[wptr++ & buf_mask] = v1;
ring->wptr = wptr & ring->ptr_mask;
ring->count_dw--;
}
+static inline void
+amdgpu_ring_write2(struct amdgpu_ring *ring,
+ u32 v1, u32 v2)
+{
+ const u32 buf_mask = ring->buf_mask;
+ u64 wptr = ring->wptr;
+ u32 *r = ring->ring;
+
+ r[wptr++ & buf_mask] = v1;
+ r[wptr++ & buf_mask] = v2;
+
+ ring->wptr = wptr & ring->ptr_mask;
+ ring->count_dw -= 2;
+}
+
+static inline void
+amdgpu_ring_write3(struct amdgpu_ring *ring,
+ u32 v1, u32 v2, u32 v3)
+{
+ const u32 buf_mask = ring->buf_mask;
+ u64 wptr = ring->wptr;
+ u32 *r = ring->ring;
+
+ r[wptr++ & buf_mask] = v1;
+ r[wptr++ & buf_mask] = v2;
+ r[wptr++ & buf_mask] = v3;
+
+ ring->wptr = wptr & ring->ptr_mask;
+ ring->count_dw -= 3;
+}
+
+static inline void
+amdgpu_ring_write4(struct amdgpu_ring *ring,
+ u32 v1, u32 v2, u32 v3, u32 v4)
+{
+ const u32 buf_mask = ring->buf_mask;
+ u64 wptr = ring->wptr;
+ u32 *r = ring->ring;
+
+ r[wptr++ & buf_mask] = v1;
+ r[wptr++ & buf_mask] = v2;
+ r[wptr++ & buf_mask] = v3;
+ r[wptr++ & buf_mask] = v4;
+
+ ring->wptr = wptr & ring->ptr_mask;
+ ring->count_dw -= 4;
+}
+
+static inline void
+amdgpu_ring_write5(struct amdgpu_ring *ring,
+ u32 v1, u32 v2, u32 v3, u32 v4, u32 v5)
+{
+ const u32 buf_mask = ring->buf_mask;
+ u64 wptr = ring->wptr;
+ u32 *r = ring->ring;
+
+ r[wptr++ & buf_mask] = v1;
+ r[wptr++ & buf_mask] = v2;
+ r[wptr++ & buf_mask] = v3;
+ r[wptr++ & buf_mask] = v4;
+ r[wptr++ & buf_mask] = v5;
+
+ ring->wptr = wptr & ring->ptr_mask;
+ ring->count_dw -= 5;
+}
+
+static inline void
+amdgpu_ring_write6(struct amdgpu_ring *ring,
+ u32 v1, u32 v2, u32 v3, u32 v4, u32 v5, u32 v6)
+{
+ const u32 buf_mask = ring->buf_mask;
+ u64 wptr = ring->wptr;
+ u32 *r = ring->ring;
+
+ r[wptr++ & buf_mask] = v1;
+ r[wptr++ & buf_mask] = v2;
+ r[wptr++ & buf_mask] = v3;
+ r[wptr++ & buf_mask] = v4;
+ r[wptr++ & buf_mask] = v5;
+ r[wptr++ & buf_mask] = v6;
+
+ ring->wptr = wptr & ring->ptr_mask;
+ ring->count_dw -= 6;
+}
+
+static inline void
+amdgpu_ring_write7(struct amdgpu_ring *ring,
+ u32 v1, u32 v2, u32 v3, u32 v4, u32 v5, u32 v6, u32 v7)
+{
+ const u32 buf_mask = ring->buf_mask;
+ u64 wptr = ring->wptr;
+ u32 *r = ring->ring;
+
+ r[wptr++ & buf_mask] = v1;
+ r[wptr++ & buf_mask] = v2;
+ r[wptr++ & buf_mask] = v3;
+ r[wptr++ & buf_mask] = v4;
+ r[wptr++ & buf_mask] = v5;
+ r[wptr++ & buf_mask] = v6;
+ r[wptr++ & buf_mask] = v7;
+
+ ring->wptr = wptr & ring->ptr_mask;
+ ring->count_dw -= 7;
+}
+
+static inline void
+amdgpu_ring_write8(struct amdgpu_ring *ring,
+ u32 v1, u32 v2, u32 v3, u32 v4, u32 v5, u32 v6, u32 v7,
+ u32 v8)
+{
+ const u32 buf_mask = ring->buf_mask;
+ u64 wptr = ring->wptr;
+ u32 *r = ring->ring;
+
+ r[wptr++ & buf_mask] = v1;
+ r[wptr++ & buf_mask] = v2;
+ r[wptr++ & buf_mask] = v3;
+ r[wptr++ & buf_mask] = v4;
+ r[wptr++ & buf_mask] = v5;
+ r[wptr++ & buf_mask] = v6;
+ r[wptr++ & buf_mask] = v7;
+ r[wptr++ & buf_mask] = v8;
+
+ ring->wptr = wptr & ring->ptr_mask;
+ ring->count_dw -= 8;
+}
+
+static inline void
+amdgpu_ring_write9(struct amdgpu_ring *ring,
+ u32 v1, u32 v2, u32 v3, u32 v4, u32 v5, u32 v6, u32 v7,
+ u32 v8, u32 v9)
+{
+ const u32 buf_mask = ring->buf_mask;
+ u64 wptr = ring->wptr;
+ u32 *r = ring->ring;
+
+ r[wptr++ & buf_mask] = v1;
+ r[wptr++ & buf_mask] = v2;
+ r[wptr++ & buf_mask] = v3;
+ r[wptr++ & buf_mask] = v4;
+ r[wptr++ & buf_mask] = v5;
+ r[wptr++ & buf_mask] = v6;
+ r[wptr++ & buf_mask] = v7;
+ r[wptr++ & buf_mask] = v8;
+ r[wptr++ & buf_mask] = v9;
+
+ ring->wptr = wptr & ring->ptr_mask;
+ ring->count_dw -= 9;
+}
+
+static inline void
+amdgpu_ring_write10(struct amdgpu_ring *ring,
+ u32 v1, u32 v2, u32 v3, u32 v4, u32 v5, u32 v6, u32 v7,
+ u32 v8, u32 v9, u32 v10)
+{
+ const u32 buf_mask = ring->buf_mask;
+ u64 wptr = ring->wptr;
+ u32 *r = ring->ring;
+
+ r[wptr++ & buf_mask] = v1;
+ r[wptr++ & buf_mask] = v2;
+ r[wptr++ & buf_mask] = v3;
+ r[wptr++ & buf_mask] = v4;
+ r[wptr++ & buf_mask] = v5;
+ r[wptr++ & buf_mask] = v6;
+ r[wptr++ & buf_mask] = v7;
+ r[wptr++ & buf_mask] = v8;
+ r[wptr++ & buf_mask] = v9;
+ r[wptr++ & buf_mask] = v10;
+
+ ring->wptr = wptr & ring->ptr_mask;
+ ring->count_dw -= 10;
+}
+
+static inline void
+amdgpu_ring_write11(struct amdgpu_ring *ring,
+ u32 v1, u32 v2, u32 v3, u32 v4, u32 v5, u32 v6, u32 v7,
+ u32 v8, u32 v9, u32 v10, u32 v11)
+{
+ const u32 buf_mask = ring->buf_mask;
+ u64 wptr = ring->wptr;
+ u32 *r = ring->ring;
+
+ r[wptr++ & buf_mask] = v1;
+ r[wptr++ & buf_mask] = v2;
+ r[wptr++ & buf_mask] = v3;
+ r[wptr++ & buf_mask] = v4;
+ r[wptr++ & buf_mask] = v5;
+ r[wptr++ & buf_mask] = v6;
+ r[wptr++ & buf_mask] = v7;
+ r[wptr++ & buf_mask] = v8;
+ r[wptr++ & buf_mask] = v9;
+ r[wptr++ & buf_mask] = v10;
+ r[wptr++ & buf_mask] = v11;
+
+ ring->wptr = wptr & ring->ptr_mask;
+ ring->count_dw -= 11;
+}
+
+#define AMDGPU_RING_WRITE(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, NAME, ...) NAME
+#define amdgpu_ring_write(...) \
+ AMDGPU_RING_WRITE(__VA_ARGS__, \
+ amdgpu_ring_write11, \
+ amdgpu_ring_write10, \
+ amdgpu_ring_write9, \
+ amdgpu_ring_write8, \
+ amdgpu_ring_write7, \
+ amdgpu_ring_write6, \
+ amdgpu_ring_write5, \
+ amdgpu_ring_write4, \
+ amdgpu_ring_write3, \
+ amdgpu_ring_write2, \
+ amdgpu_ring_write1, \
+ NULL)(__VA_ARGS__)
+
static inline void amdgpu_ring_write_multiple(struct amdgpu_ring *ring,
void *src, int count_dw)
{