[PATCH 3/3] drm/amdpug: More more efficient ring padding

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



From: Tvrtko Ursulin <tvrtko.ursulin@xxxxxxxxxx>

Similarly as in the previous patch, we add a new amdgpu_ring_fill64()
helper which can write out the nops more efficiently using memset64().

This should have a lesser effect than the previous patch, given how the
affected rings have at most 64 dword alignment restriction, in contrast to
many with 256 dwords back there. Nevertheless, it sounds plausible to
utilise the identical approach and still achieve that little bit more
efficiency.

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@xxxxxxxxxx>
Cc: Christian König <ckoenig.leichtzumerken@xxxxxxxxx>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h | 31 ++++++++++++++++++++++++
 drivers/gpu/drm/amd/amdgpu/jpeg_v1_0.c   |  7 +-----
 drivers/gpu/drm/amd/amdgpu/jpeg_v2_0.c   |  7 +-----
 drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c |  7 +-----
 drivers/gpu/drm/amd/amdgpu/uvd_v3_1.c    |  7 +-----
 drivers/gpu/drm/amd/amdgpu/uvd_v4_2.c    |  7 +-----
 drivers/gpu/drm/amd/amdgpu/uvd_v5_0.c    |  7 +-----
 drivers/gpu/drm/amd/amdgpu/uvd_v6_0.c    |  7 +-----
 drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c    |  8 +++---
 drivers/gpu/drm/amd/amdgpu/vcn_v1_0.c    |  8 +++---
 drivers/gpu/drm/amd/amdgpu/vcn_v2_0.c    |  6 +----
 11 files changed, 45 insertions(+), 57 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
index 2264d7502eb6..4757bc56df0d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
@@ -439,6 +439,37 @@ static inline void amdgpu_ring_fill(struct amdgpu_ring *ring, u32 v,
 	ring->count_dw -= count_dw;
 }
 
+static inline void amdgpu_ring_fill64(struct amdgpu_ring *ring, u64 v,
+				      unsigned int count_qw)
+{
+	unsigned int occupied, chunk1, chunk2;
+	unsigned int count_dw = count_qw * 2;
+
+	if (unlikely(ring->count_dw < count_dw))
+		DRM_ERROR("amdgpu: writing more qwords to the ring than expected!\n");
+
+	occupied = ring->wptr & ring->buf_mask;
+	chunk1 = ring->buf_mask + 1 - occupied;
+	chunk1 = (chunk1 >= count_dw) ? count_dw : chunk1;
+	chunk2 = count_dw - chunk1;
+
+	if (WARN_ON_ONCE((chunk1 | chunk2) & 1))
+		return;
+
+	chunk1 >>= 1;
+	chunk2 >>= 1;
+
+	if (chunk1)
+		memset64((u64 *)&ring->ring[occupied], v, chunk1);
+
+	if (chunk2)
+		memset64((u64 *)&ring->ring[0], v, chunk2);
+
+	ring->wptr += count_dw;
+	ring->wptr &= ring->ptr_mask;
+	ring->count_dw -= count_dw;
+}
+
 /**
  * amdgpu_ring_patch_cond_exec - patch dw count of conditional execute
  * @ring: amdgpu_ring structure
diff --git a/drivers/gpu/drm/amd/amdgpu/jpeg_v1_0.c b/drivers/gpu/drm/amd/amdgpu/jpeg_v1_0.c
index 71f43a5c7f72..9aba7002cac6 100644
--- a/drivers/gpu/drm/amd/amdgpu/jpeg_v1_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/jpeg_v1_0.c
@@ -411,14 +411,9 @@ static void jpeg_v1_0_decode_ring_emit_wreg(struct amdgpu_ring *ring,
 
 static void jpeg_v1_0_decode_ring_nop(struct amdgpu_ring *ring, uint32_t count)
 {
-	int i;
-
 	WARN_ON(ring->wptr % 2 || count % 2);
 
-	for (i = 0; i < count / 2; i++) {
-		amdgpu_ring_write(ring, PACKETJ(0, 0, 0, PACKETJ_TYPE6));
-		amdgpu_ring_write(ring, 0);
-	}
+	amdgpu_ring_fill64(ring, PACKETJ(0, 0, 0, PACKETJ_TYPE6), count / 2);
 }
 
 static int jpeg_v1_0_set_interrupt_state(struct amdgpu_device *adev,
diff --git a/drivers/gpu/drm/amd/amdgpu/jpeg_v2_0.c b/drivers/gpu/drm/amd/amdgpu/jpeg_v2_0.c
index 99adf3625657..83a10968a4a2 100644
--- a/drivers/gpu/drm/amd/amdgpu/jpeg_v2_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/jpeg_v2_0.c
@@ -642,14 +642,9 @@ void jpeg_v2_0_dec_ring_emit_wreg(struct amdgpu_ring *ring, uint32_t reg, uint32
 
 void jpeg_v2_0_dec_ring_nop(struct amdgpu_ring *ring, uint32_t count)
 {
-	int i;
-
 	WARN_ON(ring->wptr % 2 || count % 2);
 
-	for (i = 0; i < count / 2; i++) {
-		amdgpu_ring_write(ring, PACKETJ(0, 0, 0, PACKETJ_TYPE6));
-		amdgpu_ring_write(ring, 0);
-	}
+	amdgpu_ring_fill64(ring, PACKETJ(0, 0, 0, PACKETJ_TYPE6), count / 2);
 }
 
 static bool jpeg_v2_0_is_idle(void *handle)
diff --git a/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c b/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c
index 04d8966423de..62e19b1c918b 100644
--- a/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c
@@ -876,14 +876,9 @@ void jpeg_v4_0_3_dec_ring_emit_wreg(struct amdgpu_ring *ring, uint32_t reg, uint
 
 void jpeg_v4_0_3_dec_ring_nop(struct amdgpu_ring *ring, uint32_t count)
 {
-	int i;
-
 	WARN_ON(ring->wptr % 2 || count % 2);
 
-	for (i = 0; i < count / 2; i++) {
-		amdgpu_ring_write(ring, PACKETJ(0, 0, 0, PACKETJ_TYPE6));
-		amdgpu_ring_write(ring, 0);
-	}
+	amdgpu_ring_fill64(ring, PACKETJ(0, 0, 0, PACKETJ_TYPE6), count / 2);
 }
 
 static bool jpeg_v4_0_3_is_idle(void *handle)
diff --git a/drivers/gpu/drm/amd/amdgpu/uvd_v3_1.c b/drivers/gpu/drm/amd/amdgpu/uvd_v3_1.c
index 805d6662c88b..8e3ac9aa9209 100644
--- a/drivers/gpu/drm/amd/amdgpu/uvd_v3_1.c
+++ b/drivers/gpu/drm/amd/amdgpu/uvd_v3_1.c
@@ -166,14 +166,9 @@ static int uvd_v3_1_ring_test_ring(struct amdgpu_ring *ring)
 
 static void uvd_v3_1_ring_insert_nop(struct amdgpu_ring *ring, uint32_t count)
 {
-	int i;
-
 	WARN_ON(ring->wptr % 2 || count % 2);
 
-	for (i = 0; i < count / 2; i++) {
-		amdgpu_ring_write(ring, PACKET0(mmUVD_NO_OP, 0));
-		amdgpu_ring_write(ring, 0);
-	}
+	amdgpu_ring_fill64(ring, PACKET0(mmUVD_NO_OP, 0), count / 2);
 }
 
 static const struct amdgpu_ring_funcs uvd_v3_1_ring_funcs = {
diff --git a/drivers/gpu/drm/amd/amdgpu/uvd_v4_2.c b/drivers/gpu/drm/amd/amdgpu/uvd_v4_2.c
index 3f19c606f4de..cbd495a92afe 100644
--- a/drivers/gpu/drm/amd/amdgpu/uvd_v4_2.c
+++ b/drivers/gpu/drm/amd/amdgpu/uvd_v4_2.c
@@ -554,14 +554,9 @@ static void uvd_v4_2_ring_emit_ib(struct amdgpu_ring *ring,
 
 static void uvd_v4_2_ring_insert_nop(struct amdgpu_ring *ring, uint32_t count)
 {
-	int i;
-
 	WARN_ON(ring->wptr % 2 || count % 2);
 
-	for (i = 0; i < count / 2; i++) {
-		amdgpu_ring_write(ring, PACKET0(mmUVD_NO_OP, 0));
-		amdgpu_ring_write(ring, 0);
-	}
+	amdgpu_ring_fill64(ring, PACKET0(mmUVD_NO_OP, 0), count / 2);
 }
 
 /**
diff --git a/drivers/gpu/drm/amd/amdgpu/uvd_v5_0.c b/drivers/gpu/drm/amd/amdgpu/uvd_v5_0.c
index efd903c21d48..18ce4cece85e 100644
--- a/drivers/gpu/drm/amd/amdgpu/uvd_v5_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/uvd_v5_0.c
@@ -571,14 +571,9 @@ static void uvd_v5_0_ring_emit_ib(struct amdgpu_ring *ring,
 
 static void uvd_v5_0_ring_insert_nop(struct amdgpu_ring *ring, uint32_t count)
 {
-	int i;
-
 	WARN_ON(ring->wptr % 2 || count % 2);
 
-	for (i = 0; i < count / 2; i++) {
-		amdgpu_ring_write(ring, PACKET0(mmUVD_NO_OP, 0));
-		amdgpu_ring_write(ring, 0);
-	}
+	amdgpu_ring_fill64(ring, PACKET0(mmUVD_NO_OP, 0), count / 2);
 }
 
 static bool uvd_v5_0_is_idle(void *handle)
diff --git a/drivers/gpu/drm/amd/amdgpu/uvd_v6_0.c b/drivers/gpu/drm/amd/amdgpu/uvd_v6_0.c
index 495de5068455..a65696b58488 100644
--- a/drivers/gpu/drm/amd/amdgpu/uvd_v6_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/uvd_v6_0.c
@@ -1107,14 +1107,9 @@ static void uvd_v6_0_ring_emit_pipeline_sync(struct amdgpu_ring *ring)
 
 static void uvd_v6_0_ring_insert_nop(struct amdgpu_ring *ring, uint32_t count)
 {
-	int i;
-
 	WARN_ON(ring->wptr % 2 || count % 2);
 
-	for (i = 0; i < count / 2; i++) {
-		amdgpu_ring_write(ring, PACKET0(mmUVD_NO_OP, 0));
-		amdgpu_ring_write(ring, 0);
-	}
+	amdgpu_ring_fill64(ring, PACKET0(mmUVD_NO_OP, 0), count / 2);
 }
 
 static void uvd_v6_0_enc_ring_emit_pipeline_sync(struct amdgpu_ring *ring)
diff --git a/drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c b/drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c
index 6068b784dc69..75a3a4a38a6a 100644
--- a/drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/uvd_v7_0.c
@@ -1417,14 +1417,12 @@ static void uvd_v7_0_ring_emit_vm_flush(struct amdgpu_ring *ring,
 static void uvd_v7_0_ring_insert_nop(struct amdgpu_ring *ring, uint32_t count)
 {
 	struct amdgpu_device *adev = ring->adev;
-	int i;
 
 	WARN_ON(ring->wptr % 2 || count % 2);
 
-	for (i = 0; i < count / 2; i++) {
-		amdgpu_ring_write(ring, PACKET0(SOC15_REG_OFFSET(UVD, ring->me, mmUVD_NO_OP), 0));
-		amdgpu_ring_write(ring, 0);
-	}
+	amdgpu_ring_fill64(ring,
+			   PACKET0(SOC15_REG_OFFSET(UVD, ring->me, mmUVD_NO_OP), 0),
+			   count / 2);
 }
 
 static void uvd_v7_0_enc_ring_insert_end(struct amdgpu_ring *ring)
diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v1_0.c b/drivers/gpu/drm/amd/amdgpu/vcn_v1_0.c
index a280b9fecb77..f0ba65639e5f 100644
--- a/drivers/gpu/drm/amd/amdgpu/vcn_v1_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/vcn_v1_0.c
@@ -1742,14 +1742,12 @@ static int vcn_v1_0_process_interrupt(struct amdgpu_device *adev,
 static void vcn_v1_0_dec_ring_insert_nop(struct amdgpu_ring *ring, uint32_t count)
 {
 	struct amdgpu_device *adev = ring->adev;
-	int i;
 
 	WARN_ON(ring->wptr % 2 || count % 2);
 
-	for (i = 0; i < count / 2; i++) {
-		amdgpu_ring_write(ring, PACKET0(SOC15_REG_OFFSET(UVD, 0, mmUVD_NO_OP), 0));
-		amdgpu_ring_write(ring, 0);
-	}
+	amdgpu_ring_fill64(ring,
+			   PACKET0(SOC15_REG_OFFSET(UVD, 0, mmUVD_NO_OP), 0),
+			   count / 2);
 }
 
 static int vcn_v1_0_set_powergating_state(void *handle,
diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v2_0.c b/drivers/gpu/drm/amd/amdgpu/vcn_v2_0.c
index d3d096909a7f..f5af5ceecc50 100644
--- a/drivers/gpu/drm/amd/amdgpu/vcn_v2_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/vcn_v2_0.c
@@ -1406,14 +1406,10 @@ void vcn_v2_0_dec_ring_insert_end(struct amdgpu_ring *ring)
 void vcn_v2_0_dec_ring_insert_nop(struct amdgpu_ring *ring, uint32_t count)
 {
 	struct amdgpu_device *adev = ring->adev;
-	int i;
 
 	WARN_ON(ring->wptr % 2 || count % 2);
 
-	for (i = 0; i < count / 2; i++) {
-		amdgpu_ring_write(ring, PACKET0(adev->vcn.internal.nop, 0));
-		amdgpu_ring_write(ring, 0);
-	}
+	amdgpu_ring_fill64(ring, PACKET0(adev->vcn.internal.nop, 0), count / 2);
 }
 
 /**
-- 
2.44.0




[Index of Archives]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux