Having noticed that typically 200+ nops per submission are filled into the ring, using a rather verbose one-nop-at-a-time-plus-ring-buffer-arithmetic as done in amdgpu_ring_write(), while right next to it there is amdgpu_ring_write_multiple(), I thought why not pre-cache a block of nops and write them out more efficiently. The patch is rather quick and dirty, does not deal with all insert_nops vfuncs, and the cache should probably go one level up so it is not replicated per amdgpu_ring instance. And performance gains are not that amazing for normal workloads. For instance a game which results in two submissions per frame, each pads with 222 nops, submission worker thread profile changes from: + 90.78% 0.67% kworker/u32:3-e [kernel.kallsyms] [k] process_one_work + 48.92% 0.12% kworker/u32:3-e [kernel.kallsyms] [k] commit_tail + 41.18% 1.73% kworker/u32:3-e [kernel.kallsyms] [k] amdgpu_dm_atomic_commit_tail - 30.31% 0.67% kworker/u32:3-e [kernel.kallsyms] [k] drm_sched_run_job_work - 29.63% drm_sched_run_job_work + 8.55% dma_fence_add_callback - 7.50% amdgpu_job_run - 7.43% amdgpu_ib_schedule - 2.46% amdgpu_ring_commit 1.44% amdgpu_ring_insert_nop To: + 89.83% 0.51% kworker/u32:6-g [kernel.kallsyms] [k] process_one_work + 47.65% 0.30% kworker/u32:6-g [kernel.kallsyms] [k] commit_tail + 39.42% 1.97% kworker/u32:6-g [kernel.kallsyms] [k] amdgpu_dm_atomic_commit_tail - 29.57% 1.10% kworker/u32:6-g [kernel.kallsyms] [k] drm_sched_run_job_work - 28.47% drm_sched_run_job_work + 8.52% dma_fence_add_callback - 6.33% amdgpu_job_run - 6.19% amdgpu_ib_schedule - 1.85% amdgpu_ring_commit 0.53% amdgpu_ring_insert_nop Or if we run a more "spammy" workload, which does several orders of magnitude more submissions second we go from: + 79.38% 1.66% kworker/u32:1+e [kernel.kallsyms] [k] process_one_work - 63.13% 6.66% kworker/u32:1+e [kernel.kallsyms] [k] drm_sched_run_job_work - 56.47% drm_sched_run_job_work - 25.67% amdgpu_job_run - 24.40% amdgpu_ib_schedule - 15.29% amdgpu_ring_commit 12.06% amdgpu_ring_insert_nop To: + 77.76% 1.97% kworker/u32:6-f [kernel.kallsyms] [k] process_one_work - 60.15% 7.04% kworker/u32:6-f [kernel.kallsyms] [k] drm_sched_run_job_work - 53.11% drm_sched_run_job_work - 19.35% amdgpu_job_run - 17.85% amdgpu_ib_schedule - 7.75% amdgpu_ring_commit 3.27% amdgpu_ring_insert_nop Not bad and "every little helps", or flame-throwers at ready? Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@xxxxxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c | 20 +++++++++++++++----- drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h | 1 + 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c index ad49cecb20b8..22ec9de62b06 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c @@ -108,10 +108,14 @@ int amdgpu_ring_alloc(struct amdgpu_ring *ring, unsigned int ndw) */ void amdgpu_ring_insert_nop(struct amdgpu_ring *ring, uint32_t count) { - int i; + if (count > 1 && count <= ARRAY_SIZE(ring->nop_cache)) { + amdgpu_ring_write_multiple(ring, ring->nop_cache, count); + } else { + int i; - for (i = 0; i < count; i++) - amdgpu_ring_write(ring, ring->funcs->nop); + for (i = 0; i < count; i++) + amdgpu_ring_write(ring, ring->funcs->nop); + } } /** @@ -124,8 +128,11 @@ void amdgpu_ring_insert_nop(struct amdgpu_ring *ring, uint32_t count) */ void amdgpu_ring_generic_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib) { - while (ib->length_dw & ring->funcs->align_mask) - ib->ptr[ib->length_dw++] = ring->funcs->nop; + u32 count = ib->length_dw & ring->funcs->align_mask; + + memcpy(&ib->ptr[ib->length_dw], ring->nop_cache, count * sizeof(u32)); + + ib->length_dw += count; } /** @@ -359,6 +366,9 @@ int amdgpu_ring_init(struct amdgpu_device *adev, struct amdgpu_ring *ring, &ring->sched; } + for (r = 0; r < ARRAY_SIZE(ring->nop_cache); r++) + ring->nop_cache[r] = ring->funcs->nop; + return 0; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h index 582053f1cd56..74ce95b4666a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h @@ -246,6 +246,7 @@ struct amdgpu_ring { struct amdgpu_bo *ring_obj; volatile uint32_t *ring; unsigned rptr_offs; + u32 nop_cache[256]; u64 rptr_gpu_addr; volatile u32 *rptr_cpu_addr; u64 wptr; -- 2.44.0