By moving the booleans to flags and shrinking some fields we can stop spilling job allocation into the 1k SLAB even with two appended indirect buffers. End result for struct amdgpu_job: /* size: 448, cachelines: 7, members: 24 */ /* forced alignments: 1 */ So appending two IB buffers, which seems to be a typical case when running a game such as Cyberpunk 2077, makes every submission perfectly fit into the 512 byte SLAB with no wastage. Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@xxxxxxxxxx> --- drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 19 ++++++----- drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c | 7 ++-- drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c | 29 +++++++++++------ drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 6 +++- drivers/gpu/drm/amd/amdgpu/amdgpu_job.h | 36 +++++++++++---------- drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 17 +++++----- 11 files changed, 72 insertions(+), 52 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c index 5cc5f59e3018..181296ea9df7 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c @@ -108,13 +108,15 @@ static int amdgpu_cs_p1_ib(struct amdgpu_cs_parser *p, struct drm_amdgpu_cs_chunk_ib *chunk_ib, unsigned int *num_ibs) { + struct amdgpu_job *job; int r; r = amdgpu_cs_job_idx(p, chunk_ib); if (r < 0) return r; - if (num_ibs[r] >= amdgpu_ring_max_ibs(chunk_ib->ip_type)) + if (num_ibs[r] >= amdgpu_ring_max_ibs(chunk_ib->ip_type) || + WARN_ON_ONCE(overflows_type(num_ibs[r], typeof(job->num_ibs)))) return -EINVAL; ++(num_ibs[r]); @@ -296,7 +298,8 @@ static int amdgpu_cs_pass1(struct amdgpu_cs_parser *p, num_ibs[i], &p->jobs[i]); if (ret) goto free_all_kdata; - p->jobs[i]->enforce_isolation = p->adev->enforce_isolation[fpriv->xcp_id]; + if (p->adev->enforce_isolation[fpriv->xcp_id]) + p->jobs[i]->flags |= AMDGPU_ENFORCE_ISOLATION; } p->gang_leader = p->jobs[p->gang_leader_idx]; @@ -367,7 +370,7 @@ static int amdgpu_cs_p2_ib(struct amdgpu_cs_parser *p, } if (chunk_ib->flags & AMDGPU_IB_FLAG_PREAMBLE) - job->preamble_status |= AMDGPU_PREAMBLE_IB_PRESENT; + job->flags |= AMDGPU_PREAMBLE_IB_PRESENT; r = amdgpu_ib_get(p->adev, vm, ring->funcs->parse_cs ? chunk_ib->ib_bytes : 0, @@ -583,8 +586,8 @@ static int amdgpu_cs_p2_shadow(struct amdgpu_cs_parser *p, p->jobs[i]->shadow_va = shadow->shadow_va; p->jobs[i]->csa_va = shadow->csa_va; p->jobs[i]->gds_va = shadow->gds_va; - p->jobs[i]->init_shadow = - shadow->flags & AMDGPU_CS_CHUNK_CP_GFX_SHADOW_FLAGS_INIT_SHADOW; + if (shadow->flags & AMDGPU_CS_CHUNK_CP_GFX_SHADOW_FLAGS_INIT_SHADOW) + p->jobs[i]->flags |= AMDGPU_INIT_SHADOW; } return 0; @@ -1004,7 +1007,7 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p, static void trace_amdgpu_cs_ibs(struct amdgpu_cs_parser *p) { - int i, j; + unsigned int i, j; if (!trace_amdgpu_cs_enabled()) return; @@ -1352,9 +1355,9 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p, p->fence); amdgpu_cs_post_dependencies(p); - if ((leader->preamble_status & AMDGPU_PREAMBLE_IB_PRESENT) && + if ((leader->flags & AMDGPU_PREAMBLE_IB_PRESENT) && !p->ctx->preamble_presented) { - leader->preamble_status |= AMDGPU_PREAMBLE_IB_PRESENT_FIRST; + leader->flags |= AMDGPU_PREAMBLE_IB_PRESENT_FIRST; p->ctx->preamble_presented = true; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c index 49ca8c814455..447345cb94db 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c @@ -1904,7 +1904,7 @@ static void amdgpu_ib_preempt_mark_partial_job(struct amdgpu_ring *ring) job = to_amdgpu_job(s_job); if (preempted && (&job->hw_fence) == fence) /* mark the job as preempted */ - job->preemption_status |= AMDGPU_IB_PREEMPTED; + job->flags |= AMDGPU_IB_PREEMPTED; } spin_unlock(&sched->job_list_lock); } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c index 784b03abb3a4..f224547b6863 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c @@ -1430,7 +1430,7 @@ static int amdgpu_gfx_run_cleaner_shader_job(struct amdgpu_ring *ring) if (r) goto err; - job->enforce_isolation = true; + job->flags |= AMDGPU_ENFORCE_ISOLATION; ib = &job->ibs[0]; for (i = 0; i <= ring->funcs->align_mask; ++i) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c index 1c19a65e6553..56058f18e0ab 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c @@ -658,7 +658,7 @@ void amdgpu_gmc_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid, goto error_alloc; job->vm_pd_addr = amdgpu_gmc_pd_addr(adev->gart.bo); - job->vm_needs_flush = true; + job->flags |= AMDGPU_VM_NEEDS_FLUSH; job->ibs->ptr[job->ibs->length_dw++] = ring->funcs->nop; amdgpu_ring_pad_ib(ring, &job->ibs[0]); fence = amdgpu_job_submit(job); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c index 2ea98ec60220..c6d11311dcd4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c @@ -153,7 +153,7 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned int num_ibs, shadow_va = job->shadow_va; csa_va = job->csa_va; gds_va = job->gds_va; - init_shadow = job->init_shadow; + init_shadow = job->flags & AMDGPU_INIT_SHADOW; } else { vm = NULL; fence_ctx = 0; @@ -235,8 +235,9 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned int num_ibs, status |= AMDGPU_HAVE_CTX_SWITCH; if (job && ring->funcs->emit_cntxcntl) { - status |= job->preamble_status; - status |= job->preemption_status; + status |= job->flags & (AMDGPU_PREAMBLE_IB_PRESENT | + AMDGPU_PREAMBLE_IB_PRESENT_FIRST | + AMDGPU_IB_PREEMPTED); amdgpu_ring_emit_cntxcntl(ring, status); } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c index 8e712a11aba5..b9e18a806b5f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c @@ -276,8 +276,8 @@ static int amdgpu_vmid_grab_reserved(struct amdgpu_vm *vm, struct amdgpu_device *adev = ring->adev; unsigned vmhub = ring->vm_hub; struct amdgpu_vmid_mgr *id_mgr = &adev->vm_manager.id_mgr[vmhub]; + uint32_t flags = vm->use_cpu_for_update ? AMDGPU_VM_NEEDS_FLUSH : 0; uint64_t fence_context = adev->fence_context + ring->idx; - bool needs_flush = vm->use_cpu_for_update; uint64_t updates = amdgpu_vm_tlb_seq(vm); int r; @@ -320,7 +320,7 @@ static int amdgpu_vmid_grab_reserved(struct amdgpu_vm *vm, return 0; } } - needs_flush = true; + flags = AMDGPU_VM_NEEDS_FLUSH; } /* Good we can use this VMID. Remember this submission as @@ -330,8 +330,7 @@ static int amdgpu_vmid_grab_reserved(struct amdgpu_vm *vm, if (r) return r; - job->vm_needs_flush = needs_flush; - job->spm_update_needed = true; + job->flags = AMDGPU_SPM_UPDATE_NEEDED | flags; return 0; } @@ -357,7 +356,8 @@ static int amdgpu_vmid_grab_used(struct amdgpu_vm *vm, uint64_t updates = amdgpu_vm_tlb_seq(vm); int r; - job->vm_needs_flush = vm->use_cpu_for_update; + if (vm->use_cpu_for_update) + job->flags |= AMDGPU_VM_NEEDS_FLUSH; /* Check if we can use a VMID already assigned to this VM */ list_for_each_entry_reverse((*id), &id_mgr->ids_lru, list) { @@ -389,7 +389,8 @@ static int amdgpu_vmid_grab_used(struct amdgpu_vm *vm, if (r) return r; - job->vm_needs_flush |= needs_flush; + if (needs_flush) + job->flags |= AMDGPU_VM_NEEDS_FLUSH; return 0; } @@ -415,6 +416,7 @@ int amdgpu_vmid_grab(struct amdgpu_vm *vm, struct amdgpu_ring *ring, struct amdgpu_vmid_mgr *id_mgr = &adev->vm_manager.id_mgr[vmhub]; struct amdgpu_vmid *idle = NULL; struct amdgpu_vmid *id = NULL; + unsigned int vmid; int r = 0; mutex_lock(&id_mgr->lock); @@ -441,19 +443,26 @@ int amdgpu_vmid_grab(struct amdgpu_vm *vm, struct amdgpu_ring *ring, if (r) goto error; - job->vm_needs_flush = true; + job->flags |= AMDGPU_VM_NEEDS_FLUSH; } list_move_tail(&id->list, &id_mgr->ids_lru); } - job->gds_switch_needed = amdgpu_vmid_gds_switch_needed(id, job); - if (job->vm_needs_flush) { + if (amdgpu_vmid_gds_switch_needed(id, job)) + job->flags |= AMDGPU_GDS_SWITCH_NEEDED; + if (job->flags & AMDGPU_VM_NEEDS_FLUSH) { id->flushed_updates = amdgpu_vm_tlb_seq(vm); dma_fence_put(id->last_flush); id->last_flush = NULL; } - job->vmid = id - id_mgr->ids; + + vmid = id - id_mgr->ids; + if (WARN_ON_ONCE(overflows_type(vmid, typeof(job->vmid)))) { + r = -ERANGE; + goto error; + } + job->vmid = vmid; job->pasid = vm->pasid; id->gds_base = job->gds_base; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c index 1899c601c95c..dca54a47dfff 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c @@ -231,13 +231,17 @@ int amdgpu_job_alloc_with_ib(struct amdgpu_device *adev, void amdgpu_job_set_resources(struct amdgpu_job *job, struct amdgpu_bo *gds, struct amdgpu_bo *gws, struct amdgpu_bo *oa) { + uint32_t size; + if (gds) { job->gds_base = amdgpu_bo_gpu_offset(gds) >> PAGE_SHIFT; job->gds_size = amdgpu_bo_size(gds) >> PAGE_SHIFT; } if (gws) { job->gws_base = amdgpu_bo_gpu_offset(gws) >> PAGE_SHIFT; - job->gws_size = amdgpu_bo_size(gws) >> PAGE_SHIFT; + size = amdgpu_bo_size(gws) >> PAGE_SHIFT; + WARN_ON_ONCE(overflows_type(size, job->gws_size)); + job->gws_size = size; } if (oa) { job->oa_base = amdgpu_bo_gpu_offset(oa) >> PAGE_SHIFT; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h index 7e5fe3d73a06..edb18df42816 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h @@ -28,13 +28,18 @@ #include "amdgpu_ring.h" /* bit set means command submit involves a preamble IB */ -#define AMDGPU_PREAMBLE_IB_PRESENT (1 << 0) +#define AMDGPU_PREAMBLE_IB_PRESENT (1 << 0) /* bit set means preamble IB is first presented in belonging context */ -#define AMDGPU_PREAMBLE_IB_PRESENT_FIRST (1 << 1) +#define AMDGPU_PREAMBLE_IB_PRESENT_FIRST (1 << 1) /* bit set means context switch occured */ -#define AMDGPU_HAVE_CTX_SWITCH (1 << 2) +#define AMDGPU_HAVE_CTX_SWITCH (1 << 2) /* bit set means IB is preempted */ -#define AMDGPU_IB_PREEMPTED (1 << 3) +#define AMDGPU_IB_PREEMPTED (1 << 3) +#define AMDGPU_VM_NEEDS_FLUSH (1 << 4) +#define AMDGPU_GDS_SWITCH_NEEDED (1 << 5) +#define AMDGPU_SPM_UPDATE_NEEDED (1 << 6) +#define AMDGPU_ENFORCE_ISOLATION (1 << 7) +#define AMDGPU_INIT_SHADOW (1 << 8) #define to_amdgpu_job(sched_job) \ container_of((sched_job), struct amdgpu_job, base) @@ -50,21 +55,18 @@ struct amdgpu_job { struct amdgpu_sync explicit_sync; struct dma_fence hw_fence; struct dma_fence *gang_submit; - bool vm_needs_flush; - bool gds_switch_needed; - bool spm_update_needed; - bool enforce_isolation; - bool init_shadow; - unsigned vmid; - unsigned pasid; - uint32_t num_ibs; - uint32_t preamble_status; - uint32_t preemption_status; + + uint32_t pasid; + uint16_t flags; + uint16_t job_run_counter; /* >= 1 means a resubmit job */ + uint8_t vmid; + uint8_t num_ibs; + + uint16_t gws_size; /* in pages so enough and prevents holes */ + uint32_t gws_base; + uint32_t gds_base, gds_size; - uint32_t gws_base, gws_size; uint32_t oa_base, oa_size; - /* job_run_counter >= 1 means a resubmit job */ - uint32_t job_run_counter; uint64_t vm_pd_addr; uint64_t generation; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h index 383fce40d4dd..913e142564bd 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h @@ -235,7 +235,7 @@ TRACE_EVENT(amdgpu_vm_grab_id, __entry->vmid = job->vmid; __entry->vm_hub = ring->vm_hub, __entry->pd_addr = job->vm_pd_addr; - __entry->needs_flush = job->vm_needs_flush; + __entry->needs_flush = job->flags & AMDGPU_VM_NEEDS_FLUSH; ), TP_printk("pasid=%d, ring=%s, id=%u, hub=%u, pd_addr=%010Lx needs_flush=%u", __entry->pasid, __get_str(ring), __entry->vmid, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index 01ae2f88dec8..1f2d252802a7 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -2163,7 +2163,7 @@ static int amdgpu_ttm_prepare_job(struct amdgpu_device *adev, (*job)->vm_pd_addr = amdgpu_gmc_pd_addr(adev->gmc.pdb0_bo ? adev->gmc.pdb0_bo : adev->gart.bo); - (*job)->vm_needs_flush = true; + (*job)->flags |= AMDGPU_VM_NEEDS_FLUSH; } if (!resv) return 0; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c index 5c07777d3239..04aa1c75cbc5 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c @@ -726,10 +726,11 @@ bool amdgpu_vm_need_pipeline_sync(struct amdgpu_ring *ring, if (job->vmid == 0) return false; - if (job->vm_needs_flush || ring->has_compute_vm_bug) + if ((job->flags & AMDGPU_VM_NEEDS_FLUSH) || ring->has_compute_vm_bug) return true; - if (ring->funcs->emit_gds_switch && job->gds_switch_needed) + if (ring->funcs->emit_gds_switch && + (job->flags & AMDGPU_GDS_SWITCH_NEEDED)) return true; if (amdgpu_vmid_had_gpu_reset(adev, &id_mgr->ids[job->vmid])) @@ -756,11 +757,11 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job, struct amdgpu_device *adev = ring->adev; unsigned vmhub = ring->vm_hub; struct amdgpu_vmid_mgr *id_mgr = &adev->vm_manager.id_mgr[vmhub]; - struct amdgpu_vmid *id = &id_mgr->ids[job->vmid]; - bool spm_update_needed = job->spm_update_needed; bool gds_switch_needed = ring->funcs->emit_gds_switch && - job->gds_switch_needed; - bool vm_flush_needed = job->vm_needs_flush; + (job->flags & AMDGPU_GDS_SWITCH_NEEDED); + bool spm_update_needed = job->flags & AMDGPU_SPM_UPDATE_NEEDED; + bool vm_flush_needed = job->flags & AMDGPU_VM_NEEDS_FLUSH; + struct amdgpu_vmid *id = &id_mgr->ids[job->vmid]; struct dma_fence *fence = NULL; bool pasid_mapping_needed = false; unsigned int patch; @@ -786,7 +787,7 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job, ring->funcs->emit_wreg; if (!vm_flush_needed && !gds_switch_needed && !need_pipe_sync && - !(job->enforce_isolation && !job->vmid)) + !((job->flags & AMDGPU_ENFORCE_ISOLATION) && !job->vmid)) return 0; amdgpu_ring_ib_begin(ring); @@ -799,7 +800,7 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job, if (adev->gfx.enable_cleaner_shader && ring->funcs->emit_cleaner_shader && - job->enforce_isolation) + (job->flags & AMDGPU_ENFORCE_ISOLATION)) ring->funcs->emit_cleaner_shader(ring); if (vm_flush_needed) { -- 2.48.0