On Fri, Sep 16, 2022 at 5:09 AM Christian König <ckoenig.leichtzumerken@xxxxxxxxx> wrote: > > Allows submitting jobs as gang which needs to run on multiple engines at the > same time. > > All members of the gang get the same implicit, explicit and VM dependencies. So > no gang member will start running until everything else is ready. > > The last job is considered the gang leader (usually a submission to the GFX > ring) and used for signaling output dependencies. > > Each job is remembered individually as user of a buffer object, so there is no > joining of work at the end. > > v2: rebase and fix review comments from Andrey and Yogesh > v3: use READ instead of BOOKKEEP for now because of VM unmaps, set gang > leader only when necessary > v4: fix order of pushing jobs and adding fences found by Trigger. > v5: fix job index calculation and adding IBs to jobs > > Signed-off-by: Christian König <christian.koenig@xxxxxxx> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 1 + > drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 269 ++++++++++++++------- > drivers/gpu/drm/amd/amdgpu/amdgpu_cs.h | 10 +- > drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 2 +- > drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h | 12 +- > 5 files changed, 195 insertions(+), 99 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c > index 4f5bd96000ec..c7b1a2dfde13 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c > @@ -686,6 +686,7 @@ int amdgpu_amdkfd_submit_ib(struct amdgpu_device *adev, > ib->length_dw = ib_len; > /* This works for NO_HWS. TODO: need to handle without knowing VMID */ > job->vmid = vmid; > + job->num_ibs = 1; > > ret = amdgpu_ib_schedule(ring, 1, ib, job, &f); > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c > index 58088c663125..964052377991 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c > @@ -64,11 +64,11 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, > return 0; > } > > -static int amdgpu_cs_p1_ib(struct amdgpu_cs_parser *p, > - struct drm_amdgpu_cs_chunk_ib *chunk_ib, > - unsigned int *num_ibs) > +static int amdgpu_cs_job_idx(struct amdgpu_cs_parser *p, > + struct drm_amdgpu_cs_chunk_ib *chunk_ib) > { > struct drm_sched_entity *entity; > + unsigned int i; > int r; > > r = amdgpu_ctx_get_entity(p->ctx, chunk_ib->ip_type, > @@ -77,17 +77,38 @@ static int amdgpu_cs_p1_ib(struct amdgpu_cs_parser *p, > if (r) > return r; > > - /* Abort if there is no run queue associated with this entity. > - * Possibly because of disabled HW IP*/ > + /* > + * Abort if there is no run queue associated with this entity. > + * Possibly because of disabled HW IP. > + */ > if (entity->rq == NULL) > return -EINVAL; > > - /* Currently we don't support submitting to multiple entities */ > - if (p->entity && p->entity != entity) > + /* Check if we can add this IB to some existing job */ > + for (i = 0; i < p->gang_size; ++i) > + if (p->entities[i] == entity) > + return i; > + > + /* If not increase the gang size if possible */ > + if (i == AMDGPU_CS_GANG_SIZE) > return -EINVAL; > > - p->entity = entity; > - ++(*num_ibs); > + p->entities[i] = entity; > + p->gang_size = i + 1; > + return i; > +} > + > +static int amdgpu_cs_p1_ib(struct amdgpu_cs_parser *p, > + struct drm_amdgpu_cs_chunk_ib *chunk_ib, > + unsigned int *num_ibs) > +{ > + int r; > + > + r = amdgpu_cs_job_idx(p, chunk_ib); > + if (r < 0) > + return r; > + > + ++(num_ibs[r]); > return 0; > } > > @@ -161,11 +182,12 @@ static int amdgpu_cs_pass1(struct amdgpu_cs_parser *p, > union drm_amdgpu_cs *cs) > { > struct amdgpu_fpriv *fpriv = p->filp->driver_priv; > + unsigned int num_ibs[AMDGPU_CS_GANG_SIZE] = { }; > struct amdgpu_vm *vm = &fpriv->vm; > uint64_t *chunk_array_user; > uint64_t *chunk_array; > - unsigned size, num_ibs = 0; > uint32_t uf_offset = 0; > + unsigned int size; > int ret; > int i; > > @@ -228,7 +250,7 @@ static int amdgpu_cs_pass1(struct amdgpu_cs_parser *p, > if (size < sizeof(struct drm_amdgpu_cs_chunk_ib)) > goto free_partial_kdata; > > - ret = amdgpu_cs_p1_ib(p, p->chunks[i].kdata, &num_ibs); > + ret = amdgpu_cs_p1_ib(p, p->chunks[i].kdata, num_ibs); > if (ret) > goto free_partial_kdata; > break; > @@ -265,21 +287,28 @@ static int amdgpu_cs_pass1(struct amdgpu_cs_parser *p, > } > } > > - ret = amdgpu_job_alloc(p->adev, num_ibs, &p->job, vm); > - if (ret) > - goto free_all_kdata; > + if (!p->gang_size) > + return -EINVAL; > > - ret = drm_sched_job_init(&p->job->base, p->entity, &fpriv->vm); > - if (ret) > - goto free_all_kdata; > + for (i = 0; i < p->gang_size; ++i) { > + ret = amdgpu_job_alloc(p->adev, num_ibs[i], &p->jobs[i], vm); > + if (ret) > + goto free_all_kdata; > > - if (p->ctx->vram_lost_counter != p->job->vram_lost_counter) { > + ret = drm_sched_job_init(&p->jobs[i]->base, p->entities[i], > + &fpriv->vm); > + if (ret) > + goto free_all_kdata; > + } > + p->gang_leader = p->jobs[p->gang_size - 1]; > + > + if (p->ctx->vram_lost_counter != p->gang_leader->vram_lost_counter) { > ret = -ECANCELED; > goto free_all_kdata; > } > > if (p->uf_entry.tv.bo) > - p->job->uf_addr = uf_offset; > + p->gang_leader->uf_addr = uf_offset; > kvfree(chunk_array); > > /* Use this opportunity to fill in task info for the vm */ > @@ -303,17 +332,25 @@ static int amdgpu_cs_pass1(struct amdgpu_cs_parser *p, > > static int amdgpu_cs_p2_ib(struct amdgpu_cs_parser *p, > struct amdgpu_cs_chunk *chunk, > - unsigned int *num_ibs, > unsigned int *ce_preempt, > unsigned int *de_preempt) > { > struct drm_amdgpu_cs_chunk_ib *chunk_ib = chunk->kdata; > struct amdgpu_fpriv *fpriv = p->filp->driver_priv; > - struct amdgpu_ring *ring = amdgpu_job_ring(p->job); > - struct amdgpu_ib *ib = &p->job->ibs[*num_ibs]; > struct amdgpu_vm *vm = &fpriv->vm; > + struct amdgpu_ring *ring; > + struct amdgpu_job *job; > + struct amdgpu_ib *ib; > int r; > > + r = amdgpu_cs_job_idx(p, chunk_ib); > + if (r < 0) > + return r; > + > + job = p->jobs[r]; > + ring = amdgpu_job_ring(job); > + ib = &job->ibs[job->num_ibs++]; > + > /* MM engine doesn't support user fences */ > if (p->uf_entry.tv.bo && ring->funcs->no_user_fence) > return -EINVAL; > @@ -332,7 +369,7 @@ static int amdgpu_cs_p2_ib(struct amdgpu_cs_parser *p, > } > > if (chunk_ib->flags & AMDGPU_IB_FLAG_PREAMBLE) > - p->job->preamble_status |= AMDGPU_PREAMBLE_IB_PRESENT; > + job->preamble_status |= AMDGPU_PREAMBLE_IB_PRESENT; > > r = amdgpu_ib_get(p->adev, vm, ring->funcs->parse_cs ? > chunk_ib->ib_bytes : 0, > @@ -345,8 +382,6 @@ static int amdgpu_cs_p2_ib(struct amdgpu_cs_parser *p, > ib->gpu_addr = chunk_ib->va_start; > ib->length_dw = chunk_ib->ib_bytes / 4; > ib->flags = chunk_ib->flags; > - > - (*num_ibs)++; > return 0; > } > > @@ -395,7 +430,7 @@ static int amdgpu_cs_p2_dependencies(struct amdgpu_cs_parser *p, > dma_fence_put(old); > } > > - r = amdgpu_sync_fence(&p->job->sync, fence); > + r = amdgpu_sync_fence(&p->gang_leader->sync, fence); > dma_fence_put(fence); > if (r) > return r; > @@ -417,7 +452,7 @@ static int amdgpu_syncobj_lookup_and_add(struct amdgpu_cs_parser *p, > return r; > } > > - r = amdgpu_sync_fence(&p->job->sync, fence); > + r = amdgpu_sync_fence(&p->gang_leader->sync, fence); > dma_fence_put(fence); > > return r; > @@ -540,7 +575,7 @@ static int amdgpu_cs_p2_syncobj_timeline_signal(struct amdgpu_cs_parser *p, > > static int amdgpu_cs_pass2(struct amdgpu_cs_parser *p) > { > - unsigned int num_ibs = 0, ce_preempt = 0, de_preempt = 0; > + unsigned int ce_preempt = 0, de_preempt = 0; > int i, r; > > for (i = 0; i < p->nchunks; ++i) { > @@ -550,8 +585,7 @@ static int amdgpu_cs_pass2(struct amdgpu_cs_parser *p) > > switch (chunk->chunk_id) { > case AMDGPU_CHUNK_ID_IB: > - r = amdgpu_cs_p2_ib(p, chunk, &num_ibs, > - &ce_preempt, &de_preempt); > + r = amdgpu_cs_p2_ib(p, chunk, &ce_preempt, &de_preempt); > if (r) > return r; > break; > @@ -822,6 +856,7 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p, > struct amdgpu_vm *vm = &fpriv->vm; > struct amdgpu_bo_list_entry *e; > struct list_head duplicates; > + unsigned int i; > int r; > > INIT_LIST_HEAD(&p->validated); > @@ -905,16 +940,6 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p, > e->bo_va = amdgpu_vm_bo_find(vm, bo); > } > > - /* Move fence waiting after getting reservation lock of > - * PD root. Then there is no need on a ctx mutex lock. > - */ > - r = amdgpu_ctx_wait_prev_fence(p->ctx, p->entity); > - if (unlikely(r != 0)) { > - if (r != -ERESTARTSYS) > - DRM_ERROR("amdgpu_ctx_wait_prev_fence failed.\n"); > - goto error_validate; > - } > - > amdgpu_cs_get_threshold_for_moves(p->adev, &p->bytes_moved_threshold, > &p->bytes_moved_vis_threshold); > p->bytes_moved = 0; > @@ -942,13 +967,16 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p, > if (r) > goto error_validate; > > - p->job->uf_addr += amdgpu_bo_gpu_offset(uf); > + p->gang_leader->uf_addr += amdgpu_bo_gpu_offset(uf); > } > > amdgpu_cs_report_moved_bytes(p->adev, p->bytes_moved, > p->bytes_moved_vis); > - amdgpu_job_set_resources(p->job, p->bo_list->gds_obj, > - p->bo_list->gws_obj, p->bo_list->oa_obj); > + > + for (i = 0; i < p->gang_size; ++i) > + amdgpu_job_set_resources(p->jobs[i], p->bo_list->gds_obj, > + p->bo_list->gws_obj, > + p->bo_list->oa_obj); > return 0; > > error_validate: > @@ -967,20 +995,24 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p, > return r; > } > > -static void trace_amdgpu_cs_ibs(struct amdgpu_cs_parser *parser) > +static void trace_amdgpu_cs_ibs(struct amdgpu_cs_parser *p) > { > - int i; > + int i, j; > > if (!trace_amdgpu_cs_enabled()) > return; > > - for (i = 0; i < parser->job->num_ibs; i++) > - trace_amdgpu_cs(parser, i); > + for (i = 0; i < p->gang_size; ++i) { > + struct amdgpu_job *job = p->jobs[i]; > + > + for (j = 0; j < job->num_ibs; ++j) > + trace_amdgpu_cs(p, job, &job->ibs[j]); > + } > } > > -static int amdgpu_cs_patch_ibs(struct amdgpu_cs_parser *p) > +static int amdgpu_cs_patch_ibs(struct amdgpu_cs_parser *p, > + struct amdgpu_job *job) > { > - struct amdgpu_job *job = p->job; > struct amdgpu_ring *ring = amdgpu_job_ring(job); > unsigned int i; > int r; > @@ -1021,12 +1053,12 @@ static int amdgpu_cs_patch_ibs(struct amdgpu_cs_parser *p) > memcpy(ib->ptr, kptr, ib->length_dw * 4); > amdgpu_bo_kunmap(aobj); > > - r = amdgpu_ring_parse_cs(ring, p, p->job, ib); > + r = amdgpu_ring_parse_cs(ring, p, job, ib); > if (r) > return r; > } else { > ib->ptr = (uint32_t *)kptr; > - r = amdgpu_ring_patch_cs_in_place(ring, p, p->job, ib); > + r = amdgpu_ring_patch_cs_in_place(ring, p, job, ib); > amdgpu_bo_kunmap(aobj); > if (r) > return r; > @@ -1036,19 +1068,31 @@ static int amdgpu_cs_patch_ibs(struct amdgpu_cs_parser *p) > return 0; > } > > +static int amdgpu_cs_patch_jobs(struct amdgpu_cs_parser *p) > +{ > + unsigned int i; > + int r; > + > + for (i = 0; i < p->gang_size; ++i) { > + r = amdgpu_cs_patch_ibs(p, p->jobs[i]); > + if (r) > + return r; > + } > + return 0; > +} > + > static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p) > { > struct amdgpu_fpriv *fpriv = p->filp->driver_priv; > + struct amdgpu_job *job = p->gang_leader; > struct amdgpu_device *adev = p->adev; > struct amdgpu_vm *vm = &fpriv->vm; > struct amdgpu_bo_list_entry *e; > struct amdgpu_bo_va *bo_va; > struct amdgpu_bo *bo; > + unsigned int i; > int r; > > - if (!p->job->vm) > - return 0; > - > r = amdgpu_vm_clear_freed(adev, vm, NULL); > if (r) > return r; > @@ -1057,7 +1101,7 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p) > if (r) > return r; > > - r = amdgpu_sync_fence(&p->job->sync, fpriv->prt_va->last_pt_update); > + r = amdgpu_sync_fence(&job->sync, fpriv->prt_va->last_pt_update); > if (r) > return r; > > @@ -1068,7 +1112,7 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p) > if (r) > return r; > > - r = amdgpu_sync_fence(&p->job->sync, bo_va->last_pt_update); > + r = amdgpu_sync_fence(&job->sync, bo_va->last_pt_update); > if (r) > return r; > } > @@ -1087,7 +1131,7 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p) > if (r) > return r; > > - r = amdgpu_sync_fence(&p->job->sync, bo_va->last_pt_update); > + r = amdgpu_sync_fence(&job->sync, bo_va->last_pt_update); > if (r) > return r; > } > @@ -1100,11 +1144,18 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p) > if (r) > return r; > > - r = amdgpu_sync_fence(&p->job->sync, vm->last_update); > + r = amdgpu_sync_fence(&job->sync, vm->last_update); > if (r) > return r; > > - p->job->vm_pd_addr = amdgpu_gmc_pd_addr(vm->root.bo); > + for (i = 0; i < p->gang_size; ++i) { > + job = p->jobs[i]; > + > + if (!job->vm) > + continue; > + > + job->vm_pd_addr = amdgpu_gmc_pd_addr(vm->root.bo); > + } > > if (amdgpu_vm_debug) { > /* Invalidate all BOs to test for userspace bugs */ > @@ -1125,7 +1176,9 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p) > static int amdgpu_cs_sync_rings(struct amdgpu_cs_parser *p) > { > struct amdgpu_fpriv *fpriv = p->filp->driver_priv; > + struct amdgpu_job *leader = p->gang_leader; > struct amdgpu_bo_list_entry *e; > + unsigned int i; > int r; > > list_for_each_entry(e, &p->validated, tv.head) { > @@ -1135,12 +1188,23 @@ static int amdgpu_cs_sync_rings(struct amdgpu_cs_parser *p) > > sync_mode = amdgpu_bo_explicit_sync(bo) ? > AMDGPU_SYNC_EXPLICIT : AMDGPU_SYNC_NE_OWNER; > - r = amdgpu_sync_resv(p->adev, &p->job->sync, resv, sync_mode, > + r = amdgpu_sync_resv(p->adev, &leader->sync, resv, sync_mode, > &fpriv->vm); > if (r) > return r; > } > - return 0; > + > + for (i = 0; i < p->gang_size - 1; ++i) { > + r = amdgpu_sync_clone(&leader->sync, &p->jobs[i]->sync); > + if (r) > + return r; > + } > + > + r = amdgpu_ctx_wait_prev_fence(p->ctx, p->entities[p->gang_size - 1]); > + if (r && r != -ERESTARTSYS) > + DRM_ERROR("amdgpu_ctx_wait_prev_fence failed.\n"); > + > + return r; > } > > static void amdgpu_cs_post_dependencies(struct amdgpu_cs_parser *p) > @@ -1164,16 +1228,28 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p, > union drm_amdgpu_cs *cs) > { > struct amdgpu_fpriv *fpriv = p->filp->driver_priv; > - struct drm_sched_entity *entity = p->entity; > + struct amdgpu_job *leader = p->gang_leader; > struct amdgpu_bo_list_entry *e; > - struct amdgpu_job *job; > + unsigned int i; > uint64_t seq; > int r; > > - job = p->job; > - p->job = NULL; > + for (i = 0; i < p->gang_size; ++i) > + drm_sched_job_arm(&p->jobs[i]->base); > > - drm_sched_job_arm(&job->base); > + for (i = 0; i < (p->gang_size - 1); ++i) { > + struct dma_fence *fence; > + > + fence = &p->jobs[i]->base.s_fence->scheduled; > + r = amdgpu_sync_fence(&leader->sync, fence); > + if (r) > + goto error_cleanup; > + } > + > + if (p->gang_size > 1) { > + for (i = 0; i < p->gang_size; ++i) > + amdgpu_job_set_gang_leader(p->jobs[i], leader); > + } > > /* No memory allocation is allowed while holding the notifier lock. > * The lock is held until amdgpu_cs_submit is finished and fence is > @@ -1191,45 +1267,57 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p, > } > if (r) { > r = -EAGAIN; > - goto error_abort; > + goto error_unlock; > } > > - p->fence = dma_fence_get(&job->base.s_fence->finished); > + p->fence = dma_fence_get(&leader->base.s_fence->finished); > + list_for_each_entry(e, &p->validated, tv.head) { > + > + /* Everybody except for the gang leader uses READ */ > + for (i = 0; i < (p->gang_size - 1); ++i) { > + dma_resv_add_fence(e->tv.bo->base.resv, > + &p->jobs[i]->base.s_fence->finished, > + DMA_RESV_USAGE_READ); > + } > + > + /* The gang leader as remembered as writer */ typo: The gang leader IS remembered as writer > + e->tv.num_shared = 0; > + } > > - seq = amdgpu_ctx_add_fence(p->ctx, entity, p->fence); > + seq = amdgpu_ctx_add_fence(p->ctx, p->entities[p->gang_size - 1], > + p->fence); > amdgpu_cs_post_dependencies(p); > > - if ((job->preamble_status & AMDGPU_PREAMBLE_IB_PRESENT) && > + if ((leader->preamble_status & AMDGPU_PREAMBLE_IB_PRESENT) && > !p->ctx->preamble_presented) { > - job->preamble_status |= AMDGPU_PREAMBLE_IB_PRESENT_FIRST; > + leader->preamble_status |= AMDGPU_PREAMBLE_IB_PRESENT_FIRST; > p->ctx->preamble_presented = true; > } > > cs->out.handle = seq; > - job->uf_sequence = seq; > + leader->uf_sequence = seq; > > - amdgpu_job_free_resources(job); > - > - trace_amdgpu_cs_ioctl(job); > amdgpu_vm_bo_trace_cs(&fpriv->vm, &p->ticket); > - drm_sched_entity_push_job(&job->base); > + for (i = 0; i < p->gang_size; ++i) { > + amdgpu_job_free_resources(p->jobs[i]); > + trace_amdgpu_cs_ioctl(p->jobs[i]); > + drm_sched_entity_push_job(&p->jobs[i]->base); > + p->jobs[i] = NULL; > + } > > amdgpu_vm_move_to_lru_tail(p->adev, &fpriv->vm); > - > - /* Make sure all BOs are remembered as writers */ > - amdgpu_bo_list_for_each_entry(e, p->bo_list) > - e->tv.num_shared = 0; > - > ttm_eu_fence_buffer_objects(&p->ticket, &p->validated, p->fence); > + > mutex_unlock(&p->adev->notifier_lock); > mutex_unlock(&p->bo_list->bo_list_mutex); > - > return 0; > > -error_abort: > - drm_sched_job_cleanup(&job->base); > +error_unlock: > mutex_unlock(&p->adev->notifier_lock); > - amdgpu_job_free(job); > + > +error_cleanup: > + for (i = 0; i < p->gang_size; ++i) > + drm_sched_job_cleanup(&p->jobs[i]->base); > return r; > } > > @@ -1246,17 +1334,18 @@ static void amdgpu_cs_parser_fini(struct amdgpu_cs_parser *parser) > > dma_fence_put(parser->fence); > > - if (parser->ctx) { > + if (parser->ctx) > amdgpu_ctx_put(parser->ctx); > - } > if (parser->bo_list) > amdgpu_bo_list_put(parser->bo_list); > > for (i = 0; i < parser->nchunks; i++) > kvfree(parser->chunks[i].kdata); > kvfree(parser->chunks); > - if (parser->job) > - amdgpu_job_free(parser->job); > + for (i = 0; i < parser->gang_size; ++i) { > + if (parser->jobs[i]) > + amdgpu_job_free(parser->jobs[i]); > + } > if (parser->uf_entry.tv.bo) { > struct amdgpu_bo *uf = ttm_to_amdgpu_bo(parser->uf_entry.tv.bo); > > @@ -1300,7 +1389,7 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) > goto error_fini; > } > > - r = amdgpu_cs_patch_ibs(&parser); > + r = amdgpu_cs_patch_jobs(&parser); > if (r) > goto error_backoff; > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.h > index 30ecc4917f81..cbaa19b2b8a3 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.h > @@ -27,6 +27,8 @@ > #include "amdgpu_bo_list.h" > #include "amdgpu_ring.h" > > +#define AMDGPU_CS_GANG_SIZE 4 > + > struct amdgpu_bo_va_mapping; > > struct amdgpu_cs_chunk { > @@ -50,9 +52,11 @@ struct amdgpu_cs_parser { > unsigned nchunks; > struct amdgpu_cs_chunk *chunks; > > - /* scheduler job object */ > - struct amdgpu_job *job; > - struct drm_sched_entity *entity; > + /* scheduler job objects */ > + unsigned int gang_size; > + struct drm_sched_entity *entities[AMDGPU_CS_GANG_SIZE]; > + struct amdgpu_job *jobs[AMDGPU_CS_GANG_SIZE]; > + struct amdgpu_job *gang_leader; > > /* buffer objects */ > struct ww_acquire_ctx ticket; > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c > index cfbe19cfe9af..46c99331d7f1 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c > @@ -105,7 +105,6 @@ int amdgpu_job_alloc(struct amdgpu_device *adev, unsigned num_ibs, > */ > (*job)->base.sched = &adev->rings[0]->sched; > (*job)->vm = vm; > - (*job)->num_ibs = num_ibs; > > amdgpu_sync_create(&(*job)->sync); > amdgpu_sync_create(&(*job)->sched_sync); > @@ -125,6 +124,7 @@ int amdgpu_job_alloc_with_ib(struct amdgpu_device *adev, unsigned size, > if (r) > return r; > > + (*job)->num_ibs = 1; > r = amdgpu_ib_get(adev, NULL, size, pool_type, &(*job)->ibs[0]); > if (r) > kfree(*job); > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h > index 06dfcf297a8d..5e6ddc7e101c 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h > @@ -140,8 +140,10 @@ TRACE_EVENT(amdgpu_bo_create, > ); > > TRACE_EVENT(amdgpu_cs, > - TP_PROTO(struct amdgpu_cs_parser *p, int i), > - TP_ARGS(p, i), > + TP_PROTO(struct amdgpu_cs_parser *p, > + struct amdgpu_job *job, > + struct amdgpu_ib *ib), > + TP_ARGS(p, job, ib), > TP_STRUCT__entry( > __field(struct amdgpu_bo_list *, bo_list) > __field(u32, ring) > @@ -151,10 +153,10 @@ TRACE_EVENT(amdgpu_cs, > > TP_fast_assign( > __entry->bo_list = p->bo_list; > - __entry->ring = to_amdgpu_ring(p->entity->rq->sched)->idx; > - __entry->dw = p->job->ibs[i].length_dw; > + __entry->ring = to_amdgpu_ring(job->base.sched)->idx; > + __entry->dw = ib->length_dw; > __entry->fences = amdgpu_fence_count_emitted( > - to_amdgpu_ring(p->entity->rq->sched)); > + to_amdgpu_ring(job->base.sched)); > ), > TP_printk("bo_list=%p, ring=%u, dw=%u, fences=%u", > __entry->bo_list, __entry->ring, __entry->dw, > -- > 2.25.1 >