kvfree(chunk_array);
/* Use this opportunity to fill in task info for the vm */
@@ -301,22 +317,18 @@ static int amdgpu_cs_pass1(struct amdgpu_cs_parser *p,
return ret;
}
-static int amdgpu_cs_p2_ib(struct amdgpu_cs_parser *p,
- struct amdgpu_cs_chunk *chunk,
- unsigned int *num_ibs,
- unsigned int *ce_preempt,
- unsigned int *de_preempt)
+static int amdgpu_cs_p2_ib(struct amdgpu_cs_parser *p, struct amdgpu_job *job,
+ struct amdgpu_ib *ib, struct amdgpu_cs_chunk *chunk,
+ unsigned int *ce_preempt, unsigned int *de_preempt)
{
- struct amdgpu_ring *ring = to_amdgpu_ring(p->job->base.sched);
+ struct amdgpu_ring *ring = to_amdgpu_ring(job->base.sched);
struct drm_amdgpu_cs_chunk_ib *chunk_ib = chunk->kdata;
struct amdgpu_fpriv *fpriv = p->filp->driver_priv;
- struct amdgpu_ib *ib = &p->job->ibs[*num_ibs];
struct amdgpu_vm *vm = &fpriv->vm;
int r;
-
/* MM engine doesn't support user fences */
- if (p->job->uf_addr && ring->funcs->no_user_fence)
+ if (job->uf_addr && ring->funcs->no_user_fence)
return -EINVAL;
if (chunk_ib->ip_type == AMDGPU_HW_IP_GFX &&
@@ -333,7 +345,7 @@ static int amdgpu_cs_p2_ib(struct amdgpu_cs_parser *p,
}
if (chunk_ib->flags & AMDGPU_IB_FLAG_PREAMBLE)
- p->job->preamble_status |= AMDGPU_PREAMBLE_IB_PRESENT;
+ job->preamble_status |= AMDGPU_PREAMBLE_IB_PRESENT;
r = amdgpu_ib_get(p->adev, vm, ring->funcs->parse_cs ?
chunk_ib->ib_bytes : 0,
@@ -346,8 +358,6 @@ static int amdgpu_cs_p2_ib(struct amdgpu_cs_parser *p,
ib->gpu_addr = chunk_ib->va_start;
ib->length_dw = chunk_ib->ib_bytes / 4;
ib->flags = chunk_ib->flags;
-
- (*num_ibs)++;
return 0;
}
@@ -396,7 +406,7 @@ static int amdgpu_cs_p2_dependencies(struct amdgpu_cs_parser *p,
dma_fence_put(old);
}
- r = amdgpu_sync_fence(&p->job->sync, fence);
+ r = amdgpu_sync_fence(&p->jobs[0]->sync, fence);
dma_fence_put(fence);
if (r)
return r;
@@ -418,7 +428,7 @@ static int amdgpu_syncobj_lookup_and_add(struct amdgpu_cs_parser *p,
return r;
}
- r = amdgpu_sync_fence(&p->job->sync, fence);
+ r = amdgpu_sync_fence(&p->jobs[0]->sync, fence);
dma_fence_put(fence);
return r;
@@ -541,20 +551,30 @@ static int amdgpu_cs_p2_syncobj_timeline_signal(struct amdgpu_cs_parser *p,
static int amdgpu_cs_pass2(struct amdgpu_cs_parser *p)
{
- unsigned int num_ibs = 0, ce_preempt = 0, de_preempt = 0;
+ unsigned int ce_preempt = 0, de_preempt = 0;
+ unsigned int job_idx = 0, ib_idx = 0;
int i, r;
for (i = 0; i < p->nchunks; ++i) {
struct amdgpu_cs_chunk *chunk;
+ struct amdgpu_job *job;
chunk = &p->chunks[i];
switch (chunk->chunk_id) {
case AMDGPU_CHUNK_ID_IB:
- r = amdgpu_cs_p2_ib(p, chunk, &num_ibs,
+ job = p->jobs[job_idx];
+ r = amdgpu_cs_p2_ib(p, job, &job->ibs[ib_idx], chunk,
&ce_preempt, &de_preempt);
if (r)
return r;
+
+ if (++ib_idx == job->num_ibs) {
+ ++job_idx;
+ ib_idx = 0;
+ ce_preempt = 0;
+ de_preempt = 0;
+ }
break;
case AMDGPU_CHUNK_ID_DEPENDENCIES:
case AMDGPU_CHUNK_ID_SCHEDULED_DEPENDENCIES:
@@ -825,6 +845,7 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
struct amdgpu_vm *vm = &fpriv->vm;
struct amdgpu_bo_list_entry *e;
struct list_head duplicates;
+ unsigned int i;
int r;
INIT_LIST_HEAD(&p->validated);
@@ -905,16 +926,6 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
e->bo_va = amdgpu_vm_bo_find(vm, bo);
}
- /* Move fence waiting after getting reservation lock of
- * PD root. Then there is no need on a ctx mutex lock.
- */
- r = amdgpu_ctx_wait_prev_fence(p->ctx, p->entity);
- if (unlikely(r != 0)) {
- if (r != -ERESTARTSYS)
- DRM_ERROR("amdgpu_ctx_wait_prev_fence failed.\n");
- goto error_validate;
- }
-
amdgpu_cs_get_threshold_for_moves(p->adev, &p->bytes_moved_threshold,
&p->bytes_moved_vis_threshold);
p->bytes_moved = 0;
@@ -938,14 +949,16 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
amdgpu_cs_report_moved_bytes(p->adev, p->bytes_moved,
p->bytes_moved_vis);
- amdgpu_job_set_resources(p->job, p->bo_list->gds_obj,
- p->bo_list->gws_obj, p->bo_list->oa_obj);
+ for (i = 0; i < p->gang_size; ++i)
+ amdgpu_job_set_resources(p->jobs[i], p->bo_list->gds_obj,
+ p->bo_list->gws_obj,
+ p->bo_list->oa_obj);
if (!r && p->uf_entry.tv.bo) {
struct amdgpu_bo *uf = ttm_to_amdgpu_bo(p->uf_entry.tv.bo);
r = amdgpu_ttm_alloc_gart(&uf->tbo);
- p->job->uf_addr += amdgpu_bo_gpu_offset(uf);
+ p->jobs[p->gang_size - 1]->uf_addr += amdgpu_bo_gpu_offset(uf);
}
error_validate:
@@ -955,20 +968,24 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
return r;
}
-static void trace_amdgpu_cs_ibs(struct amdgpu_cs_parser *parser)
+static void trace_amdgpu_cs_ibs(struct amdgpu_cs_parser *p)
{
- int i;
+ int i, j;
if (!trace_amdgpu_cs_enabled())
return;
- for (i = 0; i < parser->job->num_ibs; i++)
- trace_amdgpu_cs(parser, i);
+ for (i = 0; i < p->gang_size; ++i) {
+ struct amdgpu_job *job = p->jobs[i];
+
+ for (j = 0; j < job->num_ibs; ++j)
+ trace_amdgpu_cs(p, job, &job->ibs[j]);
+ }
}
-static int amdgpu_cs_patch_ibs(struct amdgpu_cs_parser *p)
+static int amdgpu_cs_patch_ibs(struct amdgpu_cs_parser *p,
+ struct amdgpu_job *job)
{
- struct amdgpu_job *job = p->job;
struct amdgpu_ring *ring = to_amdgpu_ring(job->base.sched);
unsigned int i;
int r;
@@ -1007,14 +1024,13 @@ static int amdgpu_cs_patch_ibs(struct amdgpu_cs_parser *p)
memcpy(ib->ptr, kptr, job->ibs[i].length_dw * 4);
amdgpu_bo_kunmap(aobj);
- r = amdgpu_ring_parse_cs(ring, p, p->job,
- &p->job->ibs[i]);
+ r = amdgpu_ring_parse_cs(ring, p, job, &job->ibs[i]);
if (r)
return r;
} else {
ib->ptr = (uint32_t *)kptr;
- r = amdgpu_ring_patch_cs_in_place(ring, p, p->job,
- &p->job->ibs[i]);
+ r = amdgpu_ring_patch_cs_in_place(ring, p, job,
+ &job->ibs[i]);
amdgpu_bo_kunmap(aobj);
if (r)
return r;
@@ -1024,14 +1040,29 @@ static int amdgpu_cs_patch_ibs(struct amdgpu_cs_parser *p)
return 0;
}
+static int amdgpu_cs_patch_jobs(struct amdgpu_cs_parser *p)
+{
+ unsigned int i;
+ int r;
+
+ for (i = 0; i < p->gang_size; ++i) {
+ r = amdgpu_cs_patch_ibs(p, p->jobs[i]);
+ if (r)
+ return r;
+ }
+ return 0;
+}
+
static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p)
{
struct amdgpu_fpriv *fpriv = p->filp->driver_priv;
struct amdgpu_device *adev = p->adev;
+ struct amdgpu_job *job = p->jobs[0];
struct amdgpu_vm *vm = &fpriv->vm;
struct amdgpu_bo_list_entry *e;
struct amdgpu_bo_va *bo_va;
struct amdgpu_bo *bo;
+ unsigned int i;
int r;
r = amdgpu_vm_clear_freed(adev, vm, NULL);
@@ -1042,7 +1073,7 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p)
if (r)
return r;
- r = amdgpu_sync_vm_fence(&p->job->sync, fpriv->prt_va->last_pt_update);
+ r = amdgpu_sync_vm_fence(&job->sync, fpriv->prt_va->last_pt_update);
if (r)
return r;
@@ -1052,7 +1083,7 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p)
if (r)
return r;
- r = amdgpu_sync_vm_fence(&p->job->sync, bo_va->last_pt_update);
+ r = amdgpu_sync_vm_fence(&job->sync, bo_va->last_pt_update);
if (r)
return r;
}
@@ -1071,7 +1102,7 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p)
if (r)
return r;
- r = amdgpu_sync_vm_fence(&p->job->sync, bo_va->last_pt_update);
+ r = amdgpu_sync_vm_fence(&job->sync, bo_va->last_pt_update);
if (r)
return r;
}
@@ -1084,11 +1115,18 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p)
if (r)
return r;
- r = amdgpu_sync_vm_fence(&p->job->sync, vm->last_update);
+ r = amdgpu_sync_vm_fence(&job->sync, vm->last_update);
if (r)
return r;
- p->job->vm_pd_addr = amdgpu_gmc_pd_addr(vm->root.bo);
+ for (i = 0; i < p->gang_size; ++i) {
+ job = p->jobs[i];
+
+ if (!job->vm)
+ continue;
+
+ job->vm_pd_addr = amdgpu_gmc_pd_addr(vm->root.bo);
+ }
if (amdgpu_vm_debug) {
/* Invalidate all BOs to test for userspace bugs */
@@ -1109,7 +1147,9 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p)
static int amdgpu_cs_sync_rings(struct amdgpu_cs_parser *p)
{
struct amdgpu_fpriv *fpriv = p->filp->driver_priv;
+ struct amdgpu_job *job = p->jobs[0];
struct amdgpu_bo_list_entry *e;
+ unsigned int i;
int r;
list_for_each_entry(e, &p->validated, tv.head) {
@@ -1119,12 +1159,23 @@ static int amdgpu_cs_sync_rings(struct amdgpu_cs_parser *p)
sync_mode = amdgpu_bo_explicit_sync(bo) ?
AMDGPU_SYNC_EXPLICIT : AMDGPU_SYNC_NE_OWNER;
- r = amdgpu_sync_resv(p->adev, &p->job->sync, resv, sync_mode,
+ r = amdgpu_sync_resv(p->adev, &job->sync, resv, sync_mode,
&fpriv->vm);
if (r)
return r;
}
- return 0;
+
+ for (i = 1; i < p->gang_size; ++i) {
+ r = amdgpu_sync_clone(&job->sync, &p->jobs[i]->sync);
+ if (r)
+ return r;
+ }
+
+ r = amdgpu_ctx_wait_prev_fence(p->ctx, p->entities[p->gang_size - 1]);
+ if (r && r != -ERESTARTSYS)
+ DRM_ERROR("amdgpu_ctx_wait_prev_fence failed.\n");
+
+ return r;
}
static void amdgpu_cs_post_dependencies(struct amdgpu_cs_parser *p)
@@ -1147,17 +1198,27 @@ static void amdgpu_cs_post_dependencies(struct amdgpu_cs_parser *p)
static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
union drm_amdgpu_cs *cs)
{
+ struct amdgpu_job *last = p->jobs[p->gang_size - 1];
struct amdgpu_fpriv *fpriv = p->filp->driver_priv;
- struct drm_sched_entity *entity = p->entity;
struct amdgpu_bo_list_entry *e;
- struct amdgpu_job *job;
+ unsigned int i;
uint64_t seq;
int r;
- job = p->job;
- p->job = NULL;
+ for (i = 0; i < p->gang_size; ++i)
+ drm_sched_job_arm(&p->jobs[i]->base);
- drm_sched_job_arm(&job->base);
+ for (i = 0; i < (p->gang_size - 1); ++i) {
+ struct dma_fence *fence;
+
+ fence = &p->jobs[i]->base.s_fence->scheduled;
+ r = amdgpu_sync_fence(&last->sync, fence);
+ if (r)
+ goto error_cleanup;
+ }
+
+ for (i = 0; i < p->gang_size; ++i)
+ amdgpu_job_set_gang_leader(p->jobs[i], last);
/* No memory allocation is allowed while holding the notifier lock.
* The lock is held until amdgpu_cs_submit is finished and fence is
@@ -1175,44 +1236,58 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
}
if (r) {
r = -EAGAIN;
- goto error_abort;
+ goto error_unlock;
}
- p->fence = dma_fence_get(&job->base.s_fence->finished);
+ p->fence = dma_fence_get(&last->base.s_fence->finished);
- amdgpu_ctx_add_fence(p->ctx, entity, p->fence, &seq);
+ amdgpu_ctx_add_fence(p->ctx, p->entities[p->gang_size - 1], p->fence,
+ &seq);
amdgpu_cs_post_dependencies(p);
- if ((job->preamble_status & AMDGPU_PREAMBLE_IB_PRESENT) &&
+ if ((last->preamble_status & AMDGPU_PREAMBLE_IB_PRESENT) &&
!p->ctx->preamble_presented) {
- job->preamble_status |= AMDGPU_PREAMBLE_IB_PRESENT_FIRST;
+ last->preamble_status |= AMDGPU_PREAMBLE_IB_PRESENT_FIRST;
p->ctx->preamble_presented = true;
}
cs->out.handle = seq;
- job->uf_sequence = seq;
-
- amdgpu_job_free_resources(job);
+ last->uf_sequence = seq;
- trace_amdgpu_cs_ioctl(job);
amdgpu_vm_bo_trace_cs(&fpriv->vm, &p->ticket);
- drm_sched_entity_push_job(&job->base);
+ for (i = 0; i < p->gang_size; ++i) {
+ amdgpu_job_free_resources(p->jobs[i]);
+ trace_amdgpu_cs_ioctl(p->jobs[i]);
+ drm_sched_entity_push_job(&p->jobs[i]->base);
+ p->jobs[i] = NULL;
+ }
amdgpu_vm_move_to_lru_tail(p->adev, &fpriv->vm);
- /* Make sure all BOs are remembered as writers */
- amdgpu_bo_list_for_each_entry(e, p->bo_list)
+ list_for_each_entry(e, &p->validated, tv.head) {
+
+ /* Everybody except for the gang leader uses BOOKKEEP */
+ for (i = 0; i < (p->gang_size - 1); ++i) {
+ dma_resv_add_fence(e->tv.bo->base.resv,
+ &p->jobs[i]->base.s_fence->finished,
+ DMA_RESV_USAGE_BOOKKEEP);
+ }
+
+ /* The gang leader as remembered as writer */
e->tv.num_shared = 0;
+ }
ttm_eu_fence_buffer_objects(&p->ticket, &p->validated, p->fence);
mutex_unlock(&p->adev->notifier_lock);
return 0;
-error_abort:
- drm_sched_job_cleanup(&job->base);
+error_unlock:
mutex_unlock(&p->adev->notifier_lock);
- amdgpu_job_free(job);
+
+error_cleanup:
+ for (i = 0; i < p->gang_size; ++i)
+ drm_sched_job_cleanup(&p->jobs[i]->base);
return r;
}
@@ -1229,17 +1304,18 @@ static void amdgpu_cs_parser_fini(struct amdgpu_cs_parser *parser)
dma_fence_put(parser->fence);
- if (parser->ctx) {
+ if (parser->ctx)
amdgpu_ctx_put(parser->ctx);
- }
if (parser->bo_list)
amdgpu_bo_list_put(parser->bo_list);
for (i = 0; i < parser->nchunks; i++)
kvfree(parser->chunks[i].kdata);
kvfree(parser->chunks);
- if (parser->job)
- amdgpu_job_free(parser->job);
+ for (i = 0; i < parser->gang_size; ++i) {
+ if (parser->jobs[i])
+ amdgpu_job_free(parser->jobs[i]);
+ }
if (parser->uf_entry.tv.bo) {
struct amdgpu_bo *uf = ttm_to_amdgpu_bo(parser->uf_entry.tv.bo);
@@ -1283,7 +1359,7 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
goto error_fini;
}
- r = amdgpu_cs_patch_ibs(&parser);
+ r = amdgpu_cs_patch_jobs(&parser);
if (r)
goto error_backoff;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.h
index 652b5593499f..ba5860c08270 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.h
@@ -27,6 +27,8 @@
#include "amdgpu_bo_list.h"
#include "amdgpu_ring.h"
+#define AMDGPU_CS_GANG_SIZE 4
+
struct amdgpu_bo_va_mapping;
struct amdgpu_cs_chunk {
@@ -50,9 +52,10 @@ struct amdgpu_cs_parser {
unsigned nchunks;
struct amdgpu_cs_chunk *chunks;
- /* scheduler job object */
- struct drm_sched_entity *entity;
- struct amdgpu_job *job;
+ /* scheduler job objects */
+ unsigned int gang_size;
+ struct drm_sched_entity *entities[AMDGPU_CS_GANG_SIZE];
+ struct amdgpu_job *jobs[AMDGPU_CS_GANG_SIZE];
/* buffer objects */
struct ww_acquire_ctx ticket;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
index d855cb53c7e0..a5167cb91ba5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_trace.h
@@ -140,8 +140,10 @@ TRACE_EVENT(amdgpu_bo_create,
);
TRACE_EVENT(amdgpu_cs,
- TP_PROTO(struct amdgpu_cs_parser *p, int i),
- TP_ARGS(p, i),
+ TP_PROTO(struct amdgpu_cs_parser *p,
+ struct amdgpu_job *job,
+ struct amdgpu_ib *ib),
+ TP_ARGS(p, job, ib),
TP_STRUCT__entry(
__field(struct amdgpu_bo_list *, bo_list)
__field(u32, ring)
@@ -151,10 +153,10 @@ TRACE_EVENT(amdgpu_cs,
TP_fast_assign(
__entry->bo_list = p->bo_list;
- __entry->ring = to_amdgpu_ring(p->entity->rq->sched)->idx;
- __entry->dw = p->job->ibs[i].length_dw;
+ __entry->ring = to_amdgpu_ring(job->base.sched)->idx;
+ __entry->dw = ib->length_dw;
__entry->fences = amdgpu_fence_count_emitted(
- to_amdgpu_ring(p->entity->rq->sched));
+ to_amdgpu_ring(job->base.sched));
),
TP_printk("bo_list=%p, ring=%u, dw=%u, fences=%u",
__entry->bo_list, __entry->ring, __entry->dw,