Other than dramatically simplifying the submission code (requests ftw), we can reduce the execlist spinlock duration and importantly avoid having to hold it across the context switch register reads. Signed-off-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> --- drivers/gpu/drm/i915/i915_debugfs.c | 20 +- drivers/gpu/drm/i915/i915_gem.c | 8 +- drivers/gpu/drm/i915/i915_gem_request.h | 21 +- drivers/gpu/drm/i915/i915_guc_submission.c | 31 +- drivers/gpu/drm/i915/intel_lrc.c | 505 +++++++++++------------------ drivers/gpu/drm/i915/intel_lrc.h | 3 - drivers/gpu/drm/i915/intel_ringbuffer.h | 8 +- 7 files changed, 209 insertions(+), 387 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c index 15a6fddfb79b..a5ea90944bbb 100644 --- a/drivers/gpu/drm/i915/i915_debugfs.c +++ b/drivers/gpu/drm/i915/i915_debugfs.c @@ -2005,8 +2005,7 @@ static void i915_dump_lrc_obj(struct seq_file *m, return; } - seq_printf(m, "CONTEXT: %s %u\n", ring->name, - intel_execlists_ctx_id(ctx_obj)); + seq_printf(m, "CONTEXT: %s\n", ring->name); if (!i915_gem_obj_ggtt_bound(ctx_obj)) seq_puts(m, "\tNot bound in GGTT\n"); @@ -2092,7 +2091,6 @@ static int i915_execlists(struct seq_file *m, void *data) intel_runtime_pm_get(dev_priv); for_each_ring(ring, dev_priv, ring_id) { - struct drm_i915_gem_request *head_req = NULL; int count = 0; seq_printf(m, "%s\n", ring->name); @@ -2105,8 +2103,8 @@ static int i915_execlists(struct seq_file *m, void *data) status_pointer = I915_READ(RING_CONTEXT_STATUS_PTR(ring)); seq_printf(m, "\tStatus pointer: 0x%08X\n", status_pointer); - read_pointer = ring->next_context_status_buffer; - write_pointer = GEN8_CSB_WRITE_PTR(status_pointer); + read_pointer = (status_pointer >> 8) & GEN8_CSB_PTR_MASK; + write_pointer = status_pointer & GEN8_CSB_PTR_MASK; if (read_pointer > write_pointer) write_pointer += GEN8_CSB_ENTRIES; seq_printf(m, "\tRead pointer: 0x%08X, write pointer 0x%08X\n", @@ -2123,21 +2121,9 @@ static int i915_execlists(struct seq_file *m, void *data) spin_lock(&ring->execlist_lock); list_for_each(cursor, &ring->execlist_queue) count++; - head_req = list_first_entry_or_null(&ring->execlist_queue, - struct drm_i915_gem_request, execlist_link); spin_unlock(&ring->execlist_lock); seq_printf(m, "\t%d requests in queue\n", count); - if (head_req) { - struct drm_i915_gem_object *ctx_obj; - - ctx_obj = head_req->ctx->engine[ring_id].state; - seq_printf(m, "\tHead request id: %u\n", - intel_execlists_ctx_id(ctx_obj)); - seq_printf(m, "\tHead request tail: %u\n", - head_req->tail); - } - seq_putc(m, '\n'); } diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index eb875ecd7907..054e11cff00f 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -2193,12 +2193,12 @@ static void i915_gem_reset_ring_cleanup(struct intel_engine_cs *engine) if (i915.enable_execlists) { spin_lock(&engine->execlist_lock); - - /* list_splice_tail_init checks for empty lists */ list_splice_tail_init(&engine->execlist_queue, - &engine->execlist_retired_req_list); - + &engine->execlist_completed); + memset(&engine->execlist_port, 0, + sizeof(engine->execlist_port)); spin_unlock(&engine->execlist_lock); + intel_execlists_retire_requests(engine); } diff --git a/drivers/gpu/drm/i915/i915_gem_request.h b/drivers/gpu/drm/i915/i915_gem_request.h index 59957d5edfdb..c2e83584f8a2 100644 --- a/drivers/gpu/drm/i915/i915_gem_request.h +++ b/drivers/gpu/drm/i915/i915_gem_request.h @@ -63,10 +63,11 @@ struct drm_i915_gem_request { * This is required to calculate the maximum available ringbuffer * space without overwriting the postfix. */ - u32 postfix; + u32 postfix; /** Position in the ringbuffer of the end of the whole request */ u32 tail; + u32 wa_tail; /** * Context and ring buffer related to this request @@ -99,24 +100,8 @@ struct drm_i915_gem_request { /** process identifier submitting this request */ struct pid *pid; - /** - * The ELSP only accepts two elements at a time, so we queue - * context/tail pairs on a given queue (ring->execlist_queue) until the - * hardware is available. The queue serves a double purpose: we also use - * it to keep track of the up to 2 contexts currently in the hardware - * (usually one in execution and the other queued up by the GPU): We - * only remove elements from the head of the queue when the hardware - * informs us that an element has been completed. - * - * All accesses to the queue are mediated by a spinlock - * (ring->execlist_lock). - */ - /** Execlist link in the submission queue.*/ - struct list_head execlist_link; - - /** Execlists no. of times this request has been sent to the ELSP */ - int elsp_submitted; + struct list_head execlist_link; /* guarded by engine->execlist_lock */ }; struct drm_i915_gem_request * diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c index 5a6251926367..f4e09952d52c 100644 --- a/drivers/gpu/drm/i915/i915_guc_submission.c +++ b/drivers/gpu/drm/i915/i915_guc_submission.c @@ -393,7 +393,6 @@ static void guc_init_ctx_desc(struct intel_guc *guc, struct intel_ring *ring = ctx->engine[i].ring; struct intel_engine_cs *engine; struct drm_i915_gem_object *obj; - uint64_t ctx_desc; /* TODO: We have a design issue to be solved here. Only when we * receive the first batch, we know which engine is used by the @@ -407,8 +406,7 @@ static void guc_init_ctx_desc(struct intel_guc *guc, break; /* XXX: continue? */ engine = ring->engine; - ctx_desc = intel_lr_context_descriptor(ctx, engine); - lrc->context_desc = (u32)ctx_desc; + lrc->context_desc = engine->execlist_context_descriptor; /* The state page is after PPHWSP */ lrc->ring_lcra = i915_gem_obj_ggtt_offset(obj) + @@ -548,7 +546,7 @@ static int guc_add_workqueue_item(struct i915_guc_client *gc, WQ_NO_WCFLUSH_WAIT; /* The GuC wants only the low-order word of the context descriptor */ - wqi->context_desc = (u32)intel_lr_context_descriptor(rq->ctx, rq->engine); + wqi->context_desc = rq->engine->execlist_context_descriptor; /* The GuC firmware wants the tail index in QWords, not bytes */ tail = rq->ring->tail >> 3; @@ -562,27 +560,6 @@ static int guc_add_workqueue_item(struct i915_guc_client *gc, #define CTX_RING_BUFFER_START 0x08 -/* Update the ringbuffer pointer in a saved context image */ -static void lr_context_update(struct drm_i915_gem_request *rq) -{ - enum intel_engine_id ring_id = rq->engine->id; - struct drm_i915_gem_object *ctx_obj = rq->ctx->engine[ring_id].state; - struct drm_i915_gem_object *rb_obj = rq->ring->obj; - struct page *page; - uint32_t *reg_state; - - BUG_ON(!ctx_obj); - WARN_ON(!i915_gem_obj_is_pinned(ctx_obj)); - WARN_ON(!i915_gem_obj_is_pinned(rb_obj)); - - page = i915_gem_object_get_dirty_page(ctx_obj, LRC_STATE_PN); - reg_state = kmap_atomic(page); - - reg_state[CTX_RING_BUFFER_START+1] = i915_gem_obj_ggtt_offset(rb_obj); - - kunmap_atomic(reg_state); -} - /** * i915_guc_submit() - Submit commands through GuC * @client: the guc client where commands will go through @@ -597,10 +574,6 @@ int i915_guc_submit(struct i915_guc_client *client, enum intel_engine_id ring_id = rq->engine->id; int q_ret, b_ret; - /* Need this because of the deferred pin ctx and ring */ - /* Shall we move this right after ring is pinned? */ - lr_context_update(rq); - q_ret = guc_add_workqueue_item(client, rq); if (q_ret == 0) b_ret = guc_ring_doorbell(client); diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c index de5889e95d6d..80b346a3fd8a 100644 --- a/drivers/gpu/drm/i915/intel_lrc.c +++ b/drivers/gpu/drm/i915/intel_lrc.c @@ -265,233 +265,133 @@ int intel_sanitize_enable_execlists(struct drm_device *dev, int enable_execlists return 0; } -/** - * intel_execlists_ctx_id() - get the Execlists Context ID - * @ctx_obj: Logical Ring Context backing object. - * - * Do not confuse with ctx->id! Unfortunately we have a name overload - * here: the old context ID we pass to userspace as a handler so that - * they can refer to a context, and the new context ID we pass to the - * ELSP so that the GPU can inform us of the context status via - * interrupts. - * - * Return: 20-bits globally unique context ID. - */ -u32 intel_execlists_ctx_id(struct drm_i915_gem_object *ctx_obj) -{ - u32 lrca = i915_gem_obj_ggtt_offset(ctx_obj) + - LRC_PPHWSP_PN * PAGE_SIZE; - - /* LRCA is required to be 4K aligned so the more significant 20 bits - * are globally unique */ - return lrca >> 12; -} - -static bool disable_lite_restore_wa(struct intel_engine_cs *ring) -{ - return (IS_SKL_REVID(ring->dev, 0, SKL_REVID_B0) || - IS_BXT_REVID(ring->dev, 0, BXT_REVID_A1)) && - (ring->id == VCS || ring->id == VCS2); -} - -uint64_t intel_lr_context_descriptor(struct intel_context *ctx, - struct intel_engine_cs *ring) +static u32 execlists_request_write_tail(struct drm_i915_gem_request *req) { - struct drm_i915_gem_object *ctx_obj = ctx->engine[ring->id].state; - uint64_t desc; - uint64_t lrca = i915_gem_obj_ggtt_offset(ctx_obj) + - LRC_PPHWSP_PN * PAGE_SIZE; - - WARN_ON(lrca & 0xFFFFFFFF00000FFFULL); - - desc = GEN8_CTX_VALID; - desc |= GEN8_CTX_ADDRESSING_MODE(ring->i915) << GEN8_CTX_ADDRESSING_MODE_SHIFT; - if (IS_GEN8(ring->i915)) - desc |= GEN8_CTX_L3LLC_COHERENT; - desc |= GEN8_CTX_PRIVILEGE; - desc |= lrca; - desc |= (u64)intel_execlists_ctx_id(ctx_obj) << GEN8_CTX_ID_SHIFT; - - /* TODO: WaDisableLiteRestore when we start using semaphore - * signalling between Command Streamers */ - /* desc |= GEN8_CTX_FORCE_RESTORE; */ + struct intel_ring *ring = req->ring; + struct i915_hw_ppgtt *ppgtt = req->ctx->ppgtt; - /* WaEnableForceRestoreInCtxtDescForVCS:skl */ - /* WaEnableForceRestoreInCtxtDescForVCS:bxt */ - if (disable_lite_restore_wa(ring)) - desc |= GEN8_CTX_FORCE_RESTORE; + if (ppgtt && !USES_FULL_48BIT_PPGTT(req->i915)) { + /* True 32b PPGTT with dynamic page allocation: update PDP + * registers and point the unallocated PDPs to scratch page. + * PML4 is allocated during ppgtt init, so this is not needed + * in 48-bit mode. + */ + if (ppgtt->pd_dirty_rings & intel_engine_flag(req->engine)) { + ASSIGN_CTX_PDP(ppgtt, ring->registers, 3); + ASSIGN_CTX_PDP(ppgtt, ring->registers, 2); + ASSIGN_CTX_PDP(ppgtt, ring->registers, 1); + ASSIGN_CTX_PDP(ppgtt, ring->registers, 0); + ppgtt->pd_dirty_rings &= ~intel_engine_flag(req->engine); + } + } - return desc; + ring->registers[CTX_RING_TAIL+1] = req->tail; + return ring->context_descriptor; } -static void execlists_elsp_write(struct drm_i915_gem_request *rq0, - struct drm_i915_gem_request *rq1) +static void execlists_submit_pair(struct intel_engine_cs *ring) { + struct drm_i915_private *dev_priv = ring->i915; + uint32_t desc[4]; - struct intel_engine_cs *engine = rq0->engine; - struct drm_i915_private *dev_priv = rq0->i915; - uint64_t desc[2]; - - if (rq1) { - desc[1] = intel_lr_context_descriptor(rq1->ctx, rq1->engine); - rq1->elsp_submitted++; - } else { - desc[1] = 0; - } + if (ring->execlist_port[1]) { + desc[0] = execlists_request_write_tail(ring->execlist_port[1]); + desc[1] = ring->execlist_port[1]->fence.seqno; + } else + desc[1] = desc[0] = 0; - desc[0] = intel_lr_context_descriptor(rq0->ctx, rq0->engine); - rq0->elsp_submitted++; + desc[2] = execlists_request_write_tail(ring->execlist_port[0]); + desc[3] = ring->execlist_port[0]->fence.seqno; - /* You must always write both descriptors in the order below. */ - spin_lock_irq(&dev_priv->uncore.lock); - intel_uncore_forcewake_get__locked(dev_priv, FORCEWAKE_ALL); - I915_WRITE_FW(RING_ELSP(engine), upper_32_bits(desc[1])); - I915_WRITE_FW(RING_ELSP(engine), lower_32_bits(desc[1])); + /* Note: You must always write both descriptors in the order below. */ + I915_WRITE_FW(RING_ELSP(ring), desc[1]); + I915_WRITE_FW(RING_ELSP(ring), desc[0]); + I915_WRITE_FW(RING_ELSP(ring), desc[3]); - I915_WRITE_FW(RING_ELSP(engine), upper_32_bits(desc[0])); /* The context is automatically loaded after the following */ - I915_WRITE_FW(RING_ELSP(engine), lower_32_bits(desc[0])); - - /* ELSP is a wo register, use another nearby reg for posting */ - POSTING_READ_FW(RING_EXECLIST_STATUS_LO(engine)); - intel_uncore_forcewake_put__locked(dev_priv, FORCEWAKE_ALL); - spin_unlock_irq(&dev_priv->uncore.lock); + I915_WRITE_FW(RING_ELSP(ring), desc[2]); } -static int execlists_update_context(struct drm_i915_gem_request *rq) +static void execlists_context_unqueue(struct intel_engine_cs *engine) { - struct i915_hw_ppgtt *ppgtt = rq->ctx->ppgtt; - struct drm_i915_gem_object *ctx_obj = rq->ctx->engine[rq->engine->id].state; - struct drm_i915_gem_object *rb_obj = rq->ring->obj; - struct page *page; - uint32_t *reg_state; - - BUG_ON(!ctx_obj); - WARN_ON(!i915_gem_obj_is_pinned(ctx_obj)); - WARN_ON(!i915_gem_obj_is_pinned(rb_obj)); - - page = i915_gem_object_get_dirty_page(ctx_obj, LRC_STATE_PN); - reg_state = kmap_atomic(page); + struct drm_i915_gem_request *cursor; + bool submit = false; + int port = 0; - reg_state[CTX_RING_TAIL+1] = rq->tail; - reg_state[CTX_RING_BUFFER_START+1] = i915_gem_obj_ggtt_offset(rb_obj); + assert_spin_locked(&engine->execlist_lock); - if (ppgtt && !USES_FULL_48BIT_PPGTT(rq->i915)) { - /* True 32b PPGTT with dynamic page allocation: update PDP - * registers and point the unallocated PDPs to scratch page. - * PML4 is allocated during ppgtt init, so this is not needed - * in 48-bit mode. + /* Try to read in pairs and fill both submission ports */ + cursor = engine->execlist_port[port]; + if (cursor != NULL) { + /* WaIdleLiteRestore:bdw,skl + * Apply the wa NOOPs to prevent ring:HEAD == req:TAIL + * as we resubmit the request. See gen8_emit_request() + * for where we prepare the padding after the end of the + * request. */ - ASSIGN_CTX_PDP(ppgtt, reg_state, 3); - ASSIGN_CTX_PDP(ppgtt, reg_state, 2); - ASSIGN_CTX_PDP(ppgtt, reg_state, 1); - ASSIGN_CTX_PDP(ppgtt, reg_state, 0); - } - - kunmap_atomic(reg_state); - - return 0; -} + cursor->tail = cursor->wa_tail; + cursor = list_next_entry(cursor, execlist_link); + } else + cursor = list_first_entry(&engine->execlist_queue, + typeof(*cursor), + execlist_link); + while (&cursor->execlist_link != &engine->execlist_queue) { + /* Same ctx: ignore earlier request, as the + * second request extends the first. + */ + if (engine->execlist_port[port] && + cursor->ctx != engine->execlist_port[port]->ctx) { + if (++port == ARRAY_SIZE(engine->execlist_port)) + break; + } -static void execlists_submit_requests(struct drm_i915_gem_request *rq0, - struct drm_i915_gem_request *rq1) -{ - execlists_update_context(rq0); + engine->execlist_port[port] = cursor; + submit = true; - if (rq1) - execlists_update_context(rq1); + cursor = list_next_entry(cursor, execlist_link); + } - execlists_elsp_write(rq0, rq1); + if (submit) + execlists_submit_pair(engine); } -static void execlists_context_unqueue(struct intel_engine_cs *engine) +static bool execlists_complete_requests(struct intel_engine_cs *engine, + u32 seqno) { - struct drm_i915_gem_request *req0 = NULL, *req1 = NULL; - struct drm_i915_gem_request *cursor = NULL, *tmp = NULL; - assert_spin_locked(&engine->execlist_lock); - /* - * If irqs are not active generate a warning as batches that finish - * without the irqs may get lost and a GPU Hang may occur. - */ - WARN_ON(!intel_irqs_enabled(engine->dev->dev_private)); + do { + struct drm_i915_gem_request *req; - if (list_empty(&engine->execlist_queue)) - return; + req = engine->execlist_port[0]; + if (req == NULL) + break; - /* Try to read in pairs */ - list_for_each_entry_safe(cursor, tmp, &engine->execlist_queue, - execlist_link) { - if (!req0) { - req0 = cursor; - } else if (req0->ctx == cursor->ctx) { - /* Same ctx: ignore first request, as second request - * will update tail past first request's workload */ - cursor->elsp_submitted = req0->elsp_submitted; - list_del(&req0->execlist_link); - list_add_tail(&req0->execlist_link, - &engine->execlist_retired_req_list); - req0 = cursor; - } else { - req1 = cursor; + if (!i915_seqno_passed(seqno, req->fence.seqno)) break; - } - } - if (IS_GEN8(engine->dev) || IS_GEN9(engine->dev)) { - /* - * WaIdleLiteRestore: make sure we never cause a lite - * restore with HEAD==TAIL + /* Move the completed set of requests from the start of the + * execlist_queue over to the tail of the execlist_completed. */ - if (req0->elsp_submitted) { - /* - * Apply the wa NOOPS to prevent ring:HEAD == req:TAIL - * as we resubmit the request. See gen8_add_request() - * for where we prepare the padding after the end of the - * request. - */ - struct intel_ring *ring; - - ring = req0->ctx->engine[engine->id].ring; - req0->tail += 8; - req0->tail &= ring->size - 1; - } - } - - WARN_ON(req1 && req1->elsp_submitted); + engine->execlist_completed.prev->next = engine->execlist_queue.next; + engine->execlist_completed.prev = &req->execlist_link; - execlists_submit_requests(req0, req1); -} - -static bool execlists_check_remove_request(struct intel_engine_cs *ring, - u32 request_id) -{ - struct drm_i915_gem_request *head_req; + engine->execlist_queue.next = req->execlist_link.next; + req->execlist_link.next->prev = &engine->execlist_queue; - assert_spin_locked(&ring->execlist_lock); + req->execlist_link.next = &engine->execlist_completed; - head_req = list_first_entry_or_null(&ring->execlist_queue, - struct drm_i915_gem_request, - execlist_link); - - if (head_req != NULL) { - struct drm_i915_gem_object *ctx_obj = - head_req->ctx->engine[ring->id].state; - if (intel_execlists_ctx_id(ctx_obj) == request_id) { - WARN(head_req->elsp_submitted == 0, - "Never submitted head request\n"); - - if (--head_req->elsp_submitted <= 0) { - list_del(&head_req->execlist_link); - list_add_tail(&head_req->execlist_link, - &ring->execlist_retired_req_list); - return true; - } - } - } + /* The hardware has completed the request on this port, it + * will switch to the next. + */ + engine->execlist_port[0] = engine->execlist_port[1]; + engine->execlist_port[1] = NULL; + } while (1); - return false; + if (engine->execlist_context_descriptor & GEN8_CTX_FORCE_RESTORE) + return engine->execlist_port[0] == NULL; + else + return engine->execlist_port[1] == NULL; } static void set_rtpriority(void) @@ -504,23 +404,29 @@ static int intel_execlists_submit(void *arg) { struct intel_engine_cs *ring = arg; struct drm_i915_private *dev_priv = ring->i915; + const i915_reg_t ptrs = RING_CONTEXT_STATUS_PTR(ring); set_rtpriority(); + intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL); do { - u32 status; - u32 status_id; - u32 submit_contexts; u8 head, tail; + u32 seqno; set_current_state(TASK_INTERRUPTIBLE); - head = ring->next_context_status_buffer; - tail = I915_READ(RING_CONTEXT_STATUS_PTR(ring)) & GEN8_CSB_PTR_MASK; + head = tail = 0; + if (READ_ONCE(ring->execlist_port[0])) { + u32 x = I915_READ_FW(ptrs); + head = x >> 8; + tail = x; + } if (head == tail) { + intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL); if (kthread_should_stop()) return 0; schedule(); + intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL); continue; } __set_current_state(TASK_RUNNING); @@ -528,86 +434,46 @@ static int intel_execlists_submit(void *arg) if (head > tail) tail += GEN8_CSB_ENTRIES; - status = 0; - submit_contexts = 0; - - spin_lock(&ring->execlist_lock); - + seqno = 0; while (head++ < tail) { - status = I915_READ(RING_CONTEXT_STATUS_BUF_LO(ring, head % GEN8_CSB_ENTRIES)); - status_id = I915_READ(RING_CONTEXT_STATUS_BUF_HI(ring, head % GEN8_CSB_ENTRIES)); - - if (status & GEN8_CTX_STATUS_IDLE_ACTIVE) - continue; - - if (status & GEN8_CTX_STATUS_PREEMPTED) { - if (status & GEN8_CTX_STATUS_LITE_RESTORE) { - if (execlists_check_remove_request(ring, status_id)) - WARN(1, "Lite Restored request removed from queue\n"); - } else - WARN(1, "Preemption without Lite Restore\n"); - } - - if ((status & GEN8_CTX_STATUS_ACTIVE_IDLE) || - (status & GEN8_CTX_STATUS_ELEMENT_SWITCH)) { - if (execlists_check_remove_request(ring, status_id)) - submit_contexts++; + u32 status = I915_READ_FW(RING_CONTEXT_STATUS_BUF_LO(ring, + head % GEN8_CSB_ENTRIES)); + if (unlikely(status & GEN8_CTX_STATUS_PREEMPTED && 0)) { + DRM_ERROR("Pre-empted request %x %s Lite Restore\n", + I915_READ_FW(RING_CONTEXT_STATUS_BUF_HI(ring, head % GEN8_CSB_ENTRIES)), + status & GEN8_CTX_STATUS_LITE_RESTORE ? "with" : "without"); } + if (status & (GEN8_CTX_STATUS_ACTIVE_IDLE | + GEN8_CTX_STATUS_ELEMENT_SWITCH)) + seqno = I915_READ_FW(RING_CONTEXT_STATUS_BUF_HI(ring, + head % GEN8_CSB_ENTRIES)); } - if (disable_lite_restore_wa(ring)) { - /* Prevent a ctx to preempt itself */ - if ((status & GEN8_CTX_STATUS_ACTIVE_IDLE) && - (submit_contexts != 0)) + I915_WRITE_FW(ptrs, + _MASKED_FIELD(GEN8_CSB_PTR_MASK<<8, + (tail % GEN8_CSB_ENTRIES) << 8)); + + if (seqno) { + spin_lock(&ring->execlist_lock); + if (execlists_complete_requests(ring, seqno)) execlists_context_unqueue(ring); - } else if (submit_contexts != 0) { - execlists_context_unqueue(ring); + spin_unlock(&ring->execlist_lock); } - - spin_unlock(&ring->execlist_lock); - - WARN(submit_contexts > 2, "More than two context complete events?\n"); - ring->next_context_status_buffer = tail % GEN8_CSB_ENTRIES; - I915_WRITE(RING_CONTEXT_STATUS_PTR(ring), - _MASKED_FIELD(GEN8_CSB_PTR_MASK << 8, - ring->next_context_status_buffer<<8)); } while (1); } static int execlists_context_queue(struct drm_i915_gem_request *request) { struct intel_engine_cs *engine = request->engine; - struct drm_i915_gem_request *cursor; - int num_elements = 0; i915_gem_request_get(request); spin_lock(&engine->execlist_lock); - - list_for_each_entry(cursor, &engine->execlist_queue, execlist_link) - if (++num_elements > 2) - break; - - if (num_elements > 2) { - struct drm_i915_gem_request *tail_req; - - tail_req = list_last_entry(&engine->execlist_queue, - struct drm_i915_gem_request, - execlist_link); - - if (request->ctx == tail_req->ctx) { - WARN(tail_req->elsp_submitted != 0, - "More than 2 already-submitted reqs queued\n"); - list_del(&tail_req->execlist_link); - list_add_tail(&tail_req->execlist_link, - &engine->execlist_retired_req_list); - } - } - list_add_tail(&request->execlist_link, &engine->execlist_queue); - if (num_elements == 0) - execlists_context_unqueue(engine); - + if (engine->execlist_port[0] == NULL) { + engine->execlist_port[0] = request; + execlists_submit_pair(engine); + } spin_unlock(&engine->execlist_lock); return 0; @@ -641,56 +507,32 @@ int intel_logical_ring_alloc_request_extras(struct drm_i915_gem_request *request return 0; } -/* - * intel_logical_ring_advance_and_submit() - advance the tail and submit the workload - * @request: Request to advance the logical ringbuffer of. - * - * The tail is updated in our logical ringbuffer struct, not in the actual context. What - * really happens during submission is that the context and current tail will be placed - * on a queue waiting for the ELSP to be ready to accept a new context submission. At that - * point, the tail *inside* the context is updated and the ELSP written to. - */ -static void -intel_logical_ring_advance_and_submit(struct drm_i915_gem_request *request) -{ - struct drm_i915_private *dev_priv = request->i915; - - intel_ring_advance(request->ring); - request->tail = request->ring->tail; - - if (dev_priv->guc.execbuf_client) - i915_guc_submit(dev_priv->guc.execbuf_client, request); - else - execlists_context_queue(request); -} - bool intel_execlists_retire_requests(struct intel_engine_cs *ring) { struct drm_i915_gem_request *req, *tmp; - struct list_head retired_list; + struct list_head list; - WARN_ON(!mutex_is_locked(&ring->dev->struct_mutex)); - if (list_empty(&ring->execlist_retired_req_list)) + lockdep_assert_held(&ring->dev->struct_mutex); + if (list_empty(&ring->execlist_completed)) goto out; - INIT_LIST_HEAD(&retired_list); spin_lock(&ring->execlist_lock); - list_replace_init(&ring->execlist_retired_req_list, &retired_list); + list_replace_init(&ring->execlist_completed, &list); spin_unlock(&ring->execlist_lock); - list_for_each_entry_safe(req, tmp, &retired_list, execlist_link) { + list_for_each_entry_safe(req, tmp, &list, execlist_link) { struct intel_context *ctx = req->ctx; struct drm_i915_gem_object *ctx_obj = ctx->engine[ring->id].state; if (ctx_obj && (ctx != ring->default_context)) intel_lr_context_unpin(req); - list_del(&req->execlist_link); + i915_gem_request_put(req); } out: - return list_empty(&ring->execlist_queue); + return READ_ONCE(ring->execlist_port[0]) == NULL; } void intel_logical_ring_stop(struct intel_engine_cs *ring) @@ -720,6 +562,7 @@ static int intel_lr_context_do_pin(struct intel_engine_cs *ring, struct intel_ring *ringbuf) { struct drm_i915_private *dev_priv = ring->i915; + u32 ggtt_offset; int ret = 0; WARN_ON(!mutex_is_locked(&ring->dev->struct_mutex)); @@ -734,6 +577,16 @@ static int intel_lr_context_do_pin(struct intel_engine_cs *ring, ctx_obj->dirty = true; + ggtt_offset = + i915_gem_obj_ggtt_offset(ctx_obj) + LRC_PPHWSP_PN * PAGE_SIZE; + ringbuf->context_descriptor = + ggtt_offset | ring->execlist_context_descriptor; + + ringbuf->registers = + kmap(i915_gem_object_get_dirty_page(ctx_obj, LRC_STATE_PN)); + ringbuf->registers[CTX_RING_BUFFER_START+1] = + i915_gem_obj_ggtt_offset(ringbuf->obj); + /* Invalidate GuC TLB. */ if (i915.enable_guc_submission) I915_WRITE(GEN8_GTCR, GEN8_GTCR_INVALIDATE); @@ -768,6 +621,7 @@ static int intel_lr_context_pin(struct drm_i915_gem_request *rq) void intel_lr_context_unpin(struct drm_i915_gem_request *rq) { + struct drm_i915_gem_object *ctx_obj; int engine = rq->engine->id; WARN_ON(!mutex_is_locked(&rq->i915->dev->struct_mutex)); @@ -775,7 +629,10 @@ void intel_lr_context_unpin(struct drm_i915_gem_request *rq) return; intel_ring_unmap(rq->ring); - i915_gem_object_ggtt_unpin(rq->ctx->engine[engine].state); + + ctx_obj = rq->ctx->engine[engine].state; + kunmap(i915_gem_object_get_page(ctx_obj, LRC_STATE_PN)); + i915_gem_object_ggtt_unpin(ctx_obj); i915_gem_context_unreference(rq->ctx); } @@ -1168,12 +1025,39 @@ out: return ret; } +static bool disable_lite_restore_wa(struct intel_engine_cs *ring) +{ + return (IS_SKL_REVID(ring->i915, 0, SKL_REVID_B0) || + IS_BXT_REVID(ring->i915, 0, BXT_REVID_A1)) && + (ring->id == VCS || ring->id == VCS2); +} + +static uint64_t lr_context_descriptor(struct intel_engine_cs *ring) +{ + uint64_t desc; + + desc = GEN8_CTX_VALID; + desc |= GEN8_CTX_ADDRESSING_MODE(ring->i915) << GEN8_CTX_ADDRESSING_MODE_SHIFT; + if (IS_GEN8(ring->i915)) + desc |= GEN8_CTX_L3LLC_COHERENT; + desc |= GEN8_CTX_PRIVILEGE; + + /* TODO: WaDisableLiteRestore when we start using semaphore + * signalling between Command Streamers */ + /* desc |= GEN8_CTX_FORCE_RESTORE; */ + + /* WaEnableForceRestoreInCtxtDescForVCS:skl */ + /* WaEnableForceRestoreInCtxtDescForVCS:bxt */ + if (disable_lite_restore_wa(ring)) + desc |= GEN8_CTX_FORCE_RESTORE; + + return desc; +} + static int gen8_init_common_ring(struct intel_engine_cs *ring) { struct drm_device *dev = ring->dev; struct drm_i915_private *dev_priv = dev->dev_private; - u8 next_context_status_buffer_hw; - lrc_setup_hardware_status_page(ring, ring->default_context->engine[ring->id].state); @@ -1197,18 +1081,6 @@ static int gen8_init_common_ring(struct intel_engine_cs *ring) * SKL | ? | ? | * BXT | ? | ? | */ - next_context_status_buffer_hw = - GEN8_CSB_WRITE_PTR(I915_READ(RING_CONTEXT_STATUS_PTR(ring))); - - /* - * When the CSB registers are reset (also after power-up / gpu reset), - * CSB write pointer is set to all 1's, which is not valid, use '5' in - * this special case, so the first element read is CSB[0]. - */ - if (next_context_status_buffer_hw == GEN8_CSB_PTR_MASK) - next_context_status_buffer_hw = (GEN8_CSB_ENTRIES - 1); - - ring->next_context_status_buffer = next_context_status_buffer_hw; DRM_DEBUG_DRIVER("Execlists enabled for %s\n", ring->name); memset(&ring->hangcheck, 0, sizeof(ring->hangcheck)); @@ -1482,7 +1354,8 @@ static int gen8_add_request(struct drm_i915_gem_request *request) intel_ring_emit(ring, request->fence.seqno); intel_ring_emit(ring, MI_USER_INTERRUPT); intel_ring_emit(ring, MI_NOOP); - intel_logical_ring_advance_and_submit(request); + intel_ring_advance(ring); + request->tail = ring->tail; /* * Here we add two extra NOOPs as padding to avoid @@ -1491,6 +1364,12 @@ static int gen8_add_request(struct drm_i915_gem_request *request) intel_ring_emit(ring, MI_NOOP); intel_ring_emit(ring, MI_NOOP); intel_ring_advance(ring); + request->wa_tail = ring->tail; + + if (request->i915->guc.execbuf_client) + i915_guc_submit(request->i915->guc.execbuf_client, request); + else + execlists_context_queue(request); return 0; } @@ -1569,9 +1448,11 @@ static int logical_ring_init(struct drm_device *dev, struct intel_engine_cs *rin INIT_LIST_HEAD(&ring->buffers); INIT_LIST_HEAD(&ring->execlist_queue); - INIT_LIST_HEAD(&ring->execlist_retired_req_list); + INIT_LIST_HEAD(&ring->execlist_completed); spin_lock_init(&ring->execlist_lock); + ring->execlist_context_descriptor = lr_context_descriptor(ring); + ret = i915_cmd_parser_init_ring(ring); if (ret) goto error; @@ -1592,8 +1473,6 @@ static int logical_ring_init(struct drm_device *dev, struct intel_engine_cs *rin goto error; } - ring->next_context_status_buffer = - I915_READ(RING_CONTEXT_STATUS_PTR(ring)) & GEN8_CSB_PTR_MASK; task = kthread_run(intel_execlists_submit, ring, "irq/i915:%de", ring->id); if (IS_ERR(task)) @@ -1904,9 +1783,7 @@ populate_lr_context(struct intel_context *ctx, struct drm_i915_gem_object *ctx_o CTX_CTRL_RS_CTX_ENABLE)); ASSIGN_CTX_REG(reg_state, CTX_RING_HEAD, RING_HEAD(ring->mmio_base), 0); ASSIGN_CTX_REG(reg_state, CTX_RING_TAIL, RING_TAIL(ring->mmio_base), 0); - /* Ring buffer start address is not known until the buffer is pinned. - * It is written to the context image in execlists_update_context() - */ + /* Ring buffer start address is not known until the buffer is pinned. */ ASSIGN_CTX_REG(reg_state, CTX_RING_BUFFER_START, RING_START(ring->mmio_base), 0); ASSIGN_CTX_REG(reg_state, CTX_RING_BUFFER_CONTROL, RING_CTL(ring->mmio_base), ((ringbuf->size - PAGE_SIZE) & RING_NR_PAGES) | RING_VALID); diff --git a/drivers/gpu/drm/i915/intel_lrc.h b/drivers/gpu/drm/i915/intel_lrc.h index 33f82a84065a..37601a35d5fc 100644 --- a/drivers/gpu/drm/i915/intel_lrc.h +++ b/drivers/gpu/drm/i915/intel_lrc.h @@ -74,12 +74,9 @@ int intel_lr_context_deferred_alloc(struct intel_context *ctx, void intel_lr_context_unpin(struct drm_i915_gem_request *req); void intel_lr_context_reset(struct drm_device *dev, struct intel_context *ctx); -uint64_t intel_lr_context_descriptor(struct intel_context *ctx, - struct intel_engine_cs *ring); /* Execlists */ int intel_sanitize_enable_execlists(struct drm_device *dev, int enable_execlists); -u32 intel_execlists_ctx_id(struct drm_i915_gem_object *ctx_obj); bool intel_execlists_retire_requests(struct intel_engine_cs *ring); diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h index edaf07b2292e..3d4d5711aea9 100644 --- a/drivers/gpu/drm/i915/intel_ringbuffer.h +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h @@ -122,6 +122,9 @@ struct intel_ring { * we can detect new retirements. */ u32 last_retired_head; + + u32 context_descriptor; + u32 *registers; }; struct intel_context; @@ -293,9 +296,10 @@ struct intel_engine_cs { /* Execlists */ struct task_struct *execlists_submit; spinlock_t execlist_lock; + struct drm_i915_gem_request *execlist_port[2]; struct list_head execlist_queue; - struct list_head execlist_retired_req_list; - u8 next_context_status_buffer; + struct list_head execlist_completed; + u32 execlist_context_descriptor; u32 irq_keep_mask; /* bitmask for interrupts that should not be masked */ /** -- 2.7.0.rc3 _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx http://lists.freedesktop.org/mailman/listinfo/intel-gfx