Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> writes: > When using a global seqno, we required a precise stop-the-workd event to > handle preemption and unwind the global seqno counter. To accomplish > this, we would preempt to a special out-of-band context and wait for the > machine to report that it was idle. Given an idle machine, we could very > precisely see which requests had completed and which we needed to feed > back into the run queue. > > However, now that we have scrapped the global seqno, we no longer need > to precisely unwind the global counter and only track requests by their > per-context seqno. This allows us to loosely unwind inflight requests > while scheduling a preemption, with the enormous caveat that the > requests we put back on the run queue are still _inflight_ (until the > preemption request is complete). This makes request tracking much more > messy, as at any point then we can see a completed request that we > believe is not currently scheduled for execution. We also have to be > careful not to rewind RING_TAIL past RING_HEAD on preempting to the > running context, and for this we use a semaphore to prevent completion > of the request before continuing. > > To accomplish this feat, we change how we track requests scheduled to > the HW. Instead of appending our requests onto a single list as we > submit, we track each submission to ELSP as its own block. Then upon > receiving the CS preemption event, we promote the pending block to the > inflight block (discarding what was previously being tracked). As normal > CS completion events arrive, we then remove stale entries from the > inflight tracker. > > Signed-off-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> > --- > Skip the bonus asserts if the context is already completed. > --- > drivers/gpu/drm/i915/gem/i915_gem_context.c | 2 +- > drivers/gpu/drm/i915/gt/intel_context_types.h | 5 + > drivers/gpu/drm/i915/gt/intel_engine.h | 61 +- > drivers/gpu/drm/i915/gt/intel_engine_cs.c | 63 +- > drivers/gpu/drm/i915/gt/intel_engine_types.h | 52 +- > drivers/gpu/drm/i915/gt/intel_lrc.c | 678 ++++++++---------- > drivers/gpu/drm/i915/i915_gpu_error.c | 19 +- > drivers/gpu/drm/i915/i915_request.c | 6 + > drivers/gpu/drm/i915/i915_request.h | 1 + > drivers/gpu/drm/i915/i915_scheduler.c | 3 +- > drivers/gpu/drm/i915/i915_utils.h | 12 + > drivers/gpu/drm/i915/intel_guc_submission.c | 175 ++--- > drivers/gpu/drm/i915/selftests/i915_request.c | 8 +- > 13 files changed, 475 insertions(+), 610 deletions(-) > > diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c b/drivers/gpu/drm/i915/gem/i915_gem_context.c > index 0f2c22a3bcb6..35871c8a42a6 100644 > --- a/drivers/gpu/drm/i915/gem/i915_gem_context.c > +++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c > @@ -646,7 +646,7 @@ static void init_contexts(struct drm_i915_private *i915) > > static bool needs_preempt_context(struct drm_i915_private *i915) > { > - return HAS_EXECLISTS(i915); > + return USES_GUC_SUBMISSION(i915); > } > > int i915_gem_contexts_init(struct drm_i915_private *dev_priv) > diff --git a/drivers/gpu/drm/i915/gt/intel_context_types.h b/drivers/gpu/drm/i915/gt/intel_context_types.h > index 08049ee91cee..4c0e211c715d 100644 > --- a/drivers/gpu/drm/i915/gt/intel_context_types.h > +++ b/drivers/gpu/drm/i915/gt/intel_context_types.h > @@ -13,6 +13,7 @@ > #include <linux/types.h> > > #include "i915_active_types.h" > +#include "i915_utils.h" > #include "intel_engine_types.h" > #include "intel_sseu.h" > > @@ -38,6 +39,10 @@ struct intel_context { > struct i915_gem_context *gem_context; > struct intel_engine_cs *engine; > struct intel_engine_cs *inflight; > +#define intel_context_inflight(ce) ptr_mask_bits((ce)->inflight, 2) > +#define intel_context_inflight_count(ce) ptr_unmask_bits((ce)->inflight, 2) > +#define intel_context_inflight_inc(ce) ptr_count_inc(&(ce)->inflight) > +#define intel_context_inflight_dec(ce) ptr_count_dec(&(ce)->inflight) Just curious here that what you consider the advantages of carrying this info with the pointer? > > struct list_head signal_link; > struct list_head signals; > diff --git a/drivers/gpu/drm/i915/gt/intel_engine.h b/drivers/gpu/drm/i915/gt/intel_engine.h > index 2f1c6871ee95..9bb6ff76680e 100644 > --- a/drivers/gpu/drm/i915/gt/intel_engine.h > +++ b/drivers/gpu/drm/i915/gt/intel_engine.h > @@ -125,71 +125,26 @@ hangcheck_action_to_str(const enum intel_engine_hangcheck_action a) > > void intel_engines_set_scheduler_caps(struct drm_i915_private *i915); > > -static inline void > -execlists_set_active(struct intel_engine_execlists *execlists, > - unsigned int bit) > -{ > - __set_bit(bit, (unsigned long *)&execlists->active); > -} > - > -static inline bool > -execlists_set_active_once(struct intel_engine_execlists *execlists, > - unsigned int bit) > -{ > - return !__test_and_set_bit(bit, (unsigned long *)&execlists->active); > -} > - > -static inline void > -execlists_clear_active(struct intel_engine_execlists *execlists, > - unsigned int bit) > -{ > - __clear_bit(bit, (unsigned long *)&execlists->active); > -} > - > -static inline void > -execlists_clear_all_active(struct intel_engine_execlists *execlists) > +static inline unsigned int > +execlists_num_ports(const struct intel_engine_execlists * const execlists) > { > - execlists->active = 0; > + return execlists->port_mask + 1; > } > > -static inline bool > -execlists_is_active(const struct intel_engine_execlists *execlists, > - unsigned int bit) > +static inline struct i915_request * > +execlists_active(const struct intel_engine_execlists *execlists) > { > - return test_bit(bit, (unsigned long *)&execlists->active); > + GEM_BUG_ON(execlists->active - execlists->inflight > > + execlists_num_ports(execlists)); > + return READ_ONCE(*execlists->active); > } > > -void execlists_user_begin(struct intel_engine_execlists *execlists, > - const struct execlist_port *port); > -void execlists_user_end(struct intel_engine_execlists *execlists); > - > void > execlists_cancel_port_requests(struct intel_engine_execlists * const execlists); > > struct i915_request * > execlists_unwind_incomplete_requests(struct intel_engine_execlists *execlists); > > -static inline unsigned int > -execlists_num_ports(const struct intel_engine_execlists * const execlists) > -{ > - return execlists->port_mask + 1; > -} > - > -static inline struct execlist_port * > -execlists_port_complete(struct intel_engine_execlists * const execlists, > - struct execlist_port * const port) > -{ > - const unsigned int m = execlists->port_mask; > - > - GEM_BUG_ON(port_index(port, execlists) != 0); > - GEM_BUG_ON(!execlists_is_active(execlists, EXECLISTS_ACTIVE_USER)); > - > - memmove(port, port + 1, m * sizeof(struct execlist_port)); > - memset(port + m, 0, sizeof(struct execlist_port)); > - > - return port; > -} > - > static inline u32 > intel_read_status_page(const struct intel_engine_cs *engine, int reg) > { > diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c > index 7fd33e81c2d9..d45328e254dc 100644 > --- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c > +++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c > @@ -508,6 +508,10 @@ void intel_engine_init_execlists(struct intel_engine_cs *engine) > GEM_BUG_ON(!is_power_of_2(execlists_num_ports(execlists))); > GEM_BUG_ON(execlists_num_ports(execlists) > EXECLIST_MAX_PORTS); > > + memset(execlists->pending, 0, sizeof(execlists->pending)); > + execlists->active = > + memset(execlists->inflight, 0, sizeof(execlists->inflight)); > + > execlists->queue_priority_hint = INT_MIN; > execlists->queue = RB_ROOT_CACHED; > } > @@ -1152,7 +1156,7 @@ bool intel_engine_is_idle(struct intel_engine_cs *engine) > return true; > > /* Waiting to drain ELSP? */ > - if (READ_ONCE(engine->execlists.active)) { > + if (execlists_active(&engine->execlists)) { > struct tasklet_struct *t = &engine->execlists.tasklet; > > synchronize_hardirq(engine->i915->drm.irq); > @@ -1169,7 +1173,7 @@ bool intel_engine_is_idle(struct intel_engine_cs *engine) > /* Otherwise flush the tasklet if it was on another cpu */ > tasklet_unlock_wait(t); > > - if (READ_ONCE(engine->execlists.active)) > + if (execlists_active(&engine->execlists)) > return false; > } > > @@ -1367,6 +1371,7 @@ static void intel_engine_print_registers(struct intel_engine_cs *engine, > } > > if (HAS_EXECLISTS(dev_priv)) { > + struct i915_request * const *port, *rq; > const u32 *hws = > &engine->status_page.addr[I915_HWS_CSB_BUF0_INDEX]; > const u8 num_entries = execlists->csb_size; > @@ -1399,27 +1404,33 @@ static void intel_engine_print_registers(struct intel_engine_cs *engine, > } > > spin_lock_irqsave(&engine->active.lock, flags); > - for (idx = 0; idx < execlists_num_ports(execlists); idx++) { > - struct i915_request *rq; > - unsigned int count; > + for (port = execlists->active; (rq = *port); port++) { > + char hdr[80]; > + int len; > + > + len = snprintf(hdr, sizeof(hdr), > + "\t\tActive[%d: ", > + (int)(port - execlists->active)); > + if (!i915_request_signaled(rq)) > + len += snprintf(hdr + len, sizeof(hdr) - len, > + "ring:{start:%08x, hwsp:%08x, seqno:%08x}, ", > + i915_ggtt_offset(rq->ring->vma), > + rq->timeline->hwsp_offset, > + hwsp_seqno(rq)); > + snprintf(hdr + len, sizeof(hdr) - len, "rq: "); > + print_request(m, rq, hdr); > + } > + for (port = execlists->pending; (rq = *port); port++) { > char hdr[80]; > > - rq = port_unpack(&execlists->port[idx], &count); > - if (!rq) { > - drm_printf(m, "\t\tELSP[%d] idle\n", idx); > - } else if (!i915_request_signaled(rq)) { > - snprintf(hdr, sizeof(hdr), > - "\t\tELSP[%d] count=%d, ring:{start:%08x, hwsp:%08x, seqno:%08x}, rq: ", > - idx, count, > - i915_ggtt_offset(rq->ring->vma), > - rq->timeline->hwsp_offset, > - hwsp_seqno(rq)); > - print_request(m, rq, hdr); > - } else { > - print_request(m, rq, "\t\tELSP[%d] rq: "); > - } > + snprintf(hdr, sizeof(hdr), > + "\t\tPending[%d] ring:{start:%08x, hwsp:%08x, seqno:%08x}, rq: ", > + (int)(port - execlists->pending), > + i915_ggtt_offset(rq->ring->vma), > + rq->timeline->hwsp_offset, > + hwsp_seqno(rq)); > + print_request(m, rq, hdr); > } > - drm_printf(m, "\t\tHW active? 0x%x\n", execlists->active); > spin_unlock_irqrestore(&engine->active.lock, flags); > } else if (INTEL_GEN(dev_priv) > 6) { > drm_printf(m, "\tPP_DIR_BASE: 0x%08x\n", > @@ -1583,15 +1594,19 @@ int intel_enable_engine_stats(struct intel_engine_cs *engine) > } > > if (engine->stats.enabled++ == 0) { > - const struct execlist_port *port = execlists->port; > - unsigned int num_ports = execlists_num_ports(execlists); > + struct i915_request * const *port; > + struct i915_request *rq; > > engine->stats.enabled_at = ktime_get(); > > /* XXX submission method oblivious? */ > - while (num_ports-- && port_isset(port)) { > + for (port = execlists->active; (rq = *port); port++) > engine->stats.active++; > - port++; > + > + for (port = execlists->pending; (rq = *port); port++) { > + /* Exclude any contexts already counted in active */ > + if (intel_context_inflight_count(rq->hw_context) == 1) > + engine->stats.active++; > } > > if (engine->stats.active) > diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h > index 43e975a26016..411b7a807b99 100644 > --- a/drivers/gpu/drm/i915/gt/intel_engine_types.h > +++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h > @@ -172,51 +172,10 @@ struct intel_engine_execlists { > */ > u32 __iomem *ctrl_reg; > > - /** > - * @port: execlist port states > - * > - * For each hardware ELSP (ExecList Submission Port) we keep > - * track of the last request and the number of times we submitted > - * that port to hw. We then count the number of times the hw reports > - * a context completion or preemption. As only one context can > - * be active on hw, we limit resubmission of context to port[0]. This > - * is called Lite Restore, of the context. > - */ > - struct execlist_port { > - /** > - * @request_count: combined request and submission count > - */ > - struct i915_request *request_count; > -#define EXECLIST_COUNT_BITS 2 > -#define port_request(p) ptr_mask_bits((p)->request_count, EXECLIST_COUNT_BITS) > -#define port_count(p) ptr_unmask_bits((p)->request_count, EXECLIST_COUNT_BITS) > -#define port_pack(rq, count) ptr_pack_bits(rq, count, EXECLIST_COUNT_BITS) > -#define port_unpack(p, count) ptr_unpack_bits((p)->request_count, count, EXECLIST_COUNT_BITS) > -#define port_set(p, packed) ((p)->request_count = (packed)) > -#define port_isset(p) ((p)->request_count) > -#define port_index(p, execlists) ((p) - (execlists)->port) > - > - /** > - * @context_id: context ID for port > - */ > - GEM_DEBUG_DECL(u32 context_id); > - > #define EXECLIST_MAX_PORTS 2 > - } port[EXECLIST_MAX_PORTS]; > - > - /** > - * @active: is the HW active? We consider the HW as active after > - * submitting any context for execution and until we have seen the > - * last context completion event. After that, we do not expect any > - * more events until we submit, and so can park the HW. > - * > - * As we have a small number of different sources from which we feed > - * the HW, we track the state of each inside a single bitfield. > - */ > - unsigned int active; > -#define EXECLISTS_ACTIVE_USER 0 > -#define EXECLISTS_ACTIVE_PREEMPT 1 > -#define EXECLISTS_ACTIVE_HWACK 2 > + struct i915_request * const *active; > + struct i915_request *inflight[EXECLIST_MAX_PORTS + 1 /* sentinel */]; > + struct i915_request *pending[EXECLIST_MAX_PORTS + 1]; > > /** > * @port_mask: number of execlist ports - 1 > @@ -257,11 +216,6 @@ struct intel_engine_execlists { > */ > u32 *csb_status; > > - /** > - * @preempt_complete_status: expected CSB upon completing preemption > - */ > - u32 preempt_complete_status; > - > /** > * @csb_size: context status buffer FIFO size > */ > diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c > index 82b7ace62d97..b1e45c651660 100644 > --- a/drivers/gpu/drm/i915/gt/intel_lrc.c > +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c > @@ -161,6 +161,8 @@ > #define GEN8_CTX_STATUS_COMPLETED_MASK \ > (GEN8_CTX_STATUS_COMPLETE | GEN8_CTX_STATUS_PREEMPTED) > > +#define CTX_DESC_FORCE_RESTORE BIT_ULL(2) > + > /* Typical size of the average request (2 pipecontrols and a MI_BB) */ > #define EXECLISTS_REQUEST_SIZE 64 /* bytes */ > #define WA_TAIL_DWORDS 2 > @@ -221,6 +223,14 @@ static void execlists_init_reg_state(u32 *reg_state, > struct intel_engine_cs *engine, > struct intel_ring *ring); > > +static inline u32 intel_hws_preempt_address(struct intel_engine_cs *engine) > +{ > + return (i915_ggtt_offset(engine->status_page.vma) + > + I915_GEM_HWS_PREEMPT_ADDR); > +} > + > +#define ring_pause(E) ((E)->status_page.addr[I915_GEM_HWS_PREEMPT]) Scary. Please lets make a function of ring_pause and use intel_write_status_page in it. So I guess you have and you want squeeze the latency fruit. When we have everything in place, CI is green and everyone is happy, then we tear it down? > + > static inline struct i915_priolist *to_priolist(struct rb_node *rb) > { > return rb_entry(rb, struct i915_priolist, node); > @@ -271,12 +281,6 @@ static inline bool need_preempt(const struct intel_engine_cs *engine, > { > int last_prio; > > - if (!engine->preempt_context) > - return false; > - > - if (i915_request_completed(rq)) > - return false; > - > /* > * Check if the current priority hint merits a preemption attempt. > * > @@ -338,9 +342,6 @@ __maybe_unused static inline bool > assert_priority_queue(const struct i915_request *prev, > const struct i915_request *next) > { > - const struct intel_engine_execlists *execlists = > - &prev->engine->execlists; > - > /* > * Without preemption, the prev may refer to the still active element > * which we refuse to let go. > @@ -348,7 +349,7 @@ assert_priority_queue(const struct i915_request *prev, > * Even with preemption, there are times when we think it is better not > * to preempt and leave an ostensibly lower priority request in flight. > */ > - if (port_request(execlists->port) == prev) > + if (i915_request_is_active(prev)) > return true; > > return rq_prio(prev) >= rq_prio(next); > @@ -442,13 +443,11 @@ __unwind_incomplete_requests(struct intel_engine_cs *engine) > struct intel_engine_cs *owner; > > if (i915_request_completed(rq)) > - break; > + continue; /* XXX */ Yeah, but what is the plan with the XXX. > > __i915_request_unsubmit(rq); > unwind_wa_tail(rq); > > - GEM_BUG_ON(rq->hw_context->inflight); > - > /* > * Push the request back into the queue for later resubmission. > * If this request is not native to this physical engine (i.e. > @@ -500,32 +499,32 @@ execlists_context_status_change(struct i915_request *rq, unsigned long status) > status, rq); > } > > -inline void > -execlists_user_begin(struct intel_engine_execlists *execlists, > - const struct execlist_port *port) > +static inline struct i915_request * > +execlists_schedule_in(struct i915_request *rq, int idx) > { > - execlists_set_active_once(execlists, EXECLISTS_ACTIVE_USER); > -} > + struct intel_context *ce = rq->hw_context; > + int count; > > -inline void > -execlists_user_end(struct intel_engine_execlists *execlists) > -{ > - execlists_clear_active(execlists, EXECLISTS_ACTIVE_USER); > -} > + trace_i915_request_in(rq, idx); > > -static inline void > -execlists_context_schedule_in(struct i915_request *rq) > -{ > - GEM_BUG_ON(rq->hw_context->inflight); > + count = intel_context_inflight_count(ce); > + if (!count) { > + intel_context_get(ce); > + ce->inflight = rq->engine; > + > + execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_IN); > + intel_engine_context_in(ce->inflight); > + } > > - execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_IN); > - intel_engine_context_in(rq->engine); > - rq->hw_context->inflight = rq->engine; > + intel_context_inflight_inc(ce); > + GEM_BUG_ON(intel_context_inflight(ce) != rq->engine); > + > + return i915_request_get(rq); > } > > -static void kick_siblings(struct i915_request *rq) > +static void kick_siblings(struct i915_request *rq, struct intel_context *ce) > { > - struct virtual_engine *ve = to_virtual_engine(rq->hw_context->engine); > + struct virtual_engine *ve = container_of(ce, typeof(*ve), context); > struct i915_request *next = READ_ONCE(ve->request); > > if (next && next->execution_mask & ~rq->execution_mask) > @@ -533,29 +532,42 @@ static void kick_siblings(struct i915_request *rq) > } > > static inline void > -execlists_context_schedule_out(struct i915_request *rq, unsigned long status) > +execlists_schedule_out(struct i915_request *rq) > { > - rq->hw_context->inflight = NULL; > - intel_engine_context_out(rq->engine); > - execlists_context_status_change(rq, status); > + struct intel_context *ce = rq->hw_context; > + > + GEM_BUG_ON(!intel_context_inflight_count(ce)); > + > trace_i915_request_out(rq); > > - /* > - * If this is part of a virtual engine, its next request may have > - * been blocked waiting for access to the active context. We have > - * to kick all the siblings again in case we need to switch (e.g. > - * the next request is not runnable on this engine). Hopefully, > - * we will already have submitted the next request before the > - * tasklet runs and do not need to rebuild each virtual tree > - * and kick everyone again. > - */ > - if (rq->engine != rq->hw_context->engine) > - kick_siblings(rq); > + intel_context_inflight_dec(ce); > + if (!intel_context_inflight_count(ce)) { > + intel_engine_context_out(ce->inflight); > + execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT); > + > + ce->inflight = NULL; > + intel_context_put(ce); > + > + /* > + * If this is part of a virtual engine, its next request may > + * have been blocked waiting for access to the active context. > + * We have to kick all the siblings again in case we need to > + * switch (e.g. the next request is not runnable on this > + * engine). Hopefully, we will already have submitted the next > + * request before the tasklet runs and do not need to rebuild > + * each virtual tree and kick everyone again. > + */ > + if (rq->engine != ce->engine) > + kick_siblings(rq, ce); > + } > + > + i915_request_put(rq); > } > > -static u64 execlists_update_context(struct i915_request *rq) > +static u64 execlists_update_context(const struct i915_request *rq) > { > struct intel_context *ce = rq->hw_context; > + u64 desc; > > ce->lrc_reg_state[CTX_RING_TAIL + 1] = > intel_ring_set_tail(rq->ring, rq->tail); > @@ -576,7 +588,11 @@ static u64 execlists_update_context(struct i915_request *rq) > * wmb). > */ > mb(); > - return ce->lrc_desc; > + > + desc = ce->lrc_desc; > + ce->lrc_desc &= ~CTX_DESC_FORCE_RESTORE; > + > + return desc; > } > > static inline void write_desc(struct intel_engine_execlists *execlists, u64 desc, u32 port) > @@ -590,12 +606,59 @@ static inline void write_desc(struct intel_engine_execlists *execlists, u64 desc > } > } > > +static __maybe_unused void > +trace_ports(const struct intel_engine_execlists *execlists, > + const char *msg, > + struct i915_request * const *ports) > +{ > + const struct intel_engine_cs *engine = > + container_of(execlists, typeof(*engine), execlists); > + > + GEM_TRACE("%s: %s { %llx:%lld%s, %llx:%lld }\n", > + engine->name, msg, > + ports[0]->fence.context, > + ports[0]->fence.seqno, > + i915_request_completed(ports[0]) ? "!" : > + i915_request_started(ports[0]) ? "*" : > + "", > + ports[1] ? ports[1]->fence.context : 0, > + ports[1] ? ports[1]->fence.seqno : 0); > +} > + > +static __maybe_unused bool > +assert_pending_valid(const struct intel_engine_execlists *execlists, > + const char *msg) > +{ > + struct i915_request * const *port, *rq; > + struct intel_context *ce = NULL; > + > + trace_ports(execlists, msg, execlists->pending); > + > + if (execlists->pending[execlists_num_ports(execlists)]) > + return false; > + > + for (port = execlists->pending; (rq = *port); port++) { > + if (ce == rq->hw_context) > + return false; > + > + ce = rq->hw_context; > + if (i915_request_completed(rq)) > + continue; > + > + GEM_BUG_ON(i915_active_is_idle(&ce->active)); > + GEM_BUG_ON(!i915_vma_is_pinned(ce->state)); > + } > + > + return ce; > +} > + > static void execlists_submit_ports(struct intel_engine_cs *engine) > { > struct intel_engine_execlists *execlists = &engine->execlists; > - struct execlist_port *port = execlists->port; > unsigned int n; > > + GEM_BUG_ON(!assert_pending_valid(execlists, "submit")); > + > /* > * We can skip acquiring intel_runtime_pm_get() here as it was taken > * on our behalf by the request (see i915_gem_mark_busy()) and it will > @@ -613,38 +676,16 @@ static void execlists_submit_ports(struct intel_engine_cs *engine) > * of elsq entries, keep this in mind before changing the loop below. > */ > for (n = execlists_num_ports(execlists); n--; ) { > - struct i915_request *rq; > - unsigned int count; > - u64 desc; > + struct i915_request *rq = execlists->pending[n]; > > - rq = port_unpack(&port[n], &count); > - if (rq) { > - GEM_BUG_ON(count > !n); > - if (!count++) > - execlists_context_schedule_in(rq); > - port_set(&port[n], port_pack(rq, count)); > - desc = execlists_update_context(rq); > - GEM_DEBUG_EXEC(port[n].context_id = upper_32_bits(desc)); > - > - GEM_TRACE("%s in[%d]: ctx=%d.%d, fence %llx:%lld (current %d), prio=%d\n", > - engine->name, n, > - port[n].context_id, count, > - rq->fence.context, rq->fence.seqno, > - hwsp_seqno(rq), > - rq_prio(rq)); > - } else { > - GEM_BUG_ON(!n); > - desc = 0; > - } > - > - write_desc(execlists, desc, n); > + write_desc(execlists, > + rq ? execlists_update_context(rq) : 0, > + n); > } > > /* we need to manually load the submit queue */ > if (execlists->ctrl_reg) > writel(EL_CTRL_LOAD, execlists->ctrl_reg); > - > - execlists_clear_active(execlists, EXECLISTS_ACTIVE_HWACK); > } > > static bool ctx_single_port_submission(const struct intel_context *ce) > @@ -668,6 +709,7 @@ static bool can_merge_ctx(const struct intel_context *prev, > static bool can_merge_rq(const struct i915_request *prev, > const struct i915_request *next) > { > + GEM_BUG_ON(prev == next); > GEM_BUG_ON(!assert_priority_queue(prev, next)); > > if (!can_merge_ctx(prev->hw_context, next->hw_context)) > @@ -676,58 +718,6 @@ static bool can_merge_rq(const struct i915_request *prev, > return true; > } > > -static void port_assign(struct execlist_port *port, struct i915_request *rq) > -{ > - GEM_BUG_ON(rq == port_request(port)); > - > - if (port_isset(port)) > - i915_request_put(port_request(port)); > - > - port_set(port, port_pack(i915_request_get(rq), port_count(port))); > -} > - > -static void inject_preempt_context(struct intel_engine_cs *engine) > -{ > - struct intel_engine_execlists *execlists = &engine->execlists; > - struct intel_context *ce = engine->preempt_context; > - unsigned int n; > - > - GEM_BUG_ON(execlists->preempt_complete_status != > - upper_32_bits(ce->lrc_desc)); > - > - /* > - * Switch to our empty preempt context so > - * the state of the GPU is known (idle). > - */ > - GEM_TRACE("%s\n", engine->name); > - for (n = execlists_num_ports(execlists); --n; ) > - write_desc(execlists, 0, n); > - > - write_desc(execlists, ce->lrc_desc, n); > - > - /* we need to manually load the submit queue */ > - if (execlists->ctrl_reg) > - writel(EL_CTRL_LOAD, execlists->ctrl_reg); > - > - execlists_clear_active(execlists, EXECLISTS_ACTIVE_HWACK); > - execlists_set_active(execlists, EXECLISTS_ACTIVE_PREEMPT); > - > - (void)I915_SELFTEST_ONLY(execlists->preempt_hang.count++); > -} > - > -static void complete_preempt_context(struct intel_engine_execlists *execlists) > -{ > - GEM_BUG_ON(!execlists_is_active(execlists, EXECLISTS_ACTIVE_PREEMPT)); > - > - if (inject_preempt_hang(execlists)) > - return; > - > - execlists_cancel_port_requests(execlists); > - __unwind_incomplete_requests(container_of(execlists, > - struct intel_engine_cs, > - execlists)); > -} > - > static void virtual_update_register_offsets(u32 *regs, > struct intel_engine_cs *engine) > { > @@ -792,7 +782,7 @@ static bool virtual_matches(const struct virtual_engine *ve, > * we reuse the register offsets). This is a very small > * hystersis on the greedy seelction algorithm. > */ > - inflight = READ_ONCE(ve->context.inflight); > + inflight = intel_context_inflight(&ve->context); > if (inflight && inflight != engine) > return false; > > @@ -815,13 +805,23 @@ static void virtual_xfer_breadcrumbs(struct virtual_engine *ve, > spin_unlock(&old->breadcrumbs.irq_lock); > } > > +static struct i915_request * > +last_active(const struct intel_engine_execlists *execlists) > +{ > + struct i915_request * const *last = execlists->active; > + > + while (*last && i915_request_completed(*last)) > + last++; > + > + return *last; > +} > + > static void execlists_dequeue(struct intel_engine_cs *engine) > { > struct intel_engine_execlists * const execlists = &engine->execlists; > - struct execlist_port *port = execlists->port; > - const struct execlist_port * const last_port = > - &execlists->port[execlists->port_mask]; > - struct i915_request *last = port_request(port); > + struct i915_request **port = execlists->pending; > + struct i915_request ** const last_port = port + execlists->port_mask; > + struct i915_request *last; > struct rb_node *rb; > bool submit = false; > > @@ -867,65 +867,72 @@ static void execlists_dequeue(struct intel_engine_cs *engine) > break; > } > > + /* > + * If the queue is higher priority than the last > + * request in the currently active context, submit afresh. > + * We will resubmit again afterwards in case we need to split > + * the active context to interject the preemption request, > + * i.e. we will retrigger preemption following the ack in case > + * of trouble. > + */ > + last = last_active(execlists); > if (last) { > - /* > - * Don't resubmit or switch until all outstanding > - * preemptions (lite-restore) are seen. Then we > - * know the next preemption status we see corresponds > - * to this ELSP update. > - */ > - GEM_BUG_ON(!execlists_is_active(execlists, > - EXECLISTS_ACTIVE_USER)); > - GEM_BUG_ON(!port_count(&port[0])); > - > - /* > - * If we write to ELSP a second time before the HW has had > - * a chance to respond to the previous write, we can confuse > - * the HW and hit "undefined behaviour". After writing to ELSP, > - * we must then wait until we see a context-switch event from > - * the HW to indicate that it has had a chance to respond. > - */ > - if (!execlists_is_active(execlists, EXECLISTS_ACTIVE_HWACK)) > - return; > - > if (need_preempt(engine, last, rb)) { > - inject_preempt_context(engine); > - return; > - } > + GEM_TRACE("%s: preempting last=%llx:%lld, prio=%d, hint=%d\n", > + engine->name, > + last->fence.context, > + last->fence.seqno, > + last->sched.attr.priority, > + execlists->queue_priority_hint); > + /* > + * Don't let the RING_HEAD advance past the breadcrumb > + * as we unwind (and until we resubmit) so that we do > + * not accidentally tell it to go backwards. > + */ > + ring_pause(engine) = 1; > > - /* > - * In theory, we could coalesce more requests onto > - * the second port (the first port is active, with > - * no preemptions pending). However, that means we > - * then have to deal with the possible lite-restore > - * of the second port (as we submit the ELSP, there > - * may be a context-switch) but also we may complete > - * the resubmission before the context-switch. Ergo, > - * coalescing onto the second port will cause a > - * preemption event, but we cannot predict whether > - * that will affect port[0] or port[1]. > - * > - * If the second port is already active, we can wait > - * until the next context-switch before contemplating > - * new requests. The GPU will be busy and we should be > - * able to resubmit the new ELSP before it idles, > - * avoiding pipeline bubbles (momentary pauses where > - * the driver is unable to keep up the supply of new > - * work). However, we have to double check that the > - * priorities of the ports haven't been switch. > - */ > - if (port_count(&port[1])) > - return; > + /* > + * Note that we have not stopped the GPU at this point, > + * so we are unwinding the incomplete requests as they > + * remain inflight and so by the time we do complete > + * the preemption, some of the unwound requests may > + * complete! > + */ > + __unwind_incomplete_requests(engine); > > - /* > - * WaIdleLiteRestore:bdw,skl > - * Apply the wa NOOPs to prevent > - * ring:HEAD == rq:TAIL as we resubmit the > - * request. See gen8_emit_fini_breadcrumb() for > - * where we prepare the padding after the > - * end of the request. > - */ > - last->tail = last->wa_tail; > + /* > + * If we need to return to the preempted context, we > + * need to skip the lite-restore and force it to > + * reload the RING_TAIL. Otherwise, the HW has a > + * tendency to ignore us rewinding the TAIL to the > + * end of an earlier request. > + */ > + last->hw_context->lrc_desc |= CTX_DESC_FORCE_RESTORE; > + last = NULL; > + } else { > + /* > + * Otherwise if we already have a request pending > + * for execution after the current one, we can > + * just wait until the next CS event before > + * queuing more. In either case we will force a > + * lite-restore preemption event, but if we wait > + * we hopefully coalesce several updates into a single > + * submission. > + */ > + if (!list_is_last(&last->sched.link, > + &engine->active.requests)) > + return; > + > + /* > + * WaIdleLiteRestore:bdw,skl > + * Apply the wa NOOPs to prevent > + * ring:HEAD == rq:TAIL as we resubmit the > + * request. See gen8_emit_fini_breadcrumb() for > + * where we prepare the padding after the > + * end of the request. > + */ > + last->tail = last->wa_tail; > + } > } > > while (rb) { /* XXX virtual is always taking precedence */ > @@ -955,9 +962,24 @@ static void execlists_dequeue(struct intel_engine_cs *engine) > continue; > } > > + if (i915_request_completed(rq)) { > + ve->request = NULL; > + ve->base.execlists.queue_priority_hint = INT_MIN; > + rb_erase_cached(rb, &execlists->virtual); > + RB_CLEAR_NODE(rb); > + > + rq->engine = engine; > + __i915_request_submit(rq); > + > + spin_unlock(&ve->base.active.lock); > + > + rb = rb_first_cached(&execlists->virtual); > + continue; > + } > + > if (last && !can_merge_rq(last, rq)) { > spin_unlock(&ve->base.active.lock); > - return; /* leave this rq for another engine */ > + return; /* leave this for another */ > } > > GEM_TRACE("%s: virtual rq=%llx:%lld%s, new engine? %s\n", > @@ -1006,9 +1028,10 @@ static void execlists_dequeue(struct intel_engine_cs *engine) > } > > __i915_request_submit(rq); > - trace_i915_request_in(rq, port_index(port, execlists)); > - submit = true; > - last = rq; > + if (!i915_request_completed(rq)) { > + submit = true; > + last = rq; > + } > } > > spin_unlock(&ve->base.active.lock); > @@ -1021,6 +1044,9 @@ static void execlists_dequeue(struct intel_engine_cs *engine) > int i; > > priolist_for_each_request_consume(rq, rn, p, i) { > + if (i915_request_completed(rq)) > + goto skip; > + > /* > * Can we combine this request with the current port? > * It has to be the same context/ringbuffer and not > @@ -1060,19 +1086,14 @@ static void execlists_dequeue(struct intel_engine_cs *engine) > ctx_single_port_submission(rq->hw_context)) > goto done; > > - > - if (submit) > - port_assign(port, last); > + *port = execlists_schedule_in(last, port - execlists->pending); > port++; > - > - GEM_BUG_ON(port_isset(port)); > } > > - __i915_request_submit(rq); > - trace_i915_request_in(rq, port_index(port, execlists)); > - > last = rq; > submit = true; > +skip: > + __i915_request_submit(rq); > } > > rb_erase_cached(&p->node, &execlists->queue); > @@ -1097,54 +1118,30 @@ static void execlists_dequeue(struct intel_engine_cs *engine) > * interrupt for secondary ports). > */ > execlists->queue_priority_hint = queue_prio(execlists); > + GEM_TRACE("%s: queue_priority_hint:%d, submit:%s\n", > + engine->name, execlists->queue_priority_hint, > + yesno(submit)); > > if (submit) { > - port_assign(port, last); > + *port = execlists_schedule_in(last, port - execlists->pending); > + memset(port + 1, 0, (last_port - port) * sizeof(*port)); > execlists_submit_ports(engine); > } > - > - /* We must always keep the beast fed if we have work piled up */ > - GEM_BUG_ON(rb_first_cached(&execlists->queue) && > - !port_isset(execlists->port)); > - > - /* Re-evaluate the executing context setup after each preemptive kick */ > - if (last) > - execlists_user_begin(execlists, execlists->port); > - > - /* If the engine is now idle, so should be the flag; and vice versa. */ > - GEM_BUG_ON(execlists_is_active(&engine->execlists, > - EXECLISTS_ACTIVE_USER) == > - !port_isset(engine->execlists.port)); > } > > void > execlists_cancel_port_requests(struct intel_engine_execlists * const execlists) > { > - struct execlist_port *port = execlists->port; > - unsigned int num_ports = execlists_num_ports(execlists); > - > - while (num_ports-- && port_isset(port)) { > - struct i915_request *rq = port_request(port); > - > - GEM_TRACE("%s:port%u fence %llx:%lld, (current %d)\n", > - rq->engine->name, > - (unsigned int)(port - execlists->port), > - rq->fence.context, rq->fence.seqno, > - hwsp_seqno(rq)); > + struct i915_request * const *port, *rq; > > - GEM_BUG_ON(!execlists->active); > - execlists_context_schedule_out(rq, > - i915_request_completed(rq) ? > - INTEL_CONTEXT_SCHEDULE_OUT : > - INTEL_CONTEXT_SCHEDULE_PREEMPTED); > + for (port = execlists->pending; (rq = *port); port++) > + execlists_schedule_out(rq); > + memset(execlists->pending, 0, sizeof(execlists->pending)); > > - i915_request_put(rq); > - > - memset(port, 0, sizeof(*port)); > - port++; > - } > - > - execlists_clear_all_active(execlists); > + for (port = execlists->active; (rq = *port); port++) > + execlists_schedule_out(rq); > + execlists->active = > + memset(execlists->inflight, 0, sizeof(execlists->inflight)); > } > > static inline void > @@ -1163,7 +1160,6 @@ reset_in_progress(const struct intel_engine_execlists *execlists) > static void process_csb(struct intel_engine_cs *engine) > { > struct intel_engine_execlists * const execlists = &engine->execlists; > - struct execlist_port *port = execlists->port; > const u32 * const buf = execlists->csb_status; > const u8 num_entries = execlists->csb_size; > u8 head, tail; > @@ -1198,9 +1194,7 @@ static void process_csb(struct intel_engine_cs *engine) > rmb(); > > do { > - struct i915_request *rq; > unsigned int status; > - unsigned int count; > > if (++head == num_entries) > head = 0; > @@ -1223,68 +1217,37 @@ static void process_csb(struct intel_engine_cs *engine) > * status notifier. > */ > > - GEM_TRACE("%s csb[%d]: status=0x%08x:0x%08x, active=0x%x\n", > + GEM_TRACE("%s csb[%d]: status=0x%08x:0x%08x\n", > engine->name, head, > - buf[2 * head + 0], buf[2 * head + 1], > - execlists->active); > + buf[2 * head + 0], buf[2 * head + 1]); > > status = buf[2 * head]; > - if (status & (GEN8_CTX_STATUS_IDLE_ACTIVE | > - GEN8_CTX_STATUS_PREEMPTED)) > - execlists_set_active(execlists, > - EXECLISTS_ACTIVE_HWACK); > - if (status & GEN8_CTX_STATUS_ACTIVE_IDLE) > - execlists_clear_active(execlists, > - EXECLISTS_ACTIVE_HWACK); > - > - if (!(status & GEN8_CTX_STATUS_COMPLETED_MASK)) > - continue; > + if (status & GEN8_CTX_STATUS_IDLE_ACTIVE) { > +promote: > + GEM_BUG_ON(!assert_pending_valid(execlists, "promote")); > + execlists->active = > + memcpy(execlists->inflight, > + execlists->pending, > + execlists_num_ports(execlists) * > + sizeof(*execlists->pending)); > + execlists->pending[0] = NULL; I can't decide if comment or a helper inline function would serve better as documentation of between inflight and pending movement. I guess it is better to be left as a future work after the dust settles. Just general yearning for a similar kind of level of documentation steps as in dequeue. > > - /* We should never get a COMPLETED | IDLE_ACTIVE! */ > - GEM_BUG_ON(status & GEN8_CTX_STATUS_IDLE_ACTIVE); Is our assert coverage going to suffer? > + if (!inject_preempt_hang(execlists)) > + ring_pause(engine) = 0; > + } else if (status & GEN8_CTX_STATUS_PREEMPTED) { > + struct i915_request * const *port = execlists->active; > > - if (status & GEN8_CTX_STATUS_COMPLETE && > - buf[2*head + 1] == execlists->preempt_complete_status) { > - GEM_TRACE("%s preempt-idle\n", engine->name); > - complete_preempt_context(execlists); > - continue; > - } > - > - if (status & GEN8_CTX_STATUS_PREEMPTED && > - execlists_is_active(execlists, > - EXECLISTS_ACTIVE_PREEMPT)) > - continue; > + trace_ports(execlists, "preempted", execlists->active); > > - GEM_BUG_ON(!execlists_is_active(execlists, > - EXECLISTS_ACTIVE_USER)); > + while (*port) > + execlists_schedule_out(*port++); > > - rq = port_unpack(port, &count); > - GEM_TRACE("%s out[0]: ctx=%d.%d, fence %llx:%lld (current %d), prio=%d\n", > - engine->name, > - port->context_id, count, > - rq ? rq->fence.context : 0, > - rq ? rq->fence.seqno : 0, > - rq ? hwsp_seqno(rq) : 0, > - rq ? rq_prio(rq) : 0); > + goto promote; > + } else if (*execlists->active) { > + struct i915_request *rq = *execlists->active++; > > - /* Check the context/desc id for this event matches */ > - GEM_DEBUG_BUG_ON(buf[2 * head + 1] != port->context_id); > - > - GEM_BUG_ON(count == 0); > - if (--count == 0) { > - /* > - * On the final event corresponding to the > - * submission of this context, we expect either > - * an element-switch event or a completion > - * event (and on completion, the active-idle > - * marker). No more preemptions, lite-restore > - * or otherwise. > - */ > - GEM_BUG_ON(status & GEN8_CTX_STATUS_PREEMPTED); > - GEM_BUG_ON(port_isset(&port[1]) && > - !(status & GEN8_CTX_STATUS_ELEMENT_SWITCH)); > - GEM_BUG_ON(!port_isset(&port[1]) && > - !(status & GEN8_CTX_STATUS_ACTIVE_IDLE)); > + trace_ports(execlists, "completed", > + execlists->active - 1); > > /* > * We rely on the hardware being strongly > @@ -1293,21 +1256,10 @@ static void process_csb(struct intel_engine_cs *engine) > * user interrupt and CSB is processed. > */ > GEM_BUG_ON(!i915_request_completed(rq)); > + execlists_schedule_out(rq); > > - execlists_context_schedule_out(rq, > - INTEL_CONTEXT_SCHEDULE_OUT); > - i915_request_put(rq); > - > - GEM_TRACE("%s completed ctx=%d\n", > - engine->name, port->context_id); > - > - port = execlists_port_complete(execlists, port); > - if (port_isset(port)) > - execlists_user_begin(execlists, port); > - else > - execlists_user_end(execlists); > - } else { > - port_set(port, port_pack(rq, count)); > + GEM_BUG_ON(execlists->active - execlists->inflight > > + execlists_num_ports(execlists)); > } > } while (head != tail); > > @@ -1332,7 +1284,7 @@ static void __execlists_submission_tasklet(struct intel_engine_cs *const engine) > lockdep_assert_held(&engine->active.lock); > > process_csb(engine); > - if (!execlists_is_active(&engine->execlists, EXECLISTS_ACTIVE_PREEMPT)) > + if (!engine->execlists.pending[0]) > execlists_dequeue(engine); > } > > @@ -1345,11 +1297,6 @@ static void execlists_submission_tasklet(unsigned long data) > struct intel_engine_cs * const engine = (struct intel_engine_cs *)data; > unsigned long flags; > > - GEM_TRACE("%s awake?=%d, active=%x\n", > - engine->name, > - !!intel_wakeref_active(&engine->wakeref), > - engine->execlists.active); > - > spin_lock_irqsave(&engine->active.lock, flags); > __execlists_submission_tasklet(engine); > spin_unlock_irqrestore(&engine->active.lock, flags); > @@ -1376,12 +1323,16 @@ static void __submit_queue_imm(struct intel_engine_cs *engine) > tasklet_hi_schedule(&execlists->tasklet); > } > > -static void submit_queue(struct intel_engine_cs *engine, int prio) > +static void submit_queue(struct intel_engine_cs *engine, > + const struct i915_request *rq) > { > - if (prio > engine->execlists.queue_priority_hint) { > - engine->execlists.queue_priority_hint = prio; > - __submit_queue_imm(engine); > - } > + struct intel_engine_execlists *execlists = &engine->execlists; > + > + if (rq_prio(rq) <= execlists->queue_priority_hint) > + return; > + > + execlists->queue_priority_hint = rq_prio(rq); > + __submit_queue_imm(engine); > } > > static void execlists_submit_request(struct i915_request *request) > @@ -1397,7 +1348,7 @@ static void execlists_submit_request(struct i915_request *request) > GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root)); > GEM_BUG_ON(list_empty(&request->sched.link)); > > - submit_queue(engine, rq_prio(request)); > + submit_queue(engine, request); > > spin_unlock_irqrestore(&engine->active.lock, flags); > } > @@ -2048,27 +1999,13 @@ static void execlists_reset_prepare(struct intel_engine_cs *engine) > spin_unlock_irqrestore(&engine->active.lock, flags); > } > > -static bool lrc_regs_ok(const struct i915_request *rq) > -{ > - const struct intel_ring *ring = rq->ring; > - const u32 *regs = rq->hw_context->lrc_reg_state; > - > - /* Quick spot check for the common signs of context corruption */ > - > - if (regs[CTX_RING_BUFFER_CONTROL + 1] != > - (RING_CTL_SIZE(ring->size) | RING_VALID)) > - return false; > - > - if (regs[CTX_RING_BUFFER_START + 1] != i915_ggtt_offset(ring->vma)) > - return false; > - > - return true; > -} > - > -static void reset_csb_pointers(struct intel_engine_execlists *execlists) > +static void reset_csb_pointers(struct intel_engine_cs *engine) > { > + struct intel_engine_execlists * const execlists = &engine->execlists; > const unsigned int reset_value = execlists->csb_size - 1; > > + ring_pause(engine) = 0; > + > /* > * After a reset, the HW starts writing into CSB entry [0]. We > * therefore have to set our HEAD pointer back one entry so that > @@ -2115,18 +2052,21 @@ static void __execlists_reset(struct intel_engine_cs *engine, bool stalled) > process_csb(engine); /* drain preemption events */ > > /* Following the reset, we need to reload the CSB read/write pointers */ > - reset_csb_pointers(&engine->execlists); > + reset_csb_pointers(engine); > > /* > * Save the currently executing context, even if we completed > * its request, it was still running at the time of the > * reset and will have been clobbered. > */ > - if (!port_isset(execlists->port)) > - goto out_clear; > + rq = execlists_active(execlists); > + if (!rq) > + return; > > - rq = port_request(execlists->port); > ce = rq->hw_context; > + GEM_BUG_ON(i915_active_is_idle(&ce->active)); > + GEM_BUG_ON(!i915_vma_is_pinned(ce->state)); > + rq = active_request(rq); > > /* > * Catch up with any missed context-switch interrupts. > @@ -2139,9 +2079,12 @@ static void __execlists_reset(struct intel_engine_cs *engine, bool stalled) > */ > execlists_cancel_port_requests(execlists); > > - rq = active_request(rq); > - if (!rq) > + if (!rq) { > + ce->ring->head = ce->ring->tail; > goto out_replay; > + } > + > + ce->ring->head = intel_ring_wrap(ce->ring, rq->head); > > /* > * If this request hasn't started yet, e.g. it is waiting on a > @@ -2155,7 +2098,7 @@ static void __execlists_reset(struct intel_engine_cs *engine, bool stalled) > * Otherwise, if we have not started yet, the request should replay > * perfectly and we do not need to flag the result as being erroneous. > */ > - if (!i915_request_started(rq) && lrc_regs_ok(rq)) > + if (!i915_request_started(rq)) > goto out_replay; > > /* > @@ -2170,7 +2113,7 @@ static void __execlists_reset(struct intel_engine_cs *engine, bool stalled) > * image back to the expected values to skip over the guilty request. > */ > i915_reset_request(rq, stalled); > - if (!stalled && lrc_regs_ok(rq)) > + if (!stalled) > goto out_replay; > > /* > @@ -2190,17 +2133,13 @@ static void __execlists_reset(struct intel_engine_cs *engine, bool stalled) > execlists_init_reg_state(regs, ce, engine, ce->ring); > > out_replay: > - /* Rerun the request; its payload has been neutered (if guilty). */ > - ce->ring->head = > - rq ? intel_ring_wrap(ce->ring, rq->head) : ce->ring->tail; > + GEM_TRACE("%s replay {head:%04x, tail:%04x\n", > + engine->name, ce->ring->head, ce->ring->tail); > intel_ring_update_space(ce->ring); > __execlists_update_reg_state(ce, engine); > > /* Push back any incomplete requests for replay after the reset. */ > __unwind_incomplete_requests(engine); > - > -out_clear: > - execlists_clear_all_active(execlists); > } > > static void execlists_reset(struct intel_engine_cs *engine, bool stalled) > @@ -2296,7 +2235,6 @@ static void execlists_cancel_requests(struct intel_engine_cs *engine) > > execlists->queue_priority_hint = INT_MIN; > execlists->queue = RB_ROOT_CACHED; > - GEM_BUG_ON(port_isset(execlists->port)); > > GEM_BUG_ON(__tasklet_is_enabled(&execlists->tasklet)); > execlists->tasklet.func = nop_submission_tasklet; > @@ -2514,15 +2452,29 @@ static u32 *gen8_emit_wa_tail(struct i915_request *request, u32 *cs) > return cs; > } > > +static u32 *emit_preempt_busywait(struct i915_request *request, u32 *cs) > +{ > + *cs++ = MI_SEMAPHORE_WAIT | > + MI_SEMAPHORE_GLOBAL_GTT | > + MI_SEMAPHORE_POLL | > + MI_SEMAPHORE_SAD_EQ_SDD; > + *cs++ = 0; > + *cs++ = intel_hws_preempt_address(request->engine); > + *cs++ = 0; > + > + return cs; > +} > + > static u32 *gen8_emit_fini_breadcrumb(struct i915_request *request, u32 *cs) > { > cs = gen8_emit_ggtt_write(cs, > request->fence.seqno, > request->timeline->hwsp_offset, > 0); > - > *cs++ = MI_USER_INTERRUPT; > + > *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; This was discussed in irc, could warrant a comment here of why this is needed. Precious info. > + cs = emit_preempt_busywait(request, cs); > > request->tail = intel_ring_offset(request, cs); > assert_ring_tail_valid(request->ring, request->tail); > @@ -2543,9 +2495,10 @@ static u32 *gen8_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs) > PIPE_CONTROL_FLUSH_ENABLE | > PIPE_CONTROL_CS_STALL, > 0); > - > *cs++ = MI_USER_INTERRUPT; > + > *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; Comment could be copied here. Or a common helper created for common parts. -Mika > + cs = emit_preempt_busywait(request, cs); > > request->tail = intel_ring_offset(request, cs); > assert_ring_tail_valid(request->ring, request->tail); > @@ -2594,8 +2547,7 @@ void intel_execlists_set_default_submission(struct intel_engine_cs *engine) > engine->flags |= I915_ENGINE_SUPPORTS_STATS; > if (!intel_vgpu_active(engine->i915)) > engine->flags |= I915_ENGINE_HAS_SEMAPHORES; > - if (engine->preempt_context && > - HAS_LOGICAL_RING_PREEMPTION(engine->i915)) > + if (HAS_LOGICAL_RING_PREEMPTION(engine->i915)) > engine->flags |= I915_ENGINE_HAS_PREEMPTION; > } > > @@ -2718,11 +2670,6 @@ int intel_execlists_submission_init(struct intel_engine_cs *engine) > i915_mmio_reg_offset(RING_ELSP(base)); > } > > - execlists->preempt_complete_status = ~0u; > - if (engine->preempt_context) > - execlists->preempt_complete_status = > - upper_32_bits(engine->preempt_context->lrc_desc); > - > execlists->csb_status = > &engine->status_page.addr[I915_HWS_CSB_BUF0_INDEX]; > > @@ -2734,7 +2681,7 @@ int intel_execlists_submission_init(struct intel_engine_cs *engine) > else > execlists->csb_size = GEN11_CSB_ENTRIES; > > - reset_csb_pointers(execlists); > + reset_csb_pointers(engine); > > return 0; > } > @@ -2917,11 +2864,6 @@ populate_lr_context(struct intel_context *ce, > if (!engine->default_state) > regs[CTX_CONTEXT_CONTROL + 1] |= > _MASKED_BIT_ENABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT); > - if (ce->gem_context == engine->i915->preempt_context && > - INTEL_GEN(engine->i915) < 11) > - regs[CTX_CONTEXT_CONTROL + 1] |= > - _MASKED_BIT_ENABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT | > - CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT); > > ret = 0; > err_unpin_ctx: > diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c > index b7e9fddef270..a497cf7acb6a 100644 > --- a/drivers/gpu/drm/i915/i915_gpu_error.c > +++ b/drivers/gpu/drm/i915/i915_gpu_error.c > @@ -1248,10 +1248,10 @@ static void error_record_engine_registers(struct i915_gpu_state *error, > } > } > > -static void record_request(struct i915_request *request, > +static void record_request(const struct i915_request *request, > struct drm_i915_error_request *erq) > { > - struct i915_gem_context *ctx = request->gem_context; > + const struct i915_gem_context *ctx = request->gem_context; > > erq->flags = request->fence.flags; > erq->context = request->fence.context; > @@ -1315,20 +1315,15 @@ static void engine_record_requests(struct intel_engine_cs *engine, > ee->num_requests = count; > } > > -static void error_record_engine_execlists(struct intel_engine_cs *engine, > +static void error_record_engine_execlists(const struct intel_engine_cs *engine, > struct drm_i915_error_engine *ee) > { > const struct intel_engine_execlists * const execlists = &engine->execlists; > - unsigned int n; > + struct i915_request * const *port = execlists->active; > + unsigned int n = 0; > > - for (n = 0; n < execlists_num_ports(execlists); n++) { > - struct i915_request *rq = port_request(&execlists->port[n]); > - > - if (!rq) > - break; > - > - record_request(rq, &ee->execlist[n]); > - } > + while (*port) > + record_request(*port++, &ee->execlist[n++]); > > ee->num_ports = n; > } > diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c > index 7083e6ab92c5..0c99694faab7 100644 > --- a/drivers/gpu/drm/i915/i915_request.c > +++ b/drivers/gpu/drm/i915/i915_request.c > @@ -276,6 +276,12 @@ static bool i915_request_retire(struct i915_request *rq) > > local_irq_disable(); > > + /* > + * We only loosely track inflight requests across preemption, > + * and so we may find ourselves attempting to retire a _completed_ > + * request that we have removed from the HW and put back on a run > + * queue. > + */ > spin_lock(&rq->engine->active.lock); > list_del(&rq->sched.link); > spin_unlock(&rq->engine->active.lock); > diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h > index edbbdfec24ab..bebc1e9b4a5e 100644 > --- a/drivers/gpu/drm/i915/i915_request.h > +++ b/drivers/gpu/drm/i915/i915_request.h > @@ -28,6 +28,7 @@ > #include <linux/dma-fence.h> > #include <linux/lockdep.h> > > +#include "gt/intel_context_types.h" > #include "gt/intel_engine_types.h" > > #include "i915_gem.h" > diff --git a/drivers/gpu/drm/i915/i915_scheduler.c b/drivers/gpu/drm/i915/i915_scheduler.c > index 2e9b38bdc33c..b1ba3e65cd52 100644 > --- a/drivers/gpu/drm/i915/i915_scheduler.c > +++ b/drivers/gpu/drm/i915/i915_scheduler.c > @@ -179,8 +179,7 @@ static inline int rq_prio(const struct i915_request *rq) > > static void kick_submission(struct intel_engine_cs *engine, int prio) > { > - const struct i915_request *inflight = > - port_request(engine->execlists.port); > + const struct i915_request *inflight = *engine->execlists.active; > > /* > * If we are already the currently executing context, don't > diff --git a/drivers/gpu/drm/i915/i915_utils.h b/drivers/gpu/drm/i915/i915_utils.h > index 2987219a6300..4920ff9aba62 100644 > --- a/drivers/gpu/drm/i915/i915_utils.h > +++ b/drivers/gpu/drm/i915/i915_utils.h > @@ -131,6 +131,18 @@ __check_struct_size(size_t base, size_t arr, size_t count, size_t *size) > ((typeof(ptr))((unsigned long)(ptr) | __bits)); \ > }) > > +#define ptr_count_dec(p_ptr) do { \ > + typeof(p_ptr) __p = (p_ptr); \ > + unsigned long __v = (unsigned long)(*__p); \ > + *__p = (typeof(*p_ptr))(--__v); \ > +} while (0) > + > +#define ptr_count_inc(p_ptr) do { \ > + typeof(p_ptr) __p = (p_ptr); \ > + unsigned long __v = (unsigned long)(*__p); \ > + *__p = (typeof(*p_ptr))(++__v); \ > +} while (0) > + > #define page_mask_bits(ptr) ptr_mask_bits(ptr, PAGE_SHIFT) > #define page_unmask_bits(ptr) ptr_unmask_bits(ptr, PAGE_SHIFT) > #define page_pack_bits(ptr, bits) ptr_pack_bits(ptr, bits, PAGE_SHIFT) > diff --git a/drivers/gpu/drm/i915/intel_guc_submission.c b/drivers/gpu/drm/i915/intel_guc_submission.c > index db531ebc7704..12c22359fdac 100644 > --- a/drivers/gpu/drm/i915/intel_guc_submission.c > +++ b/drivers/gpu/drm/i915/intel_guc_submission.c > @@ -32,7 +32,11 @@ > #include "intel_guc_submission.h" > #include "i915_drv.h" > > -#define GUC_PREEMPT_FINISHED 0x1 > +enum { > + GUC_PREEMPT_NONE = 0, > + GUC_PREEMPT_INPROGRESS, > + GUC_PREEMPT_FINISHED, > +}; > #define GUC_PREEMPT_BREADCRUMB_DWORDS 0x8 > #define GUC_PREEMPT_BREADCRUMB_BYTES \ > (sizeof(u32) * GUC_PREEMPT_BREADCRUMB_DWORDS) > @@ -537,15 +541,11 @@ static void guc_add_request(struct intel_guc *guc, struct i915_request *rq) > u32 ctx_desc = lower_32_bits(rq->hw_context->lrc_desc); > u32 ring_tail = intel_ring_set_tail(rq->ring, rq->tail) / sizeof(u64); > > - spin_lock(&client->wq_lock); > - > guc_wq_item_append(client, engine->guc_id, ctx_desc, > ring_tail, rq->fence.seqno); > guc_ring_doorbell(client); > > client->submissions[engine->id] += 1; > - > - spin_unlock(&client->wq_lock); > } > > /* > @@ -631,8 +631,9 @@ static void inject_preempt_context(struct work_struct *work) > data[6] = intel_guc_ggtt_offset(guc, guc->shared_data); > > if (WARN_ON(intel_guc_send(guc, data, ARRAY_SIZE(data)))) { > - execlists_clear_active(&engine->execlists, > - EXECLISTS_ACTIVE_PREEMPT); > + intel_write_status_page(engine, > + I915_GEM_HWS_PREEMPT, > + GUC_PREEMPT_NONE); > tasklet_schedule(&engine->execlists.tasklet); > } > > @@ -672,8 +673,6 @@ static void complete_preempt_context(struct intel_engine_cs *engine) > { > struct intel_engine_execlists *execlists = &engine->execlists; > > - GEM_BUG_ON(!execlists_is_active(execlists, EXECLISTS_ACTIVE_PREEMPT)); > - > if (inject_preempt_hang(execlists)) > return; > > @@ -681,89 +680,90 @@ static void complete_preempt_context(struct intel_engine_cs *engine) > execlists_unwind_incomplete_requests(execlists); > > wait_for_guc_preempt_report(engine); > - intel_write_status_page(engine, I915_GEM_HWS_PREEMPT, 0); > + intel_write_status_page(engine, I915_GEM_HWS_PREEMPT, GUC_PREEMPT_NONE); > } > > -/** > - * guc_submit() - Submit commands through GuC > - * @engine: engine associated with the commands > - * > - * The only error here arises if the doorbell hardware isn't functioning > - * as expected, which really shouln't happen. > - */ > -static void guc_submit(struct intel_engine_cs *engine) > +static void guc_submit(struct intel_engine_cs *engine, > + struct i915_request **out, > + struct i915_request **end) > { > struct intel_guc *guc = &engine->i915->guc; > - struct intel_engine_execlists * const execlists = &engine->execlists; > - struct execlist_port *port = execlists->port; > - unsigned int n; > + struct intel_guc_client *client = guc->execbuf_client; > > - for (n = 0; n < execlists_num_ports(execlists); n++) { > - struct i915_request *rq; > - unsigned int count; > + spin_lock(&client->wq_lock); > > - rq = port_unpack(&port[n], &count); > - if (rq && count == 0) { > - port_set(&port[n], port_pack(rq, ++count)); > + do { > + struct i915_request *rq = *out++; > > - flush_ggtt_writes(rq->ring->vma); > + flush_ggtt_writes(rq->ring->vma); > + guc_add_request(guc, rq); > + } while (out != end); > > - guc_add_request(guc, rq); > - } > - } > + spin_unlock(&client->wq_lock); > } > > -static void port_assign(struct execlist_port *port, struct i915_request *rq) > +static inline int rq_prio(const struct i915_request *rq) > { > - GEM_BUG_ON(port_isset(port)); > - > - port_set(port, i915_request_get(rq)); > + return rq->sched.attr.priority | __NO_PREEMPTION; > } > > -static inline int rq_prio(const struct i915_request *rq) > +static struct i915_request *schedule_in(struct i915_request *rq, int idx) > { > - return rq->sched.attr.priority; > + trace_i915_request_in(rq, idx); > + > + if (!rq->hw_context->inflight) > + rq->hw_context->inflight = rq->engine; > + intel_context_inflight_inc(rq->hw_context); > + > + return i915_request_get(rq); > } > > -static inline int port_prio(const struct execlist_port *port) > +static void schedule_out(struct i915_request *rq) > { > - return rq_prio(port_request(port)) | __NO_PREEMPTION; > + trace_i915_request_out(rq); > + > + intel_context_inflight_dec(rq->hw_context); > + if (!intel_context_inflight_count(rq->hw_context)) > + rq->hw_context->inflight = NULL; > + > + i915_request_put(rq); > } > > -static bool __guc_dequeue(struct intel_engine_cs *engine) > +static void __guc_dequeue(struct intel_engine_cs *engine) > { > struct intel_engine_execlists * const execlists = &engine->execlists; > - struct execlist_port *port = execlists->port; > - struct i915_request *last = NULL; > - const struct execlist_port * const last_port = > - &execlists->port[execlists->port_mask]; > + struct i915_request **first = execlists->inflight; > + struct i915_request ** const last_port = first + execlists->port_mask; > + struct i915_request *last = first[0]; > + struct i915_request **port; > bool submit = false; > struct rb_node *rb; > > lockdep_assert_held(&engine->active.lock); > > - if (port_isset(port)) { > + if (last) { > if (intel_engine_has_preemption(engine)) { > struct guc_preempt_work *preempt_work = > &engine->i915->guc.preempt_work[engine->id]; > int prio = execlists->queue_priority_hint; > > - if (i915_scheduler_need_preempt(prio, > - port_prio(port))) { > - execlists_set_active(execlists, > - EXECLISTS_ACTIVE_PREEMPT); > + if (i915_scheduler_need_preempt(prio, rq_prio(last))) { > + intel_write_status_page(engine, > + I915_GEM_HWS_PREEMPT, > + GUC_PREEMPT_INPROGRESS); > queue_work(engine->i915->guc.preempt_wq, > &preempt_work->work); > - return false; > + return; > } > } > > - port++; > - if (port_isset(port)) > - return false; > + if (*++first) > + return; > + > + last = NULL; > } > - GEM_BUG_ON(port_isset(port)); > > + port = first; > while ((rb = rb_first_cached(&execlists->queue))) { > struct i915_priolist *p = to_priolist(rb); > struct i915_request *rq, *rn; > @@ -774,18 +774,15 @@ static bool __guc_dequeue(struct intel_engine_cs *engine) > if (port == last_port) > goto done; > > - if (submit) > - port_assign(port, last); > + *port = schedule_in(last, > + port - execlists->inflight); > port++; > } > > list_del_init(&rq->sched.link); > - > __i915_request_submit(rq); > - trace_i915_request_in(rq, port_index(port, execlists)); > - > - last = rq; > submit = true; > + last = rq; > } > > rb_erase_cached(&p->node, &execlists->queue); > @@ -794,58 +791,41 @@ static bool __guc_dequeue(struct intel_engine_cs *engine) > done: > execlists->queue_priority_hint = > rb ? to_priolist(rb)->priority : INT_MIN; > - if (submit) > - port_assign(port, last); > - if (last) > - execlists_user_begin(execlists, execlists->port); > - > - /* We must always keep the beast fed if we have work piled up */ > - GEM_BUG_ON(port_isset(execlists->port) && > - !execlists_is_active(execlists, EXECLISTS_ACTIVE_USER)); > - GEM_BUG_ON(rb_first_cached(&execlists->queue) && > - !port_isset(execlists->port)); > - > - return submit; > -} > - > -static void guc_dequeue(struct intel_engine_cs *engine) > -{ > - if (__guc_dequeue(engine)) > - guc_submit(engine); > + if (submit) { > + *port = schedule_in(last, port - execlists->inflight); > + *++port = NULL; > + guc_submit(engine, first, port); > + } > + execlists->active = execlists->inflight; > } > > static void guc_submission_tasklet(unsigned long data) > { > struct intel_engine_cs * const engine = (struct intel_engine_cs *)data; > struct intel_engine_execlists * const execlists = &engine->execlists; > - struct execlist_port *port = execlists->port; > - struct i915_request *rq; > + struct i915_request **port, *rq; > unsigned long flags; > > spin_lock_irqsave(&engine->active.lock, flags); > > - rq = port_request(port); > - while (rq && i915_request_completed(rq)) { > - trace_i915_request_out(rq); > - i915_request_put(rq); > + for (port = execlists->inflight; (rq = *port); port++) { > + if (!i915_request_completed(rq)) > + break; > > - port = execlists_port_complete(execlists, port); > - if (port_isset(port)) { > - execlists_user_begin(execlists, port); > - rq = port_request(port); > - } else { > - execlists_user_end(execlists); > - rq = NULL; > - } > + schedule_out(rq); > + } > + if (port != execlists->inflight) { > + int idx = port - execlists->inflight; > + int rem = ARRAY_SIZE(execlists->inflight) - idx; > + memmove(execlists->inflight, port, rem * sizeof(*port)); > } > > - if (execlists_is_active(execlists, EXECLISTS_ACTIVE_PREEMPT) && > - intel_read_status_page(engine, I915_GEM_HWS_PREEMPT) == > + if (intel_read_status_page(engine, I915_GEM_HWS_PREEMPT) == > GUC_PREEMPT_FINISHED) > complete_preempt_context(engine); > > - if (!execlists_is_active(execlists, EXECLISTS_ACTIVE_PREEMPT)) > - guc_dequeue(engine); > + if (!intel_read_status_page(engine, I915_GEM_HWS_PREEMPT)) > + __guc_dequeue(engine); > > spin_unlock_irqrestore(&engine->active.lock, flags); > } > @@ -959,7 +939,6 @@ static void guc_cancel_requests(struct intel_engine_cs *engine) > > execlists->queue_priority_hint = INT_MIN; > execlists->queue = RB_ROOT_CACHED; > - GEM_BUG_ON(port_isset(execlists->port)); > > spin_unlock_irqrestore(&engine->active.lock, flags); > } > @@ -1422,7 +1401,7 @@ int intel_guc_submission_enable(struct intel_guc *guc) > * and it is guaranteed that it will remove the work item from the > * queue before our request is completed. > */ > - BUILD_BUG_ON(ARRAY_SIZE(engine->execlists.port) * > + BUILD_BUG_ON(ARRAY_SIZE(engine->execlists.inflight) * > sizeof(struct guc_wq_item) * > I915_NUM_ENGINES > GUC_WQ_SIZE); > > diff --git a/drivers/gpu/drm/i915/selftests/i915_request.c b/drivers/gpu/drm/i915/selftests/i915_request.c > index 298bb7116c51..1a5b9e284ca9 100644 > --- a/drivers/gpu/drm/i915/selftests/i915_request.c > +++ b/drivers/gpu/drm/i915/selftests/i915_request.c > @@ -366,13 +366,15 @@ static int __igt_breadcrumbs_smoketest(void *arg) > > if (!wait_event_timeout(wait->wait, > i915_sw_fence_done(wait), > - HZ / 2)) { > + 5 * HZ)) { > struct i915_request *rq = requests[count - 1]; > > - pr_err("waiting for %d fences (last %llx:%lld) on %s timed out!\n", > - count, > + pr_err("waiting for %d/%d fences (last %llx:%lld) on %s timed out!\n", > + atomic_read(&wait->pending), count, > rq->fence.context, rq->fence.seqno, > t->engine->name); > + GEM_TRACE_DUMP(); > + > i915_gem_set_wedged(t->engine->i915); > GEM_BUG_ON(!i915_request_completed(rq)); > i915_sw_fence_wait(wait); > -- > 2.20.1 > > _______________________________________________ > Intel-gfx mailing list > Intel-gfx@xxxxxxxxxxxxxxxxxxxxx > https://lists.freedesktop.org/mailman/listinfo/intel-gfx _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/intel-gfx