Quoting Tvrtko Ursulin (2019-02-04 10:06:35) > > On 04/02/2019 08:41, Chris Wilson wrote: > > WAIT is occasionally suppressed by virtue of preempted requests being > > promoted to NEWCLIENT iff they have not all ready received that boost. > > s/iff/if/ iff == if, and only if But it was probably a typo. > > Make this consistent for all WAIT boosts that they are not allowed to > > preempt executing contexts and are merely granted the right to be at the > > front of the queue for the next execution slot. This is in keeping with > > the desire that the WAIT boost be a minor tweak that does not give > > excessive promotion to its user and open ourselves to trivial abuse. > > > > The problem with the inconsistent WAIT preemption becomes more apparent > > as the preemption is propagated across the engines, where one engine may > > preempt and the other not, and we be relying on the exact execution > > order being consistent across engines (e.g. using HW semaphores to > > coordinate parallel execution). > > > > v2: Also protect GuC submission from false preemption loops. > > > > Signed-off-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> > > Cc: Tvrtko Ursulin <tvrtko.ursulin@xxxxxxxxx> > > --- > > drivers/gpu/drm/i915/i915_request.c | 11 ++ > > drivers/gpu/drm/i915/i915_scheduler.h | 2 + > > drivers/gpu/drm/i915/intel_guc_submission.c | 2 +- > > drivers/gpu/drm/i915/intel_lrc.c | 9 +- > > drivers/gpu/drm/i915/selftests/igt_spinner.c | 9 +- > > drivers/gpu/drm/i915/selftests/intel_lrc.c | 156 +++++++++++++++++++ > > 6 files changed, 186 insertions(+), 3 deletions(-) > > > > diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c > > index 9ed5baf157a3..7968875d0bed 100644 > > --- a/drivers/gpu/drm/i915/i915_request.c > > +++ b/drivers/gpu/drm/i915/i915_request.c > > @@ -377,12 +377,23 @@ void __i915_request_submit(struct i915_request *request) > > > > /* We may be recursing from the signal callback of another i915 fence */ > > spin_lock_nested(&request->lock, SINGLE_DEPTH_NESTING); > > + > > GEM_BUG_ON(test_bit(I915_FENCE_FLAG_ACTIVE, &request->fence.flags)); > > set_bit(I915_FENCE_FLAG_ACTIVE, &request->fence.flags); > > + > > request->global_seqno = seqno; > > if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &request->fence.flags) && > > !i915_request_enable_breadcrumb(request)) > > intel_engine_queue_breadcrumbs(engine); > > + > > + /* > > + * As we do not allow WAIT to preempt inflight requests, > > + * once we have executed a request, along with triggering > > + * any execution callbacks, we must preserve its ordering > > + * within the non-preemptible FIFO. > > + */ > > + request->sched.attr.priority |= __NO_PREEMPTION; > > + > > spin_unlock(&request->lock); > > > > engine->emit_fini_breadcrumb(request, > > diff --git a/drivers/gpu/drm/i915/i915_scheduler.h b/drivers/gpu/drm/i915/i915_scheduler.h > > index dbe9cb7ecd82..54bd6c89817e 100644 > > --- a/drivers/gpu/drm/i915/i915_scheduler.h > > +++ b/drivers/gpu/drm/i915/i915_scheduler.h > > @@ -33,6 +33,8 @@ enum { > > #define I915_PRIORITY_WAIT ((u8)BIT(0)) > > #define I915_PRIORITY_NEWCLIENT ((u8)BIT(1)) > > > > +#define __NO_PREEMPTION (I915_PRIORITY_WAIT) > > + > > struct i915_sched_attr { > > /** > > * @priority: execution and service priority > > diff --git a/drivers/gpu/drm/i915/intel_guc_submission.c b/drivers/gpu/drm/i915/intel_guc_submission.c > > index 8bc8aa54aa35..94695eb819c2 100644 > > --- a/drivers/gpu/drm/i915/intel_guc_submission.c > > +++ b/drivers/gpu/drm/i915/intel_guc_submission.c > > @@ -720,7 +720,7 @@ static inline int rq_prio(const struct i915_request *rq) > > > > static inline int port_prio(const struct execlist_port *port) > > { > > - return rq_prio(port_request(port)); > > + return rq_prio(port_request(port)) | __NO_PREEMPTION; > > } > > > > static bool __guc_dequeue(struct intel_engine_cs *engine) > > diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c > > index a9eb0211ce77..773df0bd685b 100644 > > --- a/drivers/gpu/drm/i915/intel_lrc.c > > +++ b/drivers/gpu/drm/i915/intel_lrc.c > > @@ -188,6 +188,12 @@ static inline int rq_prio(const struct i915_request *rq) > > return rq->sched.attr.priority; > > } > > > > +static int effective_prio(const struct i915_request *rq) > > +{ > > + /* Restrict mere WAIT boosts from triggering preemption */ > > + return rq_prio(rq) | __NO_PREEMPTION; > > +} > > I suggest adding i915_request_effective_prio to i915_request.h - it is > verbose but avoids two implementation. Too verbose... And it may differ depending on backend details... We don't even need to or in no-preemption until later... > BUILD_BUG_ON(hweight32(__NO_PREEMPTION) != 1); as well? To ensure it is > defined to internal priority levels. You mean __NO_PREEMPTION & ~I915_PRIORITY_MASK ? > > static int queue_prio(const struct intel_engine_execlists *execlists) > > { > > struct i915_priolist *p; > > @@ -208,7 +214,7 @@ static int queue_prio(const struct intel_engine_execlists *execlists) > > static inline bool need_preempt(const struct intel_engine_cs *engine, > > const struct i915_request *rq) > > { > > - const int last_prio = rq_prio(rq); > > + int last_prio; > > > > if (!intel_engine_has_preemption(engine)) > > return false; > > @@ -228,6 +234,7 @@ static inline bool need_preempt(const struct intel_engine_cs *engine, > > * preempt. If that hint is stale or we may be trying to preempt > > * ourselves, ignore the request. > > */ > > + last_prio = effective_prio(rq); > > Isn't this redundant? Every submitted request already had > __NO_PREEMPTION applied in __i915_request_submit. But not in the next patch which expands upon this. > > if (!__execlists_need_preempt(engine->execlists.queue_priority_hint, > > last_prio)) > > return false; > > diff --git a/drivers/gpu/drm/i915/selftests/igt_spinner.c b/drivers/gpu/drm/i915/selftests/igt_spinner.c > > index 9ebd9225684e..86354e51bdd3 100644 > > --- a/drivers/gpu/drm/i915/selftests/igt_spinner.c > > +++ b/drivers/gpu/drm/i915/selftests/igt_spinner.c > > @@ -142,10 +142,17 @@ igt_spinner_create_request(struct igt_spinner *spin, > > *batch++ = upper_32_bits(vma->node.start); > > *batch++ = MI_BATCH_BUFFER_END; /* not reached */ > > > > - i915_gem_chipset_flush(spin->i915); > > + if (engine->emit_init_breadcrumb && > > + rq->timeline->has_initial_breadcrumb) { > > + err = engine->emit_init_breadcrumb(rq); > > + if (err) > > + goto cancel_rq; > > + } > > > > err = engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, 0); > > > > + i915_gem_chipset_flush(spin->i915); > > + > > cancel_rq: > > if (err) { > > i915_request_skip(rq, err); > > diff --git a/drivers/gpu/drm/i915/selftests/intel_lrc.c b/drivers/gpu/drm/i915/selftests/intel_lrc.c > > index fb35f53c9ce3..86fd4589f5f0 100644 > > --- a/drivers/gpu/drm/i915/selftests/intel_lrc.c > > +++ b/drivers/gpu/drm/i915/selftests/intel_lrc.c > > @@ -405,6 +405,161 @@ static int live_suppress_self_preempt(void *arg) > > goto err_client_b; > > } > > > > +static int __i915_sw_fence_call > > +dummy_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state) > > +{ > > + return NOTIFY_DONE; > > +} > > + > > +static struct i915_request *dummy_request(struct intel_engine_cs *engine) > > +{ > > + struct i915_request *rq; > > + > > + rq = kmalloc(sizeof(*rq), GFP_KERNEL | __GFP_ZERO); > > + if (!rq) > > + return NULL; > > + > > + INIT_LIST_HEAD(&rq->active_list); > > + rq->engine = engine; > > + > > + i915_sched_node_init(&rq->sched); > > + > > + /* mark this request as permanently incomplete */ > > + rq->fence.seqno = 1; > > + rq->hwsp_seqno = (u32 *)&rq->fence.seqno + 1; > > Hackery level 10 unlocked! :) > > Add to comment "..by pointing the hwsp_seqno to high (unused) 32-bits of > seqno". I put the why... How that is actually accomplished is mere implementation, and you can read the code. :-p > Also sounds like a good idea to add > BUILD_BUG_ON(typeof(req->fence.seqno) == typeof(u64))? > > > + > > + i915_sw_fence_init(&rq->submit, dummy_notify); > > + i915_sw_fence_commit(&rq->submit); > > + > > + return rq; > > +} > > + > > +static void dummy_request_free(struct i915_request *dummy) > > +{ > > + i915_request_mark_complete(dummy); > > + i915_sched_node_fini(dummy->engine->i915, &dummy->sched); > > + kfree(dummy); > > +} > > + > > +static int live_suppress_wait_preempt(void *arg) > > +{ > > + struct drm_i915_private *i915 = arg; > > + struct preempt_client client[4]; > > + struct intel_engine_cs *engine; > > + enum intel_engine_id id; > > + intel_wakeref_t wakeref; > > + int err = -ENOMEM; > > + int i; > > + > > + /* > > + * Waiters are given a little priority nudge, but not enough > > + * to actually cause any preemption. Double check that we do > > + * not needlessly generate preempt-to-idle cycles. > > + */ > > + > > + if (!HAS_LOGICAL_RING_PREEMPTION(i915)) > > + return 0; > > + > > + mutex_lock(&i915->drm.struct_mutex); > > + wakeref = intel_runtime_pm_get(i915); > > + > > + if (preempt_client_init(i915, &client[0])) /* ELSP[0] */ > > + goto err_unlock; > > + if (preempt_client_init(i915, &client[1])) /* ELSP[1] */ > > + goto err_client_0; > > + if (preempt_client_init(i915, &client[2])) /* head of queue */ > > + goto err_client_1; > > + if (preempt_client_init(i915, &client[3])) /* bystander */ > > + goto err_client_2; > > + > > + for_each_engine(engine, i915, id) { > > + int depth; > > + > > + if (!engine->emit_init_breadcrumb) > > + continue; > > + > > + for (depth = 0; depth < ARRAY_SIZE(client); depth++) { > > + struct i915_request *rq[ARRAY_SIZE(client)]; > > + struct i915_request *dummy; > > + > > + engine->execlists.preempt_hang.count = 0; > > + > > + dummy = dummy_request(engine); > > + if (!dummy) > > + goto err_client_3; > > + > > + for (i = 0; i < ARRAY_SIZE(client); i++) { > > + rq[i] = igt_spinner_create_request(&client[i].spin, > > + client[i].ctx, engine, > > + MI_NOOP); > > + if (IS_ERR(rq[i])) { > > + err = PTR_ERR(rq[i]); > > + goto err_wedged; > > + } > > + > > + /* Disable NEWCLIENT promotion */ > > + i915_gem_active_set(&rq[i]->timeline->last_request, > > + dummy); > > + i915_request_add(rq[i]); > > + } > > + > > + dummy_request_free(dummy); > > + > > + GEM_BUG_ON(i915_request_completed(rq[0])); > > + if (!igt_wait_for_spinner(&client[0].spin, rq[0])) { > > + pr_err("First client failed to start\n"); > > + goto err_wedged; > > + } > > + GEM_BUG_ON(!i915_request_started(rq[0])); > > + > > + if (i915_request_wait(rq[depth], > > + I915_WAIT_LOCKED | > > + I915_WAIT_PRIORITY, > > + 1) != -ETIME) { > > + pr_err("Waiter depth:%d completed!\n", depth); > > + goto err_wedged; > > + } > > + > > + for (i = 0; i < ARRAY_SIZE(client); i++) > > + igt_spinner_end(&client[i].spin); > > + > > + if (igt_flush_test(i915, I915_WAIT_LOCKED)) > > + goto err_wedged; > > + > > + if (engine->execlists.preempt_hang.count) { > > + pr_err("Preemption recorded x%d, depth %d; should have been suppressed!\n", > > + engine->execlists.preempt_hang.count, > > + depth); > > Worth logging engine names with all the above error messages? I honestly doubt we need per-engine since this is just about observing SW. But as we are exercising across engines, it may as well include that information. -Chris _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/intel-gfx