Quoting Tvrtko Ursulin (2020-01-27 15:52:51) > > On 26/01/2020 10:23, Chris Wilson wrote: > > If we find ourselves waiting on a MI_SEMAPHORE_WAIT, either within the > > user batch or in our own preamble, the engine raises a > > GT_WAIT_ON_SEMAPHORE interrupt. We can unmask that interrupt and so > > respond to a semaphore wait by yielding the timeslice, if we have > > another context to yield to! > > > > The only real complication is that the interrupt is only generated for > > the start of the semaphore wait, and is asynchronous to our > > process_csb() -- that is, we may not have registered the timeslice before > > we see the interrupt. To ensure we don't miss a potential semaphore > > blocking forward progress (e.g. selftests/live_timeslice_preempt) we mark > > the interrupt and apply it to the next timeslice regardless of whether it > > was active at the time. > > > > v2: We use semaphores in preempt-to-busy, within the timeslicing > > implementation itself! Ergo, when we do insert a preemption due to an > > expired timeslice, the new context may start with the missed semaphore > > flagged by the retired context and be yielded, ad infinitum. To avoid > > this, read the context id at the time of the semaphore interrupt and > > only yield if that context is still active. > > > > Signed-off-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> > > Cc: Tvrtko Ursulin <tvrtko.ursulin@xxxxxxxxx> > > --- > > drivers/gpu/drm/i915/gt/intel_engine_types.h | 9 ++++++ > > drivers/gpu/drm/i915/gt/intel_gt_irq.c | 32 +++++++++++--------- > > drivers/gpu/drm/i915/gt/intel_lrc.c | 31 ++++++++++++++++--- > > drivers/gpu/drm/i915/i915_reg.h | 5 +++ > > 4 files changed, 59 insertions(+), 18 deletions(-) > > > > diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h > > index 92be41a6903c..58725024ffa4 100644 > > --- a/drivers/gpu/drm/i915/gt/intel_engine_types.h > > +++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h > > @@ -156,6 +156,15 @@ struct intel_engine_execlists { > > */ > > struct i915_priolist default_priolist; > > > > + /** > > + * @yield: CCID at the time of the last semaphore-wait interrupt. > > + * > > + * Instead of leaving a semaphore busy-spinning on an engine, we would > > + * like to switch to another ready context, i.e. yielding the semaphore > > + * timeslice. > > + */ > > + u32 yield; > > + > > /** > > * @no_priolist: priority lists disabled > > */ > > diff --git a/drivers/gpu/drm/i915/gt/intel_gt_irq.c b/drivers/gpu/drm/i915/gt/intel_gt_irq.c > > index f796bdf1ed30..6ae64a224b02 100644 > > --- a/drivers/gpu/drm/i915/gt/intel_gt_irq.c > > +++ b/drivers/gpu/drm/i915/gt/intel_gt_irq.c > > @@ -24,6 +24,13 @@ cs_irq_handler(struct intel_engine_cs *engine, u32 iir) > > { > > bool tasklet = false; > > > > + if (iir & GT_WAIT_SEMAPHORE_INTERRUPT) { > > + WRITE_ONCE(engine->execlists.yield, > > + ENGINE_READ_FW(engine, EXECLIST_CCID)); > > + if (del_timer(&engine->execlists.timer)) > > + tasklet = true; > > What if it fires before timeslice timer has been set up and when we miss > to yield? We only set the timer after the HW ack, and we can legitimately hit a semaphore in the user payload before we see the ack. That is demonstrated aptly by live_timeslice_preempt. > > + } > > + > > if (iir & GT_CONTEXT_SWITCH_INTERRUPT) > > tasklet = true; > > > > @@ -210,7 +217,10 @@ void gen11_gt_irq_reset(struct intel_gt *gt) > > > > void gen11_gt_irq_postinstall(struct intel_gt *gt) > > { > > - const u32 irqs = GT_RENDER_USER_INTERRUPT | GT_CONTEXT_SWITCH_INTERRUPT; > > + const u32 irqs = > > + GT_RENDER_USER_INTERRUPT | > > + GT_CONTEXT_SWITCH_INTERRUPT | > > + GT_WAIT_SEMAPHORE_INTERRUPT; > > struct intel_uncore *uncore = gt->uncore; > > const u32 dmask = irqs << 16 | irqs; > > const u32 smask = irqs << 16; > > @@ -357,21 +367,15 @@ void gen8_gt_irq_postinstall(struct intel_gt *gt) > > struct intel_uncore *uncore = gt->uncore; > > > > /* These are interrupts we'll toggle with the ring mask register */ > > + const u32 irqs = > > + GT_RENDER_USER_INTERRUPT | > > + GT_CONTEXT_SWITCH_INTERRUPT | > > + GT_WAIT_SEMAPHORE_INTERRUPT; > > u32 gt_interrupts[] = { > > - (GT_RENDER_USER_INTERRUPT << GEN8_RCS_IRQ_SHIFT | > > - GT_CONTEXT_SWITCH_INTERRUPT << GEN8_RCS_IRQ_SHIFT | > > - GT_RENDER_USER_INTERRUPT << GEN8_BCS_IRQ_SHIFT | > > - GT_CONTEXT_SWITCH_INTERRUPT << GEN8_BCS_IRQ_SHIFT), > > - > > - (GT_RENDER_USER_INTERRUPT << GEN8_VCS0_IRQ_SHIFT | > > - GT_CONTEXT_SWITCH_INTERRUPT << GEN8_VCS0_IRQ_SHIFT | > > - GT_RENDER_USER_INTERRUPT << GEN8_VCS1_IRQ_SHIFT | > > - GT_CONTEXT_SWITCH_INTERRUPT << GEN8_VCS1_IRQ_SHIFT), > > - > > + irqs << GEN8_RCS_IRQ_SHIFT | irqs << GEN8_BCS_IRQ_SHIFT, > > + irqs << GEN8_VCS0_IRQ_SHIFT | irqs << GEN8_VCS1_IRQ_SHIFT, > > 0, > > - > > - (GT_RENDER_USER_INTERRUPT << GEN8_VECS_IRQ_SHIFT | > > - GT_CONTEXT_SWITCH_INTERRUPT << GEN8_VECS_IRQ_SHIFT) > > + irqs << GEN8_VECS_IRQ_SHIFT, > > }; > > > > gt->pm_ier = 0x0; > > diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c > > index a13a8c4b65ab..6ba5a634c6e3 100644 > > --- a/drivers/gpu/drm/i915/gt/intel_lrc.c > > +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c > > @@ -1661,7 +1661,8 @@ static void defer_active(struct intel_engine_cs *engine) > > } > > > > static bool > > -need_timeslice(struct intel_engine_cs *engine, const struct i915_request *rq) > > +need_timeslice(const struct intel_engine_cs *engine, > > + const struct i915_request *rq) > > Naughty. Adding a const! I'd go with a mere cheek than naughty. > > { > > int hint; > > > > @@ -1677,6 +1678,27 @@ need_timeslice(struct intel_engine_cs *engine, const struct i915_request *rq) > > return hint >= effective_prio(rq); > > } > > > > +static bool > > +timeslice_expired(const struct intel_engine_cs *engine, > > + const struct i915_request *rq) > > +{ > > + const struct intel_engine_execlists *el = &engine->execlists; > > + > > + return (timer_expired(&el->timer) || > > + /* > > + * Once bitten, forever smitten! > > + * > > + * If the active context ever busy-waited on a semaphore, > > + * it will be treated as a hog until the end of its timeslice. > > + * The HW only sends an interrupt on the first miss, and we > > One interrupt per elsp submission? Then on re-submission it can repeat? > I hope so because we recycle lrc_desc aggressively so I am wondering > about that. That appears so. While waiting on a semaphore, the interrupt only fires once not for each poll. Hmm, I shall have to check just in case each semaphore causes a new interrupt (and we may have just got lucky due to the effect of using edge interrupts). > > + * do know if that semaphore has been signaled, or even if it > > + * is now stuck on another semaphore. Play safe, yield if it > > + * might be stuck -- it will be given a fresh timeslice in > > + * the near future. > > + */ > > + upper_32_bits(rq->context->lrc_desc) == READ_ONCE(el->yield)); > > +} > > + > > static int > > switch_prio(struct intel_engine_cs *engine, const struct i915_request *rq) > > { > > @@ -1692,8 +1714,7 @@ timeslice(const struct intel_engine_cs *engine) > > return READ_ONCE(engine->props.timeslice_duration_ms); > > } > > > > -static unsigned long > > -active_timeslice(const struct intel_engine_cs *engine) > > +static unsigned long active_timeslice(const struct intel_engine_cs *engine) > > { > > const struct i915_request *rq = *engine->execlists.active; > > > > @@ -1844,7 +1865,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine) > > last->context->lrc_desc |= CTX_DESC_FORCE_RESTORE; > > last = NULL; > > } else if (need_timeslice(engine, last) && > > - timer_expired(&engine->execlists.timer)) { > > + timeslice_expired(engine, last)) { > > ENGINE_TRACE(engine, > > "expired last=%llx:%lld, prio=%d, hint=%d\n", > > Could be useful to move tracing msg into timeslice_expired and > distinguish between the two cases. Hmm. Or just add the flag here as well, will see which looks better. > > > last->fence.context, > > @@ -2110,6 +2131,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine) > > } > > clear_ports(port + 1, last_port - port); > > > > + WRITE_ONCE(execlists->yield, -1); > > execlists_submit_ports(engine); > > set_preempt_timeout(engine); > > } else { > > @@ -4290,6 +4312,7 @@ logical_ring_default_irqs(struct intel_engine_cs *engine) > > > > engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT << shift; > > engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift; > > + engine->irq_keep_mask |= GT_WAIT_SEMAPHORE_INTERRUPT << shift; > > } > > > > static void rcs_submission_override(struct intel_engine_cs *engine) > > diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h > > index b93c4c18f05c..535ce7e0dc94 100644 > > --- a/drivers/gpu/drm/i915/i915_reg.h > > +++ b/drivers/gpu/drm/i915/i915_reg.h > > @@ -3085,6 +3085,7 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg) > > #define GT_BSD_CS_ERROR_INTERRUPT (1 << 15) > > #define GT_BSD_USER_INTERRUPT (1 << 12) > > #define GT_RENDER_L3_PARITY_ERROR_INTERRUPT_S1 (1 << 11) /* hsw+; rsvd on snb, ivb, vlv */ > > +#define GT_WAIT_SEMAPHORE_INTERRUPT REG_BIT(11) /* bdw+ */ > > #define GT_CONTEXT_SWITCH_INTERRUPT (1 << 8) > > #define GT_RENDER_L3_PARITY_ERROR_INTERRUPT (1 << 5) /* !snb */ > > #define GT_RENDER_PIPECTL_NOTIFY_INTERRUPT (1 << 4) > > @@ -4036,6 +4037,10 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg) > > #define CCID_EN BIT(0) > > #define CCID_EXTENDED_STATE_RESTORE BIT(2) > > #define CCID_EXTENDED_STATE_SAVE BIT(3) > > + > > +#define EXECLIST_STATUS(base) _MMIO((base) + 0x234) > > This one is unused. Your eyes must be deceived, for I foresaw that complaint and added it to intel_engine_dump :-p -Chris _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/intel-gfx