Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> writes: > The engine provides a mirror of the CSB in the HWSP. If we use the > cacheable reads from the HWSP, we can shave off a few mmio reads per > context-switch interrupt (which are quite frequent!). Just removing a > couple of mmio is not enough to actually reduce any latency, but a small > reduction in overall cpu usage. > > Much appreciation for Ben dropping the bombshell that the CSB was in the > HWSP and for Michel in digging out the details. > > v2: Don't be lazy, add the defines for the indices. > v3: Include the HWSP in debugfs/i915_engine_info > v4: Check for GVT-g, it currently depends on intercepting CSB mmio > v5: Fixup GVT-g mmio path > v6: Disable HWSP if VT-d is active as the iommu adds unpredictable > memory latency. (Mika) > > Suggested-by: Ben Widawsky <benjamin.widawsky@xxxxxxxxx> > Signed-off-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> > Cc: Michel Thierry <michel.thierry@xxxxxxxxx> > Cc: Tvrtko Ursulin <tvrtko.ursulin@xxxxxxxxx> > Cc: Mika Kuoppala <mika.kuoppala@xxxxxxxxx> > Cc: Daniele Ceraolo Spurio <daniele.ceraolospurio@xxxxxxxxx> > Cc: Zhenyu Wang <zhenyuw@xxxxxxxxxxxxxxx> > Cc: Zhi Wang <zhi.a.wang@xxxxxxxxx> > Acked-by: Michel Thierry <michel.thierry@xxxxxxxxx> > --- > drivers/gpu/drm/i915/i915_debugfs.c | 7 +++++-- > drivers/gpu/drm/i915/intel_lrc.c | 34 ++++++++++++++++++++++++++++----- > drivers/gpu/drm/i915/intel_ringbuffer.h | 3 +++ > 3 files changed, 37 insertions(+), 7 deletions(-) > > diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c > index 6338018f655d..7062cde94a49 100644 > --- a/drivers/gpu/drm/i915/i915_debugfs.c > +++ b/drivers/gpu/drm/i915/i915_debugfs.c > @@ -3315,6 +3315,7 @@ static int i915_engine_info(struct seq_file *m, void *unused) > upper_32_bits(addr), lower_32_bits(addr)); > > if (i915.enable_execlists) { > + const u32 *hws = &engine->status_page.page_addr[I915_HWS_CSB_BUF0_INDEX]; > u32 ptr, read, write; > unsigned int idx; > > @@ -3337,10 +3338,12 @@ static int i915_engine_info(struct seq_file *m, void *unused) > write += GEN8_CSB_ENTRIES; > while (read < write) { > idx = ++read % GEN8_CSB_ENTRIES; > - seq_printf(m, "\tExeclist CSB[%d]: 0x%08x, context: %d\n", > + seq_printf(m, "\tExeclist CSB[%d]: 0x%08x [0x%08x in hwsp], context: %d [%d in hwsp]\n", > idx, > I915_READ(RING_CONTEXT_STATUS_BUF_LO(engine, idx)), > - I915_READ(RING_CONTEXT_STATUS_BUF_HI(engine, idx))); > + hws[idx * 2], > + I915_READ(RING_CONTEXT_STATUS_BUF_HI(engine, idx)), > + hws[idx * 2 + 1]); > } > > rcu_read_lock(); > diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c > index aa5534213190..8e4b21a18554 100644 > --- a/drivers/gpu/drm/i915/intel_lrc.c > +++ b/drivers/gpu/drm/i915/intel_lrc.c > @@ -547,10 +547,17 @@ static void intel_lrc_irq_handler(unsigned long data) > while (test_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted)) { > u32 __iomem *csb_mmio = > dev_priv->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_PTR(engine)); > - u32 __iomem *buf = > - dev_priv->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_BUF_LO(engine, 0)); > + /* The HWSP contains a (cacheable) mirror of the CSB */ > + const u32 *buf = > + &engine->status_page.page_addr[I915_HWS_CSB_BUF0_INDEX]; > unsigned int head, tail; > > + /* However GVT emulation depends upon intercepting CSB mmio */ > + if (unlikely(engine->csb_use_mmio)) { > + buf = (u32 * __force) > + (dev_priv->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_BUF_LO(engine, 0))); > + } > + > /* The write will be ordered by the uncached read (itself > * a memory barrier), so we do not need another in the form > * of a locked instruction. The race between the interrupt > @@ -590,13 +597,12 @@ static void intel_lrc_irq_handler(unsigned long data) > * status notifier. > */ > > - status = readl(buf + 2 * head); > + status = buf[2 * head]; > if (!(status & GEN8_CTX_STATUS_COMPLETED_MASK)) > continue; > > /* Check the context/desc id for this event matches */ > - GEM_DEBUG_BUG_ON(readl(buf + 2 * head + 1) != > - port->context_id); > + GEM_DEBUG_BUG_ON(buf[2 * head + 1] != port->context_id); > > rq = port_unpack(port, &count); > GEM_BUG_ON(count == 0); > @@ -1726,6 +1732,22 @@ logical_ring_default_irqs(struct intel_engine_cs *engine) > engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift; > } > > +static bool irq_handler_force_mmio(struct drm_i915_private *i915) > +{ > + /* GVT emulation depends upon intercepting CSB mmio */ > + if (intel_vgpu_active(i915)) > + return false; > + > + /* > + * IOMMU adds unpredictable latency causing the CSB write to only > + * be visible after the interrupt (missed breadcrumb syndrome). > + */ > + if (intel_vtd_active()) > + return false; I don't know if it is worthwhile to check if this restriction could be lifted on some gens. Perhaps for future work. But with this in place now, Reviewed-by: Mika Kuoppala <mika.kuoppala@xxxxxxxxx> > + > + return true; > +} > + > static void > logical_ring_setup(struct intel_engine_cs *engine) > { > @@ -1737,6 +1759,8 @@ logical_ring_setup(struct intel_engine_cs *engine) > /* Intentionally left blank. */ > engine->buffer = NULL; > > + engine->csb_use_mmio = irq_handler_force_mmio(dev_priv); > + > fw_domains = intel_uncore_forcewake_for_reg(dev_priv, > RING_ELSP(engine), > FW_REG_WRITE); > diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h > index 79c0021f3700..5c055b62966d 100644 > --- a/drivers/gpu/drm/i915/intel_ringbuffer.h > +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h > @@ -391,6 +391,7 @@ struct intel_engine_cs { > struct rb_root execlist_queue; > struct rb_node *execlist_first; > unsigned int fw_domains; > + bool csb_use_mmio; > > /* Contexts are pinned whilst they are active on the GPU. The last > * context executed remains active whilst the GPU is idle - the > @@ -496,6 +497,8 @@ intel_write_status_page(struct intel_engine_cs *engine, int reg, u32 value) > #define I915_GEM_HWS_SCRATCH_INDEX 0x40 > #define I915_GEM_HWS_SCRATCH_ADDR (I915_GEM_HWS_SCRATCH_INDEX << MI_STORE_DWORD_INDEX_SHIFT) > > +#define I915_HWS_CSB_BUF0_INDEX 0x10 > + > struct intel_ring * > intel_engine_create_ring(struct intel_engine_cs *engine, int size); > int intel_ring_pin(struct intel_ring *ring, > -- > 2.14.1 _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/intel-gfx