It is performance optimization to reduce mmio trap numbers from 4 to 1 durning ELSP porting writing (context submission). When context subission, to cache elsp_data[4] values in the shared page, the last elsp_data[0] port writing will be trapped to gvt for real context submission. Use PVMMIO_ELSP_SUBMIT to control this level of pvmmio optimization. v0: RFC v1: rebase v2: added pv ops for pv context submission. to maximize code resuse, introduced 2 more ops (submit_ports & preempt_context) instead of 1 op (set_default_submission) in engine structure. pv version of submit_ports and preempt_context implemented. Cc: Zhenyu Wang <zhenyuw@xxxxxxxxxxxxxxx> Cc: Zhi Wang <zhi.a.wang@xxxxxxxxx> Cc: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> Cc: Joonas Lahtinen <joonas.lahtinen@xxxxxxxxxxxxxxx> Cc: He, Min <min.he@xxxxxxxxx> Cc: Jiang, Fei <fei.jiang@xxxxxxxxx> Cc: Gong, Zhipeng <zhipeng.gong@xxxxxxxxx> Cc: Yuan, Hang <hang.yuan@xxxxxxxxx> Cc: Zhiyuan Lv <zhiyuan.lv@xxxxxxxxx> Signed-off-by: Xiaolin Zhang <xiaolin.zhang@xxxxxxxxx> --- drivers/gpu/drm/i915/i915_vgpu.c | 2 + drivers/gpu/drm/i915/intel_lrc.c | 88 +++++++++++++++++++++++++++++++-- drivers/gpu/drm/i915/intel_ringbuffer.h | 3 ++ 3 files changed, 90 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_vgpu.c b/drivers/gpu/drm/i915/i915_vgpu.c index cb409d5..9870ea6 100644 --- a/drivers/gpu/drm/i915/i915_vgpu.c +++ b/drivers/gpu/drm/i915/i915_vgpu.c @@ -66,6 +66,8 @@ void i915_check_vgpu(struct drm_i915_private *dev_priv) BUILD_BUG_ON(sizeof(struct vgt_if) != VGT_PVINFO_SIZE); + dev_priv->vgpu.pv_caps = PVMMIO_ELSP_SUBMIT; + magic = __raw_i915_read64(dev_priv, vgtif_reg(magic)); if (magic != VGT_MAGIC) return; diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c index 22b57b8..9e6ccf9 100644 --- a/drivers/gpu/drm/i915/intel_lrc.c +++ b/drivers/gpu/drm/i915/intel_lrc.c @@ -460,6 +460,60 @@ static void execlists_submit_ports(struct intel_engine_cs *engine) execlists_clear_active(execlists, EXECLISTS_ACTIVE_HWACK); } +static void execlists_submit_ports_pv(struct intel_engine_cs *engine) +{ + struct intel_engine_execlists *execlists = &engine->execlists; + struct execlist_port *port = execlists->port; + u32 __iomem *elsp = + engine->i915->regs + i915_mmio_reg_offset(RING_ELSP(engine)); + u32 *elsp_data; + unsigned int n; + u32 descs[4]; + int i = 0; + + /* + * ELSQ note: the submit queue is not cleared after being submitted + * to the HW so we need to make sure we always clean it up. This is + * currently ensured by the fact that we always write the same number + * of elsq entries, keep this in mind before changing the loop below. + */ + for (n = execlists_num_ports(execlists); n--; ) { + struct i915_request *rq; + unsigned int count; + u64 desc; + + rq = port_unpack(&port[n], &count); + if (rq) { + GEM_BUG_ON(count > !n); + if (!count++) + execlists_context_schedule_in(rq); + port_set(&port[n], port_pack(rq, count)); + desc = execlists_update_context(rq); + } else { + GEM_BUG_ON(!n); + desc = 0; + } + GEM_BUG_ON(i >= 4); + descs[i] = upper_32_bits(desc); + descs[i + 1] = lower_32_bits(desc); + i += 2; + } + + spin_lock(&engine->i915->vgpu.shared_page_lock); + elsp_data = engine->i915->vgpu.shared_page->elsp_data; + *elsp_data = descs[0]; + *(elsp_data + 1) = descs[1]; + *(elsp_data + 2) = descs[2]; + writel(descs[3], elsp); + spin_unlock(&engine->i915->vgpu.shared_page_lock); + + /* we need to manually load the submit queue */ + if (execlists->ctrl_reg) + writel(EL_CTRL_LOAD, execlists->ctrl_reg); + + execlists_clear_active(execlists, EXECLISTS_ACTIVE_HWACK); +} + static bool ctx_single_port_submission(const struct intel_context *ce) { return (IS_ENABLED(CONFIG_DRM_I915_GVT) && @@ -497,7 +551,6 @@ static void inject_preempt_context(struct intel_engine_cs *engine) GEM_BUG_ON(execlists->preempt_complete_status != upper_32_bits(ce->lrc_desc)); - /* * Switch to our empty preempt context so * the state of the GPU is known (idle). @@ -516,6 +569,27 @@ static void inject_preempt_context(struct intel_engine_cs *engine) execlists_set_active(execlists, EXECLISTS_ACTIVE_PREEMPT); } +static void inject_preempt_context_pv(struct intel_engine_cs *engine) +{ + struct intel_engine_execlists *execlists = &engine->execlists; + struct intel_context *ce = + to_intel_context(engine->i915->preempt_context, engine); + u32 __iomem *elsp = + engine->i915->regs + i915_mmio_reg_offset(RING_ELSP(engine)); + u32 *elsp_data; + + GEM_BUG_ON(execlists->preempt_complete_status != + upper_32_bits(ce->lrc_desc)); + + spin_lock(&engine->i915->vgpu.shared_page_lock); + elsp_data = engine->i915->vgpu.shared_page->elsp_data; + *elsp_data = 0; + *(elsp_data + 1) = 0; + *(elsp_data + 2) = upper_32_bits(ce->lrc_desc); + writel(lower_32_bits(ce->lrc_desc), elsp); + spin_unlock(&engine->i915->vgpu.shared_page_lock); +} + static void complete_preempt_context(struct intel_engine_execlists *execlists) { GEM_BUG_ON(!execlists_is_active(execlists, EXECLISTS_ACTIVE_PREEMPT)); @@ -583,7 +657,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine) return; if (need_preempt(engine, last, execlists->queue_priority)) { - inject_preempt_context(engine); + engine->preempt_context(engine); return; } @@ -705,7 +779,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine) if (submit) { port_assign(port, last); - execlists_submit_ports(engine); + engine->submit_ports(engine); } /* We must always keep the beast fed if we have work piled up */ @@ -2134,6 +2208,14 @@ void intel_execlists_set_default_submission(struct intel_engine_cs *engine) engine->reset.prepare = execlists_reset_prepare; + engine->preempt_context = inject_preempt_context; + engine->submit_ports = execlists_submit_ports; + + if (PVMMIO_LEVEL_ENABLE(engine->i915, PVMMIO_ELSP_SUBMIT)) { + engine->preempt_context = inject_preempt_context_pv; + engine->submit_ports = execlists_submit_ports_pv; + } + engine->park = NULL; engine->unpark = NULL; diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h index f6ec48a..e9895bf 100644 --- a/drivers/gpu/drm/i915/intel_ringbuffer.h +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h @@ -523,6 +523,9 @@ struct intel_engine_cs { void (*irq_seqno_barrier)(struct intel_engine_cs *engine); void (*cleanup)(struct intel_engine_cs *engine); + void (*preempt_context)(struct intel_engine_cs *engine); + void (*submit_ports)(struct intel_engine_cs *engine); + /* GEN8 signal/wait table - never trust comments! * signal to signal to signal to signal to signal to * RCS VCS BCS VECS VCS2 -- 2.7.4 _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/intel-gfx