Signed-off-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx>
Cc: Daniele Ceraolo Spurio <daniele.ceraolospurio@xxxxxxxxx>
Cc: Tvrtko Ursulin <tvrtko.ursulin@xxxxxxxxx>
---
drivers/gpu/drm/i915/Makefile | 3 +
drivers/gpu/drm/i915/gem/i915_gem_context.c | 1 +
drivers/gpu/drm/i915/gt/gen8_engine_cs.c | 340 ++++
drivers/gpu/drm/i915/gt/gen8_engine_cs.h | 27 +
drivers/gpu/drm/i915/gt/intel_engine_pool.c | 1 +
.../drm/i915/gt/intel_engine_workaround_bb.c | 335 ++++
.../drm/i915/gt/intel_execlists_submission.c | 1552 +----------------
drivers/gpu/drm/i915/gt/intel_lrc.c | 830 +++++++++
drivers/gpu/drm/i915/gt/intel_lrc.h | 32 +-
drivers/gpu/drm/i915/gt/selftest_execlists.c | 498 +-----
drivers/gpu/drm/i915/gt/selftest_lrc.c | 544 ++++++
drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c | 1 +
.../gpu/drm/i915/gt/uc/intel_guc_submission.c | 36 +-
drivers/gpu/drm/i915/gvt/mmio_context.c | 1 +
drivers/gpu/drm/i915/gvt/scheduler.c | 1 +
drivers/gpu/drm/i915/i915_drv.h | 1 -
drivers/gpu/drm/i915/i915_perf.c | 1 +
17 files changed, 2178 insertions(+), 2026 deletions(-)
create mode 100644 drivers/gpu/drm/i915/gt/gen8_engine_cs.c
create mode 100644 drivers/gpu/drm/i915/gt/gen8_engine_cs.h
create mode 100644 drivers/gpu/drm/i915/gt/intel_engine_workaround_bb.c
create mode 100644 drivers/gpu/drm/i915/gt/intel_lrc.c
create mode 100644 drivers/gpu/drm/i915/gt/selftest_lrc.c
diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
index 58b0e86c71f7..5f45379f5ba1 100644
--- a/drivers/gpu/drm/i915/Makefile
+++ b/drivers/gpu/drm/i915/Makefile
@@ -78,6 +78,7 @@ gt-y += \
gt/debugfs_gt.o \
gt/debugfs_gt_pm.o \
gt/gen6_ppgtt.o \
+ gt/gen8_engine_cs.o \
gt/gen8_ppgtt.o \
gt/intel_breadcrumbs.o \
gt/intel_context.o \
@@ -86,6 +87,7 @@ gt-y += \
gt/intel_engine_pm.o \
gt/intel_engine_pool.o \
gt/intel_engine_user.o \
+ gt/intel_engine_workaround_bb.o \
gt/intel_execlists_submission.o \
gt/intel_ggtt.o \
gt/intel_gt.o \
@@ -95,6 +97,7 @@ gt-y += \
gt/intel_gt_requests.o \
gt/intel_gtt.o \
gt/intel_llc.o \
+ gt/intel_lrc.o \
gt/intel_mocs.o \
gt/intel_ppgtt.o \
gt/intel_rc6.o \
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c b/drivers/gpu/drm/i915/gem/i915_gem_context.c
index da1ba8feeedc..f22a2db8758e 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c
@@ -74,6 +74,7 @@
#include "gt/intel_engine_heartbeat.h"
#include "gt/intel_engine_pm.h"
#include "gt/intel_engine_user.h"
+#include "gt/intel_lrc.h"
#include "gt/intel_lrc_reg.h"
#include "gt/intel_ring.h"
diff --git a/drivers/gpu/drm/i915/gt/gen8_engine_cs.c b/drivers/gpu/drm/i915/gt/gen8_engine_cs.c
new file mode 100644
index 000000000000..86a8b328621d
--- /dev/null
+++ b/drivers/gpu/drm/i915/gt/gen8_engine_cs.c
@@ -0,0 +1,340 @@
+// SPDX-License-Identifier: MIT
+/*
+ * Copyright © 2014 Intel Corporation
+ */
+
+#include "gen8_engine_cs.h"
+#include "i915_drv.h"
+#include "intel_gpu_commands.h"
+#include "intel_ring.h"
+#include "intel_lrc.h"
+
+int gen8_emit_flush_xcs(struct i915_request *request, u32 mode)
+{
+ u32 cmd, *cs;
+
+ cs = intel_ring_begin(request, 4);
+ if (IS_ERR(cs))
+ return PTR_ERR(cs);
+
+ cmd = MI_FLUSH_DW + 1;
+
+ /* We always require a command barrier so that subsequent
+ * commands, such as breadcrumb interrupts, are strictly ordered
+ * wrt the contents of the write cache being flushed to memory
+ * (and thus being coherent from the CPU).
+ */
+ cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
+
+ if (mode & EMIT_INVALIDATE) {
+ cmd |= MI_INVALIDATE_TLB;
+ if (request->engine->class == VIDEO_DECODE_CLASS)
+ cmd |= MI_INVALIDATE_BSD;
+ }
+
+ *cs++ = cmd;
+ *cs++ = LRC_PPHWSP_SCRATCH_ADDR;
+ *cs++ = 0; /* upper addr */
+ *cs++ = 0; /* value */
+ intel_ring_advance(request, cs);
+
+ return 0;
+}
+
+int gen8_emit_flush_rcs(struct i915_request *request, u32 mode)
+{
+ u32 *cs, flags = 0;
+
+ flags |= PIPE_CONTROL_CS_STALL;
+
+ if (mode & EMIT_FLUSH) {
+ flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
+ flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
+ flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
+ flags |= PIPE_CONTROL_FLUSH_ENABLE;
+ }
+
+ if (mode & EMIT_INVALIDATE) {
+ flags |= PIPE_CONTROL_TLB_INVALIDATE;
+ flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
+ flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
+ flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
+ flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
+ flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
+ flags |= PIPE_CONTROL_QW_WRITE;
+ flags |= PIPE_CONTROL_STORE_DATA_INDEX;
+ }
+
+ cs = intel_ring_begin(request, 6);
+ if (IS_ERR(cs))
+ return PTR_ERR(cs);
+
+ cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
+
+ intel_ring_advance(request, cs);
+
+ return 0;
+}
+
+int gen9_emit_flush_rcs(struct i915_request *request, u32 mode)
+{
+ bool vf_flush_wa = false, dc_flush_wa = false;
+ u32 *cs, flags = 0;
+ int len = 6;
+
+ flags |= PIPE_CONTROL_CS_STALL;
+
+ if (mode & EMIT_FLUSH) {
+ flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
+ flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
+ flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
+ flags |= PIPE_CONTROL_FLUSH_ENABLE;
+ }
+
+ if (mode & EMIT_INVALIDATE) {
+ flags |= PIPE_CONTROL_TLB_INVALIDATE;
+ flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
+ flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
+ flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
+ flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
+ flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
+ flags |= PIPE_CONTROL_QW_WRITE;
+ flags |= PIPE_CONTROL_STORE_DATA_INDEX;
+
+ /*
+ * On GEN9: before VF_CACHE_INVALIDATE we need to emit a NULL
+ * pipe control.
+ */
+ vf_flush_wa = true;
+ len += 6;
+
+ /* WaForGAMHang:kbl */
+ if (IS_KBL_REVID(request->i915, 0, KBL_REVID_B0)) {
+ dc_flush_wa = true;
+ len += 12;
+ }
+ }
+
+ cs = intel_ring_begin(request, len);
+ if (IS_ERR(cs))
+ return PTR_ERR(cs);
+
+ if (vf_flush_wa)
+ cs = gen8_emit_pipe_control(cs, 0, 0);
+
+ if (dc_flush_wa)
+ cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_DC_FLUSH_ENABLE,
+ 0);
+
+ cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
+
+ if (dc_flush_wa)
+ cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_CS_STALL, 0);
+
+ intel_ring_advance(request, cs);
+
+ return 0;
+}
+
+int gen11_emit_flush_rcs(struct i915_request *request, u32 mode)
+{
+ if (mode & EMIT_FLUSH) {
+ u32 *cs;
+ u32 flags = 0;
+
+ flags |= PIPE_CONTROL_CS_STALL;
+
+ flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
+ flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
+ flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
+ flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
+ flags |= PIPE_CONTROL_FLUSH_ENABLE;
+ flags |= PIPE_CONTROL_QW_WRITE;
+ flags |= PIPE_CONTROL_STORE_DATA_INDEX;
+
+ cs = intel_ring_begin(request, 6);
+ if (IS_ERR(cs))
+ return PTR_ERR(cs);
+
+ cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
+ intel_ring_advance(request, cs);
+ }
+
+ if (mode & EMIT_INVALIDATE) {
+ u32 *cs;
+ u32 flags = 0;
+
+ flags |= PIPE_CONTROL_CS_STALL;
+
+ flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
+ flags |= PIPE_CONTROL_TLB_INVALIDATE;
+ flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
+ flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
+ flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
+ flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
+ flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
+ flags |= PIPE_CONTROL_QW_WRITE;
+ flags |= PIPE_CONTROL_STORE_DATA_INDEX;
+
+ cs = intel_ring_begin(request, 6);
+ if (IS_ERR(cs))
+ return PTR_ERR(cs);
+
+ cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
+ intel_ring_advance(request, cs);
+ }
+
+ return 0;
+}
+
+static u32 preparser_disable(bool state)
+{
+ return MI_ARB_CHECK | 1 << 8 | state;
+}
+
+int gen12_emit_flush_rcs(struct i915_request *request, u32 mode)
+{
+ if (mode & EMIT_FLUSH) {
+ u32 flags = 0;
+ u32 *cs;
+
+ flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
+ flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
+ flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
+ /* Wa_1409600907:tgl */
+ flags |= PIPE_CONTROL_DEPTH_STALL;
+ flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
+ flags |= PIPE_CONTROL_FLUSH_ENABLE;
+ flags |= PIPE_CONTROL_HDC_PIPELINE_FLUSH;
+
+ flags |= PIPE_CONTROL_STORE_DATA_INDEX;
+ flags |= PIPE_CONTROL_QW_WRITE;
+
+ flags |= PIPE_CONTROL_CS_STALL;
+
+ cs = intel_ring_begin(request, 6);
+ if (IS_ERR(cs))
+ return PTR_ERR(cs);
+
+ cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
+ intel_ring_advance(request, cs);
+ }
+
+ if (mode & EMIT_INVALIDATE) {
+ u32 flags = 0;
+ u32 *cs;
+
+ flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
+ flags |= PIPE_CONTROL_TLB_INVALIDATE;
+ flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
+ flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
+ flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
+ flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
+ flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
+ flags |= PIPE_CONTROL_L3_RO_CACHE_INVALIDATE;
+
+ flags |= PIPE_CONTROL_STORE_DATA_INDEX;
+ flags |= PIPE_CONTROL_QW_WRITE;
+
+ flags |= PIPE_CONTROL_CS_STALL;
+
+ cs = intel_ring_begin(request, 8);
+ if (IS_ERR(cs))
+ return PTR_ERR(cs);
+
+ /*
+ * Prevent the pre-parser from skipping past the TLB
+ * invalidate and loading a stale page for the batch
+ * buffer / request payload.
+ */
+ *cs++ = preparser_disable(true);
+
+ cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
+
+ *cs++ = preparser_disable(false);
+ intel_ring_advance(request, cs);
+
+ /*
+ * Wa_1604544889:tgl
+ */
+ if (IS_TGL_REVID(request->i915, TGL_REVID_A0, TGL_REVID_A0)) {
+ flags = 0;
+ flags |= PIPE_CONTROL_CS_STALL;
+ flags |= PIPE_CONTROL_HDC_PIPELINE_FLUSH;
+
+ flags |= PIPE_CONTROL_STORE_DATA_INDEX;
+ flags |= PIPE_CONTROL_QW_WRITE;
+
+ cs = intel_ring_begin(request, 6);
+ if (IS_ERR(cs))
+ return PTR_ERR(cs);
+
+ cs = gen8_emit_pipe_control(cs, flags,
+ LRC_PPHWSP_SCRATCH_ADDR);
+ intel_ring_advance(request, cs);
+ }
+ }
+
+ return 0;
+}
+
+int gen8_emit_bb_start(struct i915_request *rq,
+ u64 offset, u32 len,
+ const unsigned int flags)
+{
+ u32 *cs;
+
+ cs = intel_ring_begin(rq, 6);
+ if (IS_ERR(cs))
+ return PTR_ERR(cs);
+
+ *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
+
+ *cs++ = MI_BATCH_BUFFER_START_GEN8 |
+ (flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
+ *cs++ = lower_32_bits(offset);
+ *cs++ = upper_32_bits(offset);
+
+ *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
+ *cs++ = MI_NOOP;
+
+ intel_ring_advance(rq, cs);
+
+ return 0;
+}
+
+int gen8_emit_bb_start_noarb(struct i915_request *rq,
+ u64 offset, u32 len,
+ const unsigned int flags)
+{
+ u32 *cs;
+
+ cs = intel_ring_begin(rq, 4);
+ if (IS_ERR(cs))
+ return PTR_ERR(cs);
+
+ /*
+ * WaDisableCtxRestoreArbitration:bdw,chv
+ *
+ * We don't need to perform MI_ARB_ENABLE as often as we do (in
+ * particular all the gen that do not need the w/a at all!), if we
+ * took care to make sure that on every switch into this context
+ * (both ordinary and for preemption) that arbitrartion was enabled
+ * we would be fine. However, for gen8 there is another w/a that
+ * requires us to not preempt inside GPGPU execution, so we keep
+ * arbitration disabled for gen8 batches. Arbitration will be
+ * re-enabled before we close the request
+ * (engine->emit_fini_breadcrumb).
+ */
+ *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
+
+ /* FIXME(BDW+): Address space and security selectors. */
+ *cs++ = MI_BATCH_BUFFER_START_GEN8 |
+ (flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
+ *cs++ = lower_32_bits(offset);
+ *cs++ = upper_32_bits(offset);
+
+ intel_ring_advance(rq, cs);
+
+ return 0;
+}
diff --git a/drivers/gpu/drm/i915/gt/gen8_engine_cs.h b/drivers/gpu/drm/i915/gt/gen8_engine_cs.h
new file mode 100644
index 000000000000..ebca43a5e33c
--- /dev/null
+++ b/drivers/gpu/drm/i915/gt/gen8_engine_cs.h
@@ -0,0 +1,27 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Copyright © 2014 Intel Corporation
+ */
+
+#ifndef __GEN8_ENGINE_CS_H__
+#define __GEN8_ENGINE_CS_H__
+
+#include <linux/types.h>
+
+struct i915_request;
+
+int gen8_emit_flush_xcs(struct i915_request *request, u32 mode);
+
+int gen8_emit_flush_rcs(struct i915_request *request, u32 mode);
+int gen9_emit_flush_rcs(struct i915_request *request, u32 mode);
+int gen11_emit_flush_rcs(struct i915_request *request, u32 mode);
+int gen12_emit_flush_rcs(struct i915_request *request, u32 mode);
+
+int gen8_emit_bb_start(struct i915_request *rq,
+ u64 offset, u32 len,
+ const unsigned int flags);
+int gen8_emit_bb_start_noarb(struct i915_request *rq,
+ u64 offset, u32 len,
+ const unsigned int flags);
+
+#endif /* __GEN8_ENGINE_CS_H__ */
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_pool.c b/drivers/gpu/drm/i915/gt/intel_engine_pool.c
index 397186818305..6257f2b17d33 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_pool.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_pool.c
@@ -9,6 +9,7 @@
#include "i915_drv.h"
#include "intel_engine_pm.h"
#include "intel_engine_pool.h"
+#include "intel_lrc.h"
static struct intel_engine_cs *to_engine(struct intel_engine_pool *pool)
{
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_workaround_bb.c b/drivers/gpu/drm/i915/gt/intel_engine_workaround_bb.c
new file mode 100644
index 000000000000..27b3efab1128
--- /dev/null
+++ b/drivers/gpu/drm/i915/gt/intel_engine_workaround_bb.c
@@ -0,0 +1,335 @@
+// SPDX-License-Identifier: MIT
+/*
+ * Copyright © 2014 Intel Corporation
+ */
+
+#include "i915_drv.h"
+#include "i915_reg.h"
+#include "intel_engine.h"
+#include "intel_gpu_commands.h"
+#include "intel_gt.h"
+#include "intel_lrc.h"
+
+/*
+ * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after
+ * PIPE_CONTROL instruction. This is required for the flush to happen correctly
+ * but there is a slight complication as this is applied in WA batch where the
+ * values are only initialized once so we cannot take register value at the
+ * beginning and reuse it further; hence we save its value to memory, upload a
+ * constant value with bit21 set and then we restore it back with the saved value.
+ * To simplify the WA, a constant value is formed by using the default value
+ * of this register. This shouldn't be a problem because we are only modifying
+ * it for a short period and this batch in non-premptible. We can ofcourse
+ * use additional instructions that read the actual value of the register
+ * at that time and set our bit of interest but it makes the WA complicated.
+ *
+ * This WA is also required for Gen9 so extracting as a function avoids
+ * code duplication.
+ */
+static u32 *
+gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch)
+{
+ /* NB no one else is allowed to scribble over scratch + 256! */
+ *batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
+ *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
+ *batch++ = intel_gt_scratch_offset(engine->gt,
+ INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
+ *batch++ = 0;
+
+ *batch++ = MI_LOAD_REGISTER_IMM(1);
+ *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
+ *batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES;
+
+ batch = gen8_emit_pipe_control(batch,
+ PIPE_CONTROL_CS_STALL |
+ PIPE_CONTROL_DC_FLUSH_ENABLE,
+ 0);
+
+ *batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
+ *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
+ *batch++ = intel_gt_scratch_offset(engine->gt,
+ INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
+ *batch++ = 0;
+
+ return batch;
+}
+
+/*
+ * Typically we only have one indirect_ctx and per_ctx batch buffer which are
+ * initialized at the beginning and shared across all contexts but this field
+ * helps us to have multiple batches at different offsets and select them based
+ * on a criteria. At the moment this batch always start at the beginning of the page
+ * and at this point we don't have multiple wa_ctx batch buffers.
+ *
+ * The number of WA applied are not known at the beginning; we use this field
+ * to return the no of DWORDS written.
+ *
+ * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END
+ * so it adds NOOPs as padding to make it cacheline aligned.
+ * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together
+ * makes a complete batch buffer.
+ */
+static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
+{
+ /* WaDisableCtxRestoreArbitration:bdw,chv */
+ *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
+
+ /* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */
+ if (IS_BROADWELL(engine->i915))
+ batch = gen8_emit_flush_coherentl3_wa(engine, batch);
+
+ /* WaClearSlmSpaceAtContextSwitch:bdw,chv */
+ /* Actual scratch location is at 128 bytes offset */
+ batch = gen8_emit_pipe_control(batch,
+ PIPE_CONTROL_FLUSH_L3 |
+ PIPE_CONTROL_STORE_DATA_INDEX |
+ PIPE_CONTROL_CS_STALL |
+ PIPE_CONTROL_QW_WRITE,
+ LRC_PPHWSP_SCRATCH_ADDR);
+
+ *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
+
+ /* Pad to end of cacheline */
+ while ((unsigned long)batch % CACHELINE_BYTES)
+ *batch++ = MI_NOOP;
+
+ /*
+ * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
+ * execution depends on the length specified in terms of cache lines
+ * in the register CTX_RCS_INDIRECT_CTX
+ */
+
+ return batch;
+}
+
+struct lri {
+ i915_reg_t reg;
+ u32 value;
+};
+
+static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count)
+{
+ GEM_BUG_ON(!count || count > 63);
+
+ *batch++ = MI_LOAD_REGISTER_IMM(count);
+ do {
+ *batch++ = i915_mmio_reg_offset(lri->reg);
+ *batch++ = lri->value;
+ } while (lri++, --count);
+ *batch++ = MI_NOOP;
+
+ return batch;
+}
+
+static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
+{
+ static const struct lri lri[] = {
+ /* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */
+ {
+ COMMON_SLICE_CHICKEN2,
+ __MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE,
+ 0),
+ },
+
+ /* BSpec: 11391 */
+ {
+ FF_SLICE_CHICKEN,
+ __MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX,
+ FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX),
+ },
+
+ /* BSpec: 11299 */
+ {
+ _3D_CHICKEN3,
+ __MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX,
+ _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX),
+ }
+ };
+
+ *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
+
+ /* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
+ batch = gen8_emit_flush_coherentl3_wa(engine, batch);
+
+ /* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */
+ batch = gen8_emit_pipe_control(batch,
+ PIPE_CONTROL_FLUSH_L3 |
+ PIPE_CONTROL_STORE_DATA_INDEX |
+ PIPE_CONTROL_CS_STALL |
+ PIPE_CONTROL_QW_WRITE,
+ LRC_PPHWSP_SCRATCH_ADDR);
+
+ batch = emit_lri(batch, lri, ARRAY_SIZE(lri));
+
+ /* WaMediaPoolStateCmdInWABB:bxt,glk */
+ if (HAS_POOLED_EU(engine->i915)) {
+ /*
+ * EU pool configuration is setup along with golden context
+ * during context initialization. This value depends on
+ * device type (2x6 or 3x6) and needs to be updated based
+ * on which subslice is disabled especially for 2x6
+ * devices, however it is safe to load default
+ * configuration of 3x6 device instead of masking off
+ * corresponding bits because HW ignores bits of a disabled
+ * subslice and drops down to appropriate config. Please
+ * see render_state_setup() in i915_gem_render_state.c for
+ * possible configurations, to avoid duplication they are
+ * not shown here again.
+ */
+ *batch++ = GEN9_MEDIA_POOL_STATE;
+ *batch++ = GEN9_MEDIA_POOL_ENABLE;
+ *batch++ = 0x00777000;
+ *batch++ = 0;
+ *batch++ = 0;
+ *batch++ = 0;
+ }
+
+ *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
+
+ /* Pad to end of cacheline */
+ while ((unsigned long)batch % CACHELINE_BYTES)
+ *batch++ = MI_NOOP;
+
+ return batch;
+}
+
+static u32 *
+gen10_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
+{
+ int i;
+
+ /*
+ * WaPipeControlBefore3DStateSamplePattern: cnl
+ *
+ * Ensure the engine is idle prior to programming a
+ * 3DSTATE_SAMPLE_PATTERN during a context restore.
+ */
+ batch = gen8_emit_pipe_control(batch,
+ PIPE_CONTROL_CS_STALL,
+ 0);
+ /*
+ * WaPipeControlBefore3DStateSamplePattern says we need 4 dwords for
+ * the PIPE_CONTROL followed by 12 dwords of 0x0, so 16 dwords in
+ * total. However, a PIPE_CONTROL is 6 dwords long, not 4, which is
+ * confusing. Since gen8_emit_pipe_control() already advances the
+ * batch by 6 dwords, we advance the other 10 here, completing a
+ * cacheline. It's not clear if the workaround requires this padding
+ * before other commands, or if it's just the regular padding we would
+ * already have for the workaround bb, so leave it here for now.
+ */
+ for (i = 0; i < 10; i++)
+ *batch++ = MI_NOOP;
+
+ /* Pad to end of cacheline */
+ while ((unsigned long)batch % CACHELINE_BYTES)
+ *batch++ = MI_NOOP;
+
+ return batch;
+}
+
+#define CTX_WA_BB_OBJ_SIZE (PAGE_SIZE)
+
+static int lrc_setup_wa_ctx(struct intel_engine_cs *engine)
+{
+ struct drm_i915_gem_object *obj;
+ struct i915_vma *vma;
+ int err;
+
+ obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_OBJ_SIZE);
+ if (IS_ERR(obj))
+ return PTR_ERR(obj);
+
+ vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
+ if (IS_ERR(vma)) {
+ err = PTR_ERR(vma);
+ goto err;
+ }
+
+ err = i915_vma_pin(vma, 0, 0, PIN_GLOBAL | PIN_HIGH);
+ if (err)
+ goto err;
+
+ engine->wa_ctx.vma = vma;
+ return 0;
+
+err:
+ i915_gem_object_put(obj);
+ return err;
+}
+
+typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch);
+
+int intel_engine_init_workaround_bb(struct intel_engine_cs *engine)
+{
+ struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
+ struct i915_wa_ctx_bb *wa_bb[2] = { &wa_ctx->indirect_ctx,
+ &wa_ctx->per_ctx };
+ wa_bb_func_t wa_bb_fn[2];
+ struct page *page;
+ void *batch, *batch_ptr;
+ unsigned int i;
+ int ret;
+
+ if (engine->class != RENDER_CLASS)
+ return 0;
+
+ switch (INTEL_GEN(engine->i915)) {
+ case 12:
+ case 11:
+ return 0;
+ case 10:
+ wa_bb_fn[0] = gen10_init_indirectctx_bb;
+ wa_bb_fn[1] = NULL;
+ break;
+ case 9:
+ wa_bb_fn[0] = gen9_init_indirectctx_bb;
+ wa_bb_fn[1] = NULL;
+ break;
+ case 8:
+ wa_bb_fn[0] = gen8_init_indirectctx_bb;
+ wa_bb_fn[1] = NULL;
+ break;
+ default:
+ MISSING_CASE(INTEL_GEN(engine->i915));
+ return 0;
+ }
+
+ ret = lrc_setup_wa_ctx(engine);
+ if (ret) {
+ DRM_DEBUG_DRIVER("Failed to setup context WA page: %d\n", ret);
+ return ret;
+ }
+
+ page = i915_gem_object_get_dirty_page(wa_ctx->vma->obj, 0);
+ batch = batch_ptr = kmap_atomic(page);
+
+ /*
+ * Emit the two workaround batch buffers, recording the offset from the
+ * start of the workaround batch buffer object for each and their
+ * respective sizes.
+ */
+ for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) {
+ wa_bb[i]->offset = batch_ptr - batch;
+ if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset,
+ CACHELINE_BYTES))) {
+ ret = -EINVAL;
+ break;
+ }
+ if (wa_bb_fn[i])
+ batch_ptr = wa_bb_fn[i](engine, batch_ptr);
+ wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset);
+ }
+
+ GEM_BUG_ON(batch_ptr - batch > CTX_WA_BB_OBJ_SIZE);
+
+ kunmap_atomic(batch);
+ if (ret)
+ intel_engine_fini_workaround_bb(engine);
+
+ return ret;
+}
+
+void intel_engine_fini_workaround_bb(struct intel_engine_cs *engine)
+{
+ i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0);
+}
diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
index aa67561fee53..a2aa6c5cfbb8 100644
--- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
+++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
@@ -1,147 +1,19 @@
+// SPDX-License-Identifier: MIT
/*
* Copyright © 2014 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- * Authors:
- * Ben Widawsky <ben@xxxxxxxxxxxx>
- * Michel Thierry <michel.thierry@xxxxxxxxx>
- * Thomas Daniel <thomas.daniel@xxxxxxxxx>
- * Oscar Mateo <oscar.mateo@xxxxxxxxx>
- *
*/
-/**
- * DOC: Logical Rings, Logical Ring Contexts and Execlists
- *
- * Motivation:
- * GEN8 brings an expansion of the HW contexts: "Logical Ring Contexts".
- * These expanded contexts enable a number of new abilities, especially
- * "Execlists" (also implemented in this file).
- *
- * One of the main differences with the legacy HW contexts is that logical
- * ring contexts incorporate many more things to the context's state, like
- * PDPs or ringbuffer control registers:
- *
- * The reason why PDPs are included in the context is straightforward: as
- * PPGTTs (per-process GTTs) are actually per-context, having the PDPs
- * contained there mean you don't need to do a ppgtt->switch_mm yourself,
- * instead, the GPU will do it for you on the context switch.
- *
- * But, what about the ringbuffer control registers (head, tail, etc..)?
- * shouldn't we just need a set of those per engine command streamer? This is
- * where the name "Logical Rings" starts to make sense: by virtualizing the
- * rings, the engine cs shifts to a new "ring buffer" with every context
- * switch. When you want to submit a workload to the GPU you: A) choose your
- * context, B) find its appropriate virtualized ring, C) write commands to it
- * and then, finally, D) tell the GPU to switch to that context.
- *
- * Instead of the legacy MI_SET_CONTEXT, the way you tell the GPU to switch
- * to a contexts is via a context execution list, ergo "Execlists".
- *
- * LRC implementation:
- * Regarding the creation of contexts, we have:
- *
- * - One global default context.
- * - One local default context for each opened fd.
- * - One local extra context for each context create ioctl call.
- *
- * Now that ringbuffers belong per-context (and not per-engine, like before)
- * and that contexts are uniquely tied to a given engine (and not reusable,
- * like before) we need:
- *
- * - One ringbuffer per-engine inside each context.
- * - One backing object per-engine inside each context.
- *
- * The global default context starts its life with these new objects fully
- * allocated and populated. The local default context for each opened fd is
- * more complex, because we don't know at creation time which engine is going
- * to use them. To handle this, we have implemented a deferred creation of LR
- * contexts:
- *
- * The local context starts its life as a hollow or blank holder, that only
- * gets populated for a given engine once we receive an execbuffer. If later
- * on we receive another execbuffer ioctl for the same context but a different
- * engine, we allocate/populate a new ringbuffer and context backing object and
- * so on.
- *
- * Finally, regarding local contexts created using the ioctl call: as they are
- * only allowed with the render ring, we can allocate & populate them right
- * away (no need to defer anything, at least for now).
- *
- * Execlists implementation:
- * Execlists are the new method by which, on gen8+ hardware, workloads are
- * submitted for execution (as opposed to the legacy, ringbuffer-based, method).
- * This method works as follows:
- *
- * When a request is committed, its commands (the BB start and any leading or
- * trailing commands, like the seqno breadcrumbs) are placed in the ringbuffer
- * for the appropriate context. The tail pointer in the hardware context is not
- * updated at this time, but instead, kept by the driver in the ringbuffer
- * structure. A structure representing this request is added to a request queue
- * for the appropriate engine: this structure contains a copy of the context's
- * tail after the request was written to the ring buffer and a pointer to the
- * context itself.
- *
- * If the engine's request queue was empty before the request was added, the
- * queue is processed immediately. Otherwise the queue will be processed during
- * a context switch interrupt. In any case, elements on the queue will get sent
- * (in pairs) to the GPU's ExecLists Submit Port (ELSP, for short) with a
- * globally unique 20-bits submission ID.
- *
- * When execution of a request completes, the GPU updates the context status
- * buffer with a context complete event and generates a context switch interrupt.
- * During the interrupt handling, the driver examines the events in the buffer:
- * for each context complete event, if the announced ID matches that on the head
- * of the request queue, then that request is retired and removed from the queue.
- *
- * After processing, if any requests were retired and the queue is not empty
- * then a new execution list can be submitted. The two requests at the front of
- * the queue are next to be submitted but since a context may not occur twice in
- * an execution list, if subsequent requests have the same ID as the first then
- * the two requests must be combined. This is done simply by discarding requests
- * at the head of the queue until either only one requests is left (in which case
- * we use a NULL second context) or the first two requests have unique IDs.
- *
- * By always executing the first two requests in the queue the driver ensures
- * that the GPU is kept as busy as possible. In the case where a single context
- * completes but a second context is still executing, the request for this second
- * context will be at the head of the queue when we remove the first one. This
- * request will then be resubmitted along with a new request for a different context,
- * which will cause the hardware to continue executing the second request and queue
- * the new request (the GPU detects the condition of a context getting preempted
- * with the same context and optimizes the context switch flow by not doing
- * preemption, but just sampling the new tail pointer).
- *
- */
#include <linux/interrupt.h>
+#include "gen8_engine_cs.h"
#include "i915_drv.h"
-#include "i915_perf.h"
#include "i915_trace.h"
-#include "i915_vgpu.h"
#include "intel_context.h"
#include "intel_engine_pm.h"
#include "intel_gt.h"
#include "intel_gt_pm.h"
#include "intel_gt_requests.h"
+#include "intel_lrc.h"
#include "intel_lrc_reg.h"
#include "intel_mocs.h"
#include "intel_reset.h"
@@ -230,15 +102,6 @@ static struct virtual_engine *to_virtual_engine(struct intel_engine_cs *engine)
static int __execlists_context_alloc(struct intel_context *ce,
struct intel_engine_cs *engine);
-static void execlists_init_reg_state(u32 *reg_state,
- const struct intel_context *ce,
- const struct intel_engine_cs *engine,
- const struct intel_ring *ring,
- bool close);
-static void
-__execlists_update_reg_state(const struct intel_context *ce,
- const struct intel_engine_cs *engine);
-
static void mark_eio(struct i915_request *rq)
{
if (i915_request_completed(rq))
@@ -431,527 +294,6 @@ assert_priority_queue(const struct i915_request *prev,
return rq_prio(prev) >= rq_prio(next);
}
-/*
- * The context descriptor encodes various attributes of a context,
- * including its GTT address and some flags. Because it's fairly
- * expensive to calculate, we'll just do it once and cache the result,
- * which remains valid until the context is unpinned.
- *
- * This is what a descriptor looks like, from LSB to MSB::
- *
- * bits 0-11: flags, GEN8_CTX_* (cached in ctx->desc_template)
- * bits 12-31: LRCA, GTT address of (the HWSP of) this context
- * bits 32-52: ctx ID, a globally unique tag (highest bit used by GuC)
- * bits 53-54: mbz, reserved for use by hardware
- * bits 55-63: group ID, currently unused and set to 0
- *
- * Starting from Gen11, the upper dword of the descriptor has a new format:
- *
- * bits 32-36: reserved
- * bits 37-47: SW context ID
- * bits 48:53: engine instance
- * bit 54: mbz, reserved for use by hardware
- * bits 55-60: SW counter
- * bits 61-63: engine class
- *
- * engine info, SW context ID and SW counter need to form a unique number
- * (Context ID) per lrc.
- */
-static u64
-lrc_descriptor(struct intel_context *ce, struct intel_engine_cs *engine)
-{
- u64 desc;
-
- desc = INTEL_LEGACY_32B_CONTEXT;
- if (i915_vm_is_4lvl(ce->vm))
- desc = INTEL_LEGACY_64B_CONTEXT;
- desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT;
-
- desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE;
- if (IS_GEN(engine->i915, 8))
- desc |= GEN8_CTX_L3LLC_COHERENT;
-
- desc |= i915_ggtt_offset(ce->state); /* bits 12-31 */
- /*
- * The following 32bits are copied into the OA reports (dword 2).
- * Consider updating oa_get_render_ctx_id in i915_perf.c when changing
- * anything below.
- */
- if (INTEL_GEN(engine->i915) >= 11) {
- desc |= (u64)engine->instance << GEN11_ENGINE_INSTANCE_SHIFT;
- /* bits 48-53 */
-
- desc |= (u64)engine->class << GEN11_ENGINE_CLASS_SHIFT;
- /* bits 61-63 */
- }
-
- return desc;
-}
-
-static inline unsigned int dword_in_page(void *addr)
-{
- return offset_in_page(addr) / sizeof(u32);
-}
-
-static void set_offsets(u32 *regs,
- const u8 *data,
- const struct intel_engine_cs *engine,
- bool clear)
-#define NOP(x) (BIT(7) | (x))
-#define LRI(count, flags) ((flags) << 6 | (count) | BUILD_BUG_ON_ZERO(count >= BIT(6)))
-#define POSTED BIT(0)
-#define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
-#define REG16(x) \
- (((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
- (((x) >> 2) & 0x7f)
-#define END(x) 0, (x)
-{
- const u32 base = engine->mmio_base;
-
- while (*data) {
- u8 count, flags;
-
- if (*data & BIT(7)) { /* skip */
- count = *data++ & ~BIT(7);
- if (clear)
- memset32(regs, MI_NOOP, count);
- regs += count;
- continue;
- }
-
- count = *data & 0x3f;
- flags = *data >> 6;
- data++;
-
- *regs = MI_LOAD_REGISTER_IMM(count);
- if (flags & POSTED)
- *regs |= MI_LRI_FORCE_POSTED;
- if (INTEL_GEN(engine->i915) >= 11)
- *regs |= MI_LRI_CS_MMIO;
- regs++;
-
- GEM_BUG_ON(!count);
- do {
- u32 offset = 0;
- u8 v;
-
- do {
- v = *data++;
- offset <<= 7;
- offset |= v & ~BIT(7);
- } while (v & BIT(7));
-
- regs[0] = base + (offset << 2);
- if (clear)
- regs[1] = 0;
- regs += 2;
- } while (--count);
- }
-
- if (clear) {
- u8 count = *++data;
-
- /* Clear past the tail for HW access */
- GEM_BUG_ON(dword_in_page(regs) > count);
- memset32(regs, MI_NOOP, count - dword_in_page(regs));
-
- /* Close the batch; used mainly by live_lrc_layout() */
- *regs = MI_BATCH_BUFFER_END;
- if (INTEL_GEN(engine->i915) >= 10)
- *regs |= BIT(0);
- }
-}
-
-static const u8 gen8_xcs_offsets[] = {
- NOP(1),
- LRI(11, 0),
- REG16(0x244),
- REG(0x034),
- REG(0x030),
- REG(0x038),
- REG(0x03c),
- REG(0x168),
- REG(0x140),
- REG(0x110),
- REG(0x11c),
- REG(0x114),
- REG(0x118),
-
- NOP(9),
- LRI(9, 0),
- REG16(0x3a8),
- REG16(0x28c),
- REG16(0x288),
- REG16(0x284),
- REG16(0x280),
- REG16(0x27c),
- REG16(0x278),
- REG16(0x274),
- REG16(0x270),
-
- NOP(13),
- LRI(2, 0),
- REG16(0x200),
- REG(0x028),
-
- END(80)
-};
-
-static const u8 gen9_xcs_offsets[] = {
- NOP(1),
- LRI(14, POSTED),
- REG16(0x244),
- REG(0x034),
- REG(0x030),
- REG(0x038),
- REG(0x03c),
- REG(0x168),
- REG(0x140),
- REG(0x110),
- REG(0x11c),
- REG(0x114),
- REG(0x118),
- REG(0x1c0),
- REG(0x1c4),
- REG(0x1c8),
-
- NOP(3),
- LRI(9, POSTED),
- REG16(0x3a8),
- REG16(0x28c),
- REG16(0x288),
- REG16(0x284),
- REG16(0x280),
- REG16(0x27c),
- REG16(0x278),
- REG16(0x274),
- REG16(0x270),
-
- NOP(13),
- LRI(1, POSTED),
- REG16(0x200),
-
- NOP(13),
- LRI(44, POSTED),
- REG(0x028),
- REG(0x09c),
- REG(0x0c0),
- REG(0x178),
- REG(0x17c),
- REG16(0x358),
- REG(0x170),
- REG(0x150),
- REG(0x154),
- REG(0x158),
- REG16(0x41c),
- REG16(0x600),
- REG16(0x604),
- REG16(0x608),
- REG16(0x60c),
- REG16(0x610),
- REG16(0x614),
- REG16(0x618),
- REG16(0x61c),
- REG16(0x620),
- REG16(0x624),
- REG16(0x628),
- REG16(0x62c),
- REG16(0x630),
- REG16(0x634),
- REG16(0x638),
- REG16(0x63c),
- REG16(0x640),
- REG16(0x644),
- REG16(0x648),
- REG16(0x64c),
- REG16(0x650),
- REG16(0x654),
- REG16(0x658),
- REG16(0x65c),
- REG16(0x660),
- REG16(0x664),
- REG16(0x668),
- REG16(0x66c),
- REG16(0x670),
- REG16(0x674),
- REG16(0x678),
- REG16(0x67c),
- REG(0x068),
-
- END(176)
-};
-
-static const u8 gen12_xcs_offsets[] = {
- NOP(1),
- LRI(13, POSTED),
- REG16(0x244),
- REG(0x034),
- REG(0x030),
- REG(0x038),
- REG(0x03c),
- REG(0x168),
- REG(0x140),
- REG(0x110),
- REG(0x1c0),
- REG(0x1c4),
- REG(0x1c8),
- REG(0x180),
- REG16(0x2b4),
-
- NOP(5),
- LRI(9, POSTED),
- REG16(0x3a8),
- REG16(0x28c),
- REG16(0x288),
- REG16(0x284),
- REG16(0x280),
- REG16(0x27c),
- REG16(0x278),
- REG16(0x274),
- REG16(0x270),
-
- END(80)
-};
-
-static const u8 gen8_rcs_offsets[] = {
- NOP(1),
- LRI(14, POSTED),
- REG16(0x244),
- REG(0x034),
- REG(0x030),
- REG(0x038),
- REG(0x03c),
- REG(0x168),
- REG(0x140),
- REG(0x110),
- REG(0x11c),
- REG(0x114),
- REG(0x118),
- REG(0x1c0),
- REG(0x1c4),
- REG(0x1c8),
-
- NOP(3),
- LRI(9, POSTED),
- REG16(0x3a8),
- REG16(0x28c),
- REG16(0x288),
- REG16(0x284),
- REG16(0x280),
- REG16(0x27c),
- REG16(0x278),
- REG16(0x274),
- REG16(0x270),
-
- NOP(13),
- LRI(1, 0),
- REG(0x0c8),
-
- END(80)
-};
-
-static const u8 gen9_rcs_offsets[] = {
- NOP(1),
- LRI(14, POSTED),
- REG16(0x244),
- REG(0x34),
- REG(0x30),
- REG(0x38),
- REG(0x3c),
- REG(0x168),
- REG(0x140),
- REG(0x110),
- REG(0x11c),
- REG(0x114),
- REG(0x118),
- REG(0x1c0),
- REG(0x1c4),
- REG(0x1c8),
-
- NOP(3),
- LRI(9, POSTED),
- REG16(0x3a8),
- REG16(0x28c),
- REG16(0x288),
- REG16(0x284),
- REG16(0x280),
- REG16(0x27c),
- REG16(0x278),
- REG16(0x274),
- REG16(0x270),
-
- NOP(13),
- LRI(1, 0),
- REG(0xc8),
-
- NOP(13),
- LRI(44, POSTED),
- REG(0x28),
- REG(0x9c),
- REG(0xc0),
- REG(0x178),
- REG(0x17c),
- REG16(0x358),
- REG(0x170),
- REG(0x150),
- REG(0x154),
- REG(0x158),
- REG16(0x41c),
- REG16(0x600),
- REG16(0x604),
- REG16(0x608),
- REG16(0x60c),
- REG16(0x610),
- REG16(0x614),
- REG16(0x618),
- REG16(0x61c),
- REG16(0x620),
- REG16(0x624),
- REG16(0x628),
- REG16(0x62c),
- REG16(0x630),
- REG16(0x634),
- REG16(0x638),
- REG16(0x63c),
- REG16(0x640),
- REG16(0x644),
- REG16(0x648),
- REG16(0x64c),
- REG16(0x650),
- REG16(0x654),
- REG16(0x658),
- REG16(0x65c),
- REG16(0x660),
- REG16(0x664),
- REG16(0x668),
- REG16(0x66c),
- REG16(0x670),
- REG16(0x674),
- REG16(0x678),
- REG16(0x67c),
- REG(0x68),
-
- END(176)
-};
-
-static const u8 gen11_rcs_offsets[] = {
- NOP(1),
- LRI(15, POSTED),
- REG16(0x244),
- REG(0x034),
- REG(0x030),
- REG(0x038),
- REG(0x03c),
- REG(0x168),
- REG(0x140),
- REG(0x110),
- REG(0x11c),
- REG(0x114),
- REG(0x118),
- REG(0x1c0),
- REG(0x1c4),
- REG(0x1c8),
- REG(0x180),
-
- NOP(1),
- LRI(9, POSTED),
- REG16(0x3a8),
- REG16(0x28c),
- REG16(0x288),
- REG16(0x284),
- REG16(0x280),
- REG16(0x27c),
- REG16(0x278),
- REG16(0x274),
- REG16(0x270),
-
- LRI(1, POSTED),
- REG(0x1b0),
-
- NOP(10),
- LRI(1, 0),
- REG(0x0c8),
-
- END(80)
-};
-
-static const u8 gen12_rcs_offsets[] = {
- NOP(1),
- LRI(13, POSTED),
- REG16(0x244),
- REG(0x034),
- REG(0x030),
- REG(0x038),
- REG(0x03c),
- REG(0x168),
- REG(0x140),
- REG(0x110),
- REG(0x1c0),
- REG(0x1c4),
- REG(0x1c8),
- REG(0x180),
- REG16(0x2b4),
-
- NOP(5),
- LRI(9, POSTED),
- REG16(0x3a8),
- REG16(0x28c),
- REG16(0x288),
- REG16(0x284),
- REG16(0x280),
- REG16(0x27c),
- REG16(0x278),
- REG16(0x274),
- REG16(0x270),
-
- LRI(3, POSTED),
- REG(0x1b0),
- REG16(0x5a8),
- REG16(0x5ac),
-
- NOP(6),
- LRI(1, 0),
- REG(0x0c8),
-
- END(80)
-};
-
-#undef END
-#undef REG16
-#undef REG
-#undef LRI
-#undef NOP
-
-static const u8 *reg_offsets(const struct intel_engine_cs *engine)
-{
- /*
- * The gen12+ lists only have the registers we program in the basic
- * default state. We rely on the context image using relative
- * addressing to automatic fixup the register state between the
- * physical engines for virtual engine.
- */
- GEM_BUG_ON(INTEL_GEN(engine->i915) >= 12 &&
- !intel_engine_has_relative_mmio(engine));
-
- if (engine->class == RENDER_CLASS) {
- if (INTEL_GEN(engine->i915) >= 12)
- return gen12_rcs_offsets;
- else if (INTEL_GEN(engine->i915) >= 11)
- return gen11_rcs_offsets;
- else if (INTEL_GEN(engine->i915) >= 9)
- return gen9_rcs_offsets;
- else
- return gen8_rcs_offsets;
- } else {
- if (INTEL_GEN(engine->i915) >= 12)
- return gen12_xcs_offsets;
- else if (INTEL_GEN(engine->i915) >= 9)
- return gen9_xcs_offsets;
- else
- return gen8_xcs_offsets;
- }
-}
-
static struct i915_request *
__unwind_incomplete_requests(struct intel_engine_cs *engine)
{
@@ -1092,18 +434,6 @@ static void intel_engine_context_out(struct intel_engine_cs *engine)
write_sequnlock_irqrestore(&engine->stats.lock, flags);
}
-static int lrc_ring_mi_mode(const struct intel_engine_cs *engine)
-{
- if (INTEL_GEN(engine->i915) >= 12)
- return 0x60;
- else if (INTEL_GEN(engine->i915) >= 9)
- return 0x54;
- else if (engine->class == RENDER_CLASS)
- return 0x58;
- else
- return -1;
-}
-
static void
execlists_check_context(const struct intel_context *ce,
const struct intel_engine_cs *engine)
@@ -1132,7 +462,7 @@ execlists_check_context(const struct intel_context *ce,
valid = false;
}
- x = lrc_ring_mi_mode(engine);
+ x = intel_lrc_ring_mi_mode(engine);
if (x != -1 && regs[x + 1] & (regs[x + 1] >> 16) & STOP_RING) {
pr_err("%s: context submitted with STOP_RING [%08x] in RING_MI_MODE\n",
engine->name, regs[x + 1]);
@@ -1154,7 +484,7 @@ static void restore_default_state(struct intel_context *ce,
engine->pinned_default_state + LRC_STATE_PN * PAGE_SIZE,
engine->context_size - PAGE_SIZE);
- execlists_init_reg_state(regs, ce, engine, ce->ring, false);
+ intel_lrc_init_reg_state(regs, ce, engine, ce->ring, false);
}
static void reset_active(struct i915_request *rq,
@@ -1191,7 +521,7 @@ static void reset_active(struct i915_request *rq,
/* Scrub the context image to prevent replaying the previous batch */
restore_default_state(ce, engine);
- __execlists_update_reg_state(ce, engine);
+ intel_lrc_update_reg_state(ce, engine);
/* We've switched away, so this should be a no-op, but intent matters */
ce->lrc_desc |= CTX_DESC_FORCE_RESTORE;
@@ -1550,7 +880,7 @@ static bool can_merge_rq(const struct i915_request *prev,
static void virtual_update_register_offsets(u32 *regs,
struct intel_engine_cs *engine)
{
- set_offsets(regs, reg_offsets(engine), engine, false);
+ intel_lrc_set_reg_offsets(regs, engine);
}
static bool virtual_matches(const struct virtual_engine *ve,
@@ -2891,29 +2221,6 @@ static void execlists_context_unpin(struct intel_context *ce)
i915_gem_object_unpin_map(ce->state->obj);
}
-static void
-__execlists_update_reg_state(const struct intel_context *ce,
- const struct intel_engine_cs *engine)
-{
- struct intel_ring *ring = ce->ring;
- u32 *regs = ce->lrc_reg_state;
-
- GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->head));
- GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail));
-
- regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
- regs[CTX_RING_HEAD] = ring->head;
- regs[CTX_RING_TAIL] = ring->tail;
-
- /* RPCS */
- if (engine->class == RENDER_CLASS) {
- regs[CTX_R_PWR_CLK_STATE] =
- intel_sseu_make_rpcs(engine->i915, &ce->sseu);
-
- i915_oa_init_reg_state(ce, engine);
- }
-}
-
static int
__execlists_context_pin(struct intel_context *ce,
struct intel_engine_cs *engine)
@@ -2929,9 +2236,11 @@ __execlists_context_pin(struct intel_context *ce,
if (IS_ERR(vaddr))
return PTR_ERR(vaddr);
- ce->lrc_desc = lrc_descriptor(ce, engine) | CTX_DESC_FORCE_RESTORE;
+ ce->lrc_desc =
+ intel_lrc_descriptor(ce, engine) | CTX_DESC_FORCE_RESTORE;
+
ce->lrc_reg_state = vaddr + LRC_STATE_PN * PAGE_SIZE;
- __execlists_update_reg_state(ce, engine);
+ intel_lrc_update_reg_state(ce, engine);
return 0;
}
@@ -2970,9 +2279,9 @@ static void execlists_context_reset(struct intel_context *ce)
intel_ring_reset(ce->ring, ce->ring->emit);
/* Scrub away the garbage */
- execlists_init_reg_state(ce->lrc_reg_state,
+ intel_lrc_init_reg_state(ce->lrc_reg_state,
ce, ce->engine, ce->ring, true);
- __execlists_update_reg_state(ce, ce->engine);
+ intel_lrc_update_reg_state(ce, ce->engine);
ce->lrc_desc |= CTX_DESC_FORCE_RESTORE;
}
@@ -3052,330 +2361,6 @@ static int execlists_request_alloc(struct i915_request *request)
return 0;
}
-/*
- * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after
- * PIPE_CONTROL instruction. This is required for the flush to happen correctly
- * but there is a slight complication as this is applied in WA batch where the
- * values are only initialized once so we cannot take register value at the
- * beginning and reuse it further; hence we save its value to memory, upload a
- * constant value with bit21 set and then we restore it back with the saved value.
- * To simplify the WA, a constant value is formed by using the default value
- * of this register. This shouldn't be a problem because we are only modifying
- * it for a short period and this batch in non-premptible. We can ofcourse
- * use additional instructions that read the actual value of the register
- * at that time and set our bit of interest but it makes the WA complicated.
- *
- * This WA is also required for Gen9 so extracting as a function avoids
- * code duplication.
- */
-static u32 *
-gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch)
-{
- /* NB no one else is allowed to scribble over scratch + 256! */
- *batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
- *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
- *batch++ = intel_gt_scratch_offset(engine->gt,
- INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
- *batch++ = 0;
-
- *batch++ = MI_LOAD_REGISTER_IMM(1);
- *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
- *batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES;
-
- batch = gen8_emit_pipe_control(batch,
- PIPE_CONTROL_CS_STALL |
- PIPE_CONTROL_DC_FLUSH_ENABLE,
- 0);
-
- *batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
- *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
- *batch++ = intel_gt_scratch_offset(engine->gt,
- INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
- *batch++ = 0;
-
- return batch;
-}
-
-/*
- * Typically we only have one indirect_ctx and per_ctx batch buffer which are
- * initialized at the beginning and shared across all contexts but this field
- * helps us to have multiple batches at different offsets and select them based
- * on a criteria. At the moment this batch always start at the beginning of the page
- * and at this point we don't have multiple wa_ctx batch buffers.
- *
- * The number of WA applied are not known at the beginning; we use this field
- * to return the no of DWORDS written.
- *
- * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END
- * so it adds NOOPs as padding to make it cacheline aligned.
- * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together
- * makes a complete batch buffer.
- */
-static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
-{
- /* WaDisableCtxRestoreArbitration:bdw,chv */
- *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
-
- /* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */
- if (IS_BROADWELL(engine->i915))
- batch = gen8_emit_flush_coherentl3_wa(engine, batch);
-
- /* WaClearSlmSpaceAtContextSwitch:bdw,chv */
- /* Actual scratch location is at 128 bytes offset */
- batch = gen8_emit_pipe_control(batch,
- PIPE_CONTROL_FLUSH_L3 |
- PIPE_CONTROL_STORE_DATA_INDEX |
- PIPE_CONTROL_CS_STALL |
- PIPE_CONTROL_QW_WRITE,
- LRC_PPHWSP_SCRATCH_ADDR);
-
- *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
-
- /* Pad to end of cacheline */
- while ((unsigned long)batch % CACHELINE_BYTES)
- *batch++ = MI_NOOP;
-
- /*
- * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
- * execution depends on the length specified in terms of cache lines
- * in the register CTX_RCS_INDIRECT_CTX
- */
-
- return batch;
-}
-
-struct lri {
- i915_reg_t reg;
- u32 value;
-};
-
-static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count)
-{
- GEM_BUG_ON(!count || count > 63);
-
- *batch++ = MI_LOAD_REGISTER_IMM(count);
- do {
- *batch++ = i915_mmio_reg_offset(lri->reg);
- *batch++ = lri->value;
- } while (lri++, --count);
- *batch++ = MI_NOOP;
-
- return batch;
-}
-
-static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
-{
- static const struct lri lri[] = {
- /* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */
- {
- COMMON_SLICE_CHICKEN2,
- __MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE,
- 0),
- },
-
- /* BSpec: 11391 */
- {
- FF_SLICE_CHICKEN,
- __MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX,
- FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX),
- },
-
- /* BSpec: 11299 */
- {
- _3D_CHICKEN3,
- __MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX,
- _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX),
- }
- };
-
- *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
-
- /* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
- batch = gen8_emit_flush_coherentl3_wa(engine, batch);
-
- /* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */
- batch = gen8_emit_pipe_control(batch,
- PIPE_CONTROL_FLUSH_L3 |
- PIPE_CONTROL_STORE_DATA_INDEX |
- PIPE_CONTROL_CS_STALL |
- PIPE_CONTROL_QW_WRITE,
- LRC_PPHWSP_SCRATCH_ADDR);
-
- batch = emit_lri(batch, lri, ARRAY_SIZE(lri));
-
- /* WaMediaPoolStateCmdInWABB:bxt,glk */
- if (HAS_POOLED_EU(engine->i915)) {
- /*
- * EU pool configuration is setup along with golden context
- * during context initialization. This value depends on
- * device type (2x6 or 3x6) and needs to be updated based
- * on which subslice is disabled especially for 2x6
- * devices, however it is safe to load default
- * configuration of 3x6 device instead of masking off
- * corresponding bits because HW ignores bits of a disabled
- * subslice and drops down to appropriate config. Please
- * see render_state_setup() in i915_gem_render_state.c for
- * possible configurations, to avoid duplication they are
- * not shown here again.
- */
- *batch++ = GEN9_MEDIA_POOL_STATE;
- *batch++ = GEN9_MEDIA_POOL_ENABLE;
- *batch++ = 0x00777000;
- *batch++ = 0;
- *batch++ = 0;
- *batch++ = 0;
- }
-
- *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
-
- /* Pad to end of cacheline */
- while ((unsigned long)batch % CACHELINE_BYTES)
- *batch++ = MI_NOOP;
-
- return batch;
-}
-
-static u32 *
-gen10_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
-{
- int i;
-
- /*
- * WaPipeControlBefore3DStateSamplePattern: cnl
- *
- * Ensure the engine is idle prior to programming a
- * 3DSTATE_SAMPLE_PATTERN during a context restore.
- */
- batch = gen8_emit_pipe_control(batch,
- PIPE_CONTROL_CS_STALL,
- 0);
- /*
- * WaPipeControlBefore3DStateSamplePattern says we need 4 dwords for
- * the PIPE_CONTROL followed by 12 dwords of 0x0, so 16 dwords in
- * total. However, a PIPE_CONTROL is 6 dwords long, not 4, which is
- * confusing. Since gen8_emit_pipe_control() already advances the
- * batch by 6 dwords, we advance the other 10 here, completing a
- * cacheline. It's not clear if the workaround requires this padding
- * before other commands, or if it's just the regular padding we would
- * already have for the workaround bb, so leave it here for now.
- */
- for (i = 0; i < 10; i++)
- *batch++ = MI_NOOP;
-
- /* Pad to end of cacheline */
- while ((unsigned long)batch % CACHELINE_BYTES)
- *batch++ = MI_NOOP;
-
- return batch;
-}
-
-#define CTX_WA_BB_OBJ_SIZE (PAGE_SIZE)
-
-static int lrc_setup_wa_ctx(struct intel_engine_cs *engine)
-{
- struct drm_i915_gem_object *obj;
- struct i915_vma *vma;
- int err;
-
- obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_OBJ_SIZE);
- if (IS_ERR(obj))
- return PTR_ERR(obj);
-
- vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
- if (IS_ERR(vma)) {
- err = PTR_ERR(vma);
- goto err;
- }
-
- err = i915_vma_pin(vma, 0, 0, PIN_GLOBAL | PIN_HIGH);
- if (err)
- goto err;
-
- engine->wa_ctx.vma = vma;
- return 0;
-
-err:
- i915_gem_object_put(obj);
- return err;
-}
-
-static void lrc_destroy_wa_ctx(struct intel_engine_cs *engine)
-{
- i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0);
-}
-
-typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch);
-
-static int intel_init_workaround_bb(struct intel_engine_cs *engine)
-{
- struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
- struct i915_wa_ctx_bb *wa_bb[2] = { &wa_ctx->indirect_ctx,
- &wa_ctx->per_ctx };
- wa_bb_func_t wa_bb_fn[2];
- struct page *page;
- void *batch, *batch_ptr;
- unsigned int i;
- int ret;
-
- if (engine->class != RENDER_CLASS)
- return 0;
-
- switch (INTEL_GEN(engine->i915)) {
- case 12:
- case 11:
- return 0;
- case 10:
- wa_bb_fn[0] = gen10_init_indirectctx_bb;
- wa_bb_fn[1] = NULL;
- break;
- case 9:
- wa_bb_fn[0] = gen9_init_indirectctx_bb;
- wa_bb_fn[1] = NULL;
- break;
- case 8:
- wa_bb_fn[0] = gen8_init_indirectctx_bb;
- wa_bb_fn[1] = NULL;
- break;
- default:
- MISSING_CASE(INTEL_GEN(engine->i915));
- return 0;
- }
-
- ret = lrc_setup_wa_ctx(engine);
- if (ret) {
- DRM_DEBUG_DRIVER("Failed to setup context WA page: %d\n", ret);
- return ret;
- }
-
- page = i915_gem_object_get_dirty_page(wa_ctx->vma->obj, 0);
- batch = batch_ptr = kmap_atomic(page);
-
- /*
- * Emit the two workaround batch buffers, recording the offset from the
- * start of the workaround batch buffer object for each and their
- * respective sizes.
- */
- for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) {
- wa_bb[i]->offset = batch_ptr - batch;
- if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset,
- CACHELINE_BYTES))) {
- ret = -EINVAL;
- break;
- }
- if (wa_bb_fn[i])
- batch_ptr = wa_bb_fn[i](engine, batch_ptr);
- wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset);
- }
-
- BUG_ON(batch_ptr - batch > CTX_WA_BB_OBJ_SIZE);
-
- kunmap_atomic(batch);
- if (ret)
- lrc_destroy_wa_ctx(engine);
-
- return ret;
-}
-
static void enable_error_interrupt(struct intel_engine_cs *engine)
{
u32 status;
@@ -3524,25 +2509,6 @@ static void reset_csb_pointers(struct intel_engine_cs *engine)
&execlists->csb_status[reset_value]);
}
-static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine)
-{
- int x;
-
- x = lrc_ring_mi_mode(engine);
- if (x != -1) {
- regs[x + 1] &= ~STOP_RING;
- regs[x + 1] |= STOP_RING << 16;
- }
-}
-
-static void __execlists_reset_reg_state(const struct intel_context *ce,
- const struct intel_engine_cs *engine)
-{
- u32 *regs = ce->lrc_reg_state;
-
- __reset_stop_ring(regs, engine);
-}
-
static void __execlists_reset(struct intel_engine_cs *engine, bool stalled)
{
struct intel_engine_execlists * const execlists = &engine->execlists;
@@ -3630,8 +2596,8 @@ static void __execlists_reset(struct intel_engine_cs *engine, bool stalled)
ENGINE_TRACE(engine, "replay {head:%04x, tail:%04x}\n",
ce->ring->head, ce->ring->tail);
intel_ring_update_space(ce->ring);
- __execlists_reset_reg_state(ce, engine);
- __execlists_update_reg_state(ce, engine);
+ intel_lrc_reset_reg_state(ce, engine);
+ intel_lrc_update_reg_state(ce, engine);
ce->lrc_desc |= CTX_DESC_FORCE_RESTORE; /* paranoid: GPU was reset! */
unwind:
@@ -3760,67 +2726,6 @@ static void execlists_reset_finish(struct intel_engine_cs *engine)
atomic_read(&execlists->tasklet.count));
}
-static int gen8_emit_bb_start_noarb(struct i915_request *rq,
- u64 offset, u32 len,
- const unsigned int flags)
-{
- u32 *cs;
-
- cs = intel_ring_begin(rq, 4);
- if (IS_ERR(cs))
- return PTR_ERR(cs);
-
- /*
- * WaDisableCtxRestoreArbitration:bdw,chv
- *
- * We don't need to perform MI_ARB_ENABLE as often as we do (in
- * particular all the gen that do not need the w/a at all!), if we
- * took care to make sure that on every switch into this context
- * (both ordinary and for preemption) that arbitrartion was enabled
- * we would be fine. However, for gen8 there is another w/a that
- * requires us to not preempt inside GPGPU execution, so we keep
- * arbitration disabled for gen8 batches. Arbitration will be
- * re-enabled before we close the request
- * (engine->emit_fini_breadcrumb).
- */
- *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
-
- /* FIXME(BDW+): Address space and security selectors. */
- *cs++ = MI_BATCH_BUFFER_START_GEN8 |
- (flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
- *cs++ = lower_32_bits(offset);
- *cs++ = upper_32_bits(offset);
-
- intel_ring_advance(rq, cs);
-
- return 0;
-}
-
-static int gen8_emit_bb_start(struct i915_request *rq,
- u64 offset, u32 len,
- const unsigned int flags)
-{
- u32 *cs;
-
- cs = intel_ring_begin(rq, 6);
- if (IS_ERR(cs))
- return PTR_ERR(cs);
-
- *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
-
- *cs++ = MI_BATCH_BUFFER_START_GEN8 |
- (flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
- *cs++ = lower_32_bits(offset);
- *cs++ = upper_32_bits(offset);
-
- *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
- *cs++ = MI_NOOP;
-
- intel_ring_advance(rq, cs);
-
- return 0;
-}
-
static void gen8_logical_ring_enable_irq(struct intel_engine_cs *engine)
{
ENGINE_WRITE(engine, RING_IMR,
@@ -3833,249 +2738,6 @@ static void gen8_logical_ring_disable_irq(struct intel_engine_cs *engine)
ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask);
}
-static int gen8_emit_flush(struct i915_request *request, u32 mode)
-{
- u32 cmd, *cs;
-
- cs = intel_ring_begin(request, 4);
- if (IS_ERR(cs))
- return PTR_ERR(cs);
-
- cmd = MI_FLUSH_DW + 1;
-
- /* We always require a command barrier so that subsequent
- * commands, such as breadcrumb interrupts, are strictly ordered
- * wrt the contents of the write cache being flushed to memory
- * (and thus being coherent from the CPU).
- */
- cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
-
- if (mode & EMIT_INVALIDATE) {
- cmd |= MI_INVALIDATE_TLB;
- if (request->engine->class == VIDEO_DECODE_CLASS)
- cmd |= MI_INVALIDATE_BSD;
- }
-
- *cs++ = cmd;
- *cs++ = LRC_PPHWSP_SCRATCH_ADDR;
- *cs++ = 0; /* upper addr */
- *cs++ = 0; /* value */
- intel_ring_advance(request, cs);
-
- return 0;
-}
-
-static int gen8_emit_flush_render(struct i915_request *request,
- u32 mode)
-{
- bool vf_flush_wa = false, dc_flush_wa = false;
- u32 *cs, flags = 0;
- int len;
-
- flags |= PIPE_CONTROL_CS_STALL;
-
- if (mode & EMIT_FLUSH) {
- flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
- flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
- flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
- flags |= PIPE_CONTROL_FLUSH_ENABLE;
- }
-
- if (mode & EMIT_INVALIDATE) {
- flags |= PIPE_CONTROL_TLB_INVALIDATE;
- flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
- flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
- flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
- flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
- flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
- flags |= PIPE_CONTROL_QW_WRITE;
- flags |= PIPE_CONTROL_STORE_DATA_INDEX;
-
- /*
- * On GEN9: before VF_CACHE_INVALIDATE we need to emit a NULL
- * pipe control.
- */
- if (IS_GEN(request->i915, 9))
- vf_flush_wa = true;
-
- /* WaForGAMHang:kbl */
- if (IS_KBL_REVID(request->i915, 0, KBL_REVID_B0))
- dc_flush_wa = true;
- }
-
- len = 6;
-
- if (vf_flush_wa)
- len += 6;
-
- if (dc_flush_wa)
- len += 12;
-
- cs = intel_ring_begin(request, len);
- if (IS_ERR(cs))
- return PTR_ERR(cs);
-
- if (vf_flush_wa)
- cs = gen8_emit_pipe_control(cs, 0, 0);
-
- if (dc_flush_wa)
- cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_DC_FLUSH_ENABLE,
- 0);
-
- cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
-
- if (dc_flush_wa)
- cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_CS_STALL, 0);
-
- intel_ring_advance(request, cs);
-
- return 0;
-}
-
-static int gen11_emit_flush_render(struct i915_request *request,
- u32 mode)
-{
- if (mode & EMIT_FLUSH) {
- u32 *cs;
- u32 flags = 0;
-
- flags |= PIPE_CONTROL_CS_STALL;
-
- flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
- flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
- flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
- flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
- flags |= PIPE_CONTROL_FLUSH_ENABLE;
- flags |= PIPE_CONTROL_QW_WRITE;
- flags |= PIPE_CONTROL_STORE_DATA_INDEX;
-
- cs = intel_ring_begin(request, 6);
- if (IS_ERR(cs))
- return PTR_ERR(cs);
-
- cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
- intel_ring_advance(request, cs);
- }
-
- if (mode & EMIT_INVALIDATE) {
- u32 *cs;
- u32 flags = 0;
-
- flags |= PIPE_CONTROL_CS_STALL;
-
- flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
- flags |= PIPE_CONTROL_TLB_INVALIDATE;
- flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
- flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
- flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
- flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
- flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
- flags |= PIPE_CONTROL_QW_WRITE;
- flags |= PIPE_CONTROL_STORE_DATA_INDEX;
-
- cs = intel_ring_begin(request, 6);
- if (IS_ERR(cs))
- return PTR_ERR(cs);
-
- cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
- intel_ring_advance(request, cs);
- }
-
- return 0;
-}
-
-static u32 preparser_disable(bool state)
-{
- return MI_ARB_CHECK | 1 << 8 | state;
-}
-
-static int gen12_emit_flush_render(struct i915_request *request,
- u32 mode)
-{
- if (mode & EMIT_FLUSH) {
- u32 flags = 0;
- u32 *cs;
-
- flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
- flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
- flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
- /* Wa_1409600907:tgl */
- flags |= PIPE_CONTROL_DEPTH_STALL;
- flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
- flags |= PIPE_CONTROL_FLUSH_ENABLE;
- flags |= PIPE_CONTROL_HDC_PIPELINE_FLUSH;
-
- flags |= PIPE_CONTROL_STORE_DATA_INDEX;
- flags |= PIPE_CONTROL_QW_WRITE;
-
- flags |= PIPE_CONTROL_CS_STALL;
-
- cs = intel_ring_begin(request, 6);
- if (IS_ERR(cs))
- return PTR_ERR(cs);
-
- cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
- intel_ring_advance(request, cs);
- }
-
- if (mode & EMIT_INVALIDATE) {
- u32 flags = 0;
- u32 *cs;
-
- flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
- flags |= PIPE_CONTROL_TLB_INVALIDATE;
- flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
- flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
- flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
- flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
- flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
- flags |= PIPE_CONTROL_L3_RO_CACHE_INVALIDATE;
-
- flags |= PIPE_CONTROL_STORE_DATA_INDEX;
- flags |= PIPE_CONTROL_QW_WRITE;
-
- flags |= PIPE_CONTROL_CS_STALL;
-
- cs = intel_ring_begin(request, 8);
- if (IS_ERR(cs))
- return PTR_ERR(cs);
-
- /*
- * Prevent the pre-parser from skipping past the TLB
- * invalidate and loading a stale page for the batch
- * buffer / request payload.
- */
- *cs++ = preparser_disable(true);
-
- cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
-
- *cs++ = preparser_disable(false);
- intel_ring_advance(request, cs);
-
- /*
- * Wa_1604544889:tgl
- */
- if (IS_TGL_REVID(request->i915, TGL_REVID_A0, TGL_REVID_A0)) {
- flags = 0;
- flags |= PIPE_CONTROL_CS_STALL;
- flags |= PIPE_CONTROL_HDC_PIPELINE_FLUSH;
-
- flags |= PIPE_CONTROL_STORE_DATA_INDEX;
- flags |= PIPE_CONTROL_QW_WRITE;
-
- cs = intel_ring_begin(request, 6);
- if (IS_ERR(cs))
- return PTR_ERR(cs);
-
- cs = gen8_emit_pipe_control(cs, flags,
- LRC_PPHWSP_SCRATCH_ADDR);
- intel_ring_advance(request, cs);
- }
- }
-
- return 0;
-}
-
/*
* Reserve space for 2 NOOPs at the end of each request to be
* used as a workaround for not being allowed to do lite
@@ -4291,11 +2953,10 @@ static void execlists_release(struct intel_engine_cs *engine)
execlists_shutdown(engine);
intel_engine_cleanup_common(engine);
- lrc_destroy_wa_ctx(engine);
+ intel_engine_fini_workaround_bb(engine);
}
-static void
-logical_ring_default_vfuncs(struct intel_engine_cs *engine)
+static void execlists_default_vfuncs(struct intel_engine_cs *engine)
{
/* Default vfuncs which can be overriden by each engine. */
@@ -4304,7 +2965,7 @@ logical_ring_default_vfuncs(struct intel_engine_cs *engine)
engine->cops = &execlists_context_ops;
engine->request_alloc = execlists_request_alloc;
- engine->emit_flush = gen8_emit_flush;
+ engine->emit_flush = gen8_emit_flush_xcs;
engine->emit_init_breadcrumb = gen8_emit_init_breadcrumb;
engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb;
if (INTEL_GEN(engine->i915) >= 12)
@@ -4325,8 +2986,7 @@ logical_ring_default_vfuncs(struct intel_engine_cs *engine)
}
}
-static inline void
-logical_ring_default_irqs(struct intel_engine_cs *engine)
+static inline void execlists_default_irqs(struct intel_engine_cs *engine)
{
unsigned int shift = 0;
@@ -4352,15 +3012,19 @@ static void rcs_submission_override(struct intel_engine_cs *engine)
{
switch (INTEL_GEN(engine->i915)) {
case 12:
- engine->emit_flush = gen12_emit_flush_render;
+ engine->emit_flush = gen12_emit_flush_rcs;
engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb_rcs;
break;
case 11:
- engine->emit_flush = gen11_emit_flush_render;
+ engine->emit_flush = gen11_emit_flush_rcs;
engine->emit_fini_breadcrumb = gen11_emit_fini_breadcrumb_rcs;
break;
+ case 9:
+ engine->emit_flush = gen9_emit_flush_rcs;
+ engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb_rcs;
+ break;
default:
- engine->emit_flush = gen8_emit_flush_render;
+ engine->emit_flush = gen8_emit_flush_rcs;
engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb_rcs;
break;
}
@@ -4378,13 +3042,13 @@ int intel_execlists_submission_setup(struct intel_engine_cs *engine)
timer_setup(&engine->execlists.timer, execlists_timeslice, 0);
timer_setup(&engine->execlists.preempt, execlists_preempt, 0);
- logical_ring_default_vfuncs(engine);
- logical_ring_default_irqs(engine);
+ execlists_default_vfuncs(engine);
+ execlists_default_irqs(engine);
if (engine->class == RENDER_CLASS)
rcs_submission_override(engine);
- if (intel_init_workaround_bb(engine))
+ if (intel_engine_init_workaround_bb(engine))
/*
* We continue even if we fail to initialize WA batch
* because we only expect rare glitches but nothing
@@ -4421,137 +3085,6 @@ int intel_execlists_submission_setup(struct intel_engine_cs *engine)
return 0;
}
-static u32 intel_lr_indirect_ctx_offset(const struct intel_engine_cs *engine)
-{
- u32 indirect_ctx_offset;
-
- switch (INTEL_GEN(engine->i915)) {
- default:
- MISSING_CASE(INTEL_GEN(engine->i915));
- /* fall through */
- case 12:
- indirect_ctx_offset =
- GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
- break;
- case 11:
- indirect_ctx_offset =
- GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
- break;
- case 10:
- indirect_ctx_offset =
- GEN10_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
- break;
- case 9:
- indirect_ctx_offset =
- GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
- break;
- case 8:
- indirect_ctx_offset =
- GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
- break;
- }
-
- return indirect_ctx_offset;
-}
-
-
-static void init_common_reg_state(u32 * const regs,
- const struct intel_engine_cs *engine,
- const struct intel_ring *ring,
- bool inhibit)
-{
- u32 ctl;
-
- ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH);
- ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
- if (inhibit)
- ctl |= CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT;
- if (INTEL_GEN(engine->i915) < 11)
- ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT |
- CTX_CTRL_RS_CTX_ENABLE);
- regs[CTX_CONTEXT_CONTROL] = ctl;
-
- regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
-}
-
-static void init_wa_bb_reg_state(u32 * const regs,
- const struct intel_engine_cs *engine,
- u32 pos_bb_per_ctx)
-{
- const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx;
-
- if (wa_ctx->per_ctx.size) {
- const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
-
- regs[pos_bb_per_ctx] =
- (ggtt_offset + wa_ctx->per_ctx.offset) | 0x01;
- }
-
- if (wa_ctx->indirect_ctx.size) {
- const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
-
- regs[pos_bb_per_ctx + 2] =
- (ggtt_offset + wa_ctx->indirect_ctx.offset) |
- (wa_ctx->indirect_ctx.size / CACHELINE_BYTES);
-
- regs[pos_bb_per_ctx + 4] =
- intel_lr_indirect_ctx_offset(engine) << 6;
- }
-}
-
-static void init_ppgtt_reg_state(u32 *regs, const struct i915_ppgtt *ppgtt)
-{
- if (i915_vm_is_4lvl(&ppgtt->vm)) {
- /* 64b PPGTT (48bit canonical)
- * PDP0_DESCRIPTOR contains the base address to PML4 and
- * other PDP Descriptors are ignored.
- */
- ASSIGN_CTX_PML4(ppgtt, regs);
- } else {
- ASSIGN_CTX_PDP(ppgtt, regs, 3);
- ASSIGN_CTX_PDP(ppgtt, regs, 2);
- ASSIGN_CTX_PDP(ppgtt, regs, 1);
- ASSIGN_CTX_PDP(ppgtt, regs, 0);
- }
-}
-
-static struct i915_ppgtt *vm_alias(struct i915_address_space *vm)
-{
- if (i915_is_ggtt(vm))
- return i915_vm_to_ggtt(vm)->alias;
- else
- return i915_vm_to_ppgtt(vm);
-}
-
-static void execlists_init_reg_state(u32 *regs,
- const struct intel_context *ce,
- const struct intel_engine_cs *engine,
- const struct intel_ring *ring,
- bool inhibit)
-{
- /*
- * A context is actually a big batch buffer with several
- * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The
- * values we are setting here are only for the first context restore:
- * on a subsequent save, the GPU will recreate this batchbuffer with new
- * values (including all the missing MI_LOAD_REGISTER_IMM commands that
- * we are not initializing here).
- *
- * Must keep consistent with virtual_update_register_offsets().
- */
- set_offsets(regs, reg_offsets(engine), engine, inhibit);
-
- init_common_reg_state(regs, engine, ring, inhibit);
- init_ppgtt_reg_state(regs, vm_alias(ce->vm));
-
- init_wa_bb_reg_state(regs, engine,
- INTEL_GEN(engine->i915) >= 12 ?
- GEN12_CTX_BB_PER_CTX_PTR :
- CTX_BB_PER_CTX_PTR);
-
- __reset_stop_ring(regs, engine);
-}
-
static int
populate_lr_context(struct intel_context *ce,
struct drm_i915_gem_object *ctx_obj,
@@ -4589,7 +3122,7 @@ populate_lr_context(struct intel_context *ce,
/* The second page of the context object contains some fields which must
* be set up prior to the first execution. */
- execlists_init_reg_state(vaddr + LRC_STATE_PN * PAGE_SIZE,
+ intel_lrc_init_reg_state(vaddr + LRC_STATE_PN * PAGE_SIZE,
ce, engine, ring, inhibit);
ret = 0;
@@ -5257,31 +3790,6 @@ void intel_execlists_show_requests(struct intel_engine_cs *engine,
spin_unlock_irqrestore(&engine->active.lock, flags);
}
-void intel_lr_context_reset(struct intel_engine_cs *engine,
- struct intel_context *ce,
- u32 head,
- bool scrub)
-{
- GEM_BUG_ON(!intel_context_is_pinned(ce));
-
- /*
- * We want a simple context + ring to execute the breadcrumb update.
- * We cannot rely on the context being intact across the GPU hang,
- * so clear it and rebuild just what we need for the breadcrumb.
- * All pending requests for this context will be zapped, and any
- * future request will be after userspace has had the opportunity
- * to recreate its own state.
- */
- if (scrub)
- restore_default_state(ce, engine);
-
- /* Rerun the request; its payload has been neutered (if guilty). */
- ce->ring->head = head;
- intel_ring_update_space(ce->ring);
-
- __execlists_update_reg_state(ce, engine);
-}
-
bool
intel_engine_in_execlists_submission_mode(const struct intel_engine_cs *engine)
{
diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
new file mode 100644
index 000000000000..fa12d71a199e
--- /dev/null
+++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
@@ -0,0 +1,830 @@
+// SPDX-License-Identifier: MIT
+/*
+ * Copyright © 2014 Intel Corporation
+ */
+
+/**
+ * DOC: Logical Rings, Logical Ring Contexts and Execlists
+ *
+ * Motivation:
+ * GEN8 brings an expansion of the HW contexts: "Logical Ring Contexts".
+ * These expanded contexts enable a number of new abilities, especially
+ * "Execlists" (also implemented in this file).
+ *
+ * One of the main differences with the legacy HW contexts is that logical
+ * ring contexts incorporate many more things to the context's state, like
+ * PDPs or ringbuffer control registers:
+ *
+ * The reason why PDPs are included in the context is straightforward: as
+ * PPGTTs (per-process GTTs) are actually per-context, having the PDPs
+ * contained there mean you don't need to do a ppgtt->switch_mm yourself,
+ * instead, the GPU will do it for you on the context switch.
+ *
+ * But, what about the ringbuffer control registers (head, tail, etc..)?
+ * shouldn't we just need a set of those per engine command streamer? This is
+ * where the name "Logical Rings" starts to make sense: by virtualizing the
+ * rings, the engine cs shifts to a new "ring buffer" with every context
+ * switch. When you want to submit a workload to the GPU you: A) choose your
+ * context, B) find its appropriate virtualized ring, C) write commands to it
+ * and then, finally, D) tell the GPU to switch to that context.
+ *
+ * Instead of the legacy MI_SET_CONTEXT, the way you tell the GPU to switch
+ * to a contexts is via a context execution list, ergo "Execlists".
+ *
+ * LRC implementation:
+ * Regarding the creation of contexts, we have:
+ *
+ * - One global default context.
+ * - One local default context for each opened fd.
+ * - One local extra context for each context create ioctl call.
+ *
+ * Now that ringbuffers belong per-context (and not per-engine, like before)
+ * and that contexts are uniquely tied to a given engine (and not reusable,
+ * like before) we need:
+ *
+ * - One ringbuffer per-engine inside each context.
+ * - One backing object per-engine inside each context.
+ *
+ * The global default context starts its life with these new objects fully
+ * allocated and populated. The local default context for each opened fd is
+ * more complex, because we don't know at creation time which engine is going
+ * to use them. To handle this, we have implemented a deferred creation of LR
+ * contexts:
+ *
+ * The local context starts its life as a hollow or blank holder, that only
+ * gets populated for a given engine once we receive an execbuffer. If later
+ * on we receive another execbuffer ioctl for the same context but a different
+ * engine, we allocate/populate a new ringbuffer and context backing object and
+ * so on.
+ *
+ * Finally, regarding local contexts created using the ioctl call: as they are
+ * only allowed with the render ring, we can allocate & populate them right
+ * away (no need to defer anything, at least for now).
+ *
+ * Execlists implementation:
+ * Execlists are the new method by which, on gen8+ hardware, workloads are
+ * submitted for execution (as opposed to the legacy, ringbuffer-based, method).
+ * This method works as follows:
+ *
+ * When a request is committed, its commands (the BB start and any leading or
+ * trailing commands, like the seqno breadcrumbs) are placed in the ringbuffer
+ * for the appropriate context. The tail pointer in the hardware context is not
+ * updated at this time, but instead, kept by the driver in the ringbuffer
+ * structure. A structure representing this request is added to a request queue
+ * for the appropriate engine: this structure contains a copy of the context's
+ * tail after the request was written to the ring buffer and a pointer to the
+ * context itself.
+ *
+ * If the engine's request queue was empty before the request was added, the
+ * queue is processed immediately. Otherwise the queue will be processed during
+ * a context switch interrupt. In any case, elements on the queue will get sent
+ * (in pairs) to the GPU's ExecLists Submit Port (ELSP, for short) with a
+ * globally unique 20-bits submission ID.
+ *
+ * When execution of a request completes, the GPU updates the context status
+ * buffer with a context complete event and generates a context switch
+ * interrupt. During the interrupt handling, the driver examines the events in
+ * the buffer: for each context complete event, if the announced ID matches
+ * that on the head of the request queue, then that request is retired and
+ * removed from the queue.
+ *
+ * After processing, if any requests were retired and the queue is not empty
+ * then a new execution list can be submitted. The two requests at the front of
+ * the queue are next to be submitted but since a context may not occur twice
+ * in an execution list, if subsequent requests have the same ID as the first
+ * then the two requests must be combined. This is done simply by discarding
+ * requests at the head of the queue until either only one requests is left
+ * (in which case we use a NULL second context) or the first two requests have
+ * unique IDs.
+ *
+ * By always executing the first two requests in the queue the driver ensures
+ * that the GPU is kept as busy as possible. In the case where a single context
+ * completes but a second context is still executing, the request for this
+ * second context will be at the head of the queue when we remove the first
+ * one. This request will then be resubmitted along with a new request for a
+ * different context, which will cause the hardware to continue executing the
+ * second request and queue the new request (the GPU detects the condition of a
+ * context getting preempted with the same context and optimizes the context
+ * switch flow by not doing preemption, but just sampling the new tail pointer).
+ */
+
+#include <linux/mm.h>
+
+#include "i915_drv.h"
+#include "i915_perf.h"
+#include "intel_context.h"
+#include "intel_engine.h"
+#include "intel_lrc.h"
+#include "intel_lrc_reg.h"
+#include "intel_ring.h"
+#include "intel_sseu.h"
+
+static inline unsigned int dword_in_page(void *addr)
+{
+ return offset_in_page(addr) / sizeof(u32);
+}
+
+static void set_offsets(u32 *regs,
+ const u8 *data,
+ const struct intel_engine_cs *engine,
+ bool clear)
+#define NOP(x) (BIT(7) | (x))
+#define LRI(count, flags) ((flags) << 6 | (count) | BUILD_BUG_ON_ZERO(count >= BIT(6)))
+#define POSTED BIT(0)
+#define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
+#define REG16(x) \
+ (((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
+ (((x) >> 2) & 0x7f)
+#define END(x) 0, (x)
+{
+ const u32 base = engine->mmio_base;
+
+ while (*data) {
+ u8 count, flags;
+
+ if (*data & BIT(7)) { /* skip */
+ count = *data++ & ~BIT(7);
+ if (clear)
+ memset32(regs, MI_NOOP, count);
+ regs += count;
+ continue;
+ }
+
+ count = *data & 0x3f;
+ flags = *data >> 6;
+ data++;
+
+ *regs = MI_LOAD_REGISTER_IMM(count);
+ if (flags & POSTED)
+ *regs |= MI_LRI_FORCE_POSTED;
+ if (INTEL_GEN(engine->i915) >= 11)
+ *regs |= MI_LRI_CS_MMIO;
+ regs++;
+
+ GEM_BUG_ON(!count);
+ do {
+ u32 offset = 0;
+ u8 v;
+
+ do {
+ v = *data++;
+ offset <<= 7;
+ offset |= v & ~BIT(7);
+ } while (v & BIT(7));
+
+ regs[0] = base + (offset << 2);
+ if (clear)
+ regs[1] = 0;
+ regs += 2;
+ } while (--count);
+ }
+
+ if (clear) {
+ u8 count = *++data;
+
+ /* Clear past the tail for HW access */
+ GEM_BUG_ON(dword_in_page(regs) > count);
+ memset32(regs, MI_NOOP, count - dword_in_page(regs));
+
+ /* Close the batch; used mainly by live_lrc_layout() */
+ *regs = MI_BATCH_BUFFER_END;
+ if (INTEL_GEN(engine->i915) >= 10)
+ *regs |= BIT(0);
+ }
+}
+
+static const u8 gen8_xcs_offsets[] = {
+ NOP(1),
+ LRI(11, 0),
+ REG16(0x244),
+ REG(0x034),
+ REG(0x030),
+ REG(0x038),
+ REG(0x03c),
+ REG(0x168),
+ REG(0x140),
+ REG(0x110),
+ REG(0x11c),
+ REG(0x114),
+ REG(0x118),
+
+ NOP(9),
+ LRI(9, 0),
+ REG16(0x3a8),
+ REG16(0x28c),
+ REG16(0x288),
+ REG16(0x284),
+ REG16(0x280),
+ REG16(0x27c),
+ REG16(0x278),
+ REG16(0x274),
+ REG16(0x270),
+
+ NOP(13),
+ LRI(2, 0),
+ REG16(0x200),
+ REG(0x028),
+
+ END(80)
+};
+
+static const u8 gen9_xcs_offsets[] = {
+ NOP(1),
+ LRI(14, POSTED),
+ REG16(0x244),
+ REG(0x034),
+ REG(0x030),
+ REG(0x038),
+ REG(0x03c),
+ REG(0x168),
+ REG(0x140),
+ REG(0x110),
+ REG(0x11c),
+ REG(0x114),
+ REG(0x118),
+ REG(0x1c0),
+ REG(0x1c4),
+ REG(0x1c8),
+
+ NOP(3),
+ LRI(9, POSTED),
+ REG16(0x3a8),
+ REG16(0x28c),
+ REG16(0x288),
+ REG16(0x284),
+ REG16(0x280),
+ REG16(0x27c),
+ REG16(0x278),
+ REG16(0x274),
+ REG16(0x270),
+
+ NOP(13),
+ LRI(1, POSTED),
+ REG16(0x200),
+
+ NOP(13),
+ LRI(44, POSTED),
+ REG(0x028),
+ REG(0x09c),
+ REG(0x0c0),
+ REG(0x178),
+ REG(0x17c),
+ REG16(0x358),
+ REG(0x170),
+ REG(0x150),
+ REG(0x154),
+ REG(0x158),
+ REG16(0x41c),
+ REG16(0x600),
+ REG16(0x604),
+ REG16(0x608),
+ REG16(0x60c),
+ REG16(0x610),
+ REG16(0x614),
+ REG16(0x618),
+ REG16(0x61c),
+ REG16(0x620),
+ REG16(0x624),
+ REG16(0x628),
+ REG16(0x62c),
+ REG16(0x630),
+ REG16(0x634),
+ REG16(0x638),
+ REG16(0x63c),
+ REG16(0x640),
+ REG16(0x644),
+ REG16(0x648),
+ REG16(0x64c),
+ REG16(0x650),
+ REG16(0x654),
+ REG16(0x658),
+ REG16(0x65c),
+ REG16(0x660),
+ REG16(0x664),
+ REG16(0x668),
+ REG16(0x66c),
+ REG16(0x670),
+ REG16(0x674),
+ REG16(0x678),
+ REG16(0x67c),
+ REG(0x068),
+
+ END(176)
+};
+
+static const u8 gen12_xcs_offsets[] = {
+ NOP(1),
+ LRI(13, POSTED),
+ REG16(0x244),
+ REG(0x034),
+ REG(0x030),
+ REG(0x038),
+ REG(0x03c),
+ REG(0x168),
+ REG(0x140),
+ REG(0x110),
+ REG(0x1c0),
+ REG(0x1c4),
+ REG(0x1c8),
+ REG(0x180),
+ REG16(0x2b4),
+
+ NOP(5),
+ LRI(9, POSTED),
+ REG16(0x3a8),
+ REG16(0x28c),
+ REG16(0x288),
+ REG16(0x284),
+ REG16(0x280),
+ REG16(0x27c),
+ REG16(0x278),
+ REG16(0x274),
+ REG16(0x270),
+
+ END(80)
+};
+
+static const u8 gen8_rcs_offsets[] = {
+ NOP(1),
+ LRI(14, POSTED),
+ REG16(0x244),
+ REG(0x034),
+ REG(0x030),
+ REG(0x038),
+ REG(0x03c),
+ REG(0x168),
+ REG(0x140),
+ REG(0x110),
+ REG(0x11c),
+ REG(0x114),
+ REG(0x118),
+ REG(0x1c0),
+ REG(0x1c4),
+ REG(0x1c8),
+
+ NOP(3),
+ LRI(9, POSTED),
+ REG16(0x3a8),
+ REG16(0x28c),
+ REG16(0x288),
+ REG16(0x284),
+ REG16(0x280),
+ REG16(0x27c),
+ REG16(0x278),
+ REG16(0x274),
+ REG16(0x270),
+
+ NOP(13),
+ LRI(1, 0),
+ REG(0x0c8),
+
+ END(80)
+};
+
+static const u8 gen9_rcs_offsets[] = {
+ NOP(1),
+ LRI(14, POSTED),
+ REG16(0x244),
+ REG(0x34),
+ REG(0x30),
+ REG(0x38),
+ REG(0x3c),
+ REG(0x168),
+ REG(0x140),
+ REG(0x110),
+ REG(0x11c),
+ REG(0x114),
+ REG(0x118),
+ REG(0x1c0),
+ REG(0x1c4),
+ REG(0x1c8),
+
+ NOP(3),
+ LRI(9, POSTED),
+ REG16(0x3a8),
+ REG16(0x28c),
+ REG16(0x288),
+ REG16(0x284),
+ REG16(0x280),
+ REG16(0x27c),
+ REG16(0x278),
+ REG16(0x274),
+ REG16(0x270),
+
+ NOP(13),
+ LRI(1, 0),
+ REG(0xc8),
+
+ NOP(13),
+ LRI(44, POSTED),
+ REG(0x28),
+ REG(0x9c),
+ REG(0xc0),
+ REG(0x178),
+ REG(0x17c),
+ REG16(0x358),
+ REG(0x170),
+ REG(0x150),
+ REG(0x154),
+ REG(0x158),
+ REG16(0x41c),
+ REG16(0x600),
+ REG16(0x604),
+ REG16(0x608),
+ REG16(0x60c),
+ REG16(0x610),
+ REG16(0x614),
+ REG16(0x618),
+ REG16(0x61c),
+ REG16(0x620),
+ REG16(0x624),
+ REG16(0x628),
+ REG16(0x62c),
+ REG16(0x630),
+ REG16(0x634),
+ REG16(0x638),
+ REG16(0x63c),
+ REG16(0x640),
+ REG16(0x644),
+ REG16(0x648),
+ REG16(0x64c),
+ REG16(0x650),
+ REG16(0x654),
+ REG16(0x658),
+ REG16(0x65c),
+ REG16(0x660),
+ REG16(0x664),
+ REG16(0x668),
+ REG16(0x66c),
+ REG16(0x670),
+ REG16(0x674),
+ REG16(0x678),
+ REG16(0x67c),
+ REG(0x68),
+
+ END(176)
+};
+
+static const u8 gen11_rcs_offsets[] = {
+ NOP(1),
+ LRI(15, POSTED),
+ REG16(0x244),
+ REG(0x034),
+ REG(0x030),
+ REG(0x038),
+ REG(0x03c),
+ REG(0x168),
+ REG(0x140),
+ REG(0x110),
+ REG(0x11c),
+ REG(0x114),
+ REG(0x118),
+ REG(0x1c0),
+ REG(0x1c4),
+ REG(0x1c8),
+ REG(0x180),
+
+ NOP(1),
+ LRI(9, POSTED),
+ REG16(0x3a8),
+ REG16(0x28c),
+ REG16(0x288),
+ REG16(0x284),
+ REG16(0x280),
+ REG16(0x27c),
+ REG16(0x278),
+ REG16(0x274),
+ REG16(0x270),
+
+ LRI(1, POSTED),
+ REG(0x1b0),
+
+ NOP(10),
+ LRI(1, 0),
+ REG(0x0c8),
+
+ END(80)
+};
+
+static const u8 gen12_rcs_offsets[] = {
+ NOP(1),
+ LRI(13, POSTED),
+ REG16(0x244),
+ REG(0x034),
+ REG(0x030),
+ REG(0x038),
+ REG(0x03c),
+ REG(0x168),
+ REG(0x140),
+ REG(0x110),
+ REG(0x1c0),
+ REG(0x1c4),
+ REG(0x1c8),
+ REG(0x180),
+ REG16(0x2b4),
+
+ NOP(5),
+ LRI(9, POSTED),
+ REG16(0x3a8),
+ REG16(0x28c),
+ REG16(0x288),
+ REG16(0x284),
+ REG16(0x280),
+ REG16(0x27c),
+ REG16(0x278),
+ REG16(0x274),
+ REG16(0x270),
+
+ LRI(3, POSTED),
+ REG(0x1b0),
+ REG16(0x5a8),
+ REG16(0x5ac),
+
+ NOP(6),
+ LRI(1, 0),
+ REG(0x0c8),
+
+ END(80)
+};
+
+#undef END
+#undef REG16
+#undef REG
+#undef LRI
+#undef NOP
+
+static const u8 *reg_offsets(const struct intel_engine_cs *engine)
+{
+ /*
+ * The gen12+ lists only have the registers we program in the basic
+ * default state. We rely on the context image using relative
+ * addressing to automatic fixup the register state between the
+ * physical engines for virtual engine.
+ */
+ GEM_BUG_ON(INTEL_GEN(engine->i915) >= 12 &&
+ !intel_engine_has_relative_mmio(engine));
+
+ if (engine->class == RENDER_CLASS) {
+ if (INTEL_GEN(engine->i915) >= 12)
+ return gen12_rcs_offsets;
+ else if (INTEL_GEN(engine->i915) >= 11)
+ return gen11_rcs_offsets;
+ else if (INTEL_GEN(engine->i915) >= 9)
+ return gen9_rcs_offsets;
+ else
+ return gen8_rcs_offsets;
+ } else {
+ if (INTEL_GEN(engine->i915) >= 12)
+ return gen12_xcs_offsets;
+ else if (INTEL_GEN(engine->i915) >= 9)
+ return gen9_xcs_offsets;
+ else
+ return gen8_xcs_offsets;
+ }
+}
+
+int intel_lrc_ring_mi_mode(const struct intel_engine_cs *engine)
+{
+ if (INTEL_GEN(engine->i915) >= 12)
+ return 0x60;
+ else if (INTEL_GEN(engine->i915) >= 9)
+ return 0x54;
+ else if (engine->class == RENDER_CLASS)
+ return 0x58;
+ else
+ return -1;
+}
+
+static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine)
+{
+ int x;
+
+ x = intel_lrc_ring_mi_mode(engine);
+ if (x != -1) {
+ regs[x + 1] &= ~STOP_RING;
+ regs[x + 1] |= STOP_RING << 16;
+ }
+}
+
+static void init_common_reg_state(u32 * const regs,
+ const struct intel_engine_cs *engine,
+ const struct intel_ring *ring,
+ bool inhibit)
+{
+ u32 ctl;
+
+ ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH);
+ ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
+ if (inhibit)
+ ctl |= CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT;
+ if (INTEL_GEN(engine->i915) < 11)
+ ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT |
+ CTX_CTRL_RS_CTX_ENABLE);
+ regs[CTX_CONTEXT_CONTROL] = ctl;
+
+ regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
+}
+
+static u32 lrc_indirect_ctx_offset(const struct intel_engine_cs *engine)
+{
+ u32 indirect_ctx_offset;
+
+ switch (INTEL_GEN(engine->i915)) {
+ default:
+ MISSING_CASE(INTEL_GEN(engine->i915));
+ /* fall through */
+ case 12:
+ indirect_ctx_offset =
+ GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
+ break;
+ case 11:
+ indirect_ctx_offset =
+ GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
+ break;
+ case 10:
+ indirect_ctx_offset =
+ GEN10_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
+ break;
+ case 9:
+ indirect_ctx_offset =
+ GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
+ break;
+ case 8:
+ indirect_ctx_offset =
+ GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
+ break;
+ }
+
+ return indirect_ctx_offset;
+}
+
+static void init_wa_bb_reg_state(u32 * const regs,
+ const struct intel_engine_cs *engine,
+ u32 pos_bb_per_ctx)
+{
+ const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx;
+
+ if (wa_ctx->per_ctx.size) {
+ const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
+
+ regs[pos_bb_per_ctx] =
+ (ggtt_offset + wa_ctx->per_ctx.offset) | 0x01;
+ }
+
+ if (wa_ctx->indirect_ctx.size) {
+ const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
+
+ regs[pos_bb_per_ctx + 2] =
+ (ggtt_offset + wa_ctx->indirect_ctx.offset) |
+ (wa_ctx->indirect_ctx.size / CACHELINE_BYTES);
+
+ regs[pos_bb_per_ctx + 4] = lrc_indirect_ctx_offset(engine) << 6;
+ }
+}
+
+static void init_ppgtt_reg_state(u32 *regs, const struct i915_ppgtt *ppgtt)
+{
+ if (i915_vm_is_4lvl(&ppgtt->vm)) {
+ /* 64b PPGTT (48bit canonical)
+ * PDP0_DESCRIPTOR contains the base address to PML4 and
+ * other PDP Descriptors are ignored.
+ */
+ ASSIGN_CTX_PML4(ppgtt, regs);
+ } else {
+ ASSIGN_CTX_PDP(ppgtt, regs, 3);
+ ASSIGN_CTX_PDP(ppgtt, regs, 2);
+ ASSIGN_CTX_PDP(ppgtt, regs, 1);
+ ASSIGN_CTX_PDP(ppgtt, regs, 0);
+ }
+}
+
+static struct i915_ppgtt *vm_alias(struct i915_address_space *vm)
+{
+ if (i915_is_ggtt(vm))
+ return i915_vm_to_ggtt(vm)->alias;
+ else
+ return i915_vm_to_ppgtt(vm);
+}
+
+void intel_lrc_init_reg_state(u32 *regs,
+ const struct intel_context *ce,
+ const struct intel_engine_cs *engine,
+ const struct intel_ring *ring,
+ bool inhibit)
+{
+ /*
+ * A context is actually a big batch buffer with several
+ * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The
+ * values we are setting here are only for the first context restore:
+ * on a subsequent save, the GPU will recreate this batchbuffer with new
+ * values (including all the missing MI_LOAD_REGISTER_IMM commands that
+ * we are not initializing here).
+ */
+ set_offsets(regs, reg_offsets(engine), engine, inhibit);
+
+ init_common_reg_state(regs, engine, ring, inhibit);
+ init_ppgtt_reg_state(regs, vm_alias(ce->vm));
+
+ init_wa_bb_reg_state(regs, engine,
+ INTEL_GEN(engine->i915) >= 12 ?
+ GEN12_CTX_BB_PER_CTX_PTR :
+ CTX_BB_PER_CTX_PTR);
+
+ __reset_stop_ring(regs, engine);
+}
+
+void intel_lrc_update_reg_state(const struct intel_context *ce,
+ const struct intel_engine_cs *engine)
+{
+ const struct intel_ring *ring = ce->ring;
+ u32 *regs = ce->lrc_reg_state;
+
+ GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->head));
+ GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail));
+
+ regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
+ regs[CTX_RING_HEAD] = ring->head;
+ regs[CTX_RING_TAIL] = ring->tail;
+
+ /* RPCS */
+ if (engine->class == RENDER_CLASS) {
+ regs[CTX_R_PWR_CLK_STATE] =
+ intel_sseu_make_rpcs(engine->i915, &ce->sseu);
+
+ i915_oa_init_reg_state(ce, engine);
+ }
+}
+
+void intel_lrc_reset_reg_state(const struct intel_context *ce,
+ const struct intel_engine_cs *engine)
+{
+ u32 *regs = ce->lrc_reg_state;
+
+ __reset_stop_ring(regs, engine);
+}
+
+void intel_lrc_set_reg_offsets(u32 *regs, struct intel_engine_cs *engine)
+{
+ set_offsets(regs, reg_offsets(engine), engine, false);
+}
+
+/*
+ * The context descriptor encodes various attributes of a context,
+ * including its GTT address and some flags. Because it's fairly
+ * expensive to calculate, we'll just do it once and cache the result,
+ * which remains valid until the context is unpinned.
+ *
+ * This is what a descriptor looks like, from LSB to MSB::
+ *
+ * bits 0-11: flags, GEN8_CTX_* (cached in ctx->desc_template)
+ * bits 12-31: LRCA, GTT address of (the HWSP of) this context
+ * bits 32-52: ctx ID, a globally unique tag (highest bit used by GuC)
+ * bits 53-54: mbz, reserved for use by hardware
+ * bits 55-63: group ID, currently unused and set to 0
+ *
+ * Starting from Gen11, the upper dword of the descriptor has a new format:
+ *
+ * bits 32-36: reserved
+ * bits 37-47: SW context ID
+ * bits 48:53: engine instance
+ * bit 54: mbz, reserved for use by hardware
+ * bits 55-60: SW counter
+ * bits 61-63: engine class
+ *
+ * engine info, SW context ID and SW counter need to form a unique number
+ * (Context ID) per lrc.
+ */
+u64 intel_lrc_descriptor(struct intel_context *ce,
+ struct intel_engine_cs *engine)
+{
+ u64 desc;
+
+ desc = INTEL_LEGACY_32B_CONTEXT;
+ if (i915_vm_is_4lvl(ce->vm))
+ desc = INTEL_LEGACY_64B_CONTEXT;
+ desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT;
+
+ desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE;
+ if (IS_GEN(engine->i915, 8))
+ desc |= GEN8_CTX_L3LLC_COHERENT;
+
+ desc |= i915_ggtt_offset(ce->state); /* bits 12-31 */
+ /*
+ * The following 32bits are copied into the OA reports (dword 2).
+ * Consider updating oa_get_render_ctx_id in i915_perf.c when changing
+ * anything below.
+ */
+ if (INTEL_GEN(engine->i915) >= 11) {
+ desc |= (u64)engine->instance << GEN11_ENGINE_INSTANCE_SHIFT;
+ /* bits 48-53 */
+
+ desc |= (u64)engine->class << GEN11_ENGINE_CLASS_SHIFT;
+ /* bits 61-63 */
+ }
+
+ return desc;
+}
+
+#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
+#include "selftest_lrc.c"
+#endif
diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.h b/drivers/gpu/drm/i915/gt/intel_lrc.h
index dfbc214e14f5..e617196c802a 100644
--- a/drivers/gpu/drm/i915/gt/intel_lrc.h
+++ b/drivers/gpu/drm/i915/gt/intel_lrc.h
@@ -33,6 +33,7 @@ struct i915_gem_context;
struct i915_request;
struct intel_context;
struct intel_engine_cs;
+struct intel_ring;
/* Execlists regs */
#define RING_ELSP(base) _MMIO((base) + 0x230)
@@ -50,12 +51,13 @@ struct intel_engine_cs;
#define EL_CTRL_LOAD (1 << 0)
-/* The docs specify that the write pointer wraps around after 5h, "After status
+/*
+ * The docs specify that the write pointer wraps around after 5h, "After status
* is written out to the last available status QW at offset 5h, this pointer
* wraps to 0."
*
* Therefore, one must infer than even though there are 3 bits available, 6 and
- * 7 appear to be * reserved.
+ * 7 appear to be reserved.
*/
#define GEN8_CSB_ENTRIES 6
#define GEN8_CSB_PTR_MASK 0x7
@@ -95,12 +97,25 @@ int intel_execlists_submission_setup(struct intel_engine_cs *engine);
#define LRC_PPHWSP_SCRATCH 0x34
#define LRC_PPHWSP_SCRATCH_ADDR (LRC_PPHWSP_SCRATCH * sizeof(u32))
-void intel_execlists_set_default_submission(struct intel_engine_cs *engine);
+int intel_lrc_ring_mi_mode(const struct intel_engine_cs *engine);
+
+void intel_lrc_set_reg_offsets(u32 *regs, struct intel_engine_cs *engine);
+
+void intel_lrc_init_reg_state(u32 *reg_state,
+ const struct intel_context *ce,
+ const struct intel_engine_cs *engine,
+ const struct intel_ring *ring,
+ bool inhibit);
-void intel_lr_context_reset(struct intel_engine_cs *engine,
- struct intel_context *ce,
- u32 head,
- bool scrub);
+void intel_lrc_update_reg_state(const struct intel_context *ce,
+ const struct intel_engine_cs *engine);
+void intel_lrc_reset_reg_state(const struct intel_context *ce,
+ const struct intel_engine_cs *engine);
+
+u64 intel_lrc_descriptor(struct intel_context *ce,
+ struct intel_engine_cs *engine);
+
+void intel_execlists_set_default_submission(struct intel_engine_cs *engine);
void intel_execlists_show_requests(struct intel_engine_cs *engine,
struct drm_printer *m,
@@ -127,4 +142,7 @@ intel_virtual_engine_get_sibling(struct intel_engine_cs *engine,
bool
intel_engine_in_execlists_submission_mode(const struct intel_engine_cs *engine);
+int intel_engine_init_workaround_bb(struct intel_engine_cs *engine);
+void intel_engine_fini_workaround_bb(struct intel_engine_cs *engine);
+
#endif /* _INTEL_LRC_H_ */
diff --git a/drivers/gpu/drm/i915/gt/selftest_execlists.c b/drivers/gpu/drm/i915/gt/selftest_execlists.c
index 5e04ecb61dcc..eac2b7f09080 100644
--- a/drivers/gpu/drm/i915/gt/selftest_execlists.c
+++ b/drivers/gpu/drm/i915/gt/selftest_execlists.c
@@ -6,9 +6,10 @@
#include <linux/prime_numbers.h>
-#include "gem/i915_gem_pm.h"
#include "gt/intel_engine_heartbeat.h"
+#include "gt/intel_engine_pm.h"
#include "gt/intel_reset.h"
+#include "gt/intel_ring.h"
#include "i915_selftest.h"
#include "selftests/i915_random.h"
@@ -201,7 +202,7 @@ static int live_unlite_restore(struct intel_gt *gt, int prio)
}
GEM_BUG_ON(!ce[1]->ring->size);
intel_ring_reset(ce[1]->ring, ce[1]->ring->size / 2);
- __execlists_update_reg_state(ce[1], engine);
+ intel_lrc_update_reg_state(ce[1], engine);
rq[0] = igt_spinner_create_request(&spin, ce[0], MI_ARB_CHECK);
if (IS_ERR(rq[0])) {
@@ -3741,496 +3742,3 @@ int intel_execlists_live_selftests(struct drm_i915_private *i915)
return intel_gt_live_subtests(tests, &i915->gt);
}
-
-static void hexdump(const void *buf, size_t len)
-{
- const size_t rowsize = 8 * sizeof(u32);
- const void *prev = NULL;
- bool skip = false;
- size_t pos;
-
- for (pos = 0; pos < len; pos += rowsize) {
- char line[128];
-
- if (prev && !memcmp(prev, buf + pos, rowsize)) {
- if (!skip) {
- pr_info("*\n");
- skip = true;
- }
- continue;
- }
-
- WARN_ON_ONCE(hex_dump_to_buffer(buf + pos, len - pos,
- rowsize, sizeof(u32),
- line, sizeof(line),
- false) >= sizeof(line));
- pr_info("[%04zx] %s\n", pos, line);
-
- prev = buf + pos;
- skip = false;
- }
-}
-
-static int live_lrc_layout(void *arg)
-{
- struct intel_gt *gt = arg;
- struct intel_engine_cs *engine;
- enum intel_engine_id id;
- u32 *lrc;
- int err;
-
- /*
- * Check the registers offsets we use to create the initial reg state
- * match the layout saved by HW.
- */
-
- lrc = kmalloc(PAGE_SIZE, GFP_KERNEL);
- if (!lrc)
- return -ENOMEM;
-
- err = 0;
- for_each_engine(engine, gt, id) {
- u32 *hw;
- int dw;
-
- if (!engine->default_state)
- continue;
-
- hw = i915_gem_object_pin_map(engine->default_state,
- I915_MAP_WB);
- if (IS_ERR(hw)) {
- err = PTR_ERR(hw);
- break;
- }
- hw += LRC_STATE_PN * PAGE_SIZE / sizeof(*hw);
-
- execlists_init_reg_state(memset(lrc, POISON_INUSE, PAGE_SIZE),
- engine->kernel_context,
- engine,
- engine->kernel_context->ring,
- true);
-
- dw = 0;
- do {
- u32 lri = hw[dw];
-
- if (lri == 0) {
- dw++;
- continue;
- }
-
- if (lrc[dw] == 0) {
- pr_debug("%s: skipped instruction %x at dword %d\n",
- engine->name, lri, dw);
- dw++;
- continue;
- }
-
- if ((lri & GENMASK(31, 23)) != MI_INSTR(0x22, 0)) {
- pr_err("%s: Expected LRI command at dword %d, found %08x\n",
- engine->name, dw, lri);
- err = -EINVAL;
- break;
- }
-
- if (lrc[dw] != lri) {
- pr_err("%s: LRI command mismatch at dword %d, expected %08x found %08x\n",
- engine->name, dw, lri, lrc[dw]);
- err = -EINVAL;
- break;
- }
-
- lri &= 0x7f;
- lri++;
- dw++;
-
- while (lri) {
- if (hw[dw] != lrc[dw]) {
- pr_err("%s: Different registers found at dword %d, expected %x, found %x\n",
- engine->name, dw, hw[dw], lrc[dw]);
- err = -EINVAL;
- break;
- }
-
- /*
- * Skip over the actual register value as we
- * expect that to differ.
- */
- dw += 2;
- lri -= 2;
- }
- } while ((lrc[dw] & ~BIT(0)) != MI_BATCH_BUFFER_END);
-
- if (err) {
- pr_info("%s: HW register image:\n", engine->name);
- hexdump(hw, PAGE_SIZE);
-
- pr_info("%s: SW register image:\n", engine->name);
- hexdump(lrc, PAGE_SIZE);
- }
-
- i915_gem_object_unpin_map(engine->default_state);
- if (err)
- break;
- }
-
- kfree(lrc);
- return err;
-}
-
-static int find_offset(const u32 *lri, u32 offset)
-{
- int i;
-
- for (i = 0; i < PAGE_SIZE / sizeof(u32); i++)
- if (lri[i] == offset)
- return i;
-
- return -1;
-}
-
-static int live_lrc_fixed(void *arg)
-{
- struct intel_gt *gt = arg;
- struct intel_engine_cs *engine;
- enum intel_engine_id id;
- int err = 0;
-
- /*
- * Check the assumed register offsets match the actual locations in
- * the context image.
- */
-
- for_each_engine(engine, gt, id) {
- const struct {
- u32 reg;
- u32 offset;
- const char *name;
- } tbl[] = {
- {
- i915_mmio_reg_offset(RING_START(engine->mmio_base)),
- CTX_RING_START - 1,
- "RING_START"
- },
- {
- i915_mmio_reg_offset(RING_CTL(engine->mmio_base)),
- CTX_RING_CTL - 1,
- "RING_CTL"
- },
- {
- i915_mmio_reg_offset(RING_HEAD(engine->mmio_base)),
- CTX_RING_HEAD - 1,
- "RING_HEAD"
- },
- {
- i915_mmio_reg_offset(RING_TAIL(engine->mmio_base)),
- CTX_RING_TAIL - 1,
- "RING_TAIL"
- },
- {
- i915_mmio_reg_offset(RING_MI_MODE(engine->mmio_base)),
- lrc_ring_mi_mode(engine),
- "RING_MI_MODE"
- },
- {
- i915_mmio_reg_offset(RING_BBSTATE(engine->mmio_base)),
- CTX_BB_STATE - 1,
- "BB_STATE"
- },
- { },
- }, *t;
- u32 *hw;
-
- if (!engine->default_state)
- continue;
-
- hw = i915_gem_object_pin_map(engine->default_state,
- I915_MAP_WB);
- if (IS_ERR(hw)) {
- err = PTR_ERR(hw);
- break;
- }
- hw += LRC_STATE_PN * PAGE_SIZE / sizeof(*hw);
-
- for (t = tbl; t->name; t++) {
- int dw = find_offset(hw, t->reg);
-
- if (dw != t->offset) {
- pr_err("%s: Offset for %s [0x%x] mismatch, found %x, expected %x\n",
- engine->name,
- t->name,
- t->reg,
- dw,
- t->offset);
- err = -EINVAL;
- }
- }
-
- i915_gem_object_unpin_map(engine->default_state);
- }
-
- return err;
-}
-
-static int __live_lrc_state(struct intel_engine_cs *engine,
- struct i915_vma *scratch)
-{
- struct intel_context *ce;
- struct i915_request *rq;
- enum {
- RING_START_IDX = 0,
- RING_TAIL_IDX,
- MAX_IDX
- };
- u32 expected[MAX_IDX];
- u32 *cs;
- int err;
- int n;
-
- ce = intel_context_create(engine);
- if (IS_ERR(ce))
- return PTR_ERR(ce);
-
- err = intel_context_pin(ce);
- if (err)
- goto err_put;
-
- rq = i915_request_create(ce);
- if (IS_ERR(rq)) {
- err = PTR_ERR(rq);
- goto err_unpin;
- }
-
- cs = intel_ring_begin(rq, 4 * MAX_IDX);
- if (IS_ERR(cs)) {
- err = PTR_ERR(cs);
- i915_request_add(rq);
- goto err_unpin;
- }
-
- *cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
- *cs++ = i915_mmio_reg_offset(RING_START(engine->mmio_base));
- *cs++ = i915_ggtt_offset(scratch) + RING_START_IDX * sizeof(u32);
- *cs++ = 0;
-
- expected[RING_START_IDX] = i915_ggtt_offset(ce->ring->vma);
-
- *cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
- *cs++ = i915_mmio_reg_offset(RING_TAIL(engine->mmio_base));
- *cs++ = i915_ggtt_offset(scratch) + RING_TAIL_IDX * sizeof(u32);
- *cs++ = 0;
-
- i915_request_get(rq);
- i915_request_add(rq);
-
- intel_engine_flush_submission(engine);
- expected[RING_TAIL_IDX] = ce->ring->tail;
-
- if (i915_request_wait(rq, 0, HZ / 5) < 0) {
- err = -ETIME;
- goto err_rq;
- }
-
- cs = i915_gem_object_pin_map(scratch->obj, I915_MAP_WB);
- if (IS_ERR(cs)) {
- err = PTR_ERR(cs);
- goto err_rq;
- }
-
- for (n = 0; n < MAX_IDX; n++) {
- if (cs[n] != expected[n]) {
- pr_err("%s: Stored register[%d] value[0x%x] did not match expected[0x%x]\n",
- engine->name, n, cs[n], expected[n]);
- err = -EINVAL;
- break;
- }
- }
-
- i915_gem_object_unpin_map(scratch->obj);
-
-err_rq:
- i915_request_put(rq);
-err_unpin:
- intel_context_unpin(ce);
-err_put:
- intel_context_put(ce);
- return err;
-}
-
-static int live_lrc_state(void *arg)
-{
- struct intel_gt *gt = arg;
- struct intel_engine_cs *engine;
- struct i915_vma *scratch;
- enum intel_engine_id id;
- int err = 0;
-
- /*
- * Check the live register state matches what we expect for this
- * intel_context.
- */
-
- scratch = create_scratch(gt);
- if (IS_ERR(scratch))
- return PTR_ERR(scratch);
-
- for_each_engine(engine, gt, id) {
- err = __live_lrc_state(engine, scratch);
- if (err)
- break;
- }
-
- if (igt_flush_test(gt->i915))
- err = -EIO;
-
- i915_vma_unpin_and_release(&scratch, 0);
- return err;
-}
-
-static int gpr_make_dirty(struct intel_engine_cs *engine)
-{
- struct i915_request *rq;
- u32 *cs;
- int n;
-
- rq = intel_engine_create_kernel_request(engine);
- if (IS_ERR(rq))
- return PTR_ERR(rq);
-
- cs = intel_ring_begin(rq, 2 * NUM_GPR_DW + 2);
- if (IS_ERR(cs)) {
- i915_request_add(rq);
- return PTR_ERR(cs);
- }
-
- *cs++ = MI_LOAD_REGISTER_IMM(NUM_GPR_DW);
- for (n = 0; n < NUM_GPR_DW; n++) {
- *cs++ = CS_GPR(engine, n);
- *cs++ = STACK_MAGIC;
- }
- *cs++ = MI_NOOP;
-
- intel_ring_advance(rq, cs);
- i915_request_add(rq);
-
- return 0;
-}
-
-static int __live_gpr_clear(struct intel_engine_cs *engine,
- struct i915_vma *scratch)
-{
- struct intel_context *ce;
- struct i915_request *rq;
- u32 *cs;
- int err;
- int n;
-
- if (INTEL_GEN(engine->i915) < 9 && engine->class != RENDER_CLASS)
- return 0; /* GPR only on rcs0 for gen8 */
-
- err = gpr_make_dirty(engine);
- if (err)
- return err;
-
- ce = intel_context_create(engine);
- if (IS_ERR(ce))
- return PTR_ERR(ce);
-
- rq = intel_context_create_request(ce);
- if (IS_ERR(rq)) {
- err = PTR_ERR(rq);
- goto err_put;
- }
-
- cs = intel_ring_begin(rq, 4 * NUM_GPR_DW);
- if (IS_ERR(cs)) {
- err = PTR_ERR(cs);
- i915_request_add(rq);
- goto err_put;
- }
-
- for (n = 0; n < NUM_GPR_DW; n++) {
- *cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
- *cs++ = CS_GPR(engine, n);
- *cs++ = i915_ggtt_offset(scratch) + n * sizeof(u32);
- *cs++ = 0;
- }
-
- i915_request_get(rq);
- i915_request_add(rq);
-
- if (i915_request_wait(rq, 0, HZ / 5) < 0) {
- err = -ETIME;
- goto err_rq;
- }
-
- cs = i915_gem_object_pin_map(scratch->obj, I915_MAP_WB);
- if (IS_ERR(cs)) {
- err = PTR_ERR(cs);
- goto err_rq;
- }
-
- for (n = 0; n < NUM_GPR_DW; n++) {
- if (cs[n]) {
- pr_err("%s: GPR[%d].%s was not zero, found 0x%08x!\n",
- engine->name,
- n / 2, n & 1 ? "udw" : "ldw",
- cs[n]);
- err = -EINVAL;
- break;
- }
- }
-
- i915_gem_object_unpin_map(scratch->obj);
-
-err_rq:
- i915_request_put(rq);
-err_put:
- intel_context_put(ce);
- return err;
-}
-
-static int live_gpr_clear(void *arg)
-{
- struct intel_gt *gt = arg;
- struct intel_engine_cs *engine;
- struct i915_vma *scratch;
- enum intel_engine_id id;
- int err = 0;
-
- /*
- * Check that GPR registers are cleared in new contexts as we need
- * to avoid leaking any information from previous contexts.
- */
-
- scratch = create_scratch(gt);
- if (IS_ERR(scratch))
- return PTR_ERR(scratch);
-
- for_each_engine(engine, gt, id) {
- err = __live_gpr_clear(engine, scratch);
- if (err)
- break;
- }
-
- if (igt_flush_test(gt->i915))
- err = -EIO;
-
- i915_vma_unpin_and_release(&scratch, 0);
- return err;
-}
-
-int intel_lrc_live_selftests(struct drm_i915_private *i915)
-{
- static const struct i915_subtest tests[] = {
- SUBTEST(live_lrc_layout),
- SUBTEST(live_lrc_fixed),
- SUBTEST(live_lrc_state),
- SUBTEST(live_gpr_clear),
- };
-
- if (!HAS_LOGICAL_RING_CONTEXTS(i915))
- return 0;
-
- return intel_gt_live_subtests(tests, &i915->gt);
-}
diff --git a/drivers/gpu/drm/i915/gt/selftest_lrc.c b/drivers/gpu/drm/i915/gt/selftest_lrc.c
new file mode 100644
index 000000000000..d367f286db16
--- /dev/null
+++ b/drivers/gpu/drm/i915/gt/selftest_lrc.c
@@ -0,0 +1,544 @@
+// SPDX-License-Identifier: MIT
+/*
+ * Copyright © 2018 Intel Corporation
+ */
+
+#include <linux/prime_numbers.h>
+
+#include "gt/intel_engine_heartbeat.h"
+#include "gt/intel_engine_pm.h"
+#include "gt/intel_reset.h"
+#include "gt/intel_ring.h"
+
+#include "i915_selftest.h"
+#include "selftests/i915_random.h"
+#include "selftests/igt_flush_test.h"
+#include "selftests/igt_live_test.h"
+#include "selftests/igt_spinner.h"
+#include "selftests/lib_sw_fence.h"
+
+#include "gem/selftests/igt_gem_utils.h"
+#include "gem/selftests/mock_context.h"
+
+#define CS_GPR(engine, n) ((engine)->mmio_base + 0x600 + (n) * 4)
+#define NUM_GPR_DW (16 * 2) /* each GPR is 2 dwords */
+
+static struct i915_vma *create_scratch(struct intel_gt *gt)
+{
+ struct drm_i915_gem_object *obj;
+ struct i915_vma *vma;
+ int err;
+
+ obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
+ if (IS_ERR(obj))
+ return ERR_CAST(obj);
+
+ i915_gem_object_set_cache_coherency(obj, I915_CACHING_CACHED);
+
+ vma = i915_vma_instance(obj, >->ggtt->vm, NULL);
+ if (IS_ERR(vma)) {
+ i915_gem_object_put(obj);
+ return vma;
+ }
+
+ err = i915_vma_pin(vma, 0, 0, PIN_GLOBAL);
+ if (err) {
+ i915_gem_object_put(obj);
+ return ERR_PTR(err);
+ }
+
+ return vma;
+}
+
+static void hexdump(const void *buf, size_t len)
+{
+ const size_t rowsize = 8 * sizeof(u32);
+ const void *prev = NULL;
+ bool skip = false;
+ size_t pos;
+
+ for (pos = 0; pos < len; pos += rowsize) {
+ char line[128];
+
+ if (prev && !memcmp(prev, buf + pos, rowsize)) {
+ if (!skip) {
+ pr_info("*\n");
+ skip = true;
+ }
+ continue;
+ }
+
+ WARN_ON_ONCE(hex_dump_to_buffer(buf + pos, len - pos,
+ rowsize, sizeof(u32),
+ line, sizeof(line),
+ false) >= sizeof(line));
+ pr_info("[%04zx] %s\n", pos, line);
+
+ prev = buf + pos;
+ skip = false;
+ }
+}
+
+static int live_lrc_layout(void *arg)
+{
+ struct intel_gt *gt = arg;
+ struct intel_engine_cs *engine;
+ enum intel_engine_id id;
+ u32 *lrc;
+ int err;
+
+ /*
+ * Check the registers offsets we use to create the initial reg state
+ * match the layout saved by HW.
+ */
+
+ lrc = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ if (!lrc)
+ return -ENOMEM;
+
+ err = 0;
+ for_each_engine(engine, gt, id) {
+ u32 *hw;
+ int dw;
+
+ if (!engine->default_state)
+ continue;
+
+ hw = i915_gem_object_pin_map(engine->default_state,
+ I915_MAP_WB);
+ if (IS_ERR(hw)) {
+ err = PTR_ERR(hw);
+ break;
+ }
+ hw += LRC_STATE_PN * PAGE_SIZE / sizeof(*hw);
+
+ intel_lrc_init_reg_state(memset(lrc, POISON_INUSE, PAGE_SIZE),
+ engine->kernel_context,
+ engine,
+ engine->kernel_context->ring,
+ true);
+
+ dw = 0;
+ do {
+ u32 lri = hw[dw];
+
+ if (lri == 0) {
+ dw++;
+ continue;
+ }
+
+ if (lrc[dw] == 0) {
+ pr_debug("%s: skipped instruction %x at dword %d\n",
+ engine->name, lri, dw);
+ dw++;
+ continue;
+ }
+
+ if ((lri & GENMASK(31, 23)) != MI_INSTR(0x22, 0)) {
+ pr_err("%s: Expected LRI command at dword %d, found %08x\n",
+ engine->name, dw, lri);
+ err = -EINVAL;
+ break;
+ }
+
+ if (lrc[dw] != lri) {
+ pr_err("%s: LRI command mismatch at dword %d, expected %08x found %08x\n",
+ engine->name, dw, lri, lrc[dw]);
+ err = -EINVAL;
+ break;
+ }
+
+ lri &= 0x7f;
+ lri++;
+ dw++;
+
+ while (lri) {
+ if (hw[dw] != lrc[dw]) {
+ pr_err("%s: Different registers found at dword %d, expected %x, found %x\n",
+ engine->name, dw, hw[dw], lrc[dw]);
+ err = -EINVAL;
+ break;
+ }
+
+ /*
+ * Skip over the actual register value as we
+ * expect that to differ.
+ */
+ dw += 2;
+ lri -= 2;
+ }
+ } while ((lrc[dw] & ~BIT(0)) != MI_BATCH_BUFFER_END);
+
+ if (err) {
+ pr_info("%s: HW register image:\n", engine->name);
+ hexdump(hw, PAGE_SIZE);
+
+ pr_info("%s: SW register image:\n", engine->name);
+ hexdump(lrc, PAGE_SIZE);
+ }
+
+ i915_gem_object_unpin_map(engine->default_state);
+ if (err)
+ break;
+ }
+
+ kfree(lrc);
+ return err;
+}
+
+static int find_offset(const u32 *lri, u32 offset)
+{
+ int i;
+
+ for (i = 0; i < PAGE_SIZE / sizeof(u32); i++)
+ if (lri[i] == offset)
+ return i;
+
+ return -1;
+}
+
+static int live_lrc_fixed(void *arg)
+{
+ struct intel_gt *gt = arg;
+ struct intel_engine_cs *engine;
+ enum intel_engine_id id;
+ int err = 0;
+
+ /*
+ * Check the assumed register offsets match the actual locations in
+ * the context image.
+ */
+
+ for_each_engine(engine, gt, id) {
+ const struct {
+ u32 reg;
+ u32 offset;
+ const char *name;
+ } tbl[] = {
+ {
+ i915_mmio_reg_offset(RING_START(engine->mmio_base)),
+ CTX_RING_START - 1,
+ "RING_START"
+ },
+ {
+ i915_mmio_reg_offset(RING_CTL(engine->mmio_base)),
+ CTX_RING_CTL - 1,
+ "RING_CTL"
+ },
+ {
+ i915_mmio_reg_offset(RING_HEAD(engine->mmio_base)),
+ CTX_RING_HEAD - 1,
+ "RING_HEAD"
+ },
+ {
+ i915_mmio_reg_offset(RING_TAIL(engine->mmio_base)),
+ CTX_RING_TAIL - 1,
+ "RING_TAIL"
+ },
+ {
+ i915_mmio_reg_offset(RING_MI_MODE(engine->mmio_base)),
+ intel_lrc_ring_mi_mode(engine),
+ "RING_MI_MODE"
+ },
+ {
+ i915_mmio_reg_offset(RING_BBSTATE(engine->mmio_base)),
+ CTX_BB_STATE - 1,
+ "BB_STATE"
+ },
+ { },
+ }, *t;
+ u32 *hw;
+
+ if (!engine->default_state)
+ continue;
+
+ hw = i915_gem_object_pin_map(engine->default_state,
+ I915_MAP_WB);
+ if (IS_ERR(hw)) {
+ err = PTR_ERR(hw);
+ break;
+ }
+ hw += LRC_STATE_PN * PAGE_SIZE / sizeof(*hw);
+
+ for (t = tbl; t->name; t++) {
+ int dw = find_offset(hw, t->reg);
+
+ if (dw != t->offset) {
+ pr_err("%s: Offset for %s [0x%x] mismatch, found %x, expected %x\n",
+ engine->name,
+ t->name,
+ t->reg,
+ dw,
+ t->offset);
+ err = -EINVAL;
+ }
+ }
+
+ i915_gem_object_unpin_map(engine->default_state);
+ }
+
+ return err;
+}
+
+static int __live_lrc_state(struct intel_engine_cs *engine,
+ struct i915_vma *scratch)
+{
+ struct intel_context *ce;
+ struct i915_request *rq;
+ enum {
+ RING_START_IDX = 0,
+ RING_TAIL_IDX,
+ MAX_IDX
+ };
+ u32 expected[MAX_IDX];
+ u32 *cs;
+ int err;
+ int n;
+
+ ce = intel_context_create(engine);
+ if (IS_ERR(ce))
+ return PTR_ERR(ce);
+
+ err = intel_context_pin(ce);
+ if (err)
+ goto err_put;
+
+ rq = i915_request_create(ce);
+ if (IS_ERR(rq)) {
+ err = PTR_ERR(rq);
+ goto err_unpin;
+ }
+
+ cs = intel_ring_begin(rq, 4 * MAX_IDX);
+ if (IS_ERR(cs)) {
+ err = PTR_ERR(cs);
+ i915_request_add(rq);
+ goto err_unpin;
+ }
+
+ *cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+ *cs++ = i915_mmio_reg_offset(RING_START(engine->mmio_base));
+ *cs++ = i915_ggtt_offset(scratch) + RING_START_IDX * sizeof(u32);
+ *cs++ = 0;
+
+ expected[RING_START_IDX] = i915_ggtt_offset(ce->ring->vma);
+
+ *cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+ *cs++ = i915_mmio_reg_offset(RING_TAIL(engine->mmio_base));
+ *cs++ = i915_ggtt_offset(scratch) + RING_TAIL_IDX * sizeof(u32);
+ *cs++ = 0;
+
+ i915_request_get(rq);
+ i915_request_add(rq);
+
+ intel_engine_flush_submission(engine);
+ expected[RING_TAIL_IDX] = ce->ring->tail;
+
+ if (i915_request_wait(rq, 0, HZ / 5) < 0) {
+ err = -ETIME;
+ goto err_rq;
+ }
+
+ cs = i915_gem_object_pin_map(scratch->obj, I915_MAP_WB);
+ if (IS_ERR(cs)) {
+ err = PTR_ERR(cs);
+ goto err_rq;
+ }
+
+ for (n = 0; n < MAX_IDX; n++) {
+ if (cs[n] != expected[n]) {
+ pr_err("%s: Stored register[%d] value[0x%x] did not match expected[0x%x]\n",
+ engine->name, n, cs[n], expected[n]);
+ err = -EINVAL;
+ break;
+ }
+ }
+
+ i915_gem_object_unpin_map(scratch->obj);
+
+err_rq:
+ i915_request_put(rq);
+err_unpin:
+ intel_context_unpin(ce);
+err_put:
+ intel_context_put(ce);
+ return err;
+}
+
+static int live_lrc_state(void *arg)
+{
+ struct intel_gt *gt = arg;
+ struct intel_engine_cs *engine;
+ struct i915_vma *scratch;
+ enum intel_engine_id id;
+ int err = 0;
+
+ /*
+ * Check the live register state matches what we expect for this
+ * intel_context.
+ */
+
+ scratch = create_scratch(gt);
+ if (IS_ERR(scratch))
+ return PTR_ERR(scratch);
+
+ for_each_engine(engine, gt, id) {
+ err = __live_lrc_state(engine, scratch);
+ if (err)
+ break;
+ }
+
+ if (igt_flush_test(gt->i915))
+ err = -EIO;
+
+ i915_vma_unpin_and_release(&scratch, 0);
+ return err;
+}
+
+static int gpr_make_dirty(struct intel_engine_cs *engine)
+{
+ struct i915_request *rq;
+ u32 *cs;
+ int n;
+
+ rq = intel_engine_create_kernel_request(engine);
+ if (IS_ERR(rq))
+ return PTR_ERR(rq);
+
+ cs = intel_ring_begin(rq, 2 * NUM_GPR_DW + 2);
+ if (IS_ERR(cs)) {
+ i915_request_add(rq);
+ return PTR_ERR(cs);
+ }
+
+ *cs++ = MI_LOAD_REGISTER_IMM(NUM_GPR_DW);
+ for (n = 0; n < NUM_GPR_DW; n++) {
+ *cs++ = CS_GPR(engine, n);
+ *cs++ = STACK_MAGIC;
+ }
+ *cs++ = MI_NOOP;
+
+ intel_ring_advance(rq, cs);
+ i915_request_add(rq);
+
+ return 0;
+}
+
+static int __live_gpr_clear(struct intel_engine_cs *engine,
+ struct i915_vma *scratch)
+{
+ struct intel_context *ce;
+ struct i915_request *rq;
+ u32 *cs;
+ int err;
+ int n;
+
+ if (INTEL_GEN(engine->i915) < 9 && engine->class != RENDER_CLASS)
+ return 0; /* GPR only on rcs0 for gen8 */
+
+ err = gpr_make_dirty(engine);
+ if (err)
+ return err;
+
+ ce = intel_context_create(engine);
+ if (IS_ERR(ce))
+ return PTR_ERR(ce);
+
+ rq = intel_context_create_request(ce);
+ if (IS_ERR(rq)) {
+ err = PTR_ERR(rq);
+ goto err_put;
+ }
+
+ cs = intel_ring_begin(rq, 4 * NUM_GPR_DW);
+ if (IS_ERR(cs)) {
+ err = PTR_ERR(cs);
+ i915_request_add(rq);
+ goto err_put;
+ }
+
+ for (n = 0; n < NUM_GPR_DW; n++) {
+ *cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+ *cs++ = CS_GPR(engine, n);
+ *cs++ = i915_ggtt_offset(scratch) + n * sizeof(u32);
+ *cs++ = 0;
+ }
+
+ i915_request_get(rq);
+ i915_request_add(rq);
+
+ if (i915_request_wait(rq, 0, HZ / 5) < 0) {
+ err = -ETIME;
+ goto err_rq;
+ }
+
+ cs = i915_gem_object_pin_map(scratch->obj, I915_MAP_WB);
+ if (IS_ERR(cs)) {
+ err = PTR_ERR(cs);
+ goto err_rq;
+ }
+
+ for (n = 0; n < NUM_GPR_DW; n++) {
+ if (cs[n]) {
+ pr_err("%s: GPR[%d].%s was not zero, found 0x%08x!\n",
+ engine->name,
+ n / 2, n & 1 ? "udw" : "ldw",
+ cs[n]);
+ err = -EINVAL;
+ break;
+ }
+ }
+
+ i915_gem_object_unpin_map(scratch->obj);
+
+err_rq:
+ i915_request_put(rq);
+err_put:
+ intel_context_put(ce);
+ return err;
+}
+
+static int live_gpr_clear(void *arg)
+{
+ struct intel_gt *gt = arg;
+ struct intel_engine_cs *engine;
+ struct i915_vma *scratch;
+ enum intel_engine_id id;
+ int err = 0;
+
+ /*
+ * Check that GPR registers are cleared in new contexts as we need
+ * to avoid leaking any information from previous contexts.
+ */
+
+ scratch = create_scratch(gt);
+ if (IS_ERR(scratch))
+ return PTR_ERR(scratch);
+
+ for_each_engine(engine, gt, id) {
+ err = __live_gpr_clear(engine, scratch);
+ if (err)
+ break;
+ }
+
+ if (igt_flush_test(gt->i915))
+ err = -EIO;
+
+ i915_vma_unpin_and_release(&scratch, 0);
+ return err;
+}
+
+int intel_lrc_live_selftests(struct drm_i915_private *i915)
+{
+ static const struct i915_subtest tests[] = {
+ SUBTEST(live_lrc_layout),
+ SUBTEST(live_lrc_fixed),
+ SUBTEST(live_lrc_state),
+ SUBTEST(live_gpr_clear),
+ };
+
+ if (!HAS_LOGICAL_RING_CONTEXTS(i915))
+ return 0;
+
+ return intel_gt_live_subtests(tests, &i915->gt);
+}
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c
index 101728006ae9..809cd10dc731 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_ads.c
@@ -4,6 +4,7 @@
*/
#include "gt/intel_gt.h"
+#include "gt/intel_lrc.h"
#include "intel_guc_ads.h"
#include "intel_uc.h"
#include "i915_drv.h"
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
index 9e42324fdecd..b7b4fbc5734e 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
@@ -10,6 +10,7 @@
#include "gt/intel_engine_pm.h"
#include "gt/intel_gt.h"
#include "gt/intel_gt_pm.h"
+#include "gt/intel_lrc.h"
#include "gt/intel_lrc_reg.h"
#include "gt/intel_ring.h"
@@ -401,6 +402,39 @@ cancel_port_requests(struct intel_engine_execlists * const execlists)
memset(execlists->inflight, 0, sizeof(execlists->inflight));
}
+static void context_reset(struct intel_engine_cs *engine,
+ struct intel_context *ce,
+ u32 head,
+ bool scrub)
+{
+ GEM_BUG_ON(!intel_context_is_pinned(ce));
+
+ /*
+ * We want a simple context + ring to execute the breadcrumb update.
+ * We cannot rely on the context being intact across the GPU hang,
+ * so clear it and rebuild just what we need for the breadcrumb.
+ * All pending requests for this context will be zapped, and any
+ * future request will be after userspace has had the opportunity
+ * to recreate its own state.
+ */
+ if (scrub) {
+ u32 *regs = ce->lrc_reg_state;
+
+ if (engine->pinned_default_state)
+ memcpy(regs, /* skip restoring the vanilla PPHWSP */
+ engine->pinned_default_state + LRC_STATE_PN * PAGE_SIZE,
+ engine->context_size - PAGE_SIZE);
+
+ intel_lrc_init_reg_state(regs, ce, engine, ce->ring, false);
+ }
+
+ /* Rerun the request; its payload has been neutered (if guilty). */
+ ce->ring->head = head;
+ intel_ring_update_space(ce->ring);
+
+ intel_lrc_update_reg_state(ce, engine);
+}
+
static void guc_reset_rewind(struct intel_engine_cs *engine, bool stalled)
{
struct intel_engine_execlists * const execlists = &engine->execlists;
@@ -420,7 +454,7 @@ static void guc_reset_rewind(struct intel_engine_cs *engine, bool stalled)
stalled = false;
__i915_request_reset(rq, stalled);
- intel_lr_context_reset(engine, rq->context, rq->head, stalled);
+ context_reset(engine, rq->context, rq->head, stalled);
out_unlock:
spin_unlock_irqrestore(&engine->active.lock, flags);
diff --git a/drivers/gpu/drm/i915/gvt/mmio_context.c b/drivers/gpu/drm/i915/gvt/mmio_context.c
index aaf15916d29a..334dcfa2d32f 100644
--- a/drivers/gpu/drm/i915/gvt/mmio_context.c
+++ b/drivers/gpu/drm/i915/gvt/mmio_context.c
@@ -35,6 +35,7 @@
#include "i915_drv.h"
#include "gt/intel_context.h"
+#include "gt/intel_lrc.h"
#include "gt/intel_ring.h"
#include "gvt.h"
#include "trace.h"
diff --git a/drivers/gpu/drm/i915/gvt/scheduler.c b/drivers/gpu/drm/i915/gvt/scheduler.c
index 685d1e04a5ff..378a39aefb60 100644
--- a/drivers/gpu/drm/i915/gvt/scheduler.c
+++ b/drivers/gpu/drm/i915/gvt/scheduler.c
@@ -37,6 +37,7 @@
#include "gem/i915_gem_pm.h"
#include "gt/intel_context.h"
+#include "gt/intel_lrc.h"
#include "gt/intel_ring.h"
#include "i915_drv.h"
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index a8a08c63278e..7693d37a6d0e 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -77,7 +77,6 @@
#include "gem/i915_gem_shrinker.h"
#include "gem/i915_gem_stolen.h"
-#include "gt/intel_lrc.h"
#include "gt/intel_engine.h"
#include "gt/intel_gt_types.h"
#include "gt/intel_workarounds.h"
diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index 535a12520dba..e8cf9f2b22cc 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -199,6 +199,7 @@
#include "gt/intel_engine_pm.h"
#include "gt/intel_engine_user.h"
#include "gt/intel_gt.h"
+#include "gt/intel_lrc.h"
#include "gt/intel_lrc_reg.h"
#include "gt/intel_ring.h"