*cs++ = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(base, 0));
*cs++ = upper_32_bits(pd_daddr);
@@ -1014,7 +1014,8 @@ static int emit_ppgtt_update(struct i915_request *rq, void *data)
if (IS_ERR(cs))
return PTR_ERR(cs);
- *cs++ = MI_LOAD_REGISTER_IMM(2 * GEN8_3LVL_PDPES) | MI_LRI_FORCE_POSTED;
+ *cs++ = MI_LOAD_REGISTER_IMM(2 * GEN8_3LVL_PDPES) |
+ MI_LRI_FORCE_POSTED | engine->lri_cmd_mode;
for (i = GEN8_3LVL_PDPES; i--; ) {
const dma_addr_t pd_daddr = i915_page_dir_dma_addr(ppgtt, i);
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
index f451d5076bde..5aa58e069806 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
@@ -236,6 +236,28 @@ static u32 __engine_mmio_base(struct drm_i915_private *i915,
return bases[i].base;
}
+static bool i915_engine_has_relative_lri(const struct intel_engine_cs *engine)
+{
+ if (INTEL_GEN(engine->i915) < 11)
+ return false;
+
+ if (engine->class == COPY_ENGINE_CLASS)
+ return false;
+
+ return true;
+}
+
+static void lri_init(struct intel_engine_cs *engine)
+{
+ if (i915_engine_has_relative_lri(engine)) {
+ engine->lri_cmd_mode = MI_LRI_ADD_CS_MMIO_START_GEN11;
+ engine->lri_mmio_base = 0;
+ } else {
+ engine->lri_cmd_mode = 0;
+ engine->lri_mmio_base = engine->mmio_base;
+ }
+}
+
static void __sprint_engine_name(struct intel_engine_cs *engine)
{
/*
@@ -320,6 +342,8 @@ static int intel_engine_setup(struct intel_gt *gt, enum intel_engine_id id)
/* Nothing to do here, execute in order of dependencies */
engine->schedule = NULL;
+ lri_init(engine);
+
seqlock_init(&engine->stats.lock);
ATOMIC_INIT_NOTIFIER_HEAD(&engine->context_status_notifier);
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h
index 943f0663837e..38f486f7f7e3 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_types.h
+++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h
@@ -306,6 +306,9 @@ struct intel_engine_cs {
u32 context_size;
u32 mmio_base;
+ u32 lri_mmio_base;
+ u32 lri_cmd_mode;
+
u32 uabi_capabilities;
struct rb_node uabi_node;
diff --git a/drivers/gpu/drm/i915/gt/intel_gpu_commands.h b/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
index f78b13d74e17..bfb51d8d7ce2 100644
--- a/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
+++ b/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
@@ -133,11 +133,16 @@
* simply ignores the register load under certain conditions.
* - One can actually load arbitrary many arbitrary registers: Simply issue x
* address/value pairs. Don't overdue it, though, x <= 2^4 must hold!
+ * - Newer hardware supports engine relative addressing but older hardware does
+ * not. This is required for hw engine load balancing. Hence the MI_LRI
+ * instruction itself is prefixed with '__' and should only be used on
+ * legacy hardware code paths. Generic code must always use the MI_LRI
+ * and i915_get_lri_reg() helper functions instead.
*/
#define MI_LOAD_REGISTER_IMM(x) MI_INSTR(0x22, 2*(x)-1)
/* Gen11+. addr = base + (ctx_restore ? offset & GENMASK(12,2) : offset) */
-#define MI_LRI_CS_MMIO (1<<19)
#define MI_LRI_FORCE_POSTED (1<<12)
+#define MI_LRI_ADD_CS_MMIO_START_GEN11 BIT(19)
#define MI_STORE_REGISTER_MEM MI_INSTR(0x24, 1)
#define MI_STORE_REGISTER_MEM_GEN8 MI_INSTR(0x24, 2)
#define MI_SRM_LRM_GLOBAL_GTT (1<<22)
diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
index 6cfdc0f9f2b9..7cd59d29c4e4 100644
--- a/drivers/gpu/drm/i915/gt/intel_lrc.c
+++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
@@ -2010,11 +2010,12 @@ struct lri {
u32 value;
};
-static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count)
+static u32 *emit_lri(struct intel_engine_cs *engine, u32 *batch,
+ const struct lri *lri, unsigned int count)
{
GEM_BUG_ON(!count || count > 63);
- *batch++ = MI_LOAD_REGISTER_IMM(count);
+ *batch++ = MI_LOAD_REGISTER_IMM(count) | engine->lri_cmd_mode;
do {
*batch++ = i915_mmio_reg_offset(lri->reg);
*batch++ = lri->value;
@@ -2054,7 +2055,7 @@ static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
/* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
batch = gen8_emit_flush_coherentl3_wa(engine, batch);
- batch = emit_lri(batch, lri, ARRAY_SIZE(lri));
+ batch = emit_lri(engine, batch, lri, ARRAY_SIZE(lri));
/* WaMediaPoolStateCmdInWABB:bxt,glk */
if (HAS_POOLED_EU(engine->i915)) {
@@ -3282,7 +3283,7 @@ static void init_common_reg_state(u32 * const regs,
struct intel_engine_cs *engine,
struct intel_ring *ring)
{
- const u32 base = engine->mmio_base;
+ const u32 base = engine->lri_mmio_base;
CTX_REG(regs, CTX_CONTEXT_CONTROL, RING_CONTEXT_CONTROL(base),
_MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT) |
@@ -3307,7 +3308,7 @@ static void init_wa_bb_reg_state(u32 * const regs,
u32 pos_bb_per_ctx)
{
struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx;
- const u32 base = engine->mmio_base;
+ const u32 base = engine->lri_mmio_base;
const u32 pos_indirect_ctx = pos_bb_per_ctx + 2;
const u32 pos_indirect_ctx_offset = pos_indirect_ctx + 2;
@@ -3335,6 +3336,7 @@ static void init_wa_bb_reg_state(u32 * const regs,
}
static void init_ppgtt_reg_state(u32 *regs, u32 base,
+ struct intel_engine_cs *engine,
struct i915_ppgtt *ppgtt)
{
/* PDP values well be assigned later if needed */
@@ -3376,14 +3378,12 @@ static void gen8_init_reg_state(u32 * const regs,
{
struct i915_ppgtt * const ppgtt = vm_alias(ce->vm);
const bool rcs = engine->class == RENDER_CLASS;
- const u32 base = engine->mmio_base;
- const u32 lri_base =
- intel_engine_has_relative_mmio(engine) ? MI_LRI_CS_MMIO : 0;
+ const u32 base = engine->lri_mmio_base;
regs[CTX_LRI_HEADER_0] =
MI_LOAD_REGISTER_IMM(rcs ? 14 : 11) |
MI_LRI_FORCE_POSTED |
- lri_base;
+ engine->lri_cmd_mode;
init_common_reg_state(regs, ppgtt, engine, ring);
CTX_REG(regs, CTX_SECOND_BB_HEAD_U, RING_SBBADDR_UDW(base), 0);
@@ -3395,15 +3395,17 @@ static void gen8_init_reg_state(u32 * const regs,
regs[CTX_LRI_HEADER_1] =
MI_LOAD_REGISTER_IMM(9) |
MI_LRI_FORCE_POSTED |
- lri_base;
+ engine->lri_cmd_mode;
CTX_REG(regs, CTX_CTX_TIMESTAMP, RING_CTX_TIMESTAMP(base), 0);
- init_ppgtt_reg_state(regs, base, ppgtt);
+ init_ppgtt_reg_state(regs, base, engine, ppgtt);
if (rcs) {
- regs[CTX_LRI_HEADER_2] = MI_LOAD_REGISTER_IMM(1) | lri_base;
- CTX_REG(regs, CTX_R_PWR_CLK_STATE, GEN8_R_PWR_CLK_STATE, 0);
+ regs[CTX_LRI_HEADER_2] = MI_LOAD_REGISTER_IMM(1) |
+ engine->lri_cmd_mode;
+ CTX_REG(regs, CTX_R_PWR_CLK_STATE,
+ GEN8_R_PWR_CLK_STATE, 0);
}
regs[CTX_END] = MI_BATCH_BUFFER_END;
@@ -3418,14 +3420,12 @@ static void gen12_init_reg_state(u32 * const regs,
{
struct i915_ppgtt * const ppgtt = i915_vm_to_ppgtt(ce->vm);
const bool rcs = engine->class == RENDER_CLASS;
- const u32 base = engine->mmio_base;
- const u32 lri_base =
- intel_engine_has_relative_mmio(engine) ? MI_LRI_CS_MMIO : 0;
+ const u32 base = engine->lri_mmio_base;
regs[CTX_LRI_HEADER_0] =
MI_LOAD_REGISTER_IMM(rcs ? 11 : 9) |
MI_LRI_FORCE_POSTED |
- lri_base;
+ engine->lri_cmd_mode;
init_common_reg_state(regs, ppgtt, engine, ring);
@@ -3435,15 +3435,15 @@ static void gen12_init_reg_state(u32 * const regs,
regs[CTX_LRI_HEADER_1] =
MI_LOAD_REGISTER_IMM(9) |
MI_LRI_FORCE_POSTED |
- lri_base;
+ engine->lri_cmd_mode;
CTX_REG(regs, CTX_CTX_TIMESTAMP, RING_CTX_TIMESTAMP(base), 0);
- init_ppgtt_reg_state(regs, base, ppgtt);
+ init_ppgtt_reg_state(regs, base, engine, ppgtt);
if (rcs) {
regs[GEN12_CTX_LRI_HEADER_3] =
- MI_LOAD_REGISTER_IMM(1) | lri_base;
+ MI_LOAD_REGISTER_IMM(1) | engine->lri_cmd_mode;
CTX_REG(regs, CTX_R_PWR_CLK_STATE, GEN8_R_PWR_CLK_STATE, 0);
/* TODO: oa_init_reg_state ? */
diff --git a/drivers/gpu/drm/i915/gt/intel_mocs.c b/drivers/gpu/drm/i915/gt/intel_mocs.c
index 728704bbbe18..8e8fe3deb95c 100644
--- a/drivers/gpu/drm/i915/gt/intel_mocs.c
+++ b/drivers/gpu/drm/i915/gt/intel_mocs.c
@@ -368,9 +368,6 @@ static u32 get_entry_control(const struct drm_i915_mocs_table *table,
/**
* intel_mocs_init_engine() - emit the mocs control table
* @engine: The engine for whom to emit the registers.
- *
- * This function simply emits a MI_LOAD_REGISTER_IMM command for the
- * given table starting at the given address.
*/
void intel_mocs_init_engine(struct intel_engine_cs *engine)
{
@@ -456,7 +453,8 @@ static int emit_mocs_control_table(struct i915_request *rq,
if (IS_ERR(cs))
return PTR_ERR(cs);
- *cs++ = MI_LOAD_REGISTER_IMM(table->n_entries);
+ *cs++ = MI_LOAD_REGISTER_IMM(table->n_entries) |
+ rq->engine->lri_cmd_mode;
for (index = 0; index < table->size; index++) {
u32 value = get_entry_control(table, index);
diff --git a/drivers/gpu/drm/i915/gt/intel_ringbuffer.c b/drivers/gpu/drm/i915/gt/intel_ringbuffer.c
index 0747b8c9f768..7027367b1f1a 100644
--- a/drivers/gpu/drm/i915/gt/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/gt/intel_ringbuffer.c
@@ -1536,12 +1536,13 @@ static int load_pd_dir(struct i915_request *rq, const struct i915_ppgtt *ppgtt)
if (IS_ERR(cs))
return PTR_ERR(cs);
- *cs++ = MI_LOAD_REGISTER_IMM(1);
- *cs++ = i915_mmio_reg_offset(RING_PP_DIR_DCLV(engine->mmio_base));
+ /* Can these not be merged into a single LRI??? */
+ *cs++ = MI_LOAD_REGISTER_IMM(1) | engine->lri_cmd_mode;
+ *cs++ = i915_mmio_reg_offset(RING_PP_DIR_DCLV(engine->lri_mmio_base));
*cs++ = PP_DIR_DCLV_2G;
- *cs++ = MI_LOAD_REGISTER_IMM(1);
- *cs++ = i915_mmio_reg_offset(RING_PP_DIR_BASE(engine->mmio_base));
+ *cs++ = MI_LOAD_REGISTER_IMM(1) | engine->lri_cmd_mode;
+ *cs++ = i915_mmio_reg_offset(RING_PP_DIR_BASE(engine->lri_mmio_base));
*cs++ = px_base(ppgtt->pd)->ggtt_offset << 10;
intel_ring_advance(rq, cs);
@@ -1709,7 +1710,8 @@ static int remap_l3_slice(struct i915_request *rq, int slice)
* here because no other code should access these registers other than
* at initialization time.
*/
- *cs++ = MI_LOAD_REGISTER_IMM(GEN7_L3LOG_SIZE/4);
+ *cs++ = MI_LOAD_REGISTER_IMM(GEN7_L3LOG_SIZE / 4) |
+ rq->engine->lri_cmd_mode;
for (i = 0; i < GEN7_L3LOG_SIZE/4; i++) {
*cs++ = i915_mmio_reg_offset(GEN7_L3LOG(slice, i));
*cs++ = remap_info[i];
diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index c1b764233761..8b85cdfada21 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -1693,6 +1693,11 @@ gen8_update_reg_state_unlocked(struct i915_perf_stream *stream,
};
int i;
+ /*
+ * NB: The LRI instruction is generated by the hardware.
+ * Should we read it in and assert that the offset flag is set?
+ */
+
CTX_REG(reg_state, ctx_oactxctrl, GEN8_OACTXCONTROL,
(stream->period_exponent << GEN8_OA_TIMER_PERIOD_SHIFT) |
(stream->periodic ? GEN8_OA_TIMER_ENABLE : 0) |
@@ -1752,8 +1757,9 @@ gen8_load_flex(struct i915_request *rq,
if (IS_ERR(cs))
return PTR_ERR(cs);
- *cs++ = MI_LOAD_REGISTER_IMM(count);
+ *cs++ = MI_LOAD_REGISTER_IMM(count) | rq->engine->lri_cmd_mode;
do {
+ /* FIXME: Is this table LRI remap/offset friendly? */
*cs++ = i915_mmio_reg_offset(flex->reg);
*cs++ = flex->value;
} while (flex++, --count);