[RFC] drm/i915/tgl: Advanced preparser support for GPU relocs

Daniele Ceraolo Spurio <daniele.ceraolospurio@xxxxxxxxx> · Thu, 22 Aug 2019 19:09:09 -0700

TGL has an improved CS pre-parser that can now pre-fetch commands across
batch boundaries. This improves performances when lots of small batches
are used, but has an impact on self-modifying code. If we want to modify
the content of a batch from another ring/batch, we need to either
guarantee that the memory location is updated before the pre-parser gets
to it or we need to turn the pre-parser off around the modification.
In i915, we use self-modifying code only for GPU relocations.

The pre-parser fetches across memory synchronization commands as well,
so the only way to guarantee that the writes land before the parser gets
to it is to have more instructions between the sync and the destination
than the parser FIFO depth, which is not an optimal solution.

The parser can be disabled either globally (from GFX_MODE) or at the
context level, using a new flag in the ARB_CHECK command. When
re-enabled, the parser turns on at the next arbitration point (ARB_CHECK
is not an arbitration point when the parser flag is set). The command is
not privileged, so the status can be changed by user batches as well.

To cope with this new HW feature, this patch turns off the parser when
GPU relocs are emitted and it conditionally turns back on after the
emission, before the user batch is started. The original status of the
parser, which is stored in RING_INSTPM, is used to decide whether to
re-enable the parser or not. This ensure that we don't turn the parser
back on if the userspace had decided to disable it.

Note that with this patch, the parser defaults to on for all contexts,
which might regress legacy userspace application that use self-modifying
code. However, if we turn it off by default, e.g. with an ARB_CHECK as
first cmd on the ring, we will regress performance, because even legacy
parsing capabilities are disabled in this scenario.

Signed-off-by: Daniele Ceraolo Spurio <daniele.ceraolospurio@xxxxxxxxx>
Cc: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx>
Cc: Mika Kuoppala <mika.kuoppala@xxxxxxxxxxxxxxx>
Cc: Tvrtko Ursulin <tvrtko.ursulin@xxxxxxxxxxxxxxx>
---
 .../gpu/drm/i915/gem/i915_gem_execbuffer.c    | 45 ++++++++++++--
 drivers/gpu/drm/i915/gt/intel_engine.h        |  9 +++
 drivers/gpu/drm/i915/gt/intel_engine_types.h  |  4 ++
 drivers/gpu/drm/i915/gt/intel_gpu_commands.h  | 17 +++++-
 drivers/gpu/drm/i915/gt/intel_lrc.c           | 60 +++++++++++++++++++
 drivers/gpu/drm/i915/gt/intel_timeline.c      | 13 +++-
 drivers/gpu/drm/i915/i915_reg.h               |  1 +
 7 files changed, 141 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
index b5f6937369ea..45e84f28276c 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
@@ -254,6 +254,7 @@ struct i915_execbuffer {
 
 		struct i915_request *rq;
 		u32 *rq_cmd;
+		unsigned int size;
 		unsigned int rq_size;
 	} reloc_cache;
 
@@ -904,6 +905,7 @@ static void reloc_cache_init(struct reloc_cache *cache,
 	cache->needs_unfenced = INTEL_INFO(i915)->unfenced_needs_alignment;
 	cache->node.allocated = false;
 	cache->rq = NULL;
+	cache->size = PAGE_SIZE;
 	cache->rq_size = 0;
 }
 
@@ -928,7 +930,8 @@ static inline struct i915_ggtt *cache_to_ggtt(struct reloc_cache *cache)
 
 static void reloc_gpu_flush(struct reloc_cache *cache)
 {
-	GEM_BUG_ON(cache->rq_size >= cache->rq->batch->obj->base.size / sizeof(u32));
+	GEM_BUG_ON(cache->rq_size >= cache->size / sizeof(u32));
+
 	cache->rq_cmd[cache->rq_size] = MI_BATCH_BUFFER_END;
 
 	__i915_gem_object_flush_map(cache->rq->batch->obj, 0, cache->rq_size);
@@ -1142,10 +1145,11 @@ static int __reloc_gpu_alloc(struct i915_execbuffer *eb,
 	struct intel_engine_pool_node *pool;
 	struct i915_request *rq;
 	struct i915_vma *batch;
+	u32 reserved_size = eb->engine->emit_preparser_enable_size_dw * sizeof(u32);
 	u32 *cmd;
 	int err;
 
-	pool = intel_engine_pool_get(&eb->engine->pool, PAGE_SIZE);
+	pool = intel_engine_pool_get(&eb->engine->pool, cache->size);
 	if (IS_ERR(pool))
 		return PTR_ERR(pool);
 
@@ -1158,6 +1162,9 @@ static int __reloc_gpu_alloc(struct i915_execbuffer *eb,
 		goto out_pool;
 	}
 
+	/* we reserve a portion of the batch for the pre-parser re-enabling */
+	cache->size -= reserved_size;
+
 	batch = i915_vma_instance(pool->obj, vma->vm, NULL);
 	if (IS_ERR(batch)) {
 		err = PTR_ERR(batch);
@@ -1182,9 +1189,39 @@ static int __reloc_gpu_alloc(struct i915_execbuffer *eb,
 	if (err)
 		goto err_request;
 
+	if (eb->engine->emit_preparser_disable) {
+		err = eb->engine->emit_preparser_disable(rq,
+							 cmd + cache->size / sizeof(u32));
+		if (err)
+			goto skip_request;
+	}
+
 	err = eb->engine->emit_bb_start(rq,
-					batch->node.start, PAGE_SIZE,
+					batch->node.start, cache->size,
 					cache->gen > 5 ? 0 : I915_DISPATCH_SECURE);
+
+	/*
+	 * Nothing we can do to fix this if we fail to re-enable, so print an
+	 * error and keep going. The context will still be functional, but
+	 * performance might be reduced.
+	 * We attemt this even if emit_bb_start failed to try and at least
+	 * restore the parser status.
+	 */
+	if (eb->engine->emit_preparser_disable) {
+		int ret;
+		eb->engine->emit_flush(rq, EMIT_FLUSH);
+
+		ret = eb->engine->emit_bb_start(rq,
+			batch->node.start + cache->size, reserved_size,
+			cache->gen > 5 ? 0 : I915_DISPATCH_SECURE);
+		if (ret)
+			DRM_ERROR("Failed to re-enable pre-parser for "
+				  "ctx=%u on engine%u:%u\n",
+				  rq->gem_context->hw_id,
+				  eb->engine->uabi_class,
+				  eb->engine->uabi_instance);
+	}
+
 	if (err)
 		goto skip_request;
 
@@ -1226,7 +1263,7 @@ static u32 *reloc_gpu(struct i915_execbuffer *eb,
 	struct reloc_cache *cache = &eb->reloc_cache;
 	u32 *cmd;
 
-	if (cache->rq_size > PAGE_SIZE/sizeof(u32) - (len + 1))
+	if (cache->rq_size > cache->size/sizeof(u32) - (len + 1))
 		reloc_gpu_flush(cache);
 
 	if (unlikely(!cache->rq)) {
diff --git a/drivers/gpu/drm/i915/gt/intel_engine.h b/drivers/gpu/drm/i915/gt/intel_engine.h
index d3c6993f4f46..1c9e38586af8 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine.h
+++ b/drivers/gpu/drm/i915/gt/intel_engine.h
@@ -185,9 +185,18 @@ intel_write_status_page(struct intel_engine_cs *engine, int reg, u32 value)
 #define I915_GEM_HWS_PREEMPT_ADDR	(I915_GEM_HWS_PREEMPT * sizeof(u32))
 #define I915_GEM_HWS_SEQNO		0x40
 #define I915_GEM_HWS_SEQNO_ADDR		(I915_GEM_HWS_SEQNO * sizeof(u32))
+/*
+ * note: we treat the cacheline starting from I915_GEM_HWS_SEQNO the same as a
+ * pre-ctx HSWP. see layout below.
+ */
 #define I915_GEM_HWS_SCRATCH		0x80
 #define I915_GEM_HWS_SCRATCH_ADDR	(I915_GEM_HWS_SCRATCH * sizeof(u32))
 
+/* offset in per-context hwsp, 1 cacheline in size */
+#define I915_GEM_CTX_HWS_SEQNO		0x0
+#define I915_GEM_CTX_HWS_PREFETCH_MASK	0x14
+#define I915_GEM_CTX_HWS_PREFETCH_VAL	0x15
+
 #define I915_HWS_CSB_BUF0_INDEX		0x10
 #define I915_HWS_CSB_WRITE_INDEX	0x1f
 #define CNL_HWS_CSB_WRITE_INDEX		0x2f
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h
index a82cea95c2f2..bf1a2d7b3a4f 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_types.h
+++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h
@@ -435,6 +435,10 @@ struct intel_engine_cs {
 						 u32 *cs);
 	unsigned int	emit_fini_breadcrumb_dw;
 
+	int		(*emit_preparser_disable)(struct i915_request *rq,
+						  u32 *batch);
+	unsigned int	emit_preparser_enable_size_dw;
+
 	/* Pass the request to the hardware queue (e.g. directly into
 	 * the legacy ringbuffer or to the end of an execlist).
 	 *
diff --git a/drivers/gpu/drm/i915/gt/intel_gpu_commands.h b/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
index 86e00a2db8a4..d9e82a4de854 100644
--- a/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
+++ b/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
@@ -53,6 +53,17 @@
 #define   MI_ARB_ENABLE			(1<<0)
 #define   MI_ARB_DISABLE		(0<<0)
 #define MI_BATCH_BUFFER_END	MI_INSTR(0x0a, 0)
+#define MI_CONDITIONAL_BATCH_BUFFER_END MI_INSTR(0x36, 0)
+#define   MI_CBBEND_GLOBAL_GTT		(1<<22)
+#define   MI_CBBEND_COMPARE_SEMAPHORE	(1<<21) /* Gen12+*/
+#define   MI_CBBEND_COMPARE_MASK	(1<<19) /* Gen9+*/
+#define   MI_CBBEND_END_CURRENT_LEVEL	(1<<18) /* Gen12+*/
+#define   MI_CBBEND_MAD_GT_IDD		(0 << 12) /* Gen12+*/
+#define   MI_CBBEND_MAD_GTE_IDD		(1 << 12) /* Gen12+*/
+#define   MI_CBBEND_MAD_LT_IDD		(2 << 12) /* Gen12+*/
+#define   MI_CBBEND_MAD_LTE_IDD		(3 << 12) /* Gen12+*/
+#define   MI_CBBEND_MAD_EQ_IDD		(4 << 12) /* Gen12+*/
+#define   MI_CBBEND_MAD_NE_IDD		(5 << 12) /* Gen12+*/
 #define MI_SUSPEND_FLUSH	MI_INSTR(0x0b, 0)
 #define   MI_SUSPEND_FLUSH_EN	(1<<0)
 #define MI_SET_APPID		MI_INSTR(0x0e, 0)
@@ -136,6 +147,7 @@
 #define MI_STORE_REGISTER_MEM        MI_INSTR(0x24, 1)
 #define MI_STORE_REGISTER_MEM_GEN8   MI_INSTR(0x24, 2)
 #define   MI_SRM_LRM_GLOBAL_GTT		(1<<22)
+#define   MI_SRM_ADD_CS_MMIO_START	(1<<19) /* gen11+ */
 #define MI_FLUSH_DW		MI_INSTR(0x26, 1) /* for GEN6 */
 #define   MI_FLUSH_DW_STORE_INDEX	(1<<21)
 #define   MI_INVALIDATE_TLB		(1<<18)
@@ -158,6 +170,9 @@
 #define MI_BATCH_BUFFER_START_GEN8	MI_INSTR(0x31, 1)
 #define   MI_BATCH_RESOURCE_STREAMER (1<<10)
 
+#define MI_ARB_CHECK            MI_INSTR(0x05, 0)
+#define   MI_ARB_CHECK_PREPARSER_DISABLE BIT(0)
+#define   MI_ARB_CHECK_PREPARSER_DISABLE_MASK BIT(8)
 /*
  * 3D instructions used by the kernel
  */
@@ -239,7 +254,6 @@
  * Commands used only by the command parser
  */
 #define MI_SET_PREDICATE        MI_INSTR(0x01, 0)
-#define MI_ARB_CHECK            MI_INSTR(0x05, 0)
 #define MI_RS_CONTROL           MI_INSTR(0x06, 0)
 #define MI_URB_ATOMIC_ALLOC     MI_INSTR(0x09, 0)
 #define MI_PREDICATE            MI_INSTR(0x0C, 0)
@@ -255,7 +269,6 @@
 #define MI_RS_STORE_DATA_IMM    MI_INSTR(0x2B, 0)
 #define MI_LOAD_URB_MEM         MI_INSTR(0x2C, 0)
 #define MI_STORE_URB_MEM        MI_INSTR(0x2D, 0)
-#define MI_CONDITIONAL_BATCH_BUFFER_END MI_INSTR(0x36, 0)
 
 #define PIPELINE_SELECT                ((0x3<<29)|(0x1<<27)|(0x1<<24)|(0x4<<16))
 #define GFX_OP_3DSTATE_VF_STATISTICS   ((0x3<<29)|(0x1<<27)|(0x0<<24)|(0xB<<16))
diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
index d42584439f51..2e560bbb3bdf 100644
--- a/drivers/gpu/drm/i915/gt/intel_lrc.c
+++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
@@ -2947,6 +2947,59 @@ static u32 *gen11_emit_fini_breadcrumb_rcs(struct i915_request *request,
 	return gen8_emit_fini_breadcrumb_footer(request, cs);
 }
 
+static u32 *__gen12_emit_preparser_enable(u32 *batch, u32 target_addr)
+{
+	/* return early if the pre-parser was disabled when we started */
+	*batch++ = MI_CONDITIONAL_BATCH_BUFFER_END | 2 |
+			MI_CBBEND_GLOBAL_GTT |
+			MI_CBBEND_COMPARE_SEMAPHORE |
+			MI_CBBEND_END_CURRENT_LEVEL |
+			MI_CBBEND_MAD_NE_IDD; /* return early if EQ */
+	*batch++ = PREFETCH_DISABLE_STATUS;
+	*batch++ = target_addr;
+	*batch++ = 0;
+
+	/* turn the parser back on */
+	*batch++ = MI_ARB_CHECK | MI_ARB_CHECK_PREPARSER_DISABLE_MASK;
+	*batch++ = MI_BATCH_BUFFER_END;
+
+	return batch;
+}
+
+static int gen12_emit_preparser_disable(struct i915_request *rq, u32 *batch)
+{
+	/* 2 dwords, mask and value (in that order) */
+	u32 target_offset = rq->hw_context->timeline->hwsp_offset +
+				I915_GEM_CTX_HWS_PREFETCH_MASK * sizeof(u32);
+	u32 *cs;
+
+	cs = intel_ring_begin(rq, 6);
+	if (IS_ERR(cs))
+		return PTR_ERR(cs);
+
+	/* save the current status of the pre-parser ... */
+	*cs++ = MI_STORE_REGISTER_MEM_GEN8 |
+			MI_SRM_LRM_GLOBAL_GTT |
+			MI_SRM_ADD_CS_MMIO_START;
+	*cs++ = i915_mmio_reg_offset(RING_INSTPM(0));
+	*cs++ = target_offset + sizeof(u32); /* 2nd dword */
+	*cs++ = 0;
+
+	/* ... and turn it off */
+	*cs++ = MI_ARB_CHECK |
+		MI_ARB_CHECK_PREPARSER_DISABLE_MASK |
+		MI_ARB_CHECK_PREPARSER_DISABLE;
+
+	*cs++ = MI_NOOP;
+
+	intel_ring_advance(rq, cs);
+
+	/* now prepare the batch that will re-enable the parser */
+	__gen12_emit_preparser_enable(batch, target_offset);
+
+	return 0;
+}
+
 static void execlists_park(struct intel_engine_cs *engine)
 {
 	del_timer(&engine->execlists.timer);
@@ -3017,6 +3070,13 @@ logical_ring_default_vfuncs(struct intel_engine_cs *engine)
 		engine->emit_bb_start = gen8_emit_bb_start;
 	else
 		engine->emit_bb_start = gen9_emit_bb_start;
+
+	if (INTEL_GEN(engine->i915) >= 12) {
+		u32 tmp[16];
+		engine->emit_preparser_disable = gen12_emit_preparser_disable;
+		engine->emit_preparser_enable_size_dw =
+			__gen12_emit_preparser_enable(tmp, 0) - tmp;
+	}
 }
 
 static inline void
diff --git a/drivers/gpu/drm/i915/gt/intel_timeline.c b/drivers/gpu/drm/i915/gt/intel_timeline.c
index 02fbe11b671b..89e1fd0b96ab 100644
--- a/drivers/gpu/drm/i915/gt/intel_timeline.c
+++ b/drivers/gpu/drm/i915/gt/intel_timeline.c
@@ -209,6 +209,7 @@ int intel_timeline_init(struct intel_timeline *timeline,
 			struct i915_vma *hwsp)
 {
 	void *vaddr;
+	u32 *hwsp_map;
 
 	kref_init(&timeline->kref);
 	atomic_set(&timeline->pin_count, 0);
@@ -244,8 +245,16 @@ int intel_timeline_init(struct intel_timeline *timeline,
 			return PTR_ERR(vaddr);
 	}
 
-	timeline->hwsp_seqno =
-		memset(vaddr + timeline->hwsp_offset, 0, CACHELINE_BYTES);
+	hwsp_map = memset(vaddr + timeline->hwsp_offset, 0, CACHELINE_BYTES);
+	timeline->hwsp_seqno = hwsp_map;
+
+	/*
+	 * For checking the status of the pre-fetcher, we need save the value
+	 * of the INSTPM register and then evaluate it against the mask during
+	 * a BBEND. This requires the mask and the register value to be in 2
+	 * consecutive dwords in memory. We use 2 dword in the HWSP for this
+	 */
+	hwsp_map[I915_GEM_CTX_HWS_PREFETCH_MASK] = PREFETCH_DISABLE_STATUS;
 
 	timeline->hwsp_ggtt = i915_vma_get(hwsp);
 	GEM_BUG_ON(timeline->hwsp_offset >= hwsp->size);
diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index a092b34c269d..504a513cffee 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -2549,6 +2549,7 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg)
 #define RING_DMA_FADD(base)	_MMIO((base) + 0x78)
 #define RING_DMA_FADD_UDW(base)	_MMIO((base) + 0x60) /* gen8+ */
 #define RING_INSTPM(base)	_MMIO((base) + 0xc0)
+#define  PREFETCH_DISABLE_STATUS REG_BIT(12) /* gen12+ */
 #define RING_MI_MODE(base)	_MMIO((base) + 0x9c)
 #define INSTPS		_MMIO(0x2070) /* 965+ only */
 #define GEN4_INSTDONE1	_MMIO(0x207c) /* 965+ only, aka INSTDONE_2 on SNB */
-- 
2.23.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@xxxxxxxxxxxxxxxxxxxxx
https://lists.freedesktop.org/mailman/listinfo/intel-gfx