On 09/05/18 09:59, Chris Wilson wrote:
Quoting Lionel Landwerlin (2018-05-08 19:03:45)
If some of the contexts submitting workloads to the GPU have been
configured to shutdown slices/subslices, we might loose the NOA
configurations written in the NOA muxes. We need to reprogram them
when we detect a powergating configuration change.
In this change i915/perf is responsible for setting up a reprogramming
batchbuffer which we execute just before the userspace submitted
batchbuffer. We do this while preemption is still disable, only if
needed. The decision to execute this reprogramming batchbuffer is made
when we assign a request to an execlist port.
v2: Only reprogram when detecting configuration changes (Chris/Lionel)
Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@xxxxxxxxx>
---
drivers/gpu/drm/i915/i915_drv.h | 6 ++
drivers/gpu/drm/i915/i915_perf.c | 108 ++++++++++++++++++++++
drivers/gpu/drm/i915/i915_request.h | 6 ++
drivers/gpu/drm/i915/intel_gpu_commands.h | 1 +
drivers/gpu/drm/i915/intel_lrc.c | 59 +++++++++++-
drivers/gpu/drm/i915/intel_ringbuffer.c | 2 +
drivers/gpu/drm/i915/intel_ringbuffer.h | 2 +
7 files changed, 183 insertions(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 04e27806e581..07cdbfb4ad7a 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1461,6 +1461,12 @@ struct i915_perf_stream {
* @oa_config: The OA configuration used by the stream.
*/
struct i915_oa_config *oa_config;
+
+ /**
+ * @noa_reprogram_vma: A batchbuffer reprogramming the NOA muxes, used
+ * after switching powergating configurations.
+ */
+ struct i915_vma *noa_reprogram_vma;
};
/**
diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index 5b279a82445a..1b9e3d6a53a2 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -1691,6 +1691,96 @@ static int gen8_emit_oa_config(struct i915_request *rq,
return 0;
}
+#define MAX_LRI_SIZE (125U)
+
+static u32 noa_reprogram_bb_size(struct i915_oa_config *oa_config)
+{
+ u32 n_lri;
+
+ /* Very unlikely but possible that we have no muxes to configure. */
+ if (!oa_config->mux_regs_len)
+ return 0;
+
+ n_lri = (oa_config->mux_regs_len / MAX_LRI_SIZE) +
+ (oa_config->mux_regs_len % MAX_LRI_SIZE) != 0;
+
+ return n_lri * 4 + oa_config->mux_regs_len * 8 + /* MI_LOAD_REGISTER_IMMs */
+ 6 * 4 + /* PIPE_CONTROL */
+ 4; /* MI_BATCH_BUFFER_END */
+}
+
+static int
+alloc_noa_reprogram_bo(struct i915_perf_stream *stream)
+{
+ struct drm_i915_private *dev_priv = stream->dev_priv;
+ struct i915_oa_config *oa_config = stream->oa_config;
+ struct drm_i915_gem_object *bo;
+ struct i915_vma *vma;
+ u32 buffer_size;
+ u32 *cs;
+ int i, ret, n_loaded_regs;
+
+ buffer_size = ALIGN(noa_reprogram_bb_size(oa_config), PAGE_SIZE);
+ if (buffer_size == 0)
+ return 0;
+
+ bo = i915_gem_object_create(dev_priv, buffer_size);
+ if (IS_ERR(bo)) {
+ DRM_ERROR("Failed to allocate NOA reprogramming buffer\n");
+ return PTR_ERR(bo);
+ }
+
+ cs = i915_gem_object_pin_map(bo, I915_MAP_WB);
+ if (IS_ERR(cs)) {
+ ret = PTR_ERR(cs);
+ goto err_unref_bo;
+ }
+
+ n_loaded_regs = 0;
+ for (i = 0; i < oa_config->mux_regs_len; i++) {
+ if ((n_loaded_regs % MAX_LRI_SIZE) == 0) {
+ u32 n_lri = min(oa_config->mux_regs_len - n_loaded_regs,
+ MAX_LRI_SIZE);
+ *cs++ = MI_LOAD_REGISTER_IMM(n_lri);
+ }
+
+ *cs++ = i915_mmio_reg_offset(oa_config->mux_regs[i].addr);
+ *cs++ = oa_config->mux_regs[i].value;
+ n_loaded_regs++;
+ }
+
+ cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_MMIO_WRITE, 0);
What's the pc for? Isn't it a bit dangerous to request a mmio write to
to reg 0?
Oops, I've been using this wrong :(
I thought it would flush pending MMIO writes.
+
+ *cs++ = MI_BATCH_BUFFER_END;
+
+ i915_gem_object_unpin_map(bo);
+
+ ret = i915_gem_object_set_to_gtt_domain(bo, false);
+ if (ret)
+ goto err_unref_bo;
+
+ vma = i915_vma_instance(bo, &dev_priv->ggtt.base, NULL);
+ if (IS_ERR(vma)) {
+ ret = PTR_ERR(vma);
+ goto err_unref_bo;
+ }
+
+ ret = i915_vma_pin(vma, 0, 0, PIN_USER | PIN_GLOBAL);
+ if (ret)
+ goto err_unref_vma;
No GGTT access, so just PIN_USER for GPU access.
Okay. I just don't seem to understand any of the vma stuff...
+ stream->noa_reprogram_vma = vma;
+
+ return 0;
+
+err_unref_vma:
+ i915_vma_put(vma);
+err_unref_bo:
+ i915_gem_object_put(bo);
+
+ return ret;
+}
+
static int gen8_switch_to_updated_kernel_context(struct drm_i915_private *dev_priv,
const struct i915_oa_config *oa_config)
{
@@ -2102,6 +2192,12 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream,
goto err_config;
}
+ ret = alloc_noa_reprogram_bo(stream);
+ if (ret) {
+ DRM_DEBUG("Enable to alloc NOA reprogramming BO\n");
+ goto err_config;
+ }
+
/* PRM - observability performance counters:
*
* OACONTROL, performance counter enable, note:
@@ -2136,6 +2232,13 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream,
dev_priv->perf.oa.exclusive_stream = stream;
+ ret = dev_priv->perf.oa.ops.enable_metric_set(dev_priv,
+ stream->oa_config);
+ if (ret)
+ goto err_enable;
+
+ stream->ops = &i915_oa_stream_ops;
+
mutex_unlock(&dev_priv->drm.struct_mutex);
return 0;
@@ -2497,6 +2600,11 @@ static void i915_perf_destroy_locked(struct i915_perf_stream *stream)
if (stream->ctx)
i915_gem_context_put(stream->ctx);
+ if (stream->noa_reprogram_vma) {
+ i915_vma_unpin(stream->noa_reprogram_vma);
+ i915_gem_object_put(stream->noa_reprogram_vma->obj);
This will be unhappy due to the lack of active tracking.
i915_vma_unpin_and_release(&stream->noa_reprogram_vma).
Hmm, worse than that. It's not being tracked at all, so expect it to be
freed while still in use.
Moving to engine->noa_reprogram_vma.
I should be able to set this from i915/perf as there is a window where
it idle the GPU before editing the window
+ }
+
kfree(stream);
}
diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h
index eddbd4245cb3..3357ceb4bdb1 100644
--- a/drivers/gpu/drm/i915/i915_request.h
+++ b/drivers/gpu/drm/i915/i915_request.h
@@ -149,6 +149,12 @@ struct i915_request {
/** Preallocate space in the ring for the emitting the request */
u32 reserved_space;
+ /*
+ * Pointer in the batchbuffer to where the i915/perf NOA reprogramming
+ * can be inserted just before HW submission.
+ */
+ u32 *perf_prog_start;
Just u32 offset, no need for the extra space of a pointer here.
(Probably should just limit rings to 64k and pack the offsets into u16.)
Or have them as offset from head, and u8 dwords? Hmm.
I have cold shivers from the thought of more special locations inside
the batch. Not the first, and not the last, so I feel like there should
be a better way (or at least more consistent).
Switching to an offset.
+
/** Batch buffer related to this request if any (used for
* error state dump only).
*/
diff --git a/drivers/gpu/drm/i915/intel_gpu_commands.h b/drivers/gpu/drm/i915/intel_gpu_commands.h
index 105e2a9e874a..09b0fded8b17 100644
--- a/drivers/gpu/drm/i915/intel_gpu_commands.h
+++ b/drivers/gpu/drm/i915/intel_gpu_commands.h
@@ -146,6 +146,7 @@
#define MI_BATCH_GTT (2<<6) /* aliased with (1<<7) on gen4 */
#define MI_BATCH_BUFFER_START_GEN8 MI_INSTR(0x31, 1)
#define MI_BATCH_RESOURCE_STREAMER (1<<10)
+#define MI_BATCH_SECOND_LEVEL (1<<22)
/*
* 3D instructions used by the kernel
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 1c08bd2be6c3..df4a7bb07eae 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -510,6 +510,38 @@ static bool can_merge_ctx(const struct i915_gem_context *prev,
return true;
}
+static void maybe_enable_noa_repgoram(struct i915_request *rq)
+{
+ struct intel_engine_cs *engine = rq->engine;
+ struct drm_i915_private *dev_priv = engine->i915;
+ struct i915_perf_stream *stream = dev_priv->perf.oa.exclusive_stream;
Could there be any more pointer chasing? </chandler>
Putting this into the engine.
+ struct intel_context *ce;
+ u32 *cs = rq->perf_prog_start;
+
+ /* Slice/subslice/EU powergating only matters on the RCS. */
+ if (engine->id != RCS)
+ return;
+
+ /* Nothing need to reprogram when perf isn't enabled. */
+ if (!stream)
+ return;
+
+ ce = &rq->ctx->__engine[engine->id];
rq->hw_context; but to_intel_context() until that patch lands.
Moving sseu to the request so I can do the rpcs config through MI_LRI.
+
+ /*
+ * If the powergating configuration doesn't change, no need to
+ * reprogram.
+ */
+ if (engine->last_sseu.value == ce->sseu.value)
+ return;
+
+ *cs++ = MI_BATCH_BUFFER_START_GEN8 | MI_BATCH_SECOND_LEVEL;
You are not a second level batch. You are calling from the ring to a
global address of _0_.
+ *cs++ = 0;
low 32bits = 0
+ *cs++ = i915_ggtt_offset(stream->noa_reprogram_vma);
high 32bits = address you wanted. (order reversed)
Fixing....
+
+ engine->last_sseu = ce->sseu;
+}
+
static void port_assign(struct execlist_port *port, struct i915_request *rq)
{
GEM_BUG_ON(rq == port_request(port));
@@ -517,6 +549,8 @@ static void port_assign(struct execlist_port *port, struct i915_request *rq)
if (port_isset(port))
i915_request_put(port_request(port));
+ maybe_enable_noa_repgoram(rq);
How should this work for the guc? How should this work for amalgamated
contexts? Should this only be done for the first request in the sequence
and definitely not done for lite-restore.
I'm assuming that GuC does not reorder requests submitted by i915.
+
port_set(port, port_pack(i915_request_get(rq), port_count(port)));
}
@@ -1047,6 +1081,13 @@ static void execlists_submission_tasklet(unsigned long data)
buf[2*head + 1] == execlists->preempt_complete_status) {
GEM_TRACE("%s preempt-idle\n", engine->name);
+ /*
+ * Clear out the state of the sseu on the
+ * engine, as it's not clear what it will be
+ * after preemption.
+ */
+ memset(&engine->last_sseu, 0, sizeof(engine->last_sseu));
Oh, you are missing complete_preempt_context. But you also need to reset
after reset, so cancel_port_requests is also sensisble.
Thanks, putting it there.
+
execlists_cancel_port_requests(execlists);
execlists_unwind_incomplete_requests(execlists);
@@ -1937,10 +1978,26 @@ static int gen8_emit_bb_start(struct i915_request *rq,
rq->ctx->ppgtt->pd_dirty_rings &= ~intel_engine_flag(rq->engine);
}
- cs = intel_ring_begin(rq, 6);
+ cs = intel_ring_begin(rq, rq->engine->id == RCS ? 10 : 6);
if (IS_ERR(cs))
return PTR_ERR(cs);
+ if (rq->engine->id == RCS) {
+ /*
+ * Leave some instructions to be written with an
+ * MI_BATCH_BUFFER_START to the i915/perf NOA reprogramming
+ * batchbuffer. We only turn those MI_NOOP into
+ * MI_BATCH_BUFFER_START when we detect a SSEU powergating
+ * configuration change that might affect NOA. This is only
+ * for the RCS.
+ */
+ rq->perf_prog_start = cs;
+ *cs++ = MI_NOOP;
+ *cs++ = MI_NOOP;
+ *cs++ = MI_NOOP;
+ *cs++ = MI_NOOP; /* Aligning to 2 dwords */
+ }
+
/*
* WaDisableCtxRestoreArbitration:bdw,chv
*
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index 8f19349a6055..cde791234397 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -525,6 +525,8 @@ static int init_ring_common(struct intel_engine_cs *engine)
if (INTEL_GEN(dev_priv) > 2)
I915_WRITE_MODE(engine, _MASKED_BIT_DISABLE(STOP_RING));
+ memset(&engine->last_sseu, 0, sizeof(engine->last_sseu));
Why only legacy submission?
Fixing...
+
out:
intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 010750e8ee44..2f8518e1a49f 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -343,6 +343,8 @@ struct intel_engine_cs {
struct drm_i915_gem_object *default_state;
+ union i915_gem_sseu last_sseu;
The struct here is internal; debatable if it wants the GEM moniker for
being part of the GEM user interface.
Looks like this wanted a sefltest.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@xxxxxxxxxxxxxxxxxxxxx
https://lists.freedesktop.org/mailman/listinfo/intel-gfx