On Gen11 powergating half the execution units is a functional
requirement when using the VME samplers. Not fullfilling this
requirement can lead to hangs.
This unfortunately plays fairly poorly with the NOA requirements. NOA
requires a stable power configuration to maintain its configuration.
As a result using OA (and NOA feeding into it) so far has required us
to use a power configuration that can work for all contexts. The only
power configuration fullfilling this is powergating half the execution
units.
This makes performance analysis for 3D workloads somewhat pointless.
Failing to find a solution that would work for everybody, this change
introduces a new i915-perf stream open parameter that punts the
decision off to userspace. If this parameter is omitted, the existing
Gen11 behavior remains (half EU array powergating).
This change takes the initiative to move all perf related sseu
configuration into i915_perf.c
v2: Make parameter priviliged if different from default
v3: Fix context modifying its sseu config while i915-perf is enabled
v4: Always consider global sseu a privileged operation (Tvrtko)
Override req_sseu point in intel_sseu_make_rpcs() (Tvrtko)
Remove unrelated changes (Tvrtko)
Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@xxxxxxxxx>
---
drivers/gpu/drm/i915/gem/i915_gem_context.c | 10 +--
drivers/gpu/drm/i915/gem/i915_gem_context.h | 4 +
drivers/gpu/drm/i915/gt/intel_sseu.c | 33 ++------
drivers/gpu/drm/i915/i915_perf.c | 84 ++++++++++++++++++++-
drivers/gpu/drm/i915/i915_perf_types.h | 7 ++
include/uapi/drm/i915_drm.h | 11 +++
6 files changed, 116 insertions(+), 33 deletions(-)
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c
b/drivers/gpu/drm/i915/gem/i915_gem_context.c
index cb6b6be48978..edcbc7eef716 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c
@@ -1358,10 +1358,10 @@ static int get_ringsize(struct
i915_gem_context *ctx,
return 0;
}
-static int
-user_to_context_sseu(struct drm_i915_private *i915,
- const struct drm_i915_gem_context_param_sseu *user,
- struct intel_sseu *context)
+int
+i915_gem_user_to_context_sseu(struct drm_i915_private *i915,
+ const struct drm_i915_gem_context_param_sseu *user,
+ struct intel_sseu *context)
{
const struct sseu_dev_info *device = &RUNTIME_INFO(i915)->sseu;
@@ -1496,7 +1496,7 @@ static int set_sseu(struct i915_gem_context
*ctx,
goto out_ce;
}
- ret = user_to_context_sseu(i915, &user_sseu, &sseu);
+ ret = i915_gem_user_to_context_sseu(i915, &user_sseu, &sseu);
if (ret)
goto out_ce;
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.h
b/drivers/gpu/drm/i915/gem/i915_gem_context.h
index 57b7ae2893e1..f37c36719b04 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_context.h
+++ b/drivers/gpu/drm/i915/gem/i915_gem_context.h
@@ -221,4 +221,8 @@ i915_gem_engines_iter_next(struct
i915_gem_engines_iter *it);
struct i915_lut_handle *i915_lut_handle_alloc(void);
void i915_lut_handle_free(struct i915_lut_handle *lut);
+int i915_gem_user_to_context_sseu(struct drm_i915_private *i915,
+ const struct drm_i915_gem_context_param_sseu *user,
+ struct intel_sseu *context);
+
#endif /* !__I915_GEM_CONTEXT_H__ */
diff --git a/drivers/gpu/drm/i915/gt/intel_sseu.c
b/drivers/gpu/drm/i915/gt/intel_sseu.c
index 74f793423231..d173271c7397 100644
--- a/drivers/gpu/drm/i915/gt/intel_sseu.c
+++ b/drivers/gpu/drm/i915/gt/intel_sseu.c
@@ -65,7 +65,6 @@ u32 intel_sseu_make_rpcs(struct drm_i915_private
*i915,
{
const struct sseu_dev_info *sseu = &RUNTIME_INFO(i915)->sseu;
bool subslice_pg = sseu->has_subslice_pg;
- struct intel_sseu ctx_sseu;
u8 slices, subslices;
u32 rpcs = 0;
@@ -78,31 +77,13 @@ u32 intel_sseu_make_rpcs(struct
drm_i915_private *i915,
/*
* If i915/perf is active, we want a stable powergating
configuration
- * on the system.
- *
- * We could choose full enablement, but on ICL we know there are
use
- * cases which disable slices for functional, apart for performance
- * reasons. So in this case we select a known stable subset.
+ * on the system. Use the configuration pinned by i915/perf.
*/
- if (!i915->perf.exclusive_stream) {
- ctx_sseu = *req_sseu;
- } else {
- ctx_sseu = intel_sseu_from_device_info(sseu);
-
- if (IS_GEN(i915, 11)) {
- /*
- * We only need subslice count so it doesn't matter
- * which ones we select - just turn off low bits in the
- * amount of half of all available subslices per slice.
- */
- ctx_sseu.subslice_mask =
- ~(~0 << (hweight8(ctx_sseu.subslice_mask) / 2));
- ctx_sseu.slice_mask = 0x1;
- }
- }
+ if (i915->perf.exclusive_stream)
+ req_sseu = &i915->perf.sseu;
- slices = hweight8(ctx_sseu.slice_mask);
- subslices = hweight8(ctx_sseu.subslice_mask);
+ slices = hweight8(req_sseu->slice_mask);
+ subslices = hweight8(req_sseu->subslice_mask);
/*
* Since the SScount bitfield in GEN8_R_PWR_CLK_STATE is only
three bits
@@ -175,13 +156,13 @@ u32 intel_sseu_make_rpcs(struct
drm_i915_private *i915,
if (sseu->has_eu_pg) {
u32 val;
- val = ctx_sseu.min_eus_per_subslice <<
GEN8_RPCS_EU_MIN_SHIFT;
+ val = req_sseu->min_eus_per_subslice << GEN8_RPCS_EU_MIN_SHIFT;
GEM_BUG_ON(val & ~GEN8_RPCS_EU_MIN_MASK);
val &= GEN8_RPCS_EU_MIN_MASK;
rpcs |= val;
- val = ctx_sseu.max_eus_per_subslice <<
GEN8_RPCS_EU_MAX_SHIFT;
+ val = req_sseu->max_eus_per_subslice << GEN8_RPCS_EU_MAX_SHIFT;
GEM_BUG_ON(val & ~GEN8_RPCS_EU_MAX_MASK);
val &= GEN8_RPCS_EU_MAX_MASK;
diff --git a/drivers/gpu/drm/i915/i915_perf.c
b/drivers/gpu/drm/i915/i915_perf.c
index 86c6abaa3e0e..36e46f3fbdb5 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -344,6 +344,11 @@ static const struct i915_oa_format
gen12_oa_formats[I915_OA_FORMAT_MAX] = {
* @oa_periodic: Whether to enable periodic OA unit sampling
* @oa_period_exponent: The OA unit sampling period is derived from
this
* @engine: The engine (typically rcs0) being monitored by the OA unit
+ * @sseu: Selected sseu configuration for recording
+ * @has_sseu: Whther @drm_sseu was specified by userspace
*
* As read_properties_unlocked() enumerates and validates the
properties given
* to open a stream of metrics the configuration is built up in the
structure
@@ -363,6 +368,10 @@ struct perf_open_properties {
int oa_period_exponent;
struct intel_engine_cs *engine;
+
+ bool has_sseu;
+ struct drm_i915_gem_context_param_sseu drm_sseu;
+ struct intel_sseu sseu;
};
struct i915_oa_config_bo {
@@ -2720,6 +2729,47 @@ static int i915_perf_stream_enable_sync(struct
i915_perf_stream *stream)
return 0;
}
+static void
+get_default_sseu_config(struct intel_sseu *out_sseu,
+ struct intel_engine_cs *engine)
+{
+ const struct sseu_dev_info *devinfo_sseu =
+ &RUNTIME_INFO(engine->i915)->sseu;
+
+ *out_sseu = intel_sseu_from_device_info(devinfo_sseu);
+
+ if (IS_GEN(engine->i915, 11)) {
+ /*
+ * We only need subslice count so it doesn't matter which ones
+ * we select - just turn off low bits in the amount of half of
+ * all available subslices per slice.
+ */
+ out_sseu->subslice_mask =
+ ~(~0 << (hweight8(out_sseu->subslice_mask) / 2));
+ out_sseu->slice_mask = 0x1;
+ }
+}
+
+static int
+get_sseu_config(struct intel_sseu *out_sseu,
+ struct intel_engine_cs *engine,
+ const struct drm_i915_gem_context_param_sseu *drm_sseu)
+{
+ struct intel_engine_cs *user_engine;
+
+ user_engine = intel_engine_lookup_user(
+ engine->i915,
+ drm_sseu->engine.engine_class,
+ drm_sseu->engine.engine_instance);
+ if (!user_engine)
+ return -EINVAL;
+
+ if (user_engine != engine)
+ return -EINVAL;