Re: [PATCH 2/2] drm/i915/perf: fix ctx_id read with GuC & ICL

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On 31/05/18 21:46, Michel Thierry wrote:
On 5/31/2018 12:56 PM, Lionel Landwerlin wrote:
One thing we didn't really understand about the OA report is that the
ContextID field (dword 2) is copy of the context descriptor (dword 1).

On Gen8->10 and without using GuC we didn't notice the issue because
we only checked the 21bits of the ContextID field in the OA reports
which matches exactly the hw_id stored into the context descriptor.

When using GuC submission we have an issue of a non matching hw_id
because GuC uses bit 20 of the hw_id to signal proxy submission. This
change makes introduces a mask to compare only the relevant bits.
Choose one: makes or introduces ;)


On ICL the context descriptor format has changed and we failed to
address this. On top of using a mask we also need to shift the bits
properly.


Someone is going to complain this should be two patches (one to address the GuC-ness and one for Gen11), but not me.

Reviewed-by: Michel Thierry <michel.thierry@xxxxxxxxx>


Kind of agree, but that's a pain.
We can put it as a fix of the first commit that enabled gen8 and I'll do the backport in stable versions.

Cheers,

-
Lionel


Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@xxxxxxxxx>
Fixes: 1de401c08fa805 ("drm/i915/perf: enable perf support on ICL")
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=104252
BSpec: 1237
Testcase: igt/perf/gen8-unprivileged-single-ctx-counters
---
  drivers/gpu/drm/i915/i915_drv.h  |   1 +
  drivers/gpu/drm/i915/i915_perf.c | 123 ++++++++++++++++++++++++-------
  drivers/gpu/drm/i915/intel_lrc.c |   5 ++
  3 files changed, 101 insertions(+), 28 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 58ab9259fb73..0ccda488a8db 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1953,6 +1953,7 @@ struct drm_i915_private {
                struct intel_context *pinned_ctx;
              u32 specific_ctx_id;
+            u32 specific_ctx_id_mask;
                struct hrtimer poll_check_timer;
              wait_queue_head_t poll_wq;
diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index 4a62024cbf85..d5e5d4635f1f 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -738,12 +738,7 @@ static int gen8_append_oa_reports(struct i915_perf_stream *stream,
              continue;
          }
  -        /*
-         * XXX: Just keep the lower 21 bits for now since I'm not
-         * entirely sure if the HW touches any of the higher bits in
-         * this field
-         */
-        ctx_id = report32[2] & 0x1fffff;
+        ctx_id = report32[2] & dev_priv->perf.oa.specific_ctx_id_mask;
            /*
           * Squash whatever is in the CTX_ID field if it's marked as
@@ -1204,6 +1199,36 @@ static int i915_oa_read(struct i915_perf_stream *stream,
      return dev_priv->perf.oa.ops.read(stream, buf, count, offset);
  }
  +static int oa_get_render_lrca(struct drm_i915_private *i915,
+                  struct i915_gem_context *ctx,
+                  u32 *lrca)
+{
+    struct intel_engine_cs *engine = i915->engine[RCS];
+    struct intel_context *ce;
+    int ret;
+
+    ret = i915_mutex_lock_interruptible(&i915->drm);
+    if (ret)
+        return ret;
+
+    /*
+     * As the ID is the gtt offset of the context's vma we
+     * pin the vma to ensure the ID remains fixed.
+     *
+     * NB: implied RCS engine...
+     */
+    ce = intel_context_pin(ctx, engine);
+    mutex_unlock(&i915->drm.struct_mutex);
+    if (IS_ERR(ce))
+        return PTR_ERR(ce);
+
+    i915->perf.oa.pinned_ctx = ce;
+
+    *lrca = i915_ggtt_offset(ce->state);
+
+    return 0;
+}
+
  /**
   * oa_get_render_ctx_id - determine and hold ctx hw id
   * @stream: An i915-perf stream opened for OA metrics
@@ -1216,40 +1241,81 @@ static int i915_oa_read(struct i915_perf_stream *stream,
   */
  static int oa_get_render_ctx_id(struct i915_perf_stream *stream)
  {
-    struct drm_i915_private *dev_priv = stream->dev_priv;
+    struct drm_i915_private *i915 = stream->dev_priv;
  -    if (HAS_LOGICAL_RING_CONTEXTS(dev_priv)) {
-        dev_priv->perf.oa.specific_ctx_id = stream->ctx->hw_id;
-    } else {
-        struct intel_engine_cs *engine = dev_priv->engine[RCS];
-        struct intel_context *ce;
+    switch (INTEL_GEN(i915)) {
+    case 7: {
          int ret;
  -        ret = i915_mutex_lock_interruptible(&dev_priv->drm);
+        ret = oa_get_render_lrca(i915, stream->ctx,
+                     &i915->perf.oa.specific_ctx_id);
          if (ret)
              return ret;
            /*
-         * As the ID is the gtt offset of the context's vma we
-         * pin the vma to ensure the ID remains fixed.
-         *
-         * NB: implied RCS engine...
+         * On Haswell we don't do any post processing of the reports
+         * and don't need to use the mask.
           */
-        ce = intel_context_pin(stream->ctx, engine);
-        mutex_unlock(&dev_priv->drm.struct_mutex);
-        if (IS_ERR(ce))
-            return PTR_ERR(ce);
+        i915->perf.oa.specific_ctx_id_mask = 0;
+        break;
+    }
  -        dev_priv->perf.oa.pinned_ctx = ce;
+    case 8:
+    case 9:
+    case 10:
+        if (USES_GUC_SUBMISSION(i915)) {
+            u32 lrca;
+            int ret;
  -        /*
-         * Explicitly track the ID (instead of calling
-         * i915_ggtt_offset() on the fly) considering the difference
-         * with gen8+ and execlists
-         */
-        dev_priv->perf.oa.specific_ctx_id = i915_ggtt_offset(ce->state);
+            ret = oa_get_render_lrca(i915, stream->ctx, &lrca);
+            if (ret)
+                return ret;
+
+            /*
+             * The LRCA is aligned to a page. As a result the
+             * lower 12bits are always at 0 and reused in the
+             * context descriptor for some flags. They won't be
+             * part of the context ID in the OA reports, so squash
+             * those lower bits.
+             */
+            i915->perf.oa.specific_ctx_id =
+                (lrca + LRC_HEADER_PAGES * PAGE_SIZE) >> 12;
+
+            /*
+             * GuC uses the top bit to signal proxy submission, so
+             * ignore that bit if using GuC.
+             */
+            i915->perf.oa.specific_ctx_id_mask =
+                (1U << (GEN8_CTX_ID_WIDTH - 1)) - 1;
+        } else {
+            i915->perf.oa.specific_ctx_id = stream->ctx->hw_id;
+            i915->perf.oa.specific_ctx_id_mask =
+                (1U << GEN8_CTX_ID_WIDTH) - 1;
+        }
+        break;
+
+    case 11: {
+        struct intel_engine_cs *engine = i915->engine[RCS];
+
+        i915->perf.oa.specific_ctx_id =
+            stream->ctx->hw_id << (GEN11_SW_CTX_ID_SHIFT - 32) |
+            engine->instance << (GEN11_ENGINE_INSTANCE_SHIFT - 32) |
+            engine->class << (GEN11_ENGINE_INSTANCE_SHIFT - 32);
+        i915->perf.oa.specific_ctx_id_mask =
+            ((1U << GEN11_SW_CTX_ID_WIDTH) - 1) << (GEN11_SW_CTX_ID_SHIFT - 32) | +            ((1U << GEN11_ENGINE_INSTANCE_WIDTH) - 1) << (GEN11_ENGINE_INSTANCE_SHIFT - 32) | +            ((1 << GEN11_ENGINE_CLASS_WIDTH) - 1) << (GEN11_ENGINE_CLASS_SHIFT - 32);
+        break;
      }
  +    default:
+        MISSING_CASE(INTEL_GEN(i915));
+    }
+
+    DRM_DEBUG_DRIVER("filtering on ctx_id=0x%x ctx_id_mask=0x%x\n",
+             i915->perf.oa.specific_ctx_id,
+             i915->perf.oa.specific_ctx_id_mask);
+
      return 0;
  }
  @@ -1266,6 +1332,7 @@ static void oa_put_render_ctx_id(struct i915_perf_stream *stream)
      struct intel_context *ce;
        dev_priv->perf.oa.specific_ctx_id = INVALID_CTX_ID;
+    dev_priv->perf.oa.specific_ctx_id_mask = 0;
        ce = fetch_and_zero(&dev_priv->perf.oa.pinned_ctx);
      if (ce) {
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 94f9c4795190..7d63c6d2f687 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -233,6 +233,11 @@ intel_lr_context_descriptor_update(struct i915_gem_context *ctx,
                                  /* bits 12-31 */
      GEM_BUG_ON(desc & GENMASK_ULL(63, 32));
  +    /*
+     * The following 32bits are copied into the OA reports (dword 2).
+     * Consider updating oa_get_render_ctx_id in i915_perf.c when changing
+     * anything below.
+     */
      if (INTEL_GEN(ctx->i915) >= 11) {
          GEM_BUG_ON(ctx->hw_id >= BIT(GEN11_SW_CTX_ID_WIDTH));
          desc |= (u64)ctx->hw_id << GEN11_SW_CTX_ID_SHIFT;



_______________________________________________
Intel-gfx mailing list
Intel-gfx@xxxxxxxxxxxxxxxxxxxxx
https://lists.freedesktop.org/mailman/listinfo/intel-gfx




[Index of Archives]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux