Re: [Intel-gfx] [PATCH] drm/i915: avoid concurrent writes to aux_inv

Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> · Wed, 02 Mar 2022 20:50:19 +0000

Quoting fei.yang@xxxxxxxxx (2022-03-02 18:26:57)
> From: Fei Yang <fei.yang@xxxxxxxxx>
> 
> GPU hangs have been observed when multiple engines write to the
> same aux_inv register at the same time. To avoid this each engine
> should only invalidate its own auxiliary table. The function
> gen12_emit_flush_xcs() currently invalidate the auxiliary table for
> all engines because the rq->engine is not necessarily the engine
> eventually carrying out the request, and potentially the engine
> could even be a virtual one (with engine->instance being -1).
> With this patch, auxiliary table invalidation is done only for the
> engine executing the request. And the mmio address for the aux_inv
> register is set after the engine instance becomes certain.
> 
> Signed-off-by: Chris Wilson <chris.p.wilson@xxxxxxxxx>
> Signed-off-by: Fei Yang <fei.yang@xxxxxxxxx>
> ---
>  drivers/gpu/drm/i915/gt/gen8_engine_cs.c      | 41 ++++---------------
>  .../drm/i915/gt/intel_execlists_submission.c  | 38 +++++++++++++++++
>  drivers/gpu/drm/i915/i915_request.h           |  2 +
>  3 files changed, 47 insertions(+), 34 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gt/gen8_engine_cs.c b/drivers/gpu/drm/i915/gt/gen8_engine_cs.c
> index b1b9c3fd7bf9..af62e2bc2c9b 100644
> --- a/drivers/gpu/drm/i915/gt/gen8_engine_cs.c
> +++ b/drivers/gpu/drm/i915/gt/gen8_engine_cs.c
> @@ -165,30 +165,6 @@ static u32 preparser_disable(bool state)
>         return MI_ARB_CHECK | 1 << 8 | state;
>  }
>  
> -static i915_reg_t aux_inv_reg(const struct intel_engine_cs *engine)
> -{
> -       static const i915_reg_t vd[] = {
> -               GEN12_VD0_AUX_NV,
> -               GEN12_VD1_AUX_NV,
> -               GEN12_VD2_AUX_NV,
> -               GEN12_VD3_AUX_NV,
> -       };
> -
> -       static const i915_reg_t ve[] = {
> -               GEN12_VE0_AUX_NV,
> -               GEN12_VE1_AUX_NV,
> -       };
> -
> -       if (engine->class == VIDEO_DECODE_CLASS)
> -               return vd[engine->instance];
> -
> -       if (engine->class == VIDEO_ENHANCEMENT_CLASS)
> -               return ve[engine->instance];
> -
> -       GEM_BUG_ON("unknown aux_inv reg\n");
> -       return INVALID_MMIO_REG;
> -}
> -
>  static u32 *gen12_emit_aux_table_inv(const i915_reg_t inv_reg, u32 *cs)
>  {
>         *cs++ = MI_LOAD_REGISTER_IMM(1);
> @@ -288,7 +264,7 @@ int gen12_emit_flush_xcs(struct i915_request *rq, u32 mode)
>         if (mode & EMIT_INVALIDATE)
>                 aux_inv = rq->engine->mask & ~BIT(BCS0);
>         if (aux_inv)
> -               cmd += 2 * hweight32(aux_inv) + 2;
> +               cmd += 4;
>  
>         cs = intel_ring_begin(rq, cmd);
>         if (IS_ERR(cs))
> @@ -319,16 +295,13 @@ int gen12_emit_flush_xcs(struct i915_request *rq, u32 mode)
>         *cs++ = 0; /* value */
>  
>         if (aux_inv) { /* hsdes: 1809175790 */
> -               struct intel_engine_cs *engine;
> -               unsigned int tmp;
> -
> -               *cs++ = MI_LOAD_REGISTER_IMM(hweight32(aux_inv));
> -               for_each_engine_masked(engine, rq->engine->gt, aux_inv, tmp) {
> -                       *cs++ = i915_mmio_reg_offset(aux_inv_reg(engine));
> -                       *cs++ = AUX_INV;
> -               }
> +               *cs++ = MI_LOAD_REGISTER_IMM(1);
> +               rq->vd_ve_aux_inv = cs;
> +               *cs++ = 0; /* address to be set at submission to HW */
> +               *cs++ = AUX_INV;
>                 *cs++ = MI_NOOP;
> -       }
> +       } else
> +               rq->vd_ve_aux_inv = NULL;
>  
>         if (mode & EMIT_INVALIDATE)
>                 *cs++ = preparser_disable(false);
> diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
> index 1c602d4ae297..a018de6dcac5 100644
> --- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
> +++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
> @@ -1258,6 +1258,34 @@ static bool completed(const struct i915_request *rq)
>         return __i915_request_is_complete(rq);
>  }
>  
> +static i915_reg_t aux_inv_reg(const struct intel_engine_cs *engine)
> +{
> +       static const i915_reg_t vd[] = {
> +               GEN12_VD0_AUX_NV,
> +               GEN12_VD1_AUX_NV,
> +               GEN12_VD2_AUX_NV,
> +               GEN12_VD3_AUX_NV,
> +       };
> +
> +       static const i915_reg_t ve[] = {
> +               GEN12_VE0_AUX_NV,
> +               GEN12_VE1_AUX_NV,
> +       };
> +
> +       if (engine->class == VIDEO_DECODE_CLASS) {
> +               GEM_BUG_ON(engine->instance >= ARRAY_SIZE(vd));
> +               return vd[engine->instance];
> +       }
> +
> +       if (engine->class == VIDEO_ENHANCEMENT_CLASS) {
> +               GEM_BUG_ON(engine->instance >= ARRAY_SIZE(ve));
> +               return ve[engine->instance];
> +       }
> +
> +       GEM_BUG_ON("unknown aux_inv reg\n");
> +       return INVALID_MMIO_REG;
> +}
> +
>  static void execlists_dequeue(struct intel_engine_cs *engine)
>  {
>         struct intel_engine_execlists * const execlists = &engine->execlists;
> @@ -1538,6 +1566,16 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>                         }
>  
>                         if (__i915_request_submit(rq)) {
> +                               /* hsdes: 1809175790 */
> +                               if ((GRAPHICS_VER(engine->i915) == 12) &&
> +                                   rq->vd_ve_aux_inv &&
> +                                   (engine->class == VIDEO_DECODE_CLASS ||
> +                                    engine->class == VIDEO_ENHANCEMENT_CLASS)) {

We do not need the extra checks here; we just do as we are told. We only
tell ourselves to apply the fixup when required.

> +                                       *rq->vd_ve_aux_inv = i915_mmio_reg_offset

Likewise, vd_ve is overspecific, aux_inv_fixup or aux_inv_wa (or
wa_aux_iv, fixup_aux_inv).

> +                                               (aux_inv_reg(engine));
> +                                       rq->vd_ve_aux_inv = NULL;

Move this to i915_request initialisation so that we only set aux_inv
when required, which probably explains the extra defence.

> +                                       rq->execution_mask = engine->mask;
> +                               }
>                                 if (!merge) {
>                                         *port++ = i915_request_get(last);
>                                         last = NULL;
> diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h
> index 28b1f9db5487..69de32e5e15d 100644
> --- a/drivers/gpu/drm/i915/i915_request.h
> +++ b/drivers/gpu/drm/i915/i915_request.h
> @@ -350,6 +350,8 @@ struct i915_request {
>                 struct list_head link;
>                 unsigned long delay;
>         } mock;)
> +
> +       u32 *vd_ve_aux_inv;

Not at the end of the struct; that's where we put things in the dungeon.
The selftest struct should be last; I do hope no one has been putting
things at random places in the struct without considering the layout and
semantics. From the flow, this is akin to batch, capture_list; before
emitted_jiffies would be a good spot.
-Chris