Quoting Lionel Landwerlin (2017-08-30 19:20:06) > If some of the contexts submitting workloads to the GPU have been > configured to shutdown slices/subslices, we might loose the NOA > configurations written in the NOA muxes. We need to reprogram then at > context switch. > > Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@xxxxxxxxx> > --- > drivers/gpu/drm/i915/i915_drv.h | 2 ++ > drivers/gpu/drm/i915/i915_perf.c | 77 ++++++++++++++++++++++++++++++++++++++++ > drivers/gpu/drm/i915/intel_lrc.c | 64 ++++++++++++++++++++++++++++++--- > drivers/gpu/drm/i915/intel_lrc.h | 1 + > 4 files changed, 140 insertions(+), 4 deletions(-) > > diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h > index 0003b46b6840..d4b3e5da9009 100644 > --- a/drivers/gpu/drm/i915/i915_drv.h > +++ b/drivers/gpu/drm/i915/i915_drv.h > @@ -3685,6 +3685,8 @@ int i915_perf_remove_config_ioctl(struct drm_device *dev, void *data, > void i915_oa_init_reg_state(struct intel_engine_cs *engine, > struct i915_gem_context *ctx, > uint32_t *reg_state); > +u32 i915_oa_get_perctx_bb_size(struct drm_i915_private *dev_priv); > +u32 *i915_oa_emit_perctx_bb(struct intel_engine_cs *engine, u32 *batch); > > /* i915_gem_evict.c */ > int __must_check i915_gem_evict_something(struct i915_address_space *vm, > diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c > index 94185d610673..b74ffbb47879 100644 > --- a/drivers/gpu/drm/i915/i915_perf.c > +++ b/drivers/gpu/drm/i915/i915_perf.c > @@ -1687,6 +1687,74 @@ static int gen8_emit_oa_config(struct drm_i915_gem_request *req, > return 0; > } > > +#define MAX_LRI_SIZE (125U) > + > +u32 i915_oa_get_perctx_bb_size(struct drm_i915_private *dev_priv) > +{ > + struct i915_perf_stream *stream = dev_priv->perf.oa.exclusive_stream; > + > + lockdep_assert_held(&dev_priv->drm.struct_mutex); Still not happy by this coupling to struct_mutex. :-p Missed RCS check. > + > + /* Perf not supported. */ > + if (!dev_priv->perf.initialized) > + return 0; > + > + /* OA not currently configured. */ > + if (!stream) > + return 0; > + > + /* Very unlikely but possible that we have no muxes to configure. */ > + if (!stream->oa_config->mux_regs_len) > + return 0; > + > + /* Return the size of MI_LOAD_REGISTER_IMMs. */ > + return (stream->oa_config->mux_regs_len / MAX_LRI_SIZE) * 4 + 4 + > + stream->oa_config->mux_regs_len * 8; > +} > + > +u32 *i915_oa_emit_perctx_bb(struct intel_engine_cs *engine, u32 *batch) > +{ > + struct drm_i915_private *dev_priv = engine->i915; > + struct i915_perf_stream *stream = dev_priv->perf.oa.exclusive_stream; > + u32 n_lri, n_mux_regs; > + u32 i; > + > + lockdep_assert_held(&dev_priv->drm.struct_mutex); > + > + /* We only care about RCS. */ > + if (engine->id != RCS) > + return batch; > + > + /* Perf not supported. */ > + if (!dev_priv->perf.initialized) > + return batch; > + > + /* OA not currently configured. */ > + if (!stream) > + return batch; > + > + /* It's very unlikely, but possible that we're dealing with a config > + * with no mux to configure. > + */ > + if (!stream->oa_config->mux_regs_len) > + return batch; The above could be condensed into if (i915_oa_get_perctx_bb_size() == 0) return; > + > + n_mux_regs = stream->oa_config->mux_regs_len; > + n_lri = (n_mux_regs / MAX_LRI_SIZE) + (n_mux_regs % MAX_LRI_SIZE) != 0; > + > + for (i = 0; i < n_mux_regs; i++) { > + if ((i % MAX_LRI_SIZE) == 0) { > + n_lri = min(n_mux_regs - i, MAX_LRI_SIZE); > + *batch++ = MI_LOAD_REGISTER_IMM(n_lri); > + } > + > + *batch++ = i915_mmio_reg_offset(stream->oa_config->mux_regs[i].addr); > + *batch++ = stream->oa_config->mux_regs[i].value; > + } I would have personally used a double loop. But at least kill that first n_lri, that was a moment of confusion spent trying to work out what you were using it for. > + > + return batch; > +} > /** > * intel_sanitize_enable_execlists() - sanitize i915.enable_execlists > @@ -1055,6 +1057,8 @@ static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) > */ > static u32 *gen8_init_perctx_bb(struct intel_engine_cs *engine, u32 *batch) > { > + batch = i915_oa_emit_perctx_bb(engine, batch); > + > /* WaDisableCtxRestoreArbitration:bdw,chv */ > *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; > *batch++ = MI_BATCH_BUFFER_END; > @@ -1118,21 +1122,27 @@ static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) > > static u32 *gen9_init_perctx_bb(struct intel_engine_cs *engine, u32 *batch) > { > + batch = i915_oa_emit_perctx_bb(engine, batch); Wrong wa_bb. This is emitted at the start of every bb, you want indirectctx_bb which is emitted after a context switch. Or at least I hope you don't need such a heavy handed approach. If you do, can you convince me that you are really not in conflict with the application? (Earlier you said it only needs ctx switch.) > +int logical_render_ring_reload_wa_bb(struct intel_engine_cs *engine) intel_lrc_update_wa_bb(). > +{ > + struct drm_i915_private *dev_priv = engine->i915; > + struct i915_ctx_workarounds new_wa_ctx; > + struct i915_gem_context *ctx; > + int ret; > + > + if (WARN_ON(engine->id != RCS)) > + return -EINVAL; > + > + memset(&new_wa_ctx, 0, sizeof(new_wa_ctx)); > + ret = intel_init_workaround_bb(engine, &new_wa_ctx); > + if (ret) > + return ret; > + > + if (engine->wa_ctx.vma) > + lrc_destroy_wa_ctx(engine); Couldn't we at least try to reuse the existing vma first? _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/intel-gfx