Quoting Chris Wilson (2018-05-09 09:59:09) > Quoting Lionel Landwerlin (2018-05-08 19:03:45) > > If some of the contexts submitting workloads to the GPU have been > > configured to shutdown slices/subslices, we might loose the NOA > > configurations written in the NOA muxes. We need to reprogram them > > when we detect a powergating configuration change. > > > > In this change i915/perf is responsible for setting up a reprogramming > > batchbuffer which we execute just before the userspace submitted > > batchbuffer. We do this while preemption is still disable, only if > > needed. The decision to execute this reprogramming batchbuffer is made > > when we assign a request to an execlist port. > > > > v2: Only reprogram when detecting configuration changes (Chris/Lionel) > > > > Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@xxxxxxxxx> > > --- > > drivers/gpu/drm/i915/i915_drv.h | 6 ++ > > drivers/gpu/drm/i915/i915_perf.c | 108 ++++++++++++++++++++++ > > drivers/gpu/drm/i915/i915_request.h | 6 ++ > > drivers/gpu/drm/i915/intel_gpu_commands.h | 1 + > > drivers/gpu/drm/i915/intel_lrc.c | 59 +++++++++++- > > drivers/gpu/drm/i915/intel_ringbuffer.c | 2 + > > drivers/gpu/drm/i915/intel_ringbuffer.h | 2 + > > 7 files changed, 183 insertions(+), 1 deletion(-) > > > > diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h > > index 04e27806e581..07cdbfb4ad7a 100644 > > --- a/drivers/gpu/drm/i915/i915_drv.h > > +++ b/drivers/gpu/drm/i915/i915_drv.h > > @@ -1461,6 +1461,12 @@ struct i915_perf_stream { > > * @oa_config: The OA configuration used by the stream. > > */ > > struct i915_oa_config *oa_config; > > + > > + /** > > + * @noa_reprogram_vma: A batchbuffer reprogramming the NOA muxes, used > > + * after switching powergating configurations. > > + */ > > + struct i915_vma *noa_reprogram_vma; > > }; > > > > /** > > diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c > > index 5b279a82445a..1b9e3d6a53a2 100644 > > --- a/drivers/gpu/drm/i915/i915_perf.c > > +++ b/drivers/gpu/drm/i915/i915_perf.c > > @@ -1691,6 +1691,96 @@ static int gen8_emit_oa_config(struct i915_request *rq, > > return 0; > > } > > > > +#define MAX_LRI_SIZE (125U) > > + > > +static u32 noa_reprogram_bb_size(struct i915_oa_config *oa_config) > > +{ > > + u32 n_lri; > > + > > + /* Very unlikely but possible that we have no muxes to configure. */ > > + if (!oa_config->mux_regs_len) > > + return 0; > > + > > + n_lri = (oa_config->mux_regs_len / MAX_LRI_SIZE) + > > + (oa_config->mux_regs_len % MAX_LRI_SIZE) != 0; > > + > > + return n_lri * 4 + oa_config->mux_regs_len * 8 + /* MI_LOAD_REGISTER_IMMs */ > > + 6 * 4 + /* PIPE_CONTROL */ > > + 4; /* MI_BATCH_BUFFER_END */ > > +} > > + > > +static int > > +alloc_noa_reprogram_bo(struct i915_perf_stream *stream) > > +{ > > + struct drm_i915_private *dev_priv = stream->dev_priv; > > + struct i915_oa_config *oa_config = stream->oa_config; > > + struct drm_i915_gem_object *bo; > > + struct i915_vma *vma; > > + u32 buffer_size; > > + u32 *cs; > > + int i, ret, n_loaded_regs; > > + > > + buffer_size = ALIGN(noa_reprogram_bb_size(oa_config), PAGE_SIZE); > > + if (buffer_size == 0) > > + return 0; > > + > > + bo = i915_gem_object_create(dev_priv, buffer_size); > > + if (IS_ERR(bo)) { > > + DRM_ERROR("Failed to allocate NOA reprogramming buffer\n"); > > + return PTR_ERR(bo); > > + } > > + > > + cs = i915_gem_object_pin_map(bo, I915_MAP_WB); > > + if (IS_ERR(cs)) { > > + ret = PTR_ERR(cs); > > + goto err_unref_bo; > > + } > > + > > + n_loaded_regs = 0; > > + for (i = 0; i < oa_config->mux_regs_len; i++) { > > + if ((n_loaded_regs % MAX_LRI_SIZE) == 0) { > > + u32 n_lri = min(oa_config->mux_regs_len - n_loaded_regs, > > + MAX_LRI_SIZE); > > + *cs++ = MI_LOAD_REGISTER_IMM(n_lri); > > + } > > + > > + *cs++ = i915_mmio_reg_offset(oa_config->mux_regs[i].addr); > > + *cs++ = oa_config->mux_regs[i].value; > > + n_loaded_regs++; > > + } > > + > > + cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_MMIO_WRITE, 0); > > What's the pc for? Isn't it a bit dangerous to request a mmio write to > to reg 0? > > > + > > + *cs++ = MI_BATCH_BUFFER_END; > > + > > + i915_gem_object_unpin_map(bo); > > + > > + ret = i915_gem_object_set_to_gtt_domain(bo, false); > > + if (ret) > > + goto err_unref_bo; > > + > > + vma = i915_vma_instance(bo, &dev_priv->ggtt.base, NULL); > > + if (IS_ERR(vma)) { > > + ret = PTR_ERR(vma); > > + goto err_unref_bo; > > + } > > + > > + ret = i915_vma_pin(vma, 0, 0, PIN_USER | PIN_GLOBAL); > > + if (ret) > > + goto err_unref_vma; > > No GGTT access, so just PIN_USER for GPU access. > > > + stream->noa_reprogram_vma = vma; > > + > > + return 0; > > + > > +err_unref_vma: > > + i915_vma_put(vma); > > +err_unref_bo: > > + i915_gem_object_put(bo); > > + > > + return ret; > > +} > > + > > static int gen8_switch_to_updated_kernel_context(struct drm_i915_private *dev_priv, > > const struct i915_oa_config *oa_config) > > { > > @@ -2102,6 +2192,12 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream, > > goto err_config; > > } > > > > + ret = alloc_noa_reprogram_bo(stream); > > + if (ret) { > > + DRM_DEBUG("Enable to alloc NOA reprogramming BO\n"); > > + goto err_config; > > + } > > + > > /* PRM - observability performance counters: > > * > > * OACONTROL, performance counter enable, note: > > @@ -2136,6 +2232,13 @@ static int i915_oa_stream_init(struct i915_perf_stream *stream, > > > > dev_priv->perf.oa.exclusive_stream = stream; > > > > + ret = dev_priv->perf.oa.ops.enable_metric_set(dev_priv, > > + stream->oa_config); > > + if (ret) > > + goto err_enable; > > + > > + stream->ops = &i915_oa_stream_ops; > > + > > mutex_unlock(&dev_priv->drm.struct_mutex); > > > > return 0; > > @@ -2497,6 +2600,11 @@ static void i915_perf_destroy_locked(struct i915_perf_stream *stream) > > if (stream->ctx) > > i915_gem_context_put(stream->ctx); > > > > + if (stream->noa_reprogram_vma) { > > + i915_vma_unpin(stream->noa_reprogram_vma); > > + i915_gem_object_put(stream->noa_reprogram_vma->obj); > > This will be unhappy due to the lack of active tracking. > i915_vma_unpin_and_release(&stream->noa_reprogram_vma). > > Hmm, worse than that. It's not being tracked at all, so expect it to be > freed while still in use. > > > + } > > + > > kfree(stream); > > } > > > > diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h > > index eddbd4245cb3..3357ceb4bdb1 100644 > > --- a/drivers/gpu/drm/i915/i915_request.h > > +++ b/drivers/gpu/drm/i915/i915_request.h > > @@ -149,6 +149,12 @@ struct i915_request { > > /** Preallocate space in the ring for the emitting the request */ > > u32 reserved_space; > > > > + /* > > + * Pointer in the batchbuffer to where the i915/perf NOA reprogramming > > + * can be inserted just before HW submission. > > + */ > > + u32 *perf_prog_start; > > Just u32 offset, no need for the extra space of a pointer here. > (Probably should just limit rings to 64k and pack the offsets into u16.) > Or have them as offset from head, and u8 dwords? Hmm. > > I have cold shivers from the thought of more special locations inside > the batch. Not the first, and not the last, so I feel like there should > be a better way (or at least more consistent). > > > + > > /** Batch buffer related to this request if any (used for > > * error state dump only). > > */ > > diff --git a/drivers/gpu/drm/i915/intel_gpu_commands.h b/drivers/gpu/drm/i915/intel_gpu_commands.h > > index 105e2a9e874a..09b0fded8b17 100644 > > --- a/drivers/gpu/drm/i915/intel_gpu_commands.h > > +++ b/drivers/gpu/drm/i915/intel_gpu_commands.h > > @@ -146,6 +146,7 @@ > > #define MI_BATCH_GTT (2<<6) /* aliased with (1<<7) on gen4 */ > > #define MI_BATCH_BUFFER_START_GEN8 MI_INSTR(0x31, 1) > > #define MI_BATCH_RESOURCE_STREAMER (1<<10) > > +#define MI_BATCH_SECOND_LEVEL (1<<22) > > > > /* > > * 3D instructions used by the kernel > > diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c > > index 1c08bd2be6c3..df4a7bb07eae 100644 > > --- a/drivers/gpu/drm/i915/intel_lrc.c > > +++ b/drivers/gpu/drm/i915/intel_lrc.c > > @@ -510,6 +510,38 @@ static bool can_merge_ctx(const struct i915_gem_context *prev, > > return true; > > } > > > > +static void maybe_enable_noa_repgoram(struct i915_request *rq) > > +{ > > + struct intel_engine_cs *engine = rq->engine; > > + struct drm_i915_private *dev_priv = engine->i915; > > + struct i915_perf_stream *stream = dev_priv->perf.oa.exclusive_stream; > > Could there be any more pointer chasing? </chandler> Thinking about more about how to make this part cleaner, could we not store the engine->noa_batch and then this all becomes vma = engine->noa_batch; if (vma) return; Locking! Missed it in the first pass, but we are unlocked here... That also ties into lifetimes. I'm thinking more like ctx->noa_batch with that being pinned along with the context. We need something along those lines to track the locking/lifetime. -Chris _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/intel-gfx