From: Sourab Gupta <sourab.gupta@xxxxxxxxx> With this patch, for RCS, timestamps and OA reports can be collected together, and provided to userspace in separate sample fields. For other engines, the capabilility to collect timestamps is added. The thing to note is that, still only a single stream instance can be opened at any particular time. Though that stream may now be opened for any gpu engine, for collection of timestamp samples. So, this patch doesn't add the support to open multiple concurrent streams, as yet. Signed-off-by: Sourab Gupta <sourab.gupta@xxxxxxxxx> Signed-off-by: Sagar Arun Kamble <sagar.a.kamble@xxxxxxxxx> --- drivers/gpu/drm/i915/i915_drv.h | 25 ++++- drivers/gpu/drm/i915/i915_perf.c | 229 ++++++++++++++++++++++++++++++++------- drivers/gpu/drm/i915/i915_reg.h | 2 + include/uapi/drm/i915_drm.h | 7 ++ 4 files changed, 222 insertions(+), 41 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index c4f7462..0763280 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -2182,11 +2182,30 @@ struct i915_perf_cs_sample { struct drm_i915_gem_request *request; /** - * @offset: Offset into ``&stream->cs_buffer`` - * where the perf metrics will be collected, when the commands inserted + * @start_offset: Offset into ``&stream->cs_buffer + * where the metrics will be collected, when the commands inserted * into the command stream are executed by GPU. */ - u32 offset; + u32 start_offset; + + /** + * @oa_offset: Offset into ``&stream->cs_buffer + * where the OA report will be collected (if the stream is configured + * for collection of OA samples). + */ + u32 oa_offset; + + /** + * @ts_offset: Offset into ``&stream->cs_buffer + * where the timestamps will be collected (if the stream is configured + * for collection of timestamp data) + */ + u32 ts_offset; + + /** + * @size: buffer size corresponding to this perf sample + */ + u32 size; /** * @ctx_id: Context ID associated with this perf sample diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c index c7f8e7f..2c7ab98 100644 --- a/drivers/gpu/drm/i915/i915_perf.c +++ b/drivers/gpu/drm/i915/i915_perf.c @@ -289,12 +289,17 @@ #define OAREPORT_REASON_CTX_SWITCH (1<<3) #define OAREPORT_REASON_CLK_RATIO (1<<5) -/* Data common to periodic and RCS based OA samples */ +#define OA_ADDR_ALIGN 64 +#define TS_ADDR_ALIGN 8 +#define I915_PERF_TS_SAMPLE_SIZE 8 + +/*Data common to perf samples (periodic OA / CS based OA / Timestamps)*/ struct i915_perf_sample_data { u64 source; u64 ctx_id; u64 pid; u64 tag; + u64 ts; const u8 *report; }; @@ -352,6 +357,7 @@ struct i915_perf_sample_data { #define SAMPLE_CTX_ID (1<<2) #define SAMPLE_PID (1<<3) #define SAMPLE_TAG (1<<4) +#define SAMPLE_TS (1<<5) /** * struct perf_open_properties - for validated properties given to open a stream @@ -446,14 +452,12 @@ void i915_perf_emit_sample_capture(struct drm_i915_gem_request *request, static void release_perf_samples(struct i915_perf_stream *stream, u32 target_size) { - struct drm_i915_private *dev_priv = stream->dev_priv; struct i915_perf_cs_sample *sample, *next; - u32 sample_size = dev_priv->perf.oa.oa_buffer.format_size; u32 size = 0; list_for_each_entry_safe (sample, next, &stream->cs_samples, link) { - size += sample_size; + size += sample->size; i915_gem_request_put(sample->request); list_del(&sample->link); kfree(sample); @@ -478,15 +482,24 @@ static void insert_perf_sample(struct i915_perf_stream *stream, struct drm_i915_private *dev_priv = stream->dev_priv; struct i915_perf_cs_sample *first, *last; int max_offset = stream->cs_buffer.vma->obj->base.size; - u32 sample_size = dev_priv->perf.oa.oa_buffer.format_size; unsigned long flags; + u32 offset, sample_size = 0; + + if (stream->sample_flags & SAMPLE_OA_REPORT) + sample_size += dev_priv->perf.oa.oa_buffer.format_size; + else if (stream->sample_flags & SAMPLE_TS) { + /* + * XXX: Since TS data can anyways be derived from OA report, so + * no need to capture it for RCS engine, if capture oa data is + * called already. + */ + sample_size += I915_PERF_TS_SAMPLE_SIZE; + } spin_lock_irqsave(&stream->cs_samples_lock, flags); if (list_empty(&stream->cs_samples)) { - sample->offset = 0; - list_add_tail(&sample->link, &stream->cs_samples); - spin_unlock_irqrestore(&stream->cs_samples_lock, flags); - return; + offset = 0; + goto out; } first = list_first_entry(&stream->cs_samples, typeof(*first), @@ -494,41 +507,61 @@ static void insert_perf_sample(struct i915_perf_stream *stream, last = list_last_entry(&stream->cs_samples, typeof(*last), link); - if (last->offset >= first->offset) { + if (last->start_offset >= first->start_offset) { /* Sufficient space available at the end of buffer? */ - if (last->offset + 2*sample_size < max_offset) - sample->offset = last->offset + sample_size; + if (last->start_offset + last->size + sample_size < max_offset) + offset = last->start_offset + last->size; /* * Wraparound condition. Is sufficient space available at * beginning of buffer? */ - else if (sample_size < first->offset) - sample->offset = 0; + else if (sample_size < first->start_offset) + offset = 0; /* Insufficient space. Overwrite existing old entries */ else { - u32 target_size = sample_size - first->offset; + u32 target_size = sample_size - first->start_offset; stream->cs_buffer.status |= I915_PERF_CMD_STREAM_BUF_STATUS_OVERFLOW; release_perf_samples(stream, target_size); - sample->offset = 0; + offset = 0; } } else { /* Sufficient space available? */ - if (last->offset + 2*sample_size < first->offset) - sample->offset = last->offset + sample_size; + if (last->start_offset + last->size + sample_size + < first->start_offset) + offset = last->start_offset + last->size; + /* Insufficient space. Overwrite existing old entries */ else { u32 target_size = sample_size - - (first->offset - last->offset - - sample_size); + (first->start_offset - last->start_offset - + last->size); stream->cs_buffer.status |= I915_PERF_CMD_STREAM_BUF_STATUS_OVERFLOW; release_perf_samples(stream, target_size); - sample->offset = last->offset + sample_size; + offset = last->start_offset + sample_size; } } + +out: + sample->start_offset = offset; + sample->size = sample_size; + if (stream->sample_flags & SAMPLE_OA_REPORT) { + sample->oa_offset = offset; + /* Ensure 64 byte alignment of oa_offset */ + sample->oa_offset = ALIGN(sample->oa_offset, OA_ADDR_ALIGN); + offset = sample->oa_offset + + dev_priv->perf.oa.oa_buffer.format_size; + } + if (stream->sample_flags & SAMPLE_TS) { + sample->ts_offset = offset; + /* Ensure 8 byte alignment of ts_offset */ + sample->ts_offset = ALIGN(sample->ts_offset, TS_ADDR_ALIGN); + offset = sample->ts_offset + I915_PERF_TS_SAMPLE_SIZE; + } + list_add_tail(&sample->link, &stream->cs_samples); spin_unlock_irqrestore(&stream->cs_samples_lock, flags); } @@ -591,6 +624,82 @@ static int i915_emit_oa_report_capture( } /** + * i915_emit_ts_capture - Insert the commands to capture timestamp + * data into the GPU command stream + * @request: request in whose context the timestamps are being collected. + * @preallocate: allocate space in ring for related sample. + * @offset: command stream buffer offset where the timestamp data needs to be + * collected + */ +static int i915_emit_ts_capture(struct drm_i915_gem_request *request, + bool preallocate, + u32 offset) +{ + struct drm_i915_private *dev_priv = request->i915; + struct intel_engine_cs *engine = request->engine; + struct i915_perf_stream *stream; + u32 addr = 0; + u32 cmd, len = 6, *cs; + int idx; + + if (preallocate) + request->reserved_space += len; + else + request->reserved_space -= len; + + cs = intel_ring_begin(request, 6); + if (IS_ERR(cs)) + return PTR_ERR(cs); + + idx = srcu_read_lock(&engine->perf_srcu); + stream = rcu_dereference(engine->exclusive_stream); + addr = stream->cs_buffer.vma->node.start + offset; + srcu_read_unlock(&engine->perf_srcu, idx); + + if (request->engine->id == RCS) { + if (INTEL_GEN(dev_priv) >= 8) + cmd = GFX_OP_PIPE_CONTROL(6); + else + cmd = GFX_OP_PIPE_CONTROL(5); + + *cs++ = cmd; + *cs++ = PIPE_CONTROL_GLOBAL_GTT_IVB | + PIPE_CONTROL_TIMESTAMP_WRITE; + *cs++ = addr | PIPE_CONTROL_GLOBAL_GTT; + *cs++ = 0; + *cs++ = 0; + + if (INTEL_GEN(dev_priv) >= 8) + *cs++ = 0; + else + *cs++ = MI_NOOP; + } else { + uint32_t cmd; + + cmd = MI_FLUSH_DW + 1; + if (INTEL_GEN(dev_priv) >= 8) + cmd += 1; + + cmd |= MI_FLUSH_DW_OP_STAMP; + + *cs++ = cmd; + *cs++ = addr | MI_FLUSH_DW_USE_GTT; + *cs++ = 0; + *cs++ = 0; + + if (INTEL_GEN(dev_priv) >= 8) + *cs++ = 0; + else + *cs++ = MI_NOOP; + *cs++ = MI_NOOP; + } + + intel_ring_advance(request, cs); + + return 0; +} + +/** * i915_perf_stream_emit_sample_capture - Insert the commands to capture perf * metrics into the GPU command stream * @stream: An i915-perf stream opened for GPU metrics @@ -625,7 +734,17 @@ static void i915_perf_stream_emit_sample_capture( if (stream->sample_flags & SAMPLE_OA_REPORT) { ret = i915_emit_oa_report_capture(request, preallocate, - sample->offset); + sample->oa_offset); + if (ret) + goto err_unref; + } else if (stream->sample_flags & SAMPLE_TS) { + /* + * XXX: Since TS data can anyways be derived from OA report, so + * no need to capture it for RCS engine, if capture oa data is + * called already. + */ + ret = i915_emit_ts_capture(request, preallocate, + sample->ts_offset); if (ret) goto err_unref; } @@ -947,6 +1066,12 @@ static int append_perf_sample(struct i915_perf_stream *stream, buf += 8; } + if (sample_flags & SAMPLE_TS) { + if (copy_to_user(buf, &data->ts, I915_PERF_TS_SAMPLE_SIZE)) + return -EFAULT; + buf += I915_PERF_TS_SAMPLE_SIZE; + } + if (sample_flags & SAMPLE_OA_REPORT) { if (copy_to_user(buf, data->report, report_size)) return -EFAULT; @@ -990,6 +1115,12 @@ static int append_oa_buffer_sample(struct i915_perf_stream *stream, if (sample_flags & SAMPLE_TAG) data.tag = stream->last_tag; + /* TODO: Derive timestamp from OA report, + * after scaling with the ts base + */ + if (sample_flags & SAMPLE_TS) + data.ts = 0; + if (sample_flags & SAMPLE_OA_REPORT) data.report = report; @@ -1565,7 +1696,8 @@ static int append_cs_buffer_sample(struct i915_perf_stream *stream, int ret = 0; if (sample_flags & SAMPLE_OA_REPORT) { - const u8 *report = stream->cs_buffer.vaddr + node->offset; + const u8 *report = stream->cs_buffer.vaddr + node->oa_offset; + u32 sample_ts = *(u32 *)(report + 4); data.report = report; @@ -1597,6 +1729,19 @@ static int append_cs_buffer_sample(struct i915_perf_stream *stream, stream->last_tag = node->tag; } + if (sample_flags & SAMPLE_TS) { + /* For RCS, if OA samples are also being collected, derive the + * timestamp from OA report, after scaling with the TS base. + * Else, forward the timestamp collected via command stream. + */ + /* TODO: derive the timestamp from OA report */ + if (sample_flags & SAMPLE_OA_REPORT) + data.ts = 0; + else + data.ts = *(u64 *) (stream->cs_buffer.vaddr + + node->ts_offset); + } + return append_perf_sample(stream, buf, count, offset, &data); } @@ -2760,7 +2905,8 @@ static int i915_perf_stream_init(struct i915_perf_stream *stream, SAMPLE_OA_SOURCE); bool require_cs_mode = props->sample_flags & (SAMPLE_PID | SAMPLE_TAG); - bool cs_sample_data = props->sample_flags & SAMPLE_OA_REPORT; + bool cs_sample_data = props->sample_flags & (SAMPLE_OA_REPORT | + SAMPLE_TS); struct i915_perf_stream *curr_stream; struct intel_engine_cs *engine = NULL; int idx; @@ -2917,8 +3063,21 @@ static int i915_perf_stream_init(struct i915_perf_stream *stream, require_cs_mode = true; } + if (props->sample_flags & SAMPLE_TS) { + stream->sample_flags |= SAMPLE_TS; + stream->sample_size += I915_PERF_TS_SAMPLE_SIZE; + + /* + * NB: it's meaningful to request SAMPLE_TS with just CS + * mode or periodic OA mode sampling but we don't allow + * SAMPLE_TS without either mode + */ + if (!require_oa_unit) + require_cs_mode = true; + } + if (require_cs_mode && !props->cs_mode) { - DRM_ERROR("PID/TAG sampling requires a ring to be specified"); + DRM_ERROR("PID/TAG/TS sampling requires engine to be specified"); ret = -EINVAL; goto err_enable; } @@ -2932,11 +3091,11 @@ static int i915_perf_stream_init(struct i915_perf_stream *stream, /* * The only time we should allow enabling CS mode if it's not - * strictly required, is if SAMPLE_CTX_ID has been requested - * as it's usable with periodic OA or CS sampling. + * strictly required, is if SAMPLE_CTX_ID/SAMPLE_TS has been + * requested as they're usable with periodic OA or CS sampling. */ if (!require_cs_mode && - !(props->sample_flags & SAMPLE_CTX_ID)) { + !(props->sample_flags & (SAMPLE_CTX_ID | SAMPLE_TS))) { DRM_ERROR("Stream engine given without requesting any CS specific property\n"); ret = -EINVAL; goto err_enable; @@ -3646,21 +3805,12 @@ static int read_properties_unlocked(struct drm_i915_private *dev_priv, case DRM_I915_PERF_PROP_ENGINE: { unsigned int user_ring_id = value & I915_EXEC_RING_MASK; - enum intel_engine_id engine; if (user_ring_id > I915_USER_RINGS) return -EINVAL; - /* XXX: Currently only RCS is supported. - * Remove this check when support for other - * engines is added - */ - engine = user_ring_map[user_ring_id]; - if (engine != RCS) - return -EINVAL; - props->cs_mode = true; - props->engine = engine; + props->engine = user_ring_map[user_ring_id]; } break; case DRM_I915_PERF_PROP_SAMPLE_CTX_ID: @@ -3672,6 +3822,9 @@ static int read_properties_unlocked(struct drm_i915_private *dev_priv, case DRM_I915_PERF_PROP_SAMPLE_TAG: props->sample_flags |= SAMPLE_TAG; break; + case DRM_I915_PERF_PROP_SAMPLE_TS: + props->sample_flags |= SAMPLE_TS; + break; case DRM_I915_PERF_PROP_MAX: MISSING_CASE(id); return -EINVAL; diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h index 1dc7e7a..ecd5794 100644 --- a/drivers/gpu/drm/i915/i915_reg.h +++ b/drivers/gpu/drm/i915/i915_reg.h @@ -434,6 +434,7 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg) #define MI_FLUSH_DW_STORE_INDEX (1<<21) #define MI_INVALIDATE_TLB (1<<18) #define MI_FLUSH_DW_OP_STOREDW (1<<14) +#define MI_FLUSH_DW_OP_STAMP (3<<14) #define MI_FLUSH_DW_OP_MASK (3<<14) #define MI_FLUSH_DW_NOTIFY (1<<8) #define MI_INVALIDATE_BSD (1<<7) @@ -517,6 +518,7 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg) #define PIPE_CONTROL_TLB_INVALIDATE (1<<18) #define PIPE_CONTROL_MEDIA_STATE_CLEAR (1<<16) #define PIPE_CONTROL_QW_WRITE (1<<14) +#define PIPE_CONTROL_TIMESTAMP_WRITE (3<<14) #define PIPE_CONTROL_POST_SYNC_OP_MASK (3<<14) #define PIPE_CONTROL_DEPTH_STALL (1<<13) #define PIPE_CONTROL_WRITE_FLUSH (1<<12) diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h index 0e522d4..4d27075 100644 --- a/include/uapi/drm/i915_drm.h +++ b/include/uapi/drm/i915_drm.h @@ -1425,6 +1425,12 @@ enum drm_i915_perf_property_id { */ DRM_I915_PERF_PROP_SAMPLE_TAG, + /** + * The value of this property set to 1 requests inclusion of timestamp + * in the perf sample data. + */ + DRM_I915_PERF_PROP_SAMPLE_TS, + DRM_I915_PERF_PROP_MAX /* non-ABI */ }; @@ -1494,6 +1500,7 @@ enum drm_i915_perf_record_type { * { u64 ctx_id; } && DRM_I915_PERF_PROP_SAMPLE_CTX_ID * { u64 pid; } && DRM_I915_PERF_PROP_SAMPLE_PID * { u64 tag; } && DRM_I915_PERF_PROP_SAMPLE_TAG + * { u64 timestamp; } && DRM_I915_PERF_PROP_SAMPLE_TS * { u32 oa_report[]; } && DRM_I915_PERF_PROP_SAMPLE_OA * }; */ -- 1.9.1 _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/intel-gfx