From: Sourab Gupta <sourab.gupta@xxxxxxxxx> This patch adds support for retrieving MMIO register values alongwith timestamps and forwarding them to userspace through perf. The userspace can request upto 8 MMIO register values to be dumped. The addresses of upto 8 MMIO registers can be passed through perf attr config. The registers are checked against a whitelist before passing them on. The commands to dump the values of these MMIO registers are then inserted into the ring alongwith commands to dump the timestamps. v2: Implement suggestions by Chris, pertaining to code restructuring, using BUILD_BUG_ON etc. Signed-off-by: Sourab Gupta <sourab.gupta@xxxxxxxxx> --- drivers/gpu/drm/i915/i915_drv.h | 2 + drivers/gpu/drm/i915/i915_oa_perf.c | 99 ++++++++++++++++++++++++++++++++++--- include/uapi/drm/i915_drm.h | 11 ++++- 3 files changed, 104 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index c3e823f..5c6e37a 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -2022,7 +2022,9 @@ struct drm_i915_private { #define I915_GEN_PMU_SAMPLE_RING (1<<0) #define I915_GEN_PMU_SAMPLE_PID (1<<1) #define I915_GEN_PMU_SAMPLE_TAG (1<<2) +#define I915_GEN_PMU_SAMPLE_MMIO (1<<3) int sample_info_flags; + u32 mmio_list[I915_PMU_MMIO_NUM]; } gen_pmu; void (*emit_profiling_data[I915_PROFILE_MAX]) diff --git a/drivers/gpu/drm/i915/i915_oa_perf.c b/drivers/gpu/drm/i915/i915_oa_perf.c index e065e06..4197dbd 100644 --- a/drivers/gpu/drm/i915/i915_oa_perf.c +++ b/drivers/gpu/drm/i915/i915_oa_perf.c @@ -12,6 +12,7 @@ #define PERIOD max_t(u64, 10000, NSEC_PER_SEC / FREQUENCY) #define TS_DATA_SIZE sizeof(struct drm_i915_ts_data) +#define MMIO_DATA_SIZE sizeof(struct drm_i915_mmio_data) #define CTX_INFO_SIZE sizeof(struct drm_i915_ts_node_ctx_id) #define RING_INFO_SIZE sizeof(struct drm_i915_ts_node_ring_id) #define PID_INFO_SIZE sizeof(struct drm_i915_ts_node_pid) @@ -129,8 +130,8 @@ static void i915_gen_emit_ts_data(struct drm_i915_gem_request *req, struct drm_i915_private *dev_priv = ring->dev->dev_private; struct drm_i915_gem_object *obj = dev_priv->gen_pmu.buffer.obj; struct i915_gen_pmu_node *entry; - u32 addr = 0; - int ret; + u32 mmio_addr, addr = 0; + int ret, i, count = 0; entry = kzalloc(sizeof(*entry), GFP_KERNEL); if (entry == NULL) { @@ -138,7 +139,12 @@ static void i915_gen_emit_ts_data(struct drm_i915_gem_request *req, return; } - ret = intel_ring_begin(ring, 6); + for (count = 0; count < I915_PMU_MMIO_NUM; count++) { + if (0 == dev_priv->gen_pmu.mmio_list[count]) + break; + } + + ret = intel_ring_begin(ring, 6 + 4*count); if (ret) { kfree(entry); return; @@ -173,6 +179,7 @@ static void i915_gen_emit_ts_data(struct drm_i915_gem_request *req, spin_unlock(&dev_priv->gen_pmu.lock); addr = dev_priv->gen_pmu.buffer.gtt_offset + entry->offset; + mmio_addr = addr + TS_DATA_SIZE; if (ring->id == RCS) { intel_ring_emit(ring, GFX_OP_PIPE_CONTROL(5)); @@ -192,6 +199,32 @@ static void i915_gen_emit_ts_data(struct drm_i915_gem_request *req, intel_ring_emit(ring, MI_NOOP); intel_ring_emit(ring, MI_NOOP); } + + /* + * Note: + * 1) The optimization to store the register array with a single + * command doesn't seem to be working with SRM commands. Hence, have a + * loop with a single SRM command repeated. Missing anything here? + * 2) This fn is presently called before and after batch buffer. As + * such, there should already be the CS stall commands called after BB. + * Is there a need/necessity for a command barrier to be inserted in + * ring here? If so, which commands? (CS Stall?) + */ + for (i = 0; i < I915_PMU_MMIO_NUM; i++) { + if (0 == dev_priv->gen_pmu.mmio_list[i]) + break; + + addr = mmio_addr + + i * sizeof(dev_priv->gen_pmu.mmio_list[i]); + + intel_ring_emit(ring, + MI_STORE_REGISTER_MEM(1) | + MI_SRM_LRM_GLOBAL_GTT); + intel_ring_emit(ring, dev_priv->gen_pmu.mmio_list[i]); + intel_ring_emit(ring, addr); + intel_ring_emit(ring, MI_NOOP); + } + intel_ring_advance(ring); obj->base.write_domain = I915_GEM_DOMAIN_RENDER; @@ -553,7 +586,7 @@ static void forward_one_gen_pmu_sample(struct drm_i915_private *dev_priv, { struct perf_sample_data data; struct perf_event *event = dev_priv->gen_pmu.exclusive_event; - int snapshot_size; + int snapshot_size, mmio_size; u8 *snapshot, *current_ptr; struct drm_i915_ts_node_ctx_id *ctx_info; struct drm_i915_ts_node_ring_id *ring_info; @@ -565,10 +598,16 @@ static void forward_one_gen_pmu_sample(struct drm_i915_private *dev_priv, (RING_INFO_SIZE != 8) || (PID_INFO_SIZE != 8) || (TAG_INFO_SIZE != 8)); + if (dev_priv->gen_pmu.sample_info_flags & I915_GEN_PMU_SAMPLE_MMIO) + mmio_size = MMIO_DATA_SIZE; + else + mmio_size = 0; + snapshot = dev_priv->gen_pmu.buffer.addr + node->offset; - snapshot_size = TS_DATA_SIZE + CTX_INFO_SIZE; + snapshot_size = TS_DATA_SIZE + mmio_size + CTX_INFO_SIZE; - ctx_info = (struct drm_i915_ts_node_ctx_id *)(snapshot + TS_DATA_SIZE); + ctx_info = (struct drm_i915_ts_node_ctx_id *) + (snapshot + mmio_size + TS_DATA_SIZE); ctx_info->ctx_id = node->ctx_id; current_ptr = snapshot + snapshot_size; @@ -1046,6 +1085,9 @@ static int init_gen_pmu_buffer(struct perf_event *event) if (dev_priv->gen_pmu.sample_info_flags & I915_GEN_PMU_SAMPLE_TAG) node_size += TAG_INFO_SIZE; + if (dev_priv->gen_pmu.sample_info_flags & I915_GEN_PMU_SAMPLE_MMIO) + node_size += MMIO_DATA_SIZE; + /* size has to be aligned to 8 bytes */ node_size = ALIGN(node_size, 8); dev_priv->gen_pmu.buffer.node_size = node_size; @@ -1641,6 +1683,40 @@ err_size: goto out; } + +static int check_mmio_whitelist(struct drm_i915_private *dev_priv, + struct drm_i915_gen_pmu_attr *gen_attr) +{ + +#define GEN_RANGE(l, h) GENMASK(h, l) + static const struct register_whitelist { + uint64_t offset; + uint32_t size; + /* supported gens, 0x10 for 4, 0x30 for 4 and 5, etc. */ + uint32_t gen_bitmask; + } whitelist[] = { + { GEN6_GT_GFX_RC6, 4, GEN_RANGE(7, 9) }, + { GEN6_GT_GFX_RC6p, 4, GEN_RANGE(7, 9) }, + }; + int i, count; + + for (count = 0; count < I915_PMU_MMIO_NUM; count++) { + if (!gen_attr->mmio_list[count]) + break; + + for (i = 0; i < ARRAY_SIZE(whitelist); i++) { + if (whitelist[i].offset == gen_attr->mmio_list[count] && + (1 << INTEL_INFO(dev_priv)->gen & + whitelist[i].gen_bitmask)) + break; + } + + if (i == ARRAY_SIZE(whitelist)) + return -EINVAL; + } + return 0; +} + static int i915_gen_event_init(struct perf_event *event) { struct drm_i915_private *dev_priv = @@ -1670,6 +1746,17 @@ static int i915_gen_event_init(struct perf_event *event) if (gen_attr.sample_tag) dev_priv->gen_pmu.sample_info_flags |= I915_GEN_PMU_SAMPLE_TAG; + if (gen_attr.sample_mmio) { + ret = check_mmio_whitelist(dev_priv, &gen_attr); + if (ret) + return ret; + + dev_priv->gen_pmu.sample_info_flags |= + I915_GEN_PMU_SAMPLE_MMIO; + memcpy(dev_priv->gen_pmu.mmio_list, gen_attr.mmio_list, + sizeof(dev_priv->gen_pmu.mmio_list)); + } + /* To avoid the complexity of having to accurately filter * data and marshal to the appropriate client * we currently only allow exclusive access */ diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h index db91098..4153cdf 100644 --- a/include/uapi/drm/i915_drm.h +++ b/include/uapi/drm/i915_drm.h @@ -81,7 +81,7 @@ #define I915_OA_ATTR_SIZE_VER0 32 /* sizeof first published struct */ -#define I915_GEN_PMU_ATTR_SIZE_VER0 8 /* sizeof first published struct */ +#define I915_GEN_PMU_ATTR_SIZE_VER0 40 /* sizeof first published struct */ typedef struct _drm_i915_oa_attr { __u32 size; @@ -105,7 +105,9 @@ struct drm_i915_gen_pmu_attr { __u32 sample_ring:1, sample_pid:1, sample_tag:1, - __reserved_1:29; + sample_mmio:1, + __reserved_1:28; + __u32 mmio_list[8]; }; /* Header for PERF_RECORD_DEVICE type events */ @@ -155,6 +157,11 @@ struct drm_i915_ts_data { __u32 ts_high; }; +struct drm_i915_mmio_data { +#define I915_PMU_MMIO_NUM 8 + __u32 mmio[I915_PMU_MMIO_NUM]; +}; + struct drm_i915_ts_node_ctx_id { __u32 ctx_id; __u32 pad; -- 1.8.5.1 _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx http://lists.freedesktop.org/mailman/listinfo/intel-gfx