From: Sourab Gupta <sourab.gupta@xxxxxxxxx> This patch adds the mechanism for forwarding the asynchronous OA snapshots through the perf event interface. Each node of data collected is forwarded as a separate perf sample. A single snapshot will have two fields. First is the raw report and second field is a footer with metadata corresponding to snapshot such as ctx_id, pid. The size of the raw report is the one specified during event init. The samples will be forwarded in a workqueue, which is scheduled when hrtimer triggers. In the workqueue, each node of data collected will be forwarded as a separate perf sample. Signed-off-by: Sourab Gupta <sourab.gupta@xxxxxxxxx> --- drivers/gpu/drm/i915/i915_drv.h | 5 +- drivers/gpu/drm/i915/i915_oa_perf.c | 158 +++++++++++++++++++++++++++++++++++- 2 files changed, 161 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index da150bc..d738f7a 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -1691,7 +1691,8 @@ struct drm_i915_oa_async_queue_header { struct drm_i915_oa_async_node_info { __u32 pid; __u32 ctx_id; - __u32 pad[14]; + struct drm_i915_gem_request *req; + __u32 pad[12]; }; struct drm_i915_oa_async_node { @@ -1975,7 +1976,9 @@ struct drm_i915_private { u32 tail; int format; int format_size; + u8 *snapshot; } oa_async_buffer; + struct work_struct work_timer; } oa_pmu; #endif diff --git a/drivers/gpu/drm/i915/i915_oa_perf.c b/drivers/gpu/drm/i915/i915_oa_perf.c index 419b6a5..3bf4c47 100644 --- a/drivers/gpu/drm/i915/i915_oa_perf.c +++ b/drivers/gpu/drm/i915/i915_oa_perf.c @@ -25,6 +25,128 @@ static int hsw_perf_format_sizes[] = { 64 /* C4_B8_HSW */ }; +static void init_oa_async_buf_queue(struct drm_i915_private *dev_priv) +{ + struct drm_i915_oa_async_queue_header *hdr = + (struct drm_i915_oa_async_queue_header *) + dev_priv->oa_pmu.oa_async_buffer.addr; + void *data_ptr; + + hdr->size_in_bytes = dev_priv->oa_pmu.oa_async_buffer.obj->base.size; + /* 64 bit alignment for OA node address */ + data_ptr = PTR_ALIGN((void *)(hdr + 1), 64); + hdr->data_offset = (__u64)(data_ptr - (void *)hdr); + + hdr->node_count = 0; + hdr->wrap_count = 0; +} + +static void forward_one_oa_async_sample(struct drm_i915_private *dev_priv, + struct drm_i915_oa_async_node *node) +{ + struct perf_sample_data data; + struct perf_event *event = dev_priv->oa_pmu.exclusive_event; + int format_size, snapshot_size; + u8 *snapshot; + struct perf_raw_record raw; + + format_size = dev_priv->oa_pmu.oa_async_buffer.format_size; + snapshot_size = format_size + + sizeof(struct drm_i915_oa_async_node_footer); + snapshot = dev_priv->oa_pmu.oa_async_buffer.snapshot; + + memcpy(snapshot, node, format_size); + memcpy(snapshot + format_size, &node->node_info, + sizeof(struct drm_i915_oa_async_node_footer)); + + perf_sample_data_init(&data, 0, event->hw.last_period); + + /* Note: the combined u32 raw->size member + raw data itself must be 8 + * byte aligned. (See note in init_oa_buffer for more details) */ + raw.size = snapshot_size + 4; + raw.data = snapshot; + + data.raw = &raw; + + perf_event_overflow(event, &data, &dev_priv->oa_pmu.dummy_regs); +} + +void i915_oa_async_wait_gpu(struct drm_i915_private *dev_priv) +{ + struct drm_i915_oa_async_queue_header *hdr = + (struct drm_i915_oa_async_queue_header *) + dev_priv->oa_pmu.oa_async_buffer.addr; + struct drm_i915_oa_async_node *first_node, *node; + int ret, head, tail, num_nodes; + struct drm_i915_gem_request *req; + + first_node = (struct drm_i915_oa_async_node *) + ((char *)hdr + hdr->data_offset); + num_nodes = (hdr->size_in_bytes - hdr->data_offset) / + sizeof(*node); + + + tail = hdr->node_count; + head = dev_priv->oa_pmu.oa_async_buffer.head; + + /* wait for all requests to complete*/ + while ((head % num_nodes) != (tail % num_nodes)) { + node = &first_node[head % num_nodes]; + req = node->node_info.req; + if (req) { + if (!i915_gem_request_completed(req, true)) { + ret = i915_wait_request(req); + if (ret) + DRM_DEBUG_DRIVER( + "oa async: failed to wait\n"); + } + i915_gem_request_assign(&node->node_info.req, NULL); + } + head++; + } +} + +void forward_oa_async_snapshots_work(struct work_struct *__work) +{ + struct drm_i915_private *dev_priv = + container_of(__work, typeof(*dev_priv), + oa_pmu.work_timer); + struct drm_i915_oa_async_queue_header *hdr = + (struct drm_i915_oa_async_queue_header *) + dev_priv->oa_pmu.oa_async_buffer.addr; + struct drm_i915_oa_async_node *first_node, *node; + int ret, head, tail, num_nodes; + struct drm_i915_gem_request *req; + + first_node = (struct drm_i915_oa_async_node *) + ((char *)hdr + hdr->data_offset); + num_nodes = (hdr->size_in_bytes - hdr->data_offset) / + sizeof(*node); + + ret = i915_mutex_lock_interruptible(dev_priv->dev); + if (ret) + return; + + tail = hdr->node_count; + head = dev_priv->oa_pmu.oa_async_buffer.head; + + while ((head % num_nodes) != (tail % num_nodes)) { + node = &first_node[head % num_nodes]; + req = node->node_info.req; + if (req && i915_gem_request_completed(req, true)) { + forward_one_oa_async_sample(dev_priv, node); + i915_gem_request_assign(&node->node_info.req, NULL); + head++; + } else + break; + } + + dev_priv->oa_pmu.oa_async_buffer.tail = tail; + dev_priv->oa_pmu.oa_async_buffer.head = head; + + mutex_unlock(&dev_priv->dev->struct_mutex); +} + static void forward_one_oa_snapshot_to_event(struct drm_i915_private *dev_priv, u8 *snapshot, struct perf_event *event) @@ -58,6 +180,14 @@ static u32 forward_oa_snapshots(struct drm_i915_private *dev_priv, u8 *snapshot; u32 taken; + /* + * Schedule a wq to forward the async samples collected. We schedule + * wq here, since it requires device mutex to be taken which can't be + * done here because of atomic context + */ + if (dev_priv->oa_pmu.async_sample_mode) + schedule_work(&dev_priv->oa_pmu.work_timer); + head -= dev_priv->oa_pmu.oa_buffer.gtt_offset; tail -= dev_priv->oa_pmu.oa_buffer.gtt_offset; @@ -176,6 +306,8 @@ oa_async_buffer_destroy(struct drm_i915_private *i915) i915->oa_pmu.oa_async_buffer.obj = NULL; i915->oa_pmu.oa_async_buffer.addr = NULL; + kfree(i915->oa_pmu.oa_async_buffer.snapshot); + mutex_unlock(&i915->dev->struct_mutex); } @@ -358,7 +490,7 @@ static int init_async_oa_buffer(struct perf_event *event) struct drm_i915_private *dev_priv = container_of(event->pmu, typeof(*dev_priv), oa_pmu.pmu); struct drm_i915_gem_object *bo; - int ret; + int snapshot_size, ret; BUG_ON(!IS_HASWELL(dev_priv->dev)); BUG_ON(dev_priv->oa_pmu.oa_async_buffer.obj); @@ -374,6 +506,12 @@ static int init_async_oa_buffer(struct perf_event *event) dev_priv->oa_pmu.oa_async_buffer.obj = bo; dev_priv->oa_pmu.oa_async_buffer.addr = vmap_oa_buffer(bo); + init_oa_async_buf_queue(dev_priv); + + snapshot_size = dev_priv->oa_pmu.oa_async_buffer.format_size + + sizeof(struct drm_i915_oa_async_node_footer); + dev_priv->oa_pmu.oa_async_buffer.snapshot = + kmalloc(snapshot_size, GFP_KERNEL); DRM_DEBUG_DRIVER("OA Async Buffer initialized, vaddr = %p", dev_priv->oa_pmu.oa_async_buffer.addr); @@ -814,6 +952,11 @@ static void i915_oa_event_stop(struct perf_event *event, int flags) flush_oa_snapshots(dev_priv, false); } + if (dev_priv->oa_pmu.async_sample_mode) { + dev_priv->oa_pmu.oa_async_buffer.tail = 0; + dev_priv->oa_pmu.oa_async_buffer.head = 0; + } + event->hw.state = PERF_HES_STOPPED; } @@ -844,7 +987,15 @@ static int i915_oa_event_flush(struct perf_event *event) if (event->attr.sample_period) { struct drm_i915_private *i915 = container_of(event->pmu, typeof(*i915), oa_pmu.pmu); + int ret; + if (i915->oa_pmu.async_sample_mode) { + ret = i915_mutex_lock_interruptible(i915->dev); + if (ret) + return ret; + i915_oa_async_wait_gpu(i915); + mutex_unlock(&i915->dev->struct_mutex); + } flush_oa_snapshots(i915, true); } @@ -940,6 +1091,8 @@ void i915_oa_pmu_register(struct drm_device *dev) hrtimer_init(&i915->oa_pmu.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); i915->oa_pmu.timer.function = hrtimer_sample; + INIT_WORK(&i915->oa_pmu.work_timer, forward_oa_async_snapshots_work); + spin_lock_init(&i915->oa_pmu.lock); i915->oa_pmu.pmu.capabilities = PERF_PMU_CAP_IS_DEVICE; @@ -969,6 +1122,9 @@ void i915_oa_pmu_unregister(struct drm_device *dev) if (i915->oa_pmu.pmu.event_init == NULL) return; + if (i915->oa_pmu.async_sample_mode) + cancel_work_sync(&i915->oa_pmu.work_timer); + unregister_sysctl_table(i915->oa_pmu.sysctl_header); perf_pmu_unregister(&i915->oa_pmu.pmu); -- 1.8.5.1 _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx http://lists.freedesktop.org/mailman/listinfo/intel-gfx