On Tue, Jul 14, 2020 at 12:22:39AM -0700, Umesh Nerlige Ramappa wrote:
From: Piotr Maciejewski <piotr.maciejewski@xxxxxxxxx> i915 used to support time based sampling mode which is good for overall system monitoring, but is not enough for query mode used to measure a single draw call or dispatch. Gen9-Gen11 are using current i915 perf implementation for query, but Gen12+ requires a new approach based on triggered reports within oa buffer. In order to enable above feature two changes are required: 1. Whitelist update: - enable triggered reports within oa buffer - reading oa buffer head/tail/status information - reading gpu ticks counter. 2. Map oa buffer at umd driver level to solve below constraints related to time based sampling interface: - longer time to access reports collected by oa buffer - slow oa reports browsing since oa buffer size is large - missing oa report index, so query cannot browse report directly - with direct access to oa buffer, query can extract other useful reports like context switch information needed to calculate correct performance counters values. Signed-off-by: Piotr Maciejewski <piotr.maciejewski@xxxxxxxxx> --- drivers/gpu/drm/i915/gt/intel_workarounds.c | 54 ++++++++ drivers/gpu/drm/i915/i915_perf.c | 130 +++++++++++++++++++- drivers/gpu/drm/i915/i915_perf_types.h | 13 ++ drivers/gpu/drm/i915/i915_reg.h | 14 +++ include/uapi/drm/i915_drm.h | 19 +++ 5 files changed, 227 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/intel_workarounds.c b/drivers/gpu/drm/i915/gt/intel_workarounds.c index 5726cd0a37e0..cf89928fc3a5 100644 --- a/drivers/gpu/drm/i915/gt/intel_workarounds.c +++ b/drivers/gpu/drm/i915/gt/intel_workarounds.c @@ -1365,6 +1365,48 @@ whitelist_reg(struct i915_wa_list *wal, i915_reg_t reg) whitelist_reg_ext(wal, reg, RING_FORCE_TO_NONPRIV_ACCESS_RW); } +static void gen9_whitelist_build_performance_counters(struct i915_wa_list *w) +{ + /* OA buffer trigger report 2/6 used by performance query */ + whitelist_reg(w, OAREPORTTRIG2); + whitelist_reg(w, OAREPORTTRIG6); + + /* Performance counters A18-20 used by tbs marker query */ + whitelist_reg_ext(w, OA_PERF_COUNTER_A18, + RING_FORCE_TO_NONPRIV_ACCESS_RW | + RING_FORCE_TO_NONPRIV_RANGE_16);
the above whitelist should be broken into whitelist_reg_ext(w, OA_PERF_COUNTER_A18, RING_FORCE_TO_NONPRIV_ACCESS_RW | RING_FORCE_TO_NONPRIV_RANGE_4); whitelist_reg(w, OA_PERF_COUNTER_A20); whitelist_reg(w, OA_PERF_COUNTER_A20_UPPER);
+ + /* Read access to gpu ticks */ + whitelist_reg_ext(w, GEN8_GPU_TICKS, + RING_FORCE_TO_NONPRIV_ACCESS_RD); + + /* Read access to: oa status, head, tail, buffer settings */ + whitelist_reg_ext(w, GEN8_OASTATUS, + RING_FORCE_TO_NONPRIV_ACCESS_RD | + RING_FORCE_TO_NONPRIV_RANGE_4); +} + +static void gen12_whitelist_build_performance_counters(struct i915_wa_list *w) +{ + /* OA buffer trigger report 2/6 used by performance query */ + whitelist_reg(w, GEN12_OAG_OAREPORTTRIG2); + whitelist_reg(w, GEN12_OAG_OAREPORTTRIG6); + + /* Performance counters A18-20 used by tbs marker query */ + whitelist_reg_ext(w, GEN12_OAG_PERF_COUNTER_A18, + RING_FORCE_TO_NONPRIV_ACCESS_RW | + RING_FORCE_TO_NONPRIV_RANGE_16);
same as the above comment
+ + /* Read access to gpu ticks */ + whitelist_reg_ext(w, GEN12_OAG_GPU_TICKS, + RING_FORCE_TO_NONPRIV_ACCESS_RD); + + /* Read access to: oa status, head, tail, buffer settings */ + whitelist_reg_ext(w, GEN12_OAG_OASTATUS, + RING_FORCE_TO_NONPRIV_ACCESS_RD | + RING_FORCE_TO_NONPRIV_RANGE_4); +} + static void gen9_whitelist_build(struct i915_wa_list *w) { /* WaVFEStateAfterPipeControlwithMediaStateClear:skl,bxt,glk,cfl */ @@ -1378,6 +1420,9 @@ static void gen9_whitelist_build(struct i915_wa_list *w) /* WaSendPushConstantsFromMMIO:skl,bxt */ whitelist_reg(w, COMMON_SLICE_CHICKEN2); + + /* Performance counters support */ + gen9_whitelist_build_performance_counters(w); } static void skl_whitelist_build(struct intel_engine_cs *engine) @@ -1471,6 +1516,9 @@ static void cnl_whitelist_build(struct intel_engine_cs *engine) /* WaEnablePreemptionGranularityControlByUMD:cnl */ whitelist_reg(w, GEN8_CS_CHICKEN1); + + /* Performance counters support */ + gen9_whitelist_build_performance_counters(w); } static void icl_whitelist_build(struct intel_engine_cs *engine) @@ -1500,6 +1548,9 @@ static void icl_whitelist_build(struct intel_engine_cs *engine) whitelist_reg_ext(w, PS_INVOCATION_COUNT, RING_FORCE_TO_NONPRIV_ACCESS_RD | RING_FORCE_TO_NONPRIV_RANGE_4); + + /* Performance counters support */ + gen9_whitelist_build_performance_counters(w); break; case VIDEO_DECODE_CLASS: @@ -1550,6 +1601,9 @@ static void tgl_whitelist_build(struct intel_engine_cs *engine) /* Wa_1806527549:tgl */ whitelist_reg(w, HIZ_CHICKEN); + + /* Performance counters support */ + gen12_whitelist_build_performance_counters(w); break; default: whitelist_reg_ext(w, diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c index c6f6370283cf..06a3fff52dfa 100644 --- a/drivers/gpu/drm/i915/i915_perf.c +++ b/drivers/gpu/drm/i915/i915_perf.c @@ -192,6 +192,7 @@ */ #include <linux/anon_inodes.h> +#include <linux/mman.h> #include <linux/sizes.h> #include <linux/uuid.h> @@ -434,6 +435,30 @@ static u32 gen7_oa_hw_tail_read(struct i915_perf_stream *stream) return oastatus1 & GEN7_OASTATUS1_TAIL_MASK; } +static u32 gen12_oa_hw_head_read(struct i915_perf_stream *stream) +{ + struct intel_uncore *uncore = stream->uncore; + + return intel_uncore_read(uncore, GEN12_OAG_OAHEADPTR) & + GEN12_OAG_OAHEADPTR_MASK; +} + +static u32 gen8_oa_hw_head_read(struct i915_perf_stream *stream) +{ + struct intel_uncore *uncore = stream->uncore; + + return intel_uncore_read(uncore, GEN8_OAHEADPTR) & + GEN8_OAHEADPTR_MASK; +} + +static u32 gen7_oa_hw_head_read(struct i915_perf_stream *stream) +{ + struct intel_uncore *uncore = stream->uncore; + u32 oastatus2 = intel_uncore_read(uncore, GEN7_OASTATUS2); + + return oastatus2 & GEN7_OASTATUS2_HEAD_MASK; +} + /** * oa_buffer_check_unlocked - check for data and update tail ptr state * @stream: i915 stream instance @@ -1328,6 +1353,7 @@ free_oa_buffer(struct i915_perf_stream *stream) i915_vma_unpin_and_release(&stream->oa_buffer.vma, I915_VMA_RELEASE_MAP); + stream->oa_buffer.cpu_address = 0; stream->oa_buffer.vaddr = NULL; } @@ -1448,7 +1474,8 @@ static void gen8_init_oa_buffer(struct i915_perf_stream *stream) * bit." */ intel_uncore_write(uncore, GEN8_OABUFFER, gtt_offset | - OABUFFER_SIZE_16M | GEN8_OABUFFER_MEM_SELECT_GGTT); + OABUFFER_SIZE_16M | GEN8_OABUFFER_MEM_SELECT_GGTT | + GEN7_OABUFFER_EDGE_TRIGGER); intel_uncore_write(uncore, GEN8_OATAILPTR, gtt_offset & GEN8_OATAILPTR_MASK); /* Mark that we need updated tail pointers to read from... */ @@ -1501,7 +1528,8 @@ static void gen12_init_oa_buffer(struct i915_perf_stream *stream) * bit." */ intel_uncore_write(uncore, GEN12_OAG_OABUFFER, gtt_offset | - OABUFFER_SIZE_16M | GEN8_OABUFFER_MEM_SELECT_GGTT); + OABUFFER_SIZE_16M | GEN8_OABUFFER_MEM_SELECT_GGTT | + GEN7_OABUFFER_EDGE_TRIGGER); intel_uncore_write(uncore, GEN12_OAG_OATAILPTR, gtt_offset & GEN12_OAG_OATAILPTR_MASK); @@ -1562,6 +1590,7 @@ static int alloc_oa_buffer(struct i915_perf_stream *stream) goto err_unref; } stream->oa_buffer.vma = vma; + stream->oa_buffer.cpu_address = 0; stream->oa_buffer.vaddr = i915_gem_object_pin_map(bo, I915_MAP_WB); @@ -1584,6 +1613,52 @@ static int alloc_oa_buffer(struct i915_perf_stream *stream) return ret; } +static int map_oa_buffer(struct i915_perf_stream *stream) +{ + unsigned long address = 0; + const u64 size = OA_BUFFER_SIZE; + struct i915_vma *oabuffer_vma = stream->oa_buffer.vma; + struct drm_i915_gem_object *oabuffer_obj = oabuffer_vma->obj; + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma = NULL; + + if(stream->oa_buffer.cpu_address != 0) + return 0; + + if (!boot_cpu_has(X86_FEATURE_PAT)) + return -ENODEV; + + if (!oabuffer_obj || !oabuffer_vma) + return -ENOENT; + + if (!oabuffer_obj->base.filp) + return -ENXIO; + + if (range_overflows_t(u64, 0, size, oabuffer_obj->base.size)) + return -EINVAL; + + address = vm_mmap(oabuffer_obj->base.filp, 0, size, + PROT_READ, MAP_SHARED, 0); + + if (IS_ERR_VALUE(address)) + return address; + + if (mmap_write_lock_killable(mm)) + return -EINTR; + + vma = find_vma(mm, address); + if (vma) { + vma->vm_page_prot = + pgprot_writecombine(vm_get_page_prot(vma->vm_flags)); + + stream->oa_buffer.cpu_address = address; + } + + mmap_write_unlock(mm); + + return vma ? 0 : -ENOMEM; +} + static u32 *save_restore_register(struct i915_perf_stream *stream, u32 *cs, bool save, i915_reg_t reg, u32 offset, u32 dword_count) @@ -2493,6 +2568,13 @@ gen12_enable_metric_set(struct i915_perf_stream *stream, (period_exponent << GEN12_OAG_OAGLBCTXCTRL_TIMER_PERIOD_SHIFT)) : 0); + /* + * Initialize Super Queue Internal Cnt Register + * BIT(30) - PMON Enable - set in order to collect valid metrics. + */ + intel_uncore_write(uncore, GEN12_SQCNT1, + intel_uncore_read(uncore, GEN12_SQCNT1) | BIT(30)); + /* * Update all contexts prior writing the mux configurations as we need * to make sure all slices/subslices are ON before writing to NOA @@ -3199,6 +3281,39 @@ static long i915_perf_config_locked(struct i915_perf_stream *stream, return ret; } +/** + * i915_perf_get_oa_buffer_info_locked - Properties of the i915-perf OA buffer + * @arg: pointer to oa buffer info populated by this function. + */ +static int i915_perf_get_oa_buffer_info_locked(struct i915_perf_stream *stream, + unsigned long arg) +{ + struct drm_i915_perf_oa_buffer_info info; + void __user *output = (void __user *) arg; + int ret; + + if (!output) + return -EINVAL; + + memset(&info, 0, sizeof(info)); + + info.size = stream->oa_buffer.vma->size; + info.head = stream->perf->ops.oa_hw_head_read(stream); + info.tail = stream->perf->ops.oa_hw_tail_read(stream); + info.gpu_address = i915_ggtt_offset(stream->oa_buffer.vma); + + ret = map_oa_buffer(stream); + if (ret) + return ret; + + info.cpu_address = stream->oa_buffer.cpu_address; + + if (copy_to_user(output, &info, sizeof(info))) + return -EFAULT; + + return 0; +} + /** * i915_perf_ioctl - support ioctl() usage with i915 perf stream FDs * @stream: An i915 perf stream @@ -3224,6 +3339,8 @@ static long i915_perf_ioctl_locked(struct i915_perf_stream *stream, return 0; case I915_PERF_IOCTL_CONFIG: return i915_perf_config_locked(stream, arg); + case I915_PERF_IOCTL_GET_OA_BUFFER_INFO: + return i915_perf_get_oa_buffer_info_locked(stream, arg); } return -EINVAL; @@ -4245,6 +4362,7 @@ void i915_perf_init(struct drm_i915_private *i915) perf->ops.oa_disable = gen7_oa_disable; perf->ops.read = gen7_oa_read; perf->ops.oa_hw_tail_read = gen7_oa_hw_tail_read; + perf->ops.oa_hw_head_read = gen7_oa_hw_head_read; perf->oa_formats = hsw_oa_formats; } else if (HAS_LOGICAL_RING_CONTEXTS(i915)) { @@ -4276,6 +4394,7 @@ void i915_perf_init(struct drm_i915_private *i915) perf->ops.enable_metric_set = gen8_enable_metric_set; perf->ops.disable_metric_set = gen8_disable_metric_set; perf->ops.oa_hw_tail_read = gen8_oa_hw_tail_read; + perf->ops.oa_hw_head_read = gen8_oa_hw_head_read; if (IS_GEN(i915, 8)) { perf->ctx_oactxctrl_offset = 0x120; @@ -4303,6 +4422,7 @@ void i915_perf_init(struct drm_i915_private *i915) perf->ops.enable_metric_set = gen8_enable_metric_set; perf->ops.disable_metric_set = gen10_disable_metric_set; perf->ops.oa_hw_tail_read = gen8_oa_hw_tail_read; + perf->ops.oa_hw_head_read = gen8_oa_hw_head_read; if (IS_GEN(i915, 10)) { perf->ctx_oactxctrl_offset = 0x128; @@ -4327,6 +4447,7 @@ void i915_perf_init(struct drm_i915_private *i915) perf->ops.enable_metric_set = gen12_enable_metric_set; perf->ops.disable_metric_set = gen12_disable_metric_set; perf->ops.oa_hw_tail_read = gen12_oa_hw_tail_read; + perf->ops.oa_hw_head_read = gen12_oa_hw_head_read; perf->ctx_flexeu0_offset = 0; perf->ctx_oactxctrl_offset = 0x144; @@ -4432,8 +4553,11 @@ int i915_perf_ioctl_version(void) * * 5: Add DRM_I915_PERF_PROP_POLL_OA_PERIOD parameter that controls the * interval for the hrtimer used to check for OA data. + * + * 6: Added an option to map oa buffer at umd driver level and trigger + * oa reports within oa buffer from command buffer. */ - return 5; + return 6; } #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) diff --git a/drivers/gpu/drm/i915/i915_perf_types.h b/drivers/gpu/drm/i915/i915_perf_types.h index a36a455ae336..5b40e20c2aa9 100644 --- a/drivers/gpu/drm/i915/i915_perf_types.h +++ b/drivers/gpu/drm/i915/i915_perf_types.h @@ -251,6 +251,14 @@ struct i915_perf_stream { int format_size; int size_exponent; + /** + * @cpu_address: OA buffer cpu address. + * + * Needed to map OA buffer at umd driver level + * to obtain cpu pointer and browse reports. + */ + u64 cpu_address; + /** * @ptr_lock: Locks reads and writes to all head/tail state * @@ -377,6 +385,11 @@ struct i915_oa_ops { * generations. */ u32 (*oa_hw_tail_read)(struct i915_perf_stream *stream); + + /** + * @oa_hw_head_read: read the OA head pointer register + */ + u32 (*oa_hw_head_read)(struct i915_perf_stream *stream); }; struct i915_perf { diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h index 86a23ced051b..2e3d264339e0 100644 --- a/drivers/gpu/drm/i915/i915_reg.h +++ b/drivers/gpu/drm/i915/i915_reg.h @@ -675,6 +675,7 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg) #define GEN7_OASTATUS2_HEAD_MASK 0xffffffc0 #define GEN7_OASTATUS2_MEM_SELECT_GGTT (1 << 0) /* 0: PPGTT, 1: GGTT */ +#define GEN8_GPU_TICKS _MMIO(0x2910) #define GEN8_OASTATUS _MMIO(0x2b08) #define GEN8_OASTATUS_OVERRUN_STATUS (1 << 3) #define GEN8_OASTATUS_COUNTER_OVERFLOW (1 << 2) @@ -696,6 +697,7 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg) #define OABUFFER_SIZE_16M (7 << 3) #define GEN12_OA_TLB_INV_CR _MMIO(0xceec) +#define GEN12_SQCNT1 _MMIO(0x8718) /* Gen12 OAR unit */ #define GEN12_OAR_OACONTROL _MMIO(0x2960) @@ -731,6 +733,7 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg) #define GEN12_OAG_OA_DEBUG_DISABLE_GO_1_0_REPORTS (1 << 2) #define GEN12_OAG_OA_DEBUG_DISABLE_CTX_SWITCH_REPORTS (1 << 1) +#define GEN12_OAG_GPU_TICKS _MMIO(0xda90) #define GEN12_OAG_OASTATUS _MMIO(0xdafc) #define GEN12_OAG_OASTATUS_COUNTER_OVERFLOW (1 << 2) #define GEN12_OAG_OASTATUS_BUFFER_OVERFLOW (1 << 1) @@ -972,6 +975,17 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg) #define OAREPORTTRIG8_NOA_SELECT_6_SHIFT 24 #define OAREPORTTRIG8_NOA_SELECT_7_SHIFT 28 +/* Performance counters registers */ +#define OA_PERF_COUNTER_A18 _MMIO(0x2890) +#define OA_PERF_COUNTER_A19 _MMIO(0x2898) +#define OA_PERF_COUNTER_A20 _MMIO(0x28A0) + +/* Gen12 Performance counters registers */ +#define GEN12_OAG_PERF_COUNTER_A16 _MMIO(0xDA00)
unused. remove ^
+#define GEN12_OAG_PERF_COUNTER_A18 _MMIO(0xDA10) +#define GEN12_OAG_PERF_COUNTER_A19 _MMIO(0xDA18) +#define GEN12_OAG_PERF_COUNTER_A20 _MMIO(0xDA20) + /* Same layout as OASTARTTRIGX */ #define GEN12_OAG_OASTARTTRIG1 _MMIO(0xd900) #define GEN12_OAG_OASTARTTRIG2 _MMIO(0xd904) diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h index 14b67cd6b54b..62b88c0123c8 100644 --- a/include/uapi/drm/i915_drm.h +++ b/include/uapi/drm/i915_drm.h @@ -2048,6 +2048,25 @@ struct drm_i915_perf_open_param { */ #define I915_PERF_IOCTL_CONFIG _IO('i', 0x2) +/** + * Returns OA buffer properties. + * + * This ioctl is available in perf revision 6. + */ +#define I915_PERF_IOCTL_GET_OA_BUFFER_INFO _IO('i', 0x3) + +/** + * OA buffer information structure. + */ +struct drm_i915_perf_oa_buffer_info { + __u32 size; + __u32 head; + __u32 tail; + __u32 gpu_address; + __u64 cpu_address; + __u64 reserved[4]; +}; + /** * Common to all i915 perf records */ -- 2.20.1 _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/intel-gfx
_______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/intel-gfx