Re: [PATCH 24/24] RFC drm/i915: Expose a PMU interface for perf queries

Dmitry Rogozhkin <dmitry.v.rogozhkin@xxxxxxxxx> · Fri, 19 May 2017 07:54:10 -0700

On 5/19/2017 1:01 AM, Chris Wilson wrote:
On Thu, May 18, 2017 at 04:48:47PM -0700, Dmitry Rogozhkin wrote:

On 5/18/2017 2:46 AM, Chris Wilson wrote:
The first goal is to be able to measure GPU (and invidual ring) busyness
without having to poll registers from userspace. (Which not only incurs
holding the forcewake lock indefinitely, perturbing the system, but also
runs the risk of hanging the machine.) As an alternative we can use the
perf event counter interface to sample the ring registers periodically
and send those results to userspace.

To be able to do so, we need to export the two symbols from
kernel/events/core.c to register and unregister a PMU device.

v2: Use a common timer for the ring sampling.

Signed-off-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx>
---
  drivers/gpu/drm/i915/Makefile           |   1 +
  drivers/gpu/drm/i915/i915_drv.c         |   2 +
  drivers/gpu/drm/i915/i915_drv.h         |  23 ++
  drivers/gpu/drm/i915/i915_pmu.c         | 449 ++++++++++++++++++++++++++++++++
  drivers/gpu/drm/i915/intel_ringbuffer.h |   2 +
  include/uapi/drm/i915_drm.h             |  40 +++
  kernel/events/core.c                    |   1 +
  7 files changed, 518 insertions(+)
  create mode 100644 drivers/gpu/drm/i915/i915_pmu.c

diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
index 7b05fb802f4c..ca88e6e67910 100644
--- a/drivers/gpu/drm/i915/Makefile
+++ b/drivers/gpu/drm/i915/Makefile
@@ -26,6 +26,7 @@ i915-y := i915_drv.o \
  i915-$(CONFIG_COMPAT)   += i915_ioc32.o
  i915-$(CONFIG_DEBUG_FS) += i915_debugfs.o intel_pipe_crc.o
+i915-$(CONFIG_PERF_EVENTS) += i915_pmu.o
  # GEM code
  i915-y += i915_cmd_parser.o \
diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index 2d2fb4327f97..e3c6d052d1c9 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -1144,6 +1144,7 @@ static void i915_driver_register(struct drm_i915_private *dev_priv)
  	struct drm_device *dev = &dev_priv->drm;
  	i915_gem_shrinker_init(dev_priv);
+	i915_pmu_register(dev_priv);
  	/*
  	 * Notify a valid surface after modesetting,
@@ -1197,6 +1198,7 @@ static void i915_driver_unregister(struct drm_i915_private *dev_priv)
  	intel_opregion_unregister(dev_priv);
  	i915_perf_unregister(dev_priv);
+	i915_pmu_unregister(dev_priv);
  	i915_teardown_sysfs(dev_priv);
  	i915_guc_log_unregister(dev_priv);
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 1fa1e7d48f02..10beae1a13c8 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -40,6 +40,7 @@
  #include <linux/hash.h>
  #include <linux/intel-iommu.h>
  #include <linux/kref.h>
+#include <linux/perf_event.h>
  #include <linux/pm_qos.h>
  #include <linux/reservation.h>
  #include <linux/shmem_fs.h>
@@ -2075,6 +2076,12 @@ struct intel_cdclk_state {
  	unsigned int cdclk, vco, ref;
  };
+enum {
+	__I915_SAMPLE_FREQ_ACT = 0,
+	__I915_SAMPLE_FREQ_REQ,
+	__I915_NUM_PMU_SAMPLERS
+};
+
  struct drm_i915_private {
  	struct drm_device drm;
@@ -2564,6 +2571,13 @@ struct drm_i915_private {
  		int	irq;
  	} lpe_audio;
+	struct {
+		struct pmu base;
+		struct hrtimer timer;
+		u64 enable;
+		u64 sample[__I915_NUM_PMU_SAMPLERS];
+	} pmu;
+
  	/*
  	 * NOTE: This is the dri1/ums dungeon, don't add stuff here. Your patch
  	 * will be rejected. Instead look for a better place.
@@ -3681,6 +3695,15 @@ extern void i915_perf_fini(struct drm_i915_private *dev_priv);
  extern void i915_perf_register(struct drm_i915_private *dev_priv);
  extern void i915_perf_unregister(struct drm_i915_private *dev_priv);
+/* i915_pmu.c */
+#ifdef CONFIG_PERF_EVENTS
+extern void i915_pmu_register(struct drm_i915_private *i915);
+extern void i915_pmu_unregister(struct drm_i915_private *i915);
+#else
+static inline void i915_pmu_register(struct drm_i915_private *i915) {}
+static inline void i915_pmu_unregister(struct drm_i915_private *i915) {}
+#endif
+
  /* i915_suspend.c */
  extern int i915_save_state(struct drm_i915_private *dev_priv);
  extern int i915_restore_state(struct drm_i915_private *dev_priv);
diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c
new file mode 100644
index 000000000000..80e1c07841ac
--- /dev/null
+++ b/drivers/gpu/drm/i915/i915_pmu.c
@@ -0,0 +1,449 @@
+#include <linux/perf_event.h>
+#include <linux/pm_runtime.h>
+
+#include "i915_drv.h"
+#include "intel_ringbuffer.h"
+
+#define FREQUENCY 200
+#define PERIOD max_t(u64, 10000, NSEC_PER_SEC / FREQUENCY)
+
+#define RING_MASK 0xffffffff
+#define RING_MAX 32
+
+static void engines_sample(struct drm_i915_private *dev_priv)
+{
+	struct intel_engine_cs *engine;
+	enum intel_engine_id id;
+	bool fw = false;
+
+	if ((dev_priv->pmu.enable & RING_MASK) == 0)
+		return;
+
+	if (!dev_priv->gt.awake)
+		return;
+
+	if (!intel_runtime_pm_get_if_in_use(dev_priv))
+		return;
+
+	for_each_engine(engine, dev_priv, id) {
+		u32 val;
+
+		if ((dev_priv->pmu.enable & (0x7 << (4*id))) == 0)
+			continue;
+
+		if (i915_seqno_passed(intel_engine_get_seqno(engine),
+				      intel_engine_last_submit(engine)))
+			continue;
+
+		if (!fw) {
+			intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
+			fw = true;
+		}
+
+		val = I915_READ_FW(RING_MI_MODE(engine->mmio_base));
+		if (!(val & MODE_IDLE))
+			engine->pmu_sample[I915_SAMPLE_BUSY] += PERIOD;
Could you, please, check that I understand correctly what you
attempt to do? Each PERIOD=10ms you check the status of the engines.
If engine is in a busy state you count the PERIOD time as if the
engine was busy during this period. If engine was in some other
state you count this toward this state (wait or sema below as I
understand).

If that's what you are trying to do, your solution can be somewhat
used only for the cases where tasks are executed on the engines more
then 2x PERIOD time, i.e. longer than 20ms.
It's a statistical sampling, same as any other. You have control over
the period, or at least perf does give you control (I may have not
hooked it up correctly). I set a totally arbitrary cap of 20us.
So, I understand correctly. Statistics where you really have all mighty 
to provide non-statistic metrics or at least provide both. That's great 
that perf permits to control sampling period, but smaller period will 
come with the higher cpu% which will lead to the gating this usage in 
the production environment. Besides in production environment you need 
to get busy clocks metrics over pretty big period of time close to 1 second.
Which is not the case
for a lot of tasks submitted by media with the usual execution time
of just few milliseconds.
If you are only issuing one batch lasting 1ms every second, the load is
more or less immaterial.
I did not say that. And I don't understand where you took this from. Sorry.
Considering that i915 currently tracks
when batches were scheduled and when they are ready, you can easily
calculate very strong precise metric of busy clocks for each engine
from the perspective of i915, i.e. that will not be precise time of
how long engine calculated things because it will include scheduling
latency, but that is exactly what end-user requires: end-user does
not care whether there is a latency or there is not, for him engine
is busy all that time.
Which you can do yourself. I am not that interested in repeating
metrics already available to userspace.
Repeating metrics?! Here are metrics which are available for centuries: 
cat /proc/stat. Here I can get busy clocks information for the CPUs and 
much more. Please, provide me a similar path for the GPUs if I am not 
aware of its existence.
Furthermore, coupled in with the
statiscal sampling of perf, perf also provides access to the timestamps
of when each request is constructed, queued, executed, completed and
retired; and waited upon along with a host of other tracepoints.
So, you suggest to track each request from the userspace and calculate 
on our own. You care so much about performance and additional heavy code 
in the driver especially in the interrupt handlers, why I should not 
care about userspace performance. What you incline and suggest is a 
no-go. As I said, monitoring metrics are required, not debugging 
metrics. Monitoring metrics should correctly work with the long sampling 
periods like 1s and more giving smallest possible overhead, i.e. no polling.
This is done in the alternate solution given
by Tvrtko in "drm/i915: Export engine busy stats in debugfs" patch.
Why you do not take this as a basis? Why this patch containing few
arithmetic operations to calculate engines busy clocks is ignored?
Because this approach is independent of platform and plugs into the
existing perf infrastructure. And you need a far more convincing
argument to have an execlist-only code running inside interrupt
handlers. Tvrtko said himself that maintainability is paramount.
Why execlists only? Introduce the same for ringbuffers, GuC... Step 
higher if needed and introduce counters to the scheduler routines which 
also should track requests. After all couple this with the requests 
tracking you expose to perf system as you noted above. Is that not possible?
Perf infrastructure is good thing. But it is really needed to be able to 
easily get certain engines metrics (busy clocks at the first place) with 
simple 'cat gpu/stat'. Please, understand, these metrics are required in 
production to give customers opportunity to dynamically load balance N 
media servers from the host.
-Chris


_______________________________________________
Intel-gfx mailing list
Intel-gfx@xxxxxxxxxxxxxxxxxxxxx
https://lists.freedesktop.org/mailman/listinfo/intel-gfx