[tip:perf/core] perf/x86/intel: Rework the large PEBS setup code

tip-bot for Peter Zijlstra <tipbot@xxxxxxxxx> · Wed, 10 Aug 2016 10:57:33 -0700

Commit-ID:  09e61b4f78498bd9f213b0a536e80b79507ea89f
Gitweb:     http://git.kernel.org/tip/09e61b4f78498bd9f213b0a536e80b79507ea89f
Author:     Peter Zijlstra <peterz@xxxxxxxxxxxxx>
AuthorDate: Wed, 6 Jul 2016 18:02:43 +0200
Committer:  Ingo Molnar <mingo@xxxxxxxxxx>
CommitDate: Wed, 10 Aug 2016 13:13:24 +0200

perf/x86/intel: Rework the large PEBS setup code

In order to allow optimizing perf_pmu_sched_task() we must ensure
perf_sched_cb_{inc,dec}() are no longer called from NMI context; this
means that pmu::{start,stop}() can no longer use them.

Prepare for this by reworking the whole large PEBS setup code.

The current code relied on the cpuc->pebs_enabled state, however since
that reflects the current active state as per pmu::{start,stop}() we
can no longer rely on this.

Introduce two counters: cpuc->n_pebs and cpuc->n_large_pebs which
count the total number of PEBS events and the number of PEBS events
that have FREERUNNING set, resp.. With this we can tell if the current
setup requires a single record interrupt threshold or can use a larger
buffer.

This also improves the code in that it re-enables the large threshold
once the PEBS event that required single record gets removed.

Signed-off-by: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx>
Cc: Alexander Shishkin <alexander.shishkin@xxxxxxxxxxxxxxx>
Cc: Arnaldo Carvalho de Melo <acme@xxxxxxxxxx>
Cc: Jiri Olsa <jolsa@xxxxxxxxxx>
Cc: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx>
Cc: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
Cc: Stephane Eranian <eranian@xxxxxxxxxx>
Cc: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
Cc: Vince Weaver <vincent.weaver@xxxxxxxxx>
Cc: linux-kernel@xxxxxxxxxxxxxxx
Signed-off-by: Ingo Molnar <mingo@xxxxxxxxxx>
---
 arch/x86/events/intel/ds.c   | 102 ++++++++++++++++++++++++++++---------------
 arch/x86/events/perf_event.h |   2 +
 kernel/events/core.c         |   4 ++
 3 files changed, 73 insertions(+), 35 deletions(-)

diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index 7ce9f3f..c791ff9 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -806,9 +806,55 @@ struct event_constraint *intel_pebs_constraints(struct perf_event *event)
 	return &emptyconstraint;
 }
 
-static inline bool pebs_is_enabled(struct cpu_hw_events *cpuc)
+/*
+ * We need the sched_task callback even for per-cpu events when we use
+ * the large interrupt threshold, such that we can provide PID and TID
+ * to PEBS samples.
+ */
+static inline bool pebs_needs_sched_cb(struct cpu_hw_events *cpuc)
+{
+	return cpuc->n_pebs && (cpuc->n_pebs == cpuc->n_large_pebs);
+}
+
+static inline void pebs_update_threshold(struct cpu_hw_events *cpuc)
+{
+	struct debug_store *ds = cpuc->ds;
+	u64 threshold;
+
+	if (cpuc->n_pebs == cpuc->n_large_pebs) {
+		threshold = ds->pebs_absolute_maximum -
+			x86_pmu.max_pebs_events * x86_pmu.pebs_record_size;
+	} else {
+		threshold = ds->pebs_buffer_base + x86_pmu.pebs_record_size;
+	}
+
+	ds->pebs_interrupt_threshold = threshold;
+}
+
+static void
+pebs_update_state(bool needed_cb, struct cpu_hw_events *cpuc, struct pmu *pmu)
+{
+	if (needed_cb != pebs_needs_sched_cb(cpuc)) {
+		if (!needed_cb)
+			perf_sched_cb_inc(pmu);
+		else
+			perf_sched_cb_dec(pmu);
+
+		pebs_update_threshold(cpuc);
+	}
+}
+
+static void intel_pmu_pebs_add(struct perf_event *event)
 {
-	return (cpuc->pebs_enabled & ((1ULL << MAX_PEBS_EVENTS) - 1));
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	struct hw_perf_event *hwc = &event->hw;
+	bool needed_cb = pebs_needs_sched_cb(cpuc);
+
+	cpuc->n_pebs++;
+	if (hwc->flags & PERF_X86_EVENT_FREERUNNING)
+		cpuc->n_large_pebs++;
+
+	pebs_update_state(needed_cb, cpuc, event->ctx->pmu);
 }
 
 void intel_pmu_pebs_enable(struct perf_event *event)
@@ -816,12 +862,11 @@ void intel_pmu_pebs_enable(struct perf_event *event)
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
 	struct hw_perf_event *hwc = &event->hw;
 	struct debug_store *ds = cpuc->ds;
-	bool first_pebs;
-	u64 threshold;
+
+	intel_pmu_pebs_add(event);
 
 	hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT;
 
-	first_pebs = !pebs_is_enabled(cpuc);
 	cpuc->pebs_enabled |= 1ULL << hwc->idx;
 
 	if (event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT)
@@ -830,46 +875,34 @@ void intel_pmu_pebs_enable(struct perf_event *event)
 		cpuc->pebs_enabled |= 1ULL << 63;
 
 	/*
-	 * When the event is constrained enough we can use a larger
-	 * threshold and run the event with less frequent PMI.
+	 * Use auto-reload if possible to save a MSR write in the PMI.
+	 * This must be done in pmu::start(), because PERF_EVENT_IOC_PERIOD.
 	 */
-	if (hwc->flags & PERF_X86_EVENT_FREERUNNING) {
-		threshold = ds->pebs_absolute_maximum -
-			x86_pmu.max_pebs_events * x86_pmu.pebs_record_size;
-
-		if (first_pebs)
-			perf_sched_cb_inc(event->ctx->pmu);
-	} else {
-		threshold = ds->pebs_buffer_base + x86_pmu.pebs_record_size;
-
-		/*
-		 * If not all events can use larger buffer,
-		 * roll back to threshold = 1
-		 */
-		if (!first_pebs &&
-		    (ds->pebs_interrupt_threshold > threshold))
-			perf_sched_cb_dec(event->ctx->pmu);
-	}
-
-	/* Use auto-reload if possible to save a MSR write in the PMI */
 	if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) {
 		ds->pebs_event_reset[hwc->idx] =
 			(u64)(-hwc->sample_period) & x86_pmu.cntval_mask;
 	}
+}
+
+static void intel_pmu_pebs_del(struct perf_event *event)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	struct hw_perf_event *hwc = &event->hw;
+	bool needed_cb = pebs_needs_sched_cb(cpuc);
+
+	cpuc->n_pebs--;
+	if (hwc->flags & PERF_X86_EVENT_FREERUNNING)
+		cpuc->n_large_pebs--;
 
-	if (first_pebs || ds->pebs_interrupt_threshold > threshold)
-		ds->pebs_interrupt_threshold = threshold;
+	pebs_update_state(needed_cb, cpuc, event->ctx->pmu);
 }
 
 void intel_pmu_pebs_disable(struct perf_event *event)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
 	struct hw_perf_event *hwc = &event->hw;
-	struct debug_store *ds = cpuc->ds;
-	bool large_pebs = ds->pebs_interrupt_threshold >
-		ds->pebs_buffer_base + x86_pmu.pebs_record_size;
 
-	if (large_pebs)
+	if (cpuc->n_pebs == cpuc->n_large_pebs)
 		intel_pmu_drain_pebs_buffer();
 
 	cpuc->pebs_enabled &= ~(1ULL << hwc->idx);
@@ -879,13 +912,12 @@ void intel_pmu_pebs_disable(struct perf_event *event)
 	else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST)
 		cpuc->pebs_enabled &= ~(1ULL << 63);
 
-	if (large_pebs && !pebs_is_enabled(cpuc))
-		perf_sched_cb_dec(event->ctx->pmu);
-
 	if (cpuc->enabled)
 		wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled);
 
 	hwc->config |= ARCH_PERFMON_EVENTSEL_INT;
+
+	intel_pmu_pebs_del(event);
 }
 
 void intel_pmu_pebs_enable_all(void)
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 8c4a477..94b8f27 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -194,6 +194,8 @@ struct cpu_hw_events {
 	 */
 	struct debug_store	*ds;
 	u64			pebs_enabled;
+	int			n_pebs;
+	int			n_large_pebs;
 
 	/*
 	 * Intel LBR bits
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 11f6bbe..57aff71 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2818,6 +2818,10 @@ void perf_sched_cb_inc(struct pmu *pmu)
 /*
  * This function provides the context switch callback to the lower code
  * layer. It is invoked ONLY when the context switch callback is enabled.
+ *
+ * This callback is relevant even to per-cpu events; for example multi event
+ * PEBS requires this to provide PID/TID information. This requires we flush
+ * all queued PEBS records before we context switch to a new task.
  */
 static void perf_pmu_sched_task(struct task_struct *prev,
 				struct task_struct *next,
--
To unsubscribe from this list: send the line "unsubscribe linux-tip-commits" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html