Previously when performance counters are per-core, rather than per-thread, the number available were divided by 2 on detection, and the counters used by each thread in a core were "swizzled" to ensure separation. However, this solution is suboptimal since it relies on a couple of assumptions: a) Always having 2 VPEs / core (number of counters was divided by 2) b) Always having a number of counters implemented in the core that is divisible by 2. For instance if an SoC implementation had a single counter and 2 VPEs per core, then this logic would fail and no performance counters would be available. The mechanism also does not allow for one VPE in a core using more than it's allocation of the per-core counters to count multiple events even though other VPEs may not be using them. Fix this situation by instead allocating (and releasing) per-core performance counters when they are requested. This approach removes the above assumptions and fixes the shortcomings. In order to do this: Add additional logic to mipsxx_pmu_alloc_counter() to detect if a sibling is using a per-core counter, and to allocate a per-core counter in all sibling CPUs. Similarly, add a mipsxx_pmu_free_counter() function to release a per-core counter in all sibling CPUs when it is finished with. A new spinlock, core_counters_lock, is introduced to ensure exclusivity when allocating and releasing per-core counters. Since counters are now allocated per-core on demand, rather than being reserved per-thread at boot, all of the "swizzling" of counters is removed. The upshot is that in an SoC with 2 counters / thread, counters are reported as: Performance counters: mips/interAptiv PMU enabled, 2 32-bit counters available to each CPU, irq 18 Running an instance of a test program on each of 2 threads in a core, both threads can use their 2 counters to count 2 events: taskset 4 perf stat -e instructions:u,branches:u ./test_prog & taskset 8 perf stat -e instructions:u,branches:u ./test_prog Performance counter stats for './test_prog': 30002 instructions:u 10000 branches:u 0.005164264 seconds time elapsed Performance counter stats for './test_prog': 30002 instructions:u 10000 branches:u 0.006139975 seconds time elapsed In an SoC with 2 counters / core (which can be forced by setting cpu_has_mipsmt_pertccounters = 0), counters are reported as: Performance counters: mips/interAptiv PMU enabled, 2 32-bit counters available to each core, irq 18 Running an instance of a test program on each of 2 threads in a core, now only one thread manages to secure the performance counters to count 2 events. The other thread does not get any counters. taskset 4 perf stat -e instructions:u,branches:u ./test_prog & taskset 8 perf stat -e instructions:u,branches:u ./test_prog Performance counter stats for './test_prog': <not counted> instructions:u <not counted> branches:u 0.005179533 seconds time elapsed Performance counter stats for './test_prog': 30002 instructions:u 10000 branches:u 0.005179467 seconds time elapsed Signed-off-by: Matt Redfearn <matt.redfearn@xxxxxxxx> --- arch/mips/kernel/perf_event_mipsxx.c | 130 ++++++++++++++++++++++++----------- 1 file changed, 88 insertions(+), 42 deletions(-) diff --git a/arch/mips/kernel/perf_event_mipsxx.c b/arch/mips/kernel/perf_event_mipsxx.c index 782a1b6c6352..bedb0d5ff3f2 100644 --- a/arch/mips/kernel/perf_event_mipsxx.c +++ b/arch/mips/kernel/perf_event_mipsxx.c @@ -131,6 +131,8 @@ static struct mips_pmu mipspmu; #ifdef CONFIG_MIPS_PERF_SHARED_TC_COUNTERS static int cpu_has_mipsmt_pertccounters; +static DEFINE_SPINLOCK(core_counters_lock); + static DEFINE_RWLOCK(pmuint_rwlock); #if defined(CONFIG_CPU_BMIPS5000) @@ -141,20 +143,6 @@ static DEFINE_RWLOCK(pmuint_rwlock); 0 : cpu_vpe_id(¤t_cpu_data)) #endif -/* Copied from op_model_mipsxx.c */ -static unsigned int vpe_shift(void) -{ - if (num_possible_cpus() > 1) - return 1; - - return 0; -} - -static unsigned int counters_total_to_per_cpu(unsigned int counters) -{ - return counters >> vpe_shift(); -} - #else /* !CONFIG_MIPS_PERF_SHARED_TC_COUNTERS */ #define vpe_id() 0 @@ -165,17 +153,8 @@ static void pause_local_counters(void); static irqreturn_t mipsxx_pmu_handle_irq(int, void *); static int mipsxx_pmu_handle_shared_irq(void); -static unsigned int mipsxx_pmu_swizzle_perf_idx(unsigned int idx) -{ - if (vpe_id() == 1) - idx = (idx + 2) & 3; - return idx; -} - static u64 mipsxx_pmu_read_counter(unsigned int idx) { - idx = mipsxx_pmu_swizzle_perf_idx(idx); - switch (idx) { case 0: /* @@ -197,8 +176,6 @@ static u64 mipsxx_pmu_read_counter(unsigned int idx) static u64 mipsxx_pmu_read_counter_64(unsigned int idx) { - idx = mipsxx_pmu_swizzle_perf_idx(idx); - switch (idx) { case 0: return read_c0_perfcntr0_64(); @@ -216,8 +193,6 @@ static u64 mipsxx_pmu_read_counter_64(unsigned int idx) static void mipsxx_pmu_write_counter(unsigned int idx, u64 val) { - idx = mipsxx_pmu_swizzle_perf_idx(idx); - switch (idx) { case 0: write_c0_perfcntr0(val); @@ -236,8 +211,6 @@ static void mipsxx_pmu_write_counter(unsigned int idx, u64 val) static void mipsxx_pmu_write_counter_64(unsigned int idx, u64 val) { - idx = mipsxx_pmu_swizzle_perf_idx(idx); - switch (idx) { case 0: write_c0_perfcntr0_64(val); @@ -256,8 +229,6 @@ static void mipsxx_pmu_write_counter_64(unsigned int idx, u64 val) static unsigned int mipsxx_pmu_read_control(unsigned int idx) { - idx = mipsxx_pmu_swizzle_perf_idx(idx); - switch (idx) { case 0: return read_c0_perfctrl0(); @@ -275,8 +246,6 @@ static unsigned int mipsxx_pmu_read_control(unsigned int idx) static void mipsxx_pmu_write_control(unsigned int idx, unsigned int val) { - idx = mipsxx_pmu_swizzle_perf_idx(idx); - switch (idx) { case 0: write_c0_perfctrl0(val); @@ -296,7 +265,7 @@ static void mipsxx_pmu_write_control(unsigned int idx, unsigned int val) static int mipsxx_pmu_alloc_counter(struct cpu_hw_events *cpuc, struct hw_perf_event *hwc) { - int i; + int i, cpu = smp_processor_id(); /* * We only need to care the counter mask. The range has been @@ -315,14 +284,88 @@ static int mipsxx_pmu_alloc_counter(struct cpu_hw_events *cpuc, * they can be dynamically swapped, they both feel happy. * But here we leave this issue alone for now. */ - if (test_bit(i, &cntr_mask) && - !test_and_set_bit(i, cpuc->used_mask)) + if (!test_bit(i, &cntr_mask)) + continue; + +#ifdef CONFIG_MIPS_PERF_SHARED_TC_COUNTERS + /* + * When counters are per-core, check for use and allocate + * them in all sibling CPUs. + */ + if (!cpu_has_mipsmt_pertccounters) { + struct cpu_hw_events *sibling_cpuc; + int sibling_cpu, allocated = 0; + unsigned long flags; + + spin_lock_irqsave(&core_counters_lock, flags); + + for_each_cpu(sibling_cpu, &cpu_sibling_map[cpu]) { + sibling_cpuc = per_cpu_ptr(&cpu_hw_events, + sibling_cpu); + + if (test_bit(i, sibling_cpuc->used_mask)) { + pr_debug("CPU%d already using core counter %d\n", + sibling_cpu, i); + goto next_counter; + } + } + + /* Counter i is not used by any siblings - use it */ + allocated = 1; + for_each_cpu(sibling_cpu, &cpu_sibling_map[cpu]) { + sibling_cpuc = per_cpu_ptr(&cpu_hw_events, + sibling_cpu); + + set_bit(i, sibling_cpuc->used_mask); + /* sibling is using the counter */ + pr_debug("CPU%d now using core counter %d\n", + sibling_cpu, i); + } +next_counter: + spin_unlock_irqrestore(&core_counters_lock, flags); + if (allocated) + return i; + } + else +#endif + if (!test_and_set_bit(i, cpuc->used_mask)) { + pr_debug("CPU%d now using counter %d\n", cpu, i); return i; + } } return -EAGAIN; } +static void mipsxx_pmu_free_counter(struct cpu_hw_events *cpuc, + struct hw_perf_event *hwc) +{ +#ifdef CONFIG_MIPS_PERF_SHARED_TC_COUNTERS + int sibling_cpu, cpu = smp_processor_id(); + + /* When counters are per-core, free them in all sibling CPUs */ + if (!cpu_has_mipsmt_pertccounters) { + struct cpu_hw_events *sibling_cpuc; + unsigned long flags; + + spin_lock_irqsave(&core_counters_lock, flags); + + for_each_cpu(sibling_cpu, &cpu_sibling_map[cpu]) { + sibling_cpuc = per_cpu_ptr(&cpu_hw_events, sibling_cpu); + + clear_bit(hwc->idx, sibling_cpuc->used_mask); + pr_debug("CPU%d released core counter %d\n", + sibling_cpu, hwc->idx); + } + + spin_unlock_irqrestore(&core_counters_lock, flags); + return; + } +#endif + pr_debug("CPU%d released counter %d\n", cpu, hwc->idx); + clear_bit(hwc->idx, cpuc->used_mask); +} + static void mipsxx_pmu_enable_event(struct hw_perf_event *evt, int idx) { struct perf_event *event = container_of(evt, struct perf_event, hw); @@ -510,7 +553,7 @@ static void mipspmu_del(struct perf_event *event, int flags) mipspmu_stop(event, PERF_EF_UPDATE); cpuc->events[idx] = NULL; - clear_bit(idx, cpuc->used_mask); + mipsxx_pmu_free_counter(cpuc, hwc); perf_event_update_userpage(event); } @@ -1735,8 +1778,6 @@ init_hw_perf_events(void) #ifdef CONFIG_MIPS_PERF_SHARED_TC_COUNTERS cpu_has_mipsmt_pertccounters = probe_mipsmt_pertccounters(); - if (!cpu_has_mipsmt_pertccounters) - counters = counters_total_to_per_cpu(counters); #endif if (get_c0_perfcount_int) @@ -1860,9 +1901,14 @@ init_hw_perf_events(void) on_each_cpu(reset_counters, (void *)(long)counters, 1); - pr_cont("%s PMU enabled, %d %d-bit counters available to each " - "CPU, irq %d%s\n", mipspmu.name, counters, counter_bits, irq, - irq < 0 ? " (share with timer interrupt)" : ""); + pr_cont("%s PMU enabled, %d %d-bit counters available to each %s, irq %d%s\n", + mipspmu.name, counters, counter_bits, +#ifdef CONFIG_MIPS_PERF_SHARED_TC_COUNTERS + cpu_has_mipsmt_pertccounters ? "CPU" : "core", +#else + "CPU", +#endif + irq, irq < 0 ? " (share with timer interrupt)" : ""); perf_pmu_register(&pmu, "cpu", PERF_TYPE_RAW); -- 2.7.4