Per the SDM, reference cycles (as a general purpose event) are currently implemented using the core crystal clock, TSC, or bus clock. Therefore, it's unreasonable to expect reference cycles for the measured loop to fall within some constant multiples of the number of loop iterations, unless those bounds are set so wide as to be pointless. The bounds initially established when this test was written were broadened in commit 4779578b24b3 ("make PMU test to pass on more cpu types"), but even the new bounds are too narrow to accommodate a 2.6GHz Ice Lake, with the TSC frequency at 104 times the reference cycle (for this implementation, the core crystal clock) frequency. Restore the initial (tighter) bounds, calculate the ratio of TSC frequency to reference cycle frequency, and then scale the bounds accordingly. Tested on several generations of Xeon E5 parts: Ice Lake, Cascade Lake, Skylake, Broadwell, and Haswell. Opportunistically fixed a spelling error and a commented-out printf format string. Fixes: 4779578b24b3 ("make PMU test to pass on more cpu types") Reported-by: Jacob Xu <jacobhxu@xxxxxxxxx> Signed-off-by: Jim Mattson <jmattson@xxxxxxxxxx> --- x86/pmu.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 53 insertions(+), 3 deletions(-) diff --git a/x86/pmu.c b/x86/pmu.c index 92206ad0548f..f2e4d44d3a97 100644 --- a/x86/pmu.c +++ b/x86/pmu.c @@ -86,8 +86,8 @@ struct pmu_event { } gp_events[] = { {"core cycles", 0x003c, 1*N, 50*N}, {"instructions", 0x00c0, 10*N, 10.2*N}, - {"ref cycles", 0x013c, 0.1*N, 30*N}, - {"llc refference", 0x4f2e, 1, 2*N}, + {"ref cycles", 0x013c, 1*N, 30*N}, + {"llc references", 0x4f2e, 1, 2*N}, {"llc misses", 0x412e, 1, 1*N}, {"branches", 0x00c4, 1*N, 1.1*N}, {"branch misses", 0x00c5, 0, 0.1*N}, @@ -223,7 +223,7 @@ static void measure(pmu_counter_t *evt, int count) static bool verify_event(uint64_t count, struct pmu_event *e) { - // printf("%lld >= %lld <= %lld\n", e->min, count, e->max); + // printf("%d <= %ld <= %d\n", e->min, count, e->max); return count >= e->min && count <= e->max; } @@ -605,6 +605,54 @@ static void check_gp_counters_write_width(void) } } +/* + * Per the SDM, reference cycles are currently implemented using the + * core crystal clock, TSC, or bus clock. Calibrate to the TSC + * frequency to set reasonable expectations. + */ +static void set_ref_cycle_expectations(void) +{ + pmu_counter_t cnt = { + .ctr = MSR_IA32_PERFCTR0, + .config = EVNTSEL_OS | EVNTSEL_USR | gp_events[2].unit_sel, + .count = 0, + }; + uint64_t tsc_delta; + uint64_t t0, t1, t2, t3; + + if (!eax.split.num_counters || (ebx.full & (1 << 2))) + return; + + wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0); + + t0 = fenced_rdtsc(); + start_event(&cnt); + t1 = fenced_rdtsc(); + + /* + * This loop has to run long enough to dominate the VM-exit + * costs for playing with the PMU MSRs on start and stop. + * + * On a 2.6GHz Ice Lake, with the TSC frequency at 104 times + * the core crystal clock, this function calculated a guest + * TSC : ref cycles ratio of around 105 with ECX initialized + * to one billion. + */ + asm volatile("loop ." : "+c"((int){1000000000ull})); + + t2 = fenced_rdtsc(); + stop_event(&cnt); + t3 = fenced_rdtsc(); + + tsc_delta = ((t2 - t1) + (t3 - t0)) / 2; + + if (!tsc_delta) + return; + + gp_events[2].min = (gp_events[2].min * cnt.count) / tsc_delta; + gp_events[2].max = (gp_events[2].max * cnt.count) / tsc_delta; +} + int main(int ac, char **av) { struct cpuid id = cpuid(10); @@ -627,6 +675,8 @@ int main(int ac, char **av) return report_summary(); } + set_ref_cycle_expectations(); + printf("PMU version: %d\n", eax.split.version_id); printf("GP counters: %d\n", eax.split.num_counters); printf("GP counter width: %d\n", eax.split.bit_width); -- 2.35.1.265.g69c8d7142f-goog