Expand the PMU counters test to verify that LLC references and misses have non-zero counts when the code being executed while the LLC event(s) is active is evicted via CFLUSH{,OPT}. Note, CLFLUSH{,OPT} requires a fence of some kind to ensure the cache lines are flushed before execution continues. Use MFENCE for simplicity (performance is not a concern). Suggested-by: Jim Mattson <jmattson@xxxxxxxxxx> Reviewed-by: Dapeng Mi <dapeng1.mi@xxxxxxxxxxxxxxx> Signed-off-by: Sean Christopherson <seanjc@xxxxxxxxxx> --- .../selftests/kvm/x86_64/pmu_counters_test.c | 59 +++++++++++++------ 1 file changed, 40 insertions(+), 19 deletions(-) diff --git a/tools/testing/selftests/kvm/x86_64/pmu_counters_test.c b/tools/testing/selftests/kvm/x86_64/pmu_counters_test.c index f5dedd112471..4c7133ddcda8 100644 --- a/tools/testing/selftests/kvm/x86_64/pmu_counters_test.c +++ b/tools/testing/selftests/kvm/x86_64/pmu_counters_test.c @@ -14,9 +14,9 @@ /* * Number of "extra" instructions that will be counted, i.e. the number of * instructions that are needed to set up the loop and then disabled the - * counter. 2 MOV, 2 XOR, 1 WRMSR. + * counter. 1 CLFLUSH/CLFLUSHOPT/NOP, 1 MFENCE, 2 MOV, 2 XOR, 1 WRMSR. */ -#define NUM_EXTRA_INSNS 5 +#define NUM_EXTRA_INSNS 7 #define NUM_INSNS_RETIRED (NUM_BRANCHES + NUM_EXTRA_INSNS) static uint8_t kvm_pmu_version; @@ -107,6 +107,12 @@ static void guest_assert_event_count(uint8_t idx, case INTEL_ARCH_BRANCHES_RETIRED_INDEX: GUEST_ASSERT_EQ(count, NUM_BRANCHES); break; + case INTEL_ARCH_LLC_REFERENCES_INDEX: + case INTEL_ARCH_LLC_MISSES_INDEX: + if (!this_cpu_has(X86_FEATURE_CLFLUSHOPT) && + !this_cpu_has(X86_FEATURE_CLFLUSH)) + break; + fallthrough; case INTEL_ARCH_CPU_CYCLES_INDEX: case INTEL_ARCH_REFERENCE_CYCLES_INDEX: GUEST_ASSERT_NE(count, 0); @@ -123,29 +129,44 @@ static void guest_assert_event_count(uint8_t idx, GUEST_ASSERT_EQ(_rdpmc(pmc), 0xdead); } +/* + * Enable and disable the PMC in a monolithic asm blob to ensure that the + * compiler can't insert _any_ code into the measured sequence. Note, ECX + * doesn't need to be clobbered as the input value, @pmc_msr, is restored + * before the end of the sequence. + * + * If CLFUSH{,OPT} is supported, flush the cacheline containing (at least) the + * start of the loop to force LLC references and misses, i.e. to allow testing + * that those events actually count. + */ +#define GUEST_MEASURE_EVENT(_msr, _value, clflush) \ +do { \ + __asm__ __volatile__("wrmsr\n\t" \ + clflush "\n\t" \ + "mfence\n\t" \ + "1: mov $" __stringify(NUM_BRANCHES) ", %%ecx\n\t" \ + "loop .\n\t" \ + "mov %%edi, %%ecx\n\t" \ + "xor %%eax, %%eax\n\t" \ + "xor %%edx, %%edx\n\t" \ + "wrmsr\n\t" \ + :: "a"((uint32_t)_value), "d"(_value >> 32), \ + "c"(_msr), "D"(_msr) \ + ); \ +} while (0) + static void __guest_test_arch_event(uint8_t idx, struct kvm_x86_pmu_feature event, uint32_t pmc, uint32_t pmc_msr, uint32_t ctrl_msr, uint64_t ctrl_msr_value) { wrmsr(pmc_msr, 0); - /* - * Enable and disable the PMC in a monolithic asm blob to ensure that - * the compiler can't insert _any_ code into the measured sequence. - * Note, ECX doesn't need to be clobbered as the input value, @pmc_msr, - * is restored before the end of the sequence. - */ - __asm__ __volatile__("wrmsr\n\t" - "mov $" __stringify(NUM_BRANCHES) ", %%ecx\n\t" - "loop .\n\t" - "mov %%edi, %%ecx\n\t" - "xor %%eax, %%eax\n\t" - "xor %%edx, %%edx\n\t" - "wrmsr\n\t" - :: "a"((uint32_t)ctrl_msr_value), - "d"(ctrl_msr_value >> 32), - "c"(ctrl_msr), "D"(ctrl_msr) - ); + if (this_cpu_has(X86_FEATURE_CLFLUSHOPT)) + GUEST_MEASURE_EVENT(ctrl_msr, ctrl_msr_value, "clflushopt 1f"); + else if (this_cpu_has(X86_FEATURE_CLFLUSH)) + GUEST_MEASURE_EVENT(ctrl_msr, ctrl_msr_value, "clflush 1f"); + else + GUEST_MEASURE_EVENT(ctrl_msr, ctrl_msr_value, "nop"); guest_assert_event_count(idx, event, pmc, pmc_msr); } -- 2.43.0.rc2.451.g8631bc7472-goog