On Wed, Jan 03, 2024, Dapeng Mi wrote: > When running pmu test on SPR, sometimes the following failure is > reported. > > 1 <= 0 <= 1000000 > FAIL: Intel: llc misses-4 > > Currently The LLC misses occurring only depends on probability. It's > possible that there is no LLC misses happened in the whole loop(), > especially along with processors have larger and larger cache size just > like what we observed on SPR. > > Thus, add clflush instruction into the loop() asm blob and ensure once > LLC miss is triggered at least. > > Suggested-by: Jim Mattson <jmattson@xxxxxxxxxx> > Signed-off-by: Dapeng Mi <dapeng1.mi@xxxxxxxxxxxxxxx> I wonder if we can skip all LLC tests if CPU does not have clflush/clflushopt properties? > --- > x86/pmu.c | 43 ++++++++++++++++++++++++++++++------------- > 1 file changed, 30 insertions(+), 13 deletions(-) > > diff --git a/x86/pmu.c b/x86/pmu.c > index b764827c1c3d..8fd3db0fbf81 100644 > --- a/x86/pmu.c > +++ b/x86/pmu.c > @@ -20,19 +20,21 @@ > > // Instrustion number of LOOP_ASM code > #define LOOP_INSTRNS 10 > -#define LOOP_ASM \ > +#define LOOP_ASM(_clflush) \ > + _clflush "\n\t" \ > + "mfence;\n\t" \ > "1: mov (%1), %2; add $64, %1;\n\t" \ > "nop; nop; nop; nop; nop; nop; nop;\n\t" \ > "loop 1b;\n\t" > > -/*Enable GLOBAL_CTRL + disable GLOBAL_CTRL instructions */ > -#define PRECISE_EXTRA_INSTRNS (2 + 4) > +/*Enable GLOBAL_CTRL + disable GLOBAL_CTRL + clflush/mfence instructions */ > +#define PRECISE_EXTRA_INSTRNS (2 + 4 + 2) > #define PRECISE_LOOP_INSTRNS (N * LOOP_INSTRNS + PRECISE_EXTRA_INSTRNS) > #define PRECISE_LOOP_BRANCHES (N) > -#define PRECISE_LOOP_ASM \ > +#define PRECISE_LOOP_ASM(_clflush) \ > "wrmsr;\n\t" \ > "mov %%ecx, %%edi; mov %%ebx, %%ecx;\n\t" \ > - LOOP_ASM \ > + LOOP_ASM(_clflush) \ > "mov %%edi, %%ecx; xor %%eax, %%eax; xor %%edx, %%edx;\n\t" \ > "wrmsr;\n\t" > > @@ -72,14 +74,30 @@ char *buf; > static struct pmu_event *gp_events; > static unsigned int gp_events_size; > > +#define _loop_asm(_clflush) \ > +do { \ > + asm volatile(LOOP_ASM(_clflush) \ > + : "=c"(tmp), "=r"(tmp2), "=r"(tmp3) \ > + : "0"(N), "1"(buf)); \ > +} while (0) > + > +#define _precise_loop_asm(_clflush) \ > +do { \ > + asm volatile(PRECISE_LOOP_ASM(_clflush) \ > + : "=b"(tmp), "=r"(tmp2), "=r"(tmp3) \ > + : "a"(eax), "d"(edx), "c"(global_ctl), \ > + "0"(N), "1"(buf) \ > + : "edi"); \ > +} while (0) > > static inline void __loop(void) > { > unsigned long tmp, tmp2, tmp3; > > - asm volatile(LOOP_ASM > - : "=c"(tmp), "=r"(tmp2), "=r"(tmp3) > - : "0"(N), "1"(buf)); > + if (this_cpu_has(X86_FEATURE_CLFLUSH)) > + _loop_asm("clflush (%1)"); > + else > + _loop_asm("nop"); > } > > /* > @@ -96,11 +114,10 @@ static inline void __precise_count_loop(u64 cntrs) > u32 eax = cntrs & (BIT_ULL(32) - 1); > u32 edx = cntrs >> 32; > > - asm volatile(PRECISE_LOOP_ASM > - : "=b"(tmp), "=r"(tmp2), "=r"(tmp3) > - : "a"(eax), "d"(edx), "c"(global_ctl), > - "0"(N), "1"(buf) > - : "edi"); > + if (this_cpu_has(X86_FEATURE_CLFLUSH)) > + _precise_loop_asm("clflush (%1)"); > + else > + _precise_loop_asm("nop"); > } > > static inline void loop(u64 cntrs) > -- > 2.34.1 >