Re: [kvm-unit-tests RFC PATCH] arm/tlbflush.c: TLB flushing torture test [DEV]

Andrew Jones <drjones@xxxxxxxxxx> · Mon, 27 Jul 2015 12:32:38 +0200

On Mon, Jul 27, 2015 at 10:07:57AM +0100, Alex Bennée wrote:
> 
> Andrew Jones <drjones@xxxxxxxxxx> writes:
> 
> > On Fri, Jul 24, 2015 at 02:25:06PM +0100, Alex Bennée wrote:
> >> This adds a fairly brain dead torture test for TLB flushes intended for
> >> stressing the MTTCG QEMU build. It takes the usual -smp option for
> >> multiple CPUs.
> >> 
> >> By default it will do a TLBIALL flush after each cycle. If you pass
> >> -append "page" to the kernel it will take it in turns to flush each of
> >> the computation functions. At the moment it doesn't do any re-mapping of
> >> pages but maybe that is something that could be done in the future.
> >> 
> >> [DEV VERSION FOR COMMENT]
> >> Signed-off-by: Alex Bennée <alex.bennee@xxxxxxxxxx>
> >> ---
> >>  arm/tlbflush.c               | 163 +++++++++++++++++++++++++++++++++++++++++++
> >>  config/config-arm-common.mak |   4 +-
> >>  lib/arm/asm/mmu.h            |  11 +++
> >>  3 files changed, 177 insertions(+), 1 deletion(-)
> >>  create mode 100644 arm/tlbflush.c
> >> 
> >> diff --git a/arm/tlbflush.c b/arm/tlbflush.c
> >> new file mode 100644
> >> index 0000000..6eeff18
> >> --- /dev/null
> >> +++ b/arm/tlbflush.c
> >> @@ -0,0 +1,163 @@
> >> +#include <libcflat.h>
> >> +#include <asm/smp.h>
> >> +#include <asm/cpumask.h>
> >> +#include <asm/barrier.h>
> >> +#include <asm/mmu.h>
> >> +
> >> +#define SEQ_LENGTH 10
> >> +
> >> +static cpumask_t smp_test_complete;
> >> +static int flush_count = 100000;
> >> +static int flush_self = 1;
> >> +static int flush_page = 0;
> >> +
> >> +__attribute__((aligned(0x1000))) unsigned int hash_array(int length, unsigned int *array)
> >
> > You should use PAGE_SIZE instead of 0x1000 in these attributes, allowing
> > the test to also work for aarch64, as we're using 64k pages on
> > aarch64.
> 
> Good point.
> 
> >
> >> +{
> >> +	int i;
> >> +	unsigned int sum=0;
> >> +	for (i=0; i<length; i++)
> >> +	{
> >> +		unsigned int val = *array++;
> >> +		sum ^= val;
> >> +		sum ^= (val >> (val % 16));
> >> +		sum ^= (val << (val % 32));
> >> +	}
> >> +
> >> +	return sum;
> >> +}
> >> +
> >> +__attribute__((aligned(0x1000))) void create_fib_sequence(int length, unsigned int *array)
> >> +{
> >> +	int i;
> >> +
> >> +	/* first two values */
> >> +	array[0] = 0;
> >> +	array[1] = 1;
> >> +	for (i=2; i<length; i++)
> >> +	{
> >> +		array[i] = array[i-2] + array[i-1];
> >> +	}
> >> +}
> >> +
> >> +__attribute__((aligned(0x1000))) unsigned long long factorial(unsigned int n)
> >> +{
> >> +	unsigned int i;
> >> +	unsigned long long fac = 1;
> >> +	for (i=1; i<=n; i++)
> >> +	{
> >> +		fac = fac * i;
> >> +	}
> >> +	return fac;
> >> +}
> >> +
> >> +/* do some computationally expensive stuff, return a checksum of the
> >> + * results */
> >> +__attribute__((aligned(0x1000))) unsigned int do_computation(void)
> >> +{
> >> +	unsigned int fib_array[SEQ_LENGTH];
> >> +	unsigned long long facfib_array[SEQ_LENGTH];
> >> +	unsigned int fib_hash, facfib_hash;
> >> +	int cpu = smp_processor_id();
> >> +	int i, j;
> >> +	
> >> +	create_fib_sequence(SEQ_LENGTH, &fib_array[0]);
> >> +	fib_hash = hash_array(SEQ_LENGTH, &fib_array[0]);
> >> +	for (i=0; i<SEQ_LENGTH; i++) {
> >> +		for (j=0; j<fib_array[i]; j++) {
> >> +			facfib_array[i] = factorial(fib_array[i]+j);
> >> +		}
> >> +	}
> >> +	facfib_hash = 0;
> >> +	for (i=0; i<SEQ_LENGTH; i++) {
> >> +		for (j=0; j<fib_array[i]; j++) {
> >> +			facfib_hash ^= hash_array(sizeof(facfib_array)/sizeof(unsigned int), (unsigned int *)&facfib_array[0]);
> >> +		}
> >> +	}
> >> +
> >> +#if 0
> >> +	printf("CPU:%d FIBSEQ ", cpu);
> >> +	for (i=0; i<SEQ_LENGTH; i++)
> >> +		printf("%u,", fib_array[i]);
> >> +	printf("\n");
> >> +
> >> +	printf("CPU:%d FACFIB ", cpu);
> >> +	for (i=0; i<SEQ_LENGTH; i++)
> >> +		printf("%llu,", facfib_array[i]);
> >> +	printf("\n");
> >> +#endif
> >> +	
> >> +	return (fib_hash ^ facfib_hash);
> >> +}
> >> +
> >> +static void * pages[] = {&hash_array, &create_fib_sequence, &factorial, &do_computation};
> >
> > I can't comment on whether or not the complexity of do_computation is
> > necessary for your test, but it seems like overkill. Comments explaining
> > why it's necessary would be good.
> 
> OK. From QEMUs TCG point of view I just want to ensure I have more than two
> basic blocks per-page region so I can check the block-chaining in-page
> and jump caching intra-page which are both affected on flushes. A
> computationally complex routine with a known answer would be nicer
> though I guess.
> 
> >
> >> +
> >> +static void test_flush(void)
> >> +{
> >> +	int i, errors = 0;
> >> +	int cpu = smp_processor_id();
> >> +
> >> +	unsigned int ref;
> >> +
> >> +	printf("CPU%d online\n", cpu);
> >> +
> >> +	ref = do_computation();
> >
> > What makes you sure that the first time you do the computation
> > per cpu is correct? I think computing it externally, and saving
> > the result, i.e. 
> >
> > #define EXPECTED_RESULT 0x12345678
> >
> > would be more reliable.
> 
> OK.
> 
> >
> >> +
> >> +	for (i=0; i < flush_count; i++) {
> >> +		unsigned int this_ref = do_computation();
> >> +
> >> +		if (this_ref != ref) {
> >> +			errors++;
> >> +			printf("CPU%d: seq%d 0x%x!=0x%x\n",
> >> +				cpu, i, ref, this_ref);
> >> +		}
> >> +
> >> +		if ((i % 1000) == 0) {
> >> +			printf("CPU%d: seq%d\n", cpu, i);
> >> +		}
> >> +		
> >> +		if (flush_self) {
> >> +			if (flush_page) {
> >> +				int j = (i % (sizeof(pages)/sizeof(void *)));
> > libcflat.h has the ARRAY_SIZE macro
> 
> OK
> 
> >> +				flush_tlb_page((unsigned long)pages[j]);
> >> +			} else {
> >> +				flush_tlb_all();
> >> +			}
> >> +		}
> >> +	}
> >> +
> >> +	report("CPU%d: Done - Errors: %d\n", errors == 0, cpu, errors);
> >> +
> >> +	cpumask_set_cpu(cpu, &smp_test_complete);
> >> +	if (cpu != 0)
> >> +		halt();
> >> +}
> >> +
> >> +int main(int argc, char **argv)
> >> +{
> >> +	int cpu, i;
> >> +	
> >> +	report_prefix_push("tlbflush");
> >> +
> >> +	for (i=0; i<argc; i++) {
> >> +		char *arg = argv[i];
> >> +/* 		printf("arg:%d:%s\n", i, arg); */
> >> +
> >> +		if (strcmp(arg, "page") == 0) {
> >> +			report_prefix_push("page");
> >> +			flush_page = 1;
> >> +		}
> >> +	}
> >> +
> >> +	for_each_present_cpu(cpu) {
> >> +		if (cpu == 0)
> >> +			continue;
> >> +		smp_boot_secondary(cpu, test_flush);
> >> +	}
> >> +
> >> +	test_flush();
> >> +
> >> +	while (!cpumask_full(&smp_test_complete))
> >> +		cpu_relax();
> >> +
> >> +	return report_summary();
> >
> > As we use the kernel coding style you should run
> >
> > $KERNEL_SRC/scripts/checkpatch.pl -f arm/tlbflush.c
> >
> > Also, please rename to tlbflush-test.c to differentiate it
> > from an implementation of tlbflush support, and to make
> > the standalone test name (if we commit those patches) more
> > descriptive.
> 
> I'll have another poke at my editor config. It should have been setting
> the coding style automatically, although of course explicit local
> variables are better ;-)
> 
> >
> >> +}
> >> diff --git a/config/config-arm-common.mak b/config/config-arm-common.mak
> >> index 0674daa..5b14db4 100644
> >> --- a/config/config-arm-common.mak
> >> +++ b/config/config-arm-common.mak
> >> @@ -11,7 +11,8 @@ endif
> >>  
> >>  tests-common = \
> >>  	$(TEST_DIR)/selftest.flat \
> >> -	$(TEST_DIR)/spinlock-test.flat
> >> +	$(TEST_DIR)/spinlock-test.flat \
> >> +        $(TEST_DIR)/tlbflush.flat
> >
> > As we're adding tests faster now it's becoming clear that the '\' list
> > isn't so great. To add a new test at the bottom we always have to modify
> > the last line too. We should either add the new one at the top (right
> > below the 'test-common =' line), or change this to a '+=' sequence like
> > some other lists are done.
> >
> >>  
> >>  all: test_cases
> >>  
> >> @@ -72,3 +73,4 @@ test_cases: $(generated_files) $(tests-common) $(tests)
> >>  
> >>  $(TEST_DIR)/selftest.elf: $(cstart.o) $(TEST_DIR)/selftest.o
> >>  $(TEST_DIR)/spinlock-test.elf: $(cstart.o) $(TEST_DIR)/spinlock-test.o
> >> +$(TEST_DIR)/tlbflush.elf: $(cstart.o) $(TEST_DIR)/tlbflush.o
> >> diff --git a/lib/arm/asm/mmu.h b/lib/arm/asm/mmu.h
> >> index c1bd01c..2bb0cde 100644
> >> --- a/lib/arm/asm/mmu.h
> >> +++ b/lib/arm/asm/mmu.h
> >> @@ -14,8 +14,11 @@
> >>  #define PTE_AF			PTE_EXT_AF
> >>  #define PTE_WBWA		L_PTE_MT_WRITEALLOC
> >>  
> >> +/* See B3.18.7 TLB maintenance operations */
> >> +
> >>  static inline void local_flush_tlb_all(void)
> >>  {
> >> +	/* TLBIALL */
> >>  	asm volatile("mcr p15, 0, %0, c8, c7, 0" :: "r" (0));
> >>  	dsb();
> >>  	isb();
> >> @@ -27,6 +30,14 @@ static inline void flush_tlb_all(void)
> >>  	local_flush_tlb_all();
> >>  }
> >>  
> >> +static inline void flush_tlb_page(unsigned long vaddr)
> >> +{
> >> +	/* TLBIMVAA */
> >> +	asm volatile("mcr p15, 0, %0, c8, c7, 3" :: "r" (vaddr));
> >> +	dsb();
> >> +	isb();
> >> +}
> >> +
> >>  #include <asm/mmu-api.h>
> >>  
> >>  #endif /* __ASMARM_MMU_H_ */
> >
> > This mmu.h change looks good, but please add the arm64
> > flush_tlb_page at the same time. And anyway, I guess you'll
> > want your test to work for both arm and aarch64?
> 
> Yes I will. Currently the MTTCG is arm32 only but this will be expanded.

Actually, I'd also like the arm*/asm/mmu.h file changes to be in
a separate patch, they can be together in one patch, but that patch
should be separate from the unit test.

Thanks,
drew
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html