alvise rigo <a.rigo@xxxxxxxxxxxxxxxxxxxxxx> writes: > On Mon, Aug 3, 2015 at 12:30 PM, Alex Bennée <alex.bennee@xxxxxxxxxx> wrote: >> >> alvise rigo <a.rigo@xxxxxxxxxxxxxxxxxxxxxx> writes: >> >>> Hi Alex, >>> >>> Nice set of tests, they are proving to be helpful. >>> One question below. >>> <snip> >>> >>> Why are we calling these last two instructions with the 'eq' suffix? >>> Shouldn't we just strex r1, r0, [sptr] and then cmp r1, #0? >> >> Possibly, my armv7 is a little rusty. I'm just looking at tweaking this >> test now so I'll try and clean that up. Please find the updated test attached. I've also included some new test modes. In theory the barrier test by itself should still fail but it passes on real ARMv7 as well as TCG. I'm trying to run the test on a heavier core-ed ARMv7 to check. I suspect we get away with it on ARMv7-on-x86_64 due to the strong ordering of the x86. The "excl" and "acqrel" tests now run without issue (although again plain acqrel semantics shouldn't stop a race corrupting shared_value). I'll tweak the v8 versions of the test tomorrow. -- Alex Bennée
>From 0953549985134268bf9079a7a01b2631d8a4fdee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alex=20Benn=C3=A9e?= <alex@xxxxxxxxxx> Date: Thu, 30 Jul 2015 15:13:33 +0000 Subject: [kvm-unit-tests PATCH] arm/barrier-test: add memory barrier tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This test has been written mainly to stress multi-threaded TCG behaviour but will demonstrate failure by default on real hardware. The test takes the following parameters: - "lock" use GCC's locking semantics - "atomic" use GCC's __atomic primitives - "barrier" use plain dmb() barriers - "wfelock" use WaitForEvent sleep - "excl" use load/store exclusive semantics - "acqrel" use acquire/release semantics Also two more options allow the test to be tweaked - "noshuffle" disables the memory shuffling - "count=%ld" set your own per-CPU increment count Signed-off-by: Alex Bennée <alex.bennee@xxxxxxxxxx> --- v2 - Don't use thumb style strexeq stuff - Add atomic, barrier and wfelock tests - Add count/noshuffle test controls --- arm/barrier-test.c | 284 +++++++++++++++++++++++++++++++++++++++++++ config/config-arm-common.mak | 2 + 2 files changed, 286 insertions(+) create mode 100644 arm/barrier-test.c diff --git a/arm/barrier-test.c b/arm/barrier-test.c new file mode 100644 index 0000000..765f8f6 --- /dev/null +++ b/arm/barrier-test.c @@ -0,0 +1,284 @@ +#include <libcflat.h> +#include <asm/smp.h> +#include <asm/cpumask.h> +#include <asm/barrier.h> +#include <asm/mmu.h> + +#include <prng.h> + +#define MAX_CPUS 8 + +/* How many increments to do */ +static int increment_count = 10000000; +static int do_shuffle = 1; + + +/* shared value all the tests attempt to safely increment */ +static unsigned int shared_value; + +/* PAGE_SIZE * uint32_t means we span several pages */ +static uint32_t memory_array[PAGE_SIZE]; + +/* We use the alignment of the following to ensure accesses to locking + * and synchronisation primatives don't interfere with the page of the + * shared value + */ +__attribute__((aligned(PAGE_SIZE))) static unsigned int per_cpu_value[MAX_CPUS]; +__attribute__((aligned(PAGE_SIZE))) static cpumask_t smp_test_complete; +__attribute__((aligned(PAGE_SIZE))) static int global_lock; + +struct isaac_ctx prng_context[MAX_CPUS]; + +void (*inc_fn)(void); + + +/* In any SMP setting this *should* fail due to cores stepping on + * each other updating the shared variable + */ +static void increment_shared(void) +{ + shared_value++; +} + +/* GCC __sync primitives are deprecated in favour of __atomic */ +static void increment_shared_with_lock(void) +{ + while (__sync_lock_test_and_set(&global_lock, 1)); + shared_value++; + __sync_lock_release(&global_lock); +} + +/* In practice even __ATOMIC_RELAXED uses ARM's ldxr/stex exclusive + * semantics */ +static void increment_shared_with_atomic(void) +{ + __atomic_add_fetch(&shared_value, 1, __ATOMIC_SEQ_CST); +} + + +/* By themselves barriers do not gaurentee atomicity */ +static void increment_shared_with_barrier(void) +{ +#if defined (__LP64__) || defined (_LP64) +#else + asm volatile( + " ldr r0, [%[sptr]]\n" + " dmb\n" + " add r0, r0, #0x1\n" + " str r1, r0, [%[sptr]]\n" + " dmb\n" + : /* out */ + : [sptr] "r" (&shared_value) /* in */ + : "r0", "r1", "cc"); +#endif +} + +/* + * Load/store exclusive with WFE (wait-for-event) + */ + +static void increment_shared_with_wfelock(void) +{ +#if defined (__LP64__) || defined (_LP64) +#else + asm volatile( + " mov r1, #1\n" + "1: ldrex r0, [%[lock]]\n" + " cmp r0, #0\n" + " wfene\n" + " strexeq r0, r1, [%[lock]]\n" + " cmpeq r0, #0\n" + " bne 1b\n" + " dmb\n" + /* lock held */ + " ldr r0, [%[sptr]]\n" + " add r0, r0, #0x1\n" + " str r0, [%[sptr]]\n" + /* now release */ + " mov r0, #0\n" + " dmb\n" + " str r0, [%[lock]]\n" + " dsb\n" + " sev\n" + : /* out */ + : [lock] "r" (&global_lock), [sptr] "r" (&shared_value) /* in */ + : "r0", "r1", "cc"); +#endif +} + + +/* + * Hand-written version of the load/store exclusive + */ +static void increment_shared_with_excl(void) +{ +#if defined (__LP64__) || defined (_LP64) + asm volatile( + "1: ldxr w0, [%[sptr]]\n" + " add w0, w0, #0x1\n" + " stxr w1, w0, [%[sptr]]\n" + " cbnz w1, 1b\n" + : /* out */ + : [sptr] "r" (&shared_value) /* in */ + : "w0", "w1", "cc"); +#else + asm volatile( + "1: ldrex r0, [%[sptr]]\n" + " add r0, r0, #0x1\n" + " strex r1, r0, [%[sptr]]\n" + " cmp r1, #0\n" + " bne 1b\n" + : /* out */ + : [sptr] "r" (&shared_value) /* in */ + : "r0", "r1", "cc"); +#endif +} + +static void increment_shared_with_acqrel(void) +{ +#if defined (__LP64__) || defined (_LP64) + asm volatile( + " ldar w0, [%[sptr]]\n" + " add w0, w0, #0x1\n" + " str w0, [%[sptr]]\n" + : /* out */ + : [sptr] "r" (&shared_value) /* in */ + : "w0"); +#else + /* ARMv7 has no acquire/release semantics but we + * can ensure the results of the write are propagated + * with the use of barriers. + */ + asm volatile( + "1: ldrex r0, [%[sptr]]\n" + " add r0, r0, #0x1\n" + " strex r1, r0, [%[sptr]]\n" + " cmp r1, #0\n" + " bne 1b\n" + " dmb\n" + : /* out */ + : [sptr] "r" (&shared_value) /* in */ + : "r0", "r1", "cc"); +#endif + +} + +/* The idea of this is just to generate some random load/store + * activity which may or may not race with an un-barried incremented + * of the shared counter + */ +static void shuffle_memory(int cpu) +{ + int i; + uint32_t lspat = isaac_next_uint32(&prng_context[cpu]); + uint32_t seq = isaac_next_uint32(&prng_context[cpu]); + int count = seq & 0x1f; + uint32_t val=0; + + seq >>= 5; + + for (i=0; i<count; i++) { + int index = seq & ~PAGE_MASK; + if (lspat & 1) { + val ^= memory_array[index]; + } else { + memory_array[index] = val; + } + seq >>= PAGE_SHIFT; + seq ^= lspat; + lspat >>= 1; + } + +} + +static void do_increment(void) +{ + int i; + int cpu = smp_processor_id(); + + printf("CPU%d online\n", cpu); + + for (i=0; i < increment_count; i++) { + per_cpu_value[cpu]++; + inc_fn(); + + if (do_shuffle) + shuffle_memory(cpu); + } + + printf("CPU%d: Done, %d incs\n", cpu, per_cpu_value[cpu]); + + cpumask_set_cpu(cpu, &smp_test_complete); + if (cpu != 0) + halt(); +} + +int main(int argc, char **argv) +{ + int cpu; + unsigned int i, sum = 0; + static const unsigned char seed[] = "myseed"; + + inc_fn = &increment_shared; + + isaac_init(&prng_context[0], &seed[0], sizeof(seed)); + + for (i=0; i<argc; i++) { + char *arg = argv[i]; + + if (strcmp(arg, "lock") == 0) { + inc_fn = &increment_shared_with_lock; + report_prefix_push("lock"); + } else if (strcmp(arg, "atomic") == 0) { + inc_fn = &increment_shared_with_atomic; + report_prefix_push("atomic"); + } else if (strcmp(arg, "barrier") == 0) { + inc_fn = &increment_shared_with_atomic; + report_prefix_push("barrier"); + } else if (strcmp(arg, "wfelock") == 0) { + inc_fn = &increment_shared_with_wfelock; + report_prefix_push("wfelock"); + } else if (strcmp(arg, "excl") == 0) { + inc_fn = &increment_shared_with_excl; + report_prefix_push("excl"); + } else if (strcmp(arg, "acqrel") == 0) { + inc_fn = &increment_shared_with_acqrel; + report_prefix_push("acqrel"); + } else if (strcmp(arg, "noshuffle") == 0) { + do_shuffle = 0; + report_prefix_push("noshuffle"); + } else if (strstr(arg, "count=") != NULL) { + char *p = strstr(arg, "="); + increment_count = atol(p+1); + } else { + isaac_reseed(&prng_context[0], (unsigned char *) arg, strlen(arg)); + } + } + + /* fill our random page */ + for (i=0; i<PAGE_SIZE; i++) { + memory_array[i] = isaac_next_uint32(&prng_context[0]); + } + + for_each_present_cpu(cpu) { + uint32_t seed2 = isaac_next_uint32(&prng_context[0]); + if (cpu == 0) + continue; + + isaac_init(&prng_context[cpu], (unsigned char *) &seed2, sizeof(seed2)); + smp_boot_secondary(cpu, do_increment); + } + + do_increment(); + + while (!cpumask_full(&smp_test_complete)) + cpu_relax(); + + /* All CPUs done, do we add up */ + for_each_present_cpu(cpu) { + sum += per_cpu_value[cpu]; + } + report("total incs %d", sum == shared_value, shared_value); + + return report_summary(); +} diff --git a/config/config-arm-common.mak b/config/config-arm-common.mak index de2c4a3..4c5abb6 100644 --- a/config/config-arm-common.mak +++ b/config/config-arm-common.mak @@ -12,6 +12,7 @@ endif tests-common = $(TEST_DIR)/selftest.flat tests-common += $(TEST_DIR)/spinlock-test.flat tests-common += $(TEST_DIR)/tlbflush-test.flat +tests-common += $(TEST_DIR)/barrier-test.flat ifneq ($(TEST),) tests = $(TEST_DIR)/$(TEST).flat @@ -78,3 +79,4 @@ $(TEST_DIR)/$(TEST).elf: $(cstart.o) $(TEST_DIR)/$(TEST).o $(TEST_DIR)/selftest.elf: $(cstart.o) $(TEST_DIR)/selftest.o $(TEST_DIR)/spinlock-test.elf: $(cstart.o) $(TEST_DIR)/spinlock-test.o $(TEST_DIR)/tlbflush-test.elf: $(cstart.o) $(TEST_DIR)/tlbflush-test.o +$(TEST_DIR)/barrier-test.elf: $(cstart.o) $(TEST_DIR)/barrier-test.o -- 2.5.0