Re: [kvm-unit-tests PATCH v5 11/11] new: arm/barrier-test for memory barriers

Alex Bennée <alex.bennee@xxxxxxxxxx> · Mon, 03 Aug 2015 17:06:26 +0100

alvise rigo <a.rigo@xxxxxxxxxxxxxxxxxxxxxx> writes:

> On Mon, Aug 3, 2015 at 12:30 PM, Alex Bennée <alex.bennee@xxxxxxxxxx> wrote:
>>
>> alvise rigo <a.rigo@xxxxxxxxxxxxxxxxxxxxxx> writes:
>>
>>> Hi Alex,
>>>
>>> Nice set of tests, they are proving to be helpful.
>>> One question below.
>>>
<snip>
>>>
>>> Why are we calling these last two instructions with the 'eq' suffix?
>>> Shouldn't we just strex r1, r0, [sptr] and then cmp r1, #0?
>>
>> Possibly, my armv7 is a little rusty. I'm just looking at tweaking this
>> test now so I'll try and clean that up.

Please find the updated test attached. I've also included some new test
modes. In theory the barrier test by itself should still fail but it
passes on real ARMv7 as well as TCG. I'm trying to run the test on a
heavier core-ed ARMv7 to check. I suspect we get away with it on
ARMv7-on-x86_64 due to the strong ordering of the x86.

The "excl" and "acqrel" tests now run without issue (although again
plain acqrel semantics shouldn't stop a race corrupting shared_value).

I'll tweak the v8 versions of the test tomorrow.

-- 
Alex Bennée

>From 0953549985134268bf9079a7a01b2631d8a4fdee Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alex=20Benn=C3=A9e?= <alex@xxxxxxxxxx>
Date: Thu, 30 Jul 2015 15:13:33 +0000
Subject: [kvm-unit-tests PATCH] arm/barrier-test: add memory barrier tests
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This test has been written mainly to stress multi-threaded TCG behaviour
but will demonstrate failure by default on real hardware. The test takes
the following parameters:

  - "lock" use GCC's locking semantics
  - "atomic" use GCC's __atomic primitives
  - "barrier" use plain dmb() barriers
  - "wfelock" use WaitForEvent sleep
  - "excl" use load/store exclusive semantics
  - "acqrel" use acquire/release semantics

Also two more options allow the test to be tweaked

  - "noshuffle" disables the memory shuffling
  - "count=%ld" set your own per-CPU increment count

Signed-off-by: Alex Bennée <alex.bennee@xxxxxxxxxx>

---
v2
  - Don't use thumb style strexeq stuff
  - Add atomic, barrier and wfelock tests
  - Add count/noshuffle test controls
---
 arm/barrier-test.c           | 284 +++++++++++++++++++++++++++++++++++++++++++
 config/config-arm-common.mak |   2 +
 2 files changed, 286 insertions(+)
 create mode 100644 arm/barrier-test.c

diff --git a/arm/barrier-test.c b/arm/barrier-test.c
new file mode 100644
index 0000000..765f8f6
--- /dev/null
+++ b/arm/barrier-test.c
@@ -0,0 +1,284 @@
+#include <libcflat.h>
+#include <asm/smp.h>
+#include <asm/cpumask.h>
+#include <asm/barrier.h>
+#include <asm/mmu.h>
+
+#include <prng.h>
+
+#define MAX_CPUS 8
+
+/* How many increments to do */
+static int increment_count = 10000000;
+static int do_shuffle = 1;
+
+
+/* shared value all the tests attempt to safely increment */
+static unsigned int shared_value;
+
+/* PAGE_SIZE * uint32_t means we span several pages */
+static uint32_t memory_array[PAGE_SIZE];
+
+/* We use the alignment of the following to ensure accesses to locking
+ * and synchronisation primatives don't interfere with the page of the
+ * shared value
+ */
+__attribute__((aligned(PAGE_SIZE))) static unsigned int per_cpu_value[MAX_CPUS];
+__attribute__((aligned(PAGE_SIZE))) static cpumask_t smp_test_complete;
+__attribute__((aligned(PAGE_SIZE))) static int global_lock;
+
+struct isaac_ctx prng_context[MAX_CPUS];
+
+void (*inc_fn)(void);
+
+
+/* In any SMP setting this *should* fail due to cores stepping on
+ * each other updating the shared variable
+ */
+static void increment_shared(void)
+{
+	shared_value++;
+}
+
+/* GCC __sync primitives are deprecated in favour of __atomic */
+static void increment_shared_with_lock(void)
+{
+	while (__sync_lock_test_and_set(&global_lock, 1));
+	shared_value++;
+	__sync_lock_release(&global_lock);
+}
+
+/* In practice even __ATOMIC_RELAXED uses ARM's ldxr/stex exclusive
+ * semantics */
+static void increment_shared_with_atomic(void)
+{
+	__atomic_add_fetch(&shared_value, 1, __ATOMIC_SEQ_CST);
+}
+
+
+/* By themselves barriers do not gaurentee atomicity */
+static void increment_shared_with_barrier(void)
+{
+#if defined (__LP64__) || defined (_LP64)
+#else
+	asm volatile(
+	"	ldr	r0, [%[sptr]]\n"
+	"	dmb\n"
+	"	add     r0, r0, #0x1\n"
+	"	str	r1, r0, [%[sptr]]\n"
+	"	dmb\n"
+	: /* out */
+	: [sptr] "r" (&shared_value) /* in */
+	: "r0", "r1", "cc");
+#endif
+}
+
+/*
+ * Load/store exclusive with WFE (wait-for-event)
+ */
+
+static void increment_shared_with_wfelock(void)
+{
+#if defined (__LP64__) || defined (_LP64)
+#else
+	asm volatile(
+	"	mov     r1, #1\n"
+	"1:	ldrex	r0, [%[lock]]\n"
+	"	cmp     r0, #0\n"
+	"	wfene\n"
+	"	strexeq r0, r1, [%[lock]]\n"
+	"	cmpeq	r0, #0\n"
+	"	bne	1b\n"
+	"	dmb\n"
+	/* lock held */
+	"	ldr	r0, [%[sptr]]\n"
+	"	add	r0, r0, #0x1\n"
+	"	str	r0, [%[sptr]]\n"
+	/* now release */
+	"	mov	r0, #0\n"
+	"	dmb\n"
+	"	str	r0, [%[lock]]\n"
+	"	dsb\n"
+	"	sev\n"
+	: /* out */
+	: [lock] "r" (&global_lock), [sptr] "r" (&shared_value) /* in */
+	: "r0", "r1", "cc");
+#endif
+}
+
+
+/*
+ * Hand-written version of the load/store exclusive
+ */
+static void increment_shared_with_excl(void)
+{
+#if defined (__LP64__) || defined (_LP64)
+	asm volatile(
+	"1:	ldxr	w0, [%[sptr]]\n"
+	"	add     w0, w0, #0x1\n"
+	"	stxr	w1, w0, [%[sptr]]\n"
+	"	cbnz	w1, 1b\n"
+	: /* out */
+	: [sptr] "r" (&shared_value) /* in */
+	: "w0", "w1", "cc");
+#else
+	asm volatile(
+	"1:	ldrex	r0, [%[sptr]]\n"
+	"	add     r0, r0, #0x1\n"
+	"	strex	r1, r0, [%[sptr]]\n"
+	"	cmp	r1, #0\n"
+	"	bne	1b\n"
+	: /* out */
+	: [sptr] "r" (&shared_value) /* in */
+	: "r0", "r1", "cc");
+#endif
+}
+
+static void increment_shared_with_acqrel(void)
+{
+#if defined (__LP64__) || defined (_LP64)
+	asm volatile(
+	"	ldar	w0, [%[sptr]]\n"
+	"	add     w0, w0, #0x1\n"
+	"	str	w0, [%[sptr]]\n"
+	: /* out */
+	: [sptr] "r" (&shared_value) /* in */
+	: "w0");
+#else
+	/* ARMv7 has no acquire/release semantics but we
+	 * can ensure the results of the write are propagated
+	 * with the use of barriers.
+	 */
+	asm volatile(
+	"1:	ldrex	r0, [%[sptr]]\n"
+	"	add     r0, r0, #0x1\n"
+	"	strex	r1, r0, [%[sptr]]\n"
+	"	cmp	r1, #0\n"
+	"	bne	1b\n"
+	"	dmb\n"
+	: /* out */
+	: [sptr] "r" (&shared_value) /* in */
+	: "r0", "r1", "cc");
+#endif
+
+}
+
+/* The idea of this is just to generate some random load/store
+ * activity which may or may not race with an un-barried incremented
+ * of the shared counter
+ */
+static void shuffle_memory(int cpu)
+{
+	int i;
+	uint32_t lspat = isaac_next_uint32(&prng_context[cpu]);
+	uint32_t seq = isaac_next_uint32(&prng_context[cpu]);
+	int count = seq & 0x1f;
+	uint32_t val=0;
+
+	seq >>= 5;
+
+	for (i=0; i<count; i++) {
+		int index = seq & ~PAGE_MASK;
+		if (lspat & 1) {
+			val ^= memory_array[index];
+		} else {
+			memory_array[index] = val;
+		}
+		seq >>= PAGE_SHIFT;
+		seq ^= lspat;
+		lspat >>= 1;
+	}
+
+}
+
+static void do_increment(void)
+{
+	int i;
+	int cpu = smp_processor_id();
+
+	printf("CPU%d online\n", cpu);
+
+	for (i=0; i < increment_count; i++) {
+		per_cpu_value[cpu]++;
+		inc_fn();
+
+		if (do_shuffle)
+			shuffle_memory(cpu);
+	}
+
+	printf("CPU%d: Done, %d incs\n", cpu, per_cpu_value[cpu]);
+
+	cpumask_set_cpu(cpu, &smp_test_complete);
+	if (cpu != 0)
+		halt();
+}
+
+int main(int argc, char **argv)
+{
+	int cpu;
+	unsigned int i, sum = 0;
+	static const unsigned char seed[] = "myseed";
+
+	inc_fn = &increment_shared;
+
+	isaac_init(&prng_context[0], &seed[0], sizeof(seed));
+
+	for (i=0; i<argc; i++) {
+		char *arg = argv[i];
+
+		if (strcmp(arg, "lock") == 0) {
+			inc_fn = &increment_shared_with_lock;
+			report_prefix_push("lock");
+		} else if (strcmp(arg, "atomic") == 0) {
+			inc_fn = &increment_shared_with_atomic;
+			report_prefix_push("atomic");
+		} else if (strcmp(arg, "barrier") == 0) {
+			inc_fn = &increment_shared_with_atomic;
+			report_prefix_push("barrier");
+		} else if (strcmp(arg, "wfelock") == 0) {
+			inc_fn = &increment_shared_with_wfelock;
+			report_prefix_push("wfelock");
+		} else if (strcmp(arg, "excl") == 0) {
+			inc_fn = &increment_shared_with_excl;
+			report_prefix_push("excl");
+		} else if (strcmp(arg, "acqrel") == 0) {
+			inc_fn = &increment_shared_with_acqrel;
+			report_prefix_push("acqrel");
+		} else if (strcmp(arg, "noshuffle") == 0) {
+			do_shuffle = 0;
+			report_prefix_push("noshuffle");
+		} else if (strstr(arg, "count=") != NULL) {
+			char *p = strstr(arg, "=");
+			increment_count = atol(p+1);
+		} else {
+			isaac_reseed(&prng_context[0], (unsigned char *) arg, strlen(arg));
+		}
+	}
+
+	/* fill our random page */
+	for (i=0; i<PAGE_SIZE; i++) {
+		memory_array[i] = isaac_next_uint32(&prng_context[0]);
+	}
+
+	for_each_present_cpu(cpu) {
+		uint32_t seed2 = isaac_next_uint32(&prng_context[0]);
+		if (cpu == 0)
+			continue;
+
+		isaac_init(&prng_context[cpu], (unsigned char *) &seed2, sizeof(seed2));
+		smp_boot_secondary(cpu, do_increment);
+	}
+
+	do_increment();
+
+	while (!cpumask_full(&smp_test_complete))
+		cpu_relax();
+
+	/* All CPUs done, do we add up */
+	for_each_present_cpu(cpu) {
+		sum += per_cpu_value[cpu];
+	}
+	report("total incs %d", sum == shared_value, shared_value);
+
+	return report_summary();
+}
diff --git a/config/config-arm-common.mak b/config/config-arm-common.mak
index de2c4a3..4c5abb6 100644
--- a/config/config-arm-common.mak
+++ b/config/config-arm-common.mak
@@ -12,6 +12,7 @@ endif
 tests-common = $(TEST_DIR)/selftest.flat
 tests-common += $(TEST_DIR)/spinlock-test.flat
 tests-common += $(TEST_DIR)/tlbflush-test.flat
+tests-common += $(TEST_DIR)/barrier-test.flat
 
 ifneq ($(TEST),)
 	tests = $(TEST_DIR)/$(TEST).flat
@@ -78,3 +79,4 @@ $(TEST_DIR)/$(TEST).elf: $(cstart.o) $(TEST_DIR)/$(TEST).o
 $(TEST_DIR)/selftest.elf: $(cstart.o) $(TEST_DIR)/selftest.o
 $(TEST_DIR)/spinlock-test.elf: $(cstart.o) $(TEST_DIR)/spinlock-test.o
 $(TEST_DIR)/tlbflush-test.elf: $(cstart.o) $(TEST_DIR)/tlbflush-test.o
+$(TEST_DIR)/barrier-test.elf: $(cstart.o) $(TEST_DIR)/barrier-test.o
-- 
2.5.0