Quoting Chris Wilson (2017-08-23 13:55:55) > At the moment, the verify tests use an extremely brutal write-read of > every dword, degrading performance to UC. If we break those up into > cachelines, we can do a wcb write/read at a time instead, roughly 8x > faster. We lose the accuracy of the forced wcb flushes around every dword, > but we are retaining the overall behaviour of checking reads following > writes instead. To compensate, we do check that a single dword write/read > before using wcb aligned accesses. > > Signed-off-by: Chris Wilson <chris@xxxxxxxxxxxxxxxxxx> -> Tumbleweed -> > --- > tests/gem_fence_thrash.c | 116 +++++++++++++++++++++++++++++++++++++++++------ > 1 file changed, 101 insertions(+), 15 deletions(-) > > diff --git a/tests/gem_fence_thrash.c b/tests/gem_fence_thrash.c > index 52095f26..3e1edb73 100644 > --- a/tests/gem_fence_thrash.c > +++ b/tests/gem_fence_thrash.c > @@ -30,7 +30,6 @@ > #include "config.h" > #endif > > -#include "igt.h" > #include <unistd.h> > #include <stdlib.h> > #include <stdio.h> > @@ -43,6 +42,12 @@ > #include <pthread.h> > #include "drm.h" > > +#include "igt.h" > +#include "igt_x86.h" > + > +#define PAGE_SIZE 4096 > +#define CACHELINE 64 > + > #define OBJECT_SIZE (128*1024) /* restricted to 1MiB alignment on i915 fences */ > > /* Before introduction of the LRU list for fences, allocation of a fence for a page > @@ -104,15 +109,78 @@ bo_copy (void *_arg) > return NULL; > } > > +#if defined(__x86_64__) && !defined(__clang__) > +#define MOVNT 512 > + > +#pragma GCC push_options > +#pragma GCC target("sse4.1") > + > +#include <smmintrin.h> > +__attribute__((noinline)) > +static void copy_wc_page(void *dst, void *src) > +{ > + if (igt_x86_features() & SSE4_1) { > + __m128i *S = (__m128i *)src; > + __m128i *D = (__m128i *)dst; > + > + for (int i = 0; i < PAGE_SIZE/CACHELINE; i++) { > + __m128i tmp[4]; > + > + tmp[0] = _mm_stream_load_si128(S++); > + tmp[1] = _mm_stream_load_si128(S++); > + tmp[2] = _mm_stream_load_si128(S++); > + tmp[3] = _mm_stream_load_si128(S++); > + > + _mm_store_si128(D++, tmp[0]); > + _mm_store_si128(D++, tmp[1]); > + _mm_store_si128(D++, tmp[2]); > + _mm_store_si128(D++, tmp[3]); > + } > + } else > + memcpy(dst, src, PAGE_SIZE); > +} > +static void copy_wc_cacheline(void *dst, void *src) > +{ > + if (igt_x86_features() & SSE4_1) { > + __m128i *S = (__m128i *)src; > + __m128i *D = (__m128i *)dst; > + __m128i tmp[4]; > + > + tmp[0] = _mm_stream_load_si128(S++); > + tmp[1] = _mm_stream_load_si128(S++); > + tmp[2] = _mm_stream_load_si128(S++); > + tmp[3] = _mm_stream_load_si128(S++); > + > + _mm_store_si128(D++, tmp[0]); > + _mm_store_si128(D++, tmp[1]); > + _mm_store_si128(D++, tmp[2]); > + _mm_store_si128(D++, tmp[3]); > + } else > + memcpy(dst, src, CACHELINE); > +} > + > +#pragma GCC pop_options > + > +#else > +static void copy_wc_page(void *dst, const void *src) > +{ > + memcpy(dst, src, PAGE_SIZE); > +} > +static void copy_wc_cacheline(void *dst, const void *src) > +{ > + memcpy(dst, src, CACHELINE); > +} > +#endif > + > static void > _bo_write_verify(struct test *t) > { > int fd = t->fd; > int i, k; > uint32_t **s; > - uint32_t v; > unsigned int dwords = OBJECT_SIZE >> 2; > const char *tile_str[] = { "none", "x", "y" }; > + uint32_t tmp[PAGE_SIZE/sizeof(uint32_t)]; > > igt_assert(t->tiling >= 0 && t->tiling <= I915_TILING_Y); > igt_assert_lt(0, t->num_surfaces); > @@ -124,21 +192,39 @@ _bo_write_verify(struct test *t) > s[k] = bo_create(fd, t->tiling); > > for (k = 0; k < t->num_surfaces; k++) { > - volatile uint32_t *a = s[k]; > - > - for (i = 0; i < dwords; i++) { > - a[i] = i; > - v = a[i]; > - igt_assert_f(v == i, > - "tiling %s: write failed at %d (%x)\n", > - tile_str[t->tiling], i, v); > + uint32_t *a = s[k]; > + > + a[0] = 0xdeadbeef; > + igt_assert_f(a[0] == 0xdeadbeef, > + "tiling %s: write failed at start (%x)\n", > + tile_str[t->tiling], a[0]); > + > + a[dwords - 1] = 0xc0ffee; > + igt_assert_f(a[dwords - 1] == 0xc0ffee, > + "tiling %s: write failed at end (%x)\n", > + tile_str[t->tiling], a[dwords - 1]); > + > + for (i = 0; i < dwords; i += CACHELINE/sizeof(uint32_t)) { > + for (int j = 0; j < CACHELINE/sizeof(uint32_t); j++) > + a[i + j] = ~(i + j); > + > + copy_wc_cacheline(tmp, a + i); > + for (int j = 0; j < CACHELINE/sizeof(uint32_t); j++) > + igt_assert_f(tmp[j] == ~(i+ j), > + "tiling %s: write failed at %d (%x)\n", > + tile_str[t->tiling], i + j, tmp[j]); > + > + for (int j = 0; j < CACHELINE/sizeof(uint32_t); j++) > + a[i + j] = i + j; > } > > - for (i = 0; i < dwords; i++) { > - v = a[i]; > - igt_assert_f(v == i, > - "tiling %s: verify failed at %d (%x)\n", > - tile_str[t->tiling], i, v); > + for (i = 0; i < dwords; i += PAGE_SIZE/sizeof(uint32_t)) { > + copy_wc_page(tmp, a + i); > + for (int j = 0; j < PAGE_SIZE/sizeof(uint32_t); j++) { > + igt_assert_f(tmp[j] == i + j, > + "tiling %s: verify failed at %d (%x)\n", > + tile_str[t->tiling], i + j, tmp[j]); > + } > } > } > > -- > 2.14.1 > _______________________________________________ Intel-gfx mailing list Intel-gfx@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/intel-gfx