fma4, sse4a, xop Signed-off-by: Jiří Župka <jzupka@xxxxxxxxxx> --- client/virt/deps/test_cpu_flags/Makefile | 83 +++++++++++------------ client/virt/deps/test_cpu_flags/aes.c | 13 +++- client/virt/deps/test_cpu_flags/avx.c | 30 +++++---- client/virt/deps/test_cpu_flags/cpuflags-test.c | 57 +++++++++++----- client/virt/deps/test_cpu_flags/fma4.c | 31 +++++++++ client/virt/deps/test_cpu_flags/pclmul.c | 10 ++- client/virt/deps/test_cpu_flags/rdrand.c | 6 +- client/virt/deps/test_cpu_flags/sse3.c | 12 +++- client/virt/deps/test_cpu_flags/sse4.c | 25 ++++++-- client/virt/deps/test_cpu_flags/sse4a.c | 37 ++++++++++ client/virt/deps/test_cpu_flags/ssse3.c | 14 +++- client/virt/deps/test_cpu_flags/stress.c | 6 ++ client/virt/deps/test_cpu_flags/tests.h | 38 ++++++++-- client/virt/deps/test_cpu_flags/xop.c | 48 +++++++++++++ 14 files changed, 309 insertions(+), 101 deletions(-) create mode 100644 client/virt/deps/test_cpu_flags/fma4.c create mode 100644 client/virt/deps/test_cpu_flags/sse4a.c create mode 100644 client/virt/deps/test_cpu_flags/xop.c diff --git a/client/virt/deps/test_cpu_flags/Makefile b/client/virt/deps/test_cpu_flags/Makefile index 5b77740..b95c36e 100644 --- a/client/virt/deps/test_cpu_flags/Makefile +++ b/client/virt/deps/test_cpu_flags/Makefile @@ -3,57 +3,42 @@ MKDIR = mkdir -p OPTFLAGS=-O3 CFLAGS= -m64 ${OPTFLAGS} -std=c99 -pipe \ - -ftree-vectorize -ftree-vectorizer-verbose=0 \ + -ftree-vectorize \ -ffast-math \ -fopenmp \ -CFLAGSAVX= -m64 ${OPTFLAGS} -std=c99 -pipe \ - -ftree-vectorize -ftree-vectorizer-verbose=0 \ - -ffast-math \ - -mavx \ - -fopenmp \ +CFLAGSAVX= ${CFLAGS} \ + -mavx \ -CFLAGSSSE4= -m64 ${OPTFLAGS} -std=c99 -pipe \ - -ftree-vectorize -ftree-vectorizer-verbose=0 \ - -ffast-math \ +CFLAGSFMA4= ${CFLAGS} \ + -mfma4 \ + +CFLAGSSSE4A= ${CFLAGS} \ + -msse4a \ + +CFLAGSSSE4= ${CFLAGS} \ -msse4 -msse4.1 -msse4.2 \ - -fopenmp \ -CFLAGSSSSE3= -m64 ${OPTFLAGS} -std=c99 -pipe \ - -ftree-vectorize -ftree-vectorizer-verbose=0 \ - -ffast-math \ +CFLAGSSSSE3= ${CFLAGS} \ -mssse3 \ - -fopenmp \ -CFLAGSSSE3= -m64 ${OPTFLAGS} -std=c99 -pipe \ - -ftree-vectorize -ftree-vectorizer-verbose=0 \ - -ffast-math \ +CFLAGSSSE3= ${CFLAGS} \ -msse3 \ - -fopenmp \ -CFLAGSAES= -m64 ${OPTFLAGS} -std=c99 -pipe \ - -ftree-vectorize -ftree-vectorizer-verbose=0 \ - -ffast-math \ +CFLAGSAES= ${CFLAGS} \ -maes \ - -fopenmp \ -CFLAGSPCLMUL= -m64 ${OPTFLAGS} -std=c99 -pipe \ - -ftree-vectorize -ftree-vectorizer-verbose=0 \ - -ffast-math \ +CFLAGSPCLMUL= ${CFLAGS} \ -mpclmul \ - -fopenmp \ -CFLAGSRDRAND= -m64 ${OPTFLAGS} -std=c99 -pipe \ - -ftree-vectorize -ftree-vectorizer-verbose=0 \ - -ffast-math \ +CFLAGSRDRAND= ${CFLAGS} \ -mrdrnd \ - -fopenmp \ -CFLAGSSTRESS= -m64 ${OPTFLAGS} -std=c99 -pipe \ - -ftree-vectorize -ftree-vectorizer-verbose=0 \ - -ffast-math \ +CFLAGSXOP= ${CFLAGS} \ + -mxop \ + +CFLAGSSTRESS= ${CFLAGS} \ $(EXTRA_FLAGS) \ - -fopenmp \ CXX=g++ CC=gcc @@ -66,39 +51,51 @@ default:cpuflags-test all:cpuflags-test -cpuflags-test: avx.o sse4.o ssse3.o sse3.o aes.o pclmul.o rdrand.o stress.o +cpuflags-test: avx.o fma4.o xop.o sse4a.o sse4.o ssse3.o sse3.o aes.o pclmul.o rdrand.o stress.o $(CC) $(CFLAGS) $(LIBS) cpuflags-test.c -o cpuflags-test \ aes.o \ pclmul.o \ rdrand.o \ avx.o \ + fma4.o \ + xop.o \ + sse4a.o \ sse4.o \ ssse3.o \ sse3.o \ stress.o \ -aes.o: aes.c +aes.o: aes.c tests.h $(CC) $(CFLAGSAES) $(LIBS) -c aes.c -pclmul.o: pclmul.c +pclmul.o: pclmul.c tests.h $(CC) $(CFLAGSPCLMUL) $(LIBS) -c pclmul.c -rdrand.o: rdrand.c +rdrand.o: rdrand.c tests.h $(CC) $(CFLAGSRDRAND) $(LIBS) -c rdrand.c -avx.o: avx.c +fma4.o: fma4.c tests.h + $(CC) $(CFLAGSFMA4) $(LIBS) -c fma4.c + +xop.o: xop.c tests.h + $(CC) $(CFLAGSXOP) $(LIBS) -c xop.c + +avx.o: avx.c tests.h $(CC) $(CFLAGSAVX) $(LIBS) -c avx.c -sse4.o: sse4.c +sse4a.o: sse4a.c tests.h + $(CC) $(CFLAGSSSE4A) $(LIBS) -c sse4a.c + +sse4.o: sse4.c tests.h $(CC) $(CFLAGSSSE4) $(LIBS) -c sse4.c -ssse3.o: ssse3.c +ssse3.o: ssse3.c tests.h $(CC) $(CFLAGSSSSE3) $(LIBS) -c ssse3.c -sse3.o: sse3.c +sse3.o: sse3.c tests.h $(CC) $(CFLAGSSSE3) $(LIBS) -c sse3.c -stress.o: stress.c +stress.o: stress.c tests.h $(CC) $(CFLAGSSTRESS) $(LIBS) -c stress.c ARCHIVE= cpuflags-test diff --git a/client/virt/deps/test_cpu_flags/aes.c b/client/virt/deps/test_cpu_flags/aes.c index b8dc5cc..7132ec7 100644 --- a/client/virt/deps/test_cpu_flags/aes.c +++ b/client/virt/deps/test_cpu_flags/aes.c @@ -7,8 +7,10 @@ #include "tests.h" +#define result (5931894172722287318L) + #ifdef __AES__ -void aes(){ +int aes(){ __ma128i v1; __ma128i v2; for (int i = 1;i >= 0; i--){ @@ -17,10 +19,15 @@ void aes(){ } __ma128i v3; v3.i = _mm_aesdeclast_si128(v1.i, v2.i); - printf("[%d %d %d]\n",v1.ui64[0],v2.ui64[0],v3.ui64[0]); + if (v3.ui64[0] != result){ + printf("Correct: %ld result: %ld\n", result, v3.ui64[0]); + return -1; + } + return 0; } #else -void aes(){ +int aes(){ printf("AES is not supported."); + return 0; } #endif diff --git a/client/virt/deps/test_cpu_flags/avx.c b/client/virt/deps/test_cpu_flags/avx.c index bf06929..179c51b 100644 --- a/client/virt/deps/test_cpu_flags/avx.c +++ b/client/virt/deps/test_cpu_flags/avx.c @@ -7,15 +7,8 @@ #include "tests.h" #ifdef __AVX__ - -typedef union __attribute__ ((aligned(32))){ - __m256 v; - float f32[8]; -} __mar256; - - -void avx(){ - __mar256 a,b; +int avx(){ + __ma256 a,b,c; __m256 ymm0; __m256 ymm1; @@ -27,17 +20,26 @@ void avx(){ ymm0 = _mm256_load_ps(a.f32); ymm1 = _mm256_load_ps(b.f32); - __mar256 ymm3; - ymm3.v = _mm256_sub_ps(ymm0,ymm1); - _mm256_store_ps(b.f32, ymm3.v ); + __ma256 ymm3; + ymm3.f = _mm256_sub_ps(ymm0,ymm1); + _mm256_store_ps(c.f32, ymm3.f); for (int i = 0;i < 8; i++){ - printf("[%f]\n", b.f32[i]); + if (((a.f32[i] - b.f32[i]) - c.f32[i]) > FLT_EPSILON){ + printf("Wrong result:\n"); + for (int i = 0;i < 8; i++){ + printf("Correct: %f result: %f\n", a.f32[i] - b.f32[i], + c.f32[i]); + } + return -1; + } } + return 0; } #endif #ifndef __AVX__ -void avx(){ +int avx(){ printf("AVX is not supported."); + return 0; } #endif diff --git a/client/virt/deps/test_cpu_flags/cpuflags-test.c b/client/virt/deps/test_cpu_flags/cpuflags-test.c index 483561c..0d7200e 100644 --- a/client/virt/deps/test_cpu_flags/cpuflags-test.c +++ b/client/virt/deps/test_cpu_flags/cpuflags-test.c @@ -5,12 +5,16 @@ void print_help(){ printf( - " --sse4 test sse4 instruction.\n" + " --sse3 test sse3 instruction.\n" " --ssse3 test ssse3 instruction.\n" + " --sse4 test sse4 instruction.\n" + " --sse4a test sse4a instruction.\n" " --avx test avx instruction.\n" " --aes test aes instruction.\n" " --pclmul test carry less multiplication.\n" " --rdrand test rdrand instruction.\n" + " --fma4 test fma4 instruction.\n" + " --xop test fma4 instruction.\n" " --stress n_cpus,avx,aes start stress on n_cpus.and cpuflags\n"); } @@ -35,6 +39,9 @@ inst parse_Inst(char * optarg){ else if(strcmp(pch,"sse4") == 0){ i.sse4 = 1; } + else if(strcmp(pch,"sse4a") == 0){ + i.sse4a = 1; + } else if(strcmp(pch,"avx") == 0){ i.avx = 1; } @@ -47,6 +54,12 @@ inst parse_Inst(char * optarg){ else if(strcmp(pch,"rdrand") == 0){ i.rdrand = 1; } + else if(strcmp(pch,"fma4") == 0){ + i.fma4 = 1; + } + else if(strcmp(pch,"xop") == 0){ + i.xop = 1; + } pch = strtok (NULL, ","); } return i; @@ -57,18 +70,22 @@ int main(int argc, char **argv) { int digit_optind = 0; int opt_count = 0; + int ret = 0; while (1) { int this_option_optind = optind ? optind : 1; int option_index = 0; static struct option long_options[] = - {{ "sse3", no_argument, 0, 0 }, + {{ "stress",required_argument, 0, 0 }, + { "sse3", no_argument, 0, 0 }, { "ssse3", no_argument, 0, 0 }, { "sse4", no_argument, 0, 0 }, + { "sse4a", no_argument, 0, 0 }, { "avx", no_argument, 0, 0 }, { "aes", no_argument, 0, 0 }, { "pclmul", no_argument, 0, 0 }, { "rdrand", no_argument, 0, 0 }, - { "stress", required_argument, 0, 0 }, + { "fma4", no_argument, 0, 0 }, + { "xop", no_argument, 0, 0 }, { 0, 0, 0, 0}}; c = getopt_long(argc, argv, "", long_options, &option_index); @@ -80,37 +97,41 @@ int main(int argc, char **argv) { switch (c) { case 0: - printf("option %s", long_options[option_index].name); - if (optarg) - printf(" with arg %s", optarg); - printf("\n"); switch (option_index) { case 0: - sse3(); + stress(parse_Inst(optarg)); break; case 1: - ssse3(); + ret += sse3(); break; case 2: - sse4(); + ret += ssse3(); break; case 3: - avx(); + ret += sse4(); break; case 4: - aes(); + ret += sse4a(); break; case 5: - pclmul(); + ret += avx(); break; case 6: - rdrand(); + ret += aes(); break; case 7: - stress(parse_Inst(optarg)); + ret += pclmul(); + break; + case 8: + ret += rdrand(); + break; + case 9: + ret += fma4(); + break; + case 10: + ret += xop(); break; } - printf("\n"); break; case '?': @@ -123,5 +144,9 @@ int main(int argc, char **argv) { } opt_count += 1; } + if (ret > 0) { + printf("%d test fail.\n", ret); + exit(-1); + } exit(0); } diff --git a/client/virt/deps/test_cpu_flags/fma4.c b/client/virt/deps/test_cpu_flags/fma4.c new file mode 100644 index 0000000..48739e1 --- /dev/null +++ b/client/virt/deps/test_cpu_flags/fma4.c @@ -0,0 +1,31 @@ +/* + * fma4.c + * + * Created on: Nov 29, 2011 + * Author: jzupka + */ +#include "tests.h" + +#ifdef __FMA4__ + +int fma4(){ + __ma256 a, b, c, d; + int i; + for (i = 0; i < 4; i++) { + a.d64[i] = i; + b.d64[i] = 2.; + c.d64[i] = 3.; + } + d.d = _mm256_macc_pd(a.d, b.d, c.d); + for (i = 0; i < 4; i++) printf(" %.3lf", d.d64[i]); + printf("\n"); + return 0; +} + +#endif +#ifndef __FMA4__ +int fma4(){ + printf("FMA4 is not supported."); + return 0; +} +#endif diff --git a/client/virt/deps/test_cpu_flags/pclmul.c b/client/virt/deps/test_cpu_flags/pclmul.c index 3387a17..1877e8b 100644 --- a/client/virt/deps/test_cpu_flags/pclmul.c +++ b/client/virt/deps/test_cpu_flags/pclmul.c @@ -8,7 +8,7 @@ #include "tests.h" #ifdef __PCLMUL__ -void pclmul(){ +int pclmul(){ __ma128i v1; __ma128i v2; for (int i = 1;i >= 0; i--){ @@ -17,10 +17,14 @@ void pclmul(){ } __ma128i v3; v3.i = _mm_clmulepi64_si128(v1.i, v2.i, 0); - printf("[%d %d %d]\n",v1.ui64[0],v2.ui64[0],v3.ui64[0]); + if (v3.ui64[0] != 5) + printf("Correct: %d result: %d\n", 5, v3.ui64[0]); + return -1; + return 0; } #else -void pclmul(){ +int pclmul(){ printf("PCMUL is not supported."); + return 0; } #endif diff --git a/client/virt/deps/test_cpu_flags/rdrand.c b/client/virt/deps/test_cpu_flags/rdrand.c index f9d1b76..8a6cb58 100644 --- a/client/virt/deps/test_cpu_flags/rdrand.c +++ b/client/virt/deps/test_cpu_flags/rdrand.c @@ -8,7 +8,7 @@ #include "tests.h" #ifdef __RDRND__ -void rdrand() +int rdrand() { int val, num=1; while (num--) { @@ -19,9 +19,11 @@ void rdrand() __asm volatile("movl %%eax,%0" : "=m"(val)); printf("Random is %d\n",val); } + return 0; } #else -void rdrand(){ +int rdrand(){ printf("RDRAND is not supported."); + return 0; } #endif diff --git a/client/virt/deps/test_cpu_flags/sse3.c b/client/virt/deps/test_cpu_flags/sse3.c index 18d2643..fd38821 100644 --- a/client/virt/deps/test_cpu_flags/sse3.c +++ b/client/virt/deps/test_cpu_flags/sse3.c @@ -9,7 +9,7 @@ #include "tests.h" #ifdef __SSE3__ -void sse3(){ +int sse3(){ __ma128f v1; __ma128f v2; for (int i = 4;i >= 0; i--){ @@ -18,11 +18,17 @@ void sse3(){ } __ma128f vo; vo.f = _mm_addsub_ps(v1.f,v2.f); - printf("[%f]\n", vo.f32[3]); + if (abs(vo.f32[3] - (v1.f32[3]+v2.f32[3])) < FLT_EPSILON){ + return 0; + }else{ + printf("Correct: %f result: %f\n",v1.f32[3]+v2.f32[3], vo.f32[3]); + return -1; + } } #else -void sse3(){ +int sse3(){ printf("SSE3 is not supported."); + return 0; } #endif diff --git a/client/virt/deps/test_cpu_flags/sse4.c b/client/virt/deps/test_cpu_flags/sse4.c index f9b60fb..0f0a5fb 100644 --- a/client/virt/deps/test_cpu_flags/sse4.c +++ b/client/virt/deps/test_cpu_flags/sse4.c @@ -8,21 +8,36 @@ #include "tests.h" #if (defined __SSE4_1__ || defined __SSE4_2__) -void sse4(){ +int sse4(){ __ma128i v1; __ma128i v2; - for (int i = 16;i >= 0; i--){ + for (int i = 15;i >= 0; i--){ v1.ui8[i] = i; v2.ui8[i] = 16-i; } __ma128i v3; v3.i = _mm_max_epi8(v1.i,v2.i); - for (int i = 15;i >= 0; i--){ - printf("max[%d]\n",v3.ui8[i]); + int ret = 0; + for (int i = 0;i < 16; i++){ + if (v1.ui8[i] < v2.ui8[i]){ + if (v3.ui8[i] != v2.ui8[i]) + ret = 1; + }else{ + if (v3.ui8[i] != v1.ui8[i]) + ret = 1; + } + } + if (ret){ + printf("Wrong result:\n"); + for (int i = 15;i >= 0; i--){ + printf("max[%d]\n",v3.ui8[i]); + } } + return ret; } #else -void sse4(){ +int sse4(){ printf("SSE4 is not supported."); + return 0; } #endif diff --git a/client/virt/deps/test_cpu_flags/sse4a.c b/client/virt/deps/test_cpu_flags/sse4a.c new file mode 100644 index 0000000..a5fbcd9 --- /dev/null +++ b/client/virt/deps/test_cpu_flags/sse4a.c @@ -0,0 +1,37 @@ +/* + * sse4a.c + * + * Created on: Nov 29, 2011 + * Author: jzupka + */ +#include "tests.h" + +#ifdef __SSE4A__ + +int sse4a(){ + __ma128f v; + double d[2]; + d[0] = -1.; + d[1] = -2.; + v.d64[0] = 0.; + v.d64[1] = 1.; + _mm_stream_sd(&d[0], v.d); + for (int i = 0;i < 2; i++){ + if (v.d64[i] != d[i]){ + printf("Wrong result:\n"); + for (int i = 0;i < 2; i++){ + printf("Correct: %f result: %f\n", d[i], v.d64[i]); + } + return -1; + } + } + return 0; +} + +#endif +#ifndef __SSE4A__ +int sse4a(){ + printf("SSE4A is not supported."); + return 0; +} +#endif diff --git a/client/virt/deps/test_cpu_flags/ssse3.c b/client/virt/deps/test_cpu_flags/ssse3.c index 8372f43..6604764 100644 --- a/client/virt/deps/test_cpu_flags/ssse3.c +++ b/client/virt/deps/test_cpu_flags/ssse3.c @@ -8,17 +8,23 @@ #include "tests.h" #ifdef __SSSE3__ -void ssse3(){ +int ssse3(){ __ma128i v1; for (int i = 16;i >= 0; i--){ - v1.ui8[i] = -i; + v1.i8[i] = -i; } __ma128i vo; vo.i = _mm_abs_epi8(v1.i); - printf("[%d]\n", vo.ui8[4]); + if (abs(v1.i8[4]) == vo.i8[4]){ + return 0; + }else{ + printf("Correct: %d result: %d\n", abs(v1.i8[4]), vo.i8[4]); + return -1; + } } #else -void ssse3(){ +int ssse3(){ printf("SSSE3 is not supported."); + return 0; } #endif diff --git a/client/virt/deps/test_cpu_flags/stress.c b/client/virt/deps/test_cpu_flags/stress.c index cad505b..5b0a9ed 100644 --- a/client/virt/deps/test_cpu_flags/stress.c +++ b/client/virt/deps/test_cpu_flags/stress.c @@ -63,6 +63,12 @@ void stress(inst in) { pclmul(); if (in.rdrand) rdrand(); + if (in.fma4) + fma4(); + if (in.xop) + xop(); + if (in.sse4a) + sse4a(); } int r = rand()%size; diff --git a/client/virt/deps/test_cpu_flags/tests.h b/client/virt/deps/test_cpu_flags/tests.h index a009923..b581864 100644 --- a/client/virt/deps/test_cpu_flags/tests.h +++ b/client/virt/deps/test_cpu_flags/tests.h @@ -10,19 +10,26 @@ #include <stdio.h> #include <stdlib.h> -#include <immintrin.h> +//#include <immintrin.h> +#include <x86intrin.h> #include <stdint.h> #include <omp.h> +#include <float.h> +#include <math.h> + typedef struct{ int num_threads; char sse3; char ssse3; char sse4; + char sse4a; char avx; char aes; char pclmul; char rdrand; + char fma4; + char xop; } inst; typedef uint16_t auint16_t __attribute__ ((aligned(16))); @@ -30,7 +37,10 @@ typedef uint16_t auint16_t __attribute__ ((aligned(16))); typedef union __attribute__ ((aligned(16))){ __m128i i; uint64_t ui64[2]; + uint32_t ui32[4]; + uint16_t ui16[8]; uint8_t ui8[16]; + int8_t i8[16]; } __ma128i; typedef union __attribute__ ((aligned(32))){ @@ -40,14 +50,26 @@ typedef union __attribute__ ((aligned(32))){ double d64[2]; } __ma128f; -void aes(); -void pclmul(); -void rdrand(); +#ifdef __AVX__ +typedef union __attribute__ ((aligned(32))){ + __m256 f; + __m256d d; + float f32[8]; + double d64[4]; +} __ma256; +#endif + -void avx(); -void sse4(); -void sse3(); -void ssse3(); +int aes(); +int pclmul(); +int rdrand(); +int avx(); +int sse4(); +int sse4a(); +int sse3(); +int ssse3(); +int fma4(); +int xop(); void stress(inst in); diff --git a/client/virt/deps/test_cpu_flags/xop.c b/client/virt/deps/test_cpu_flags/xop.c new file mode 100644 index 0000000..ef01dde --- /dev/null +++ b/client/virt/deps/test_cpu_flags/xop.c @@ -0,0 +1,48 @@ +/* + * xop.c + * + * Created on: Nov 29, 2011 + * Author: jzupka + */ +#include "tests.h" + +#ifdef __XOP__ + +int xop(){ + __ma128i a, b, selector, d; + int i; + a.ui64[1] = 0xccccccccccccccccll; + a.ui64[0] = 0x8888888888888888ll; + b.ui64[1] = 0x3333333333333333ll; + b.ui64[0] = 0x7777777777777777ll; + selector.ui64[1] = 0xfedcba9876543210ll; + selector.ui64[0] = 0x0123456789abcdefll; + d.i = _mm_cmov_si128(a.i, b.i, selector.i); + printf("a: %016I64x %016I64x\n", + a.ui64[1], a.ui64[0]); + printf("b: %016I64x %016I64x\n", + b.ui64[1], b.ui64[0]); + printf("selector %016I64x %016I64x\n", + selector.ui64[1], selector.ui64[0]); + printf("result: %016I64x %016I64x\n", + d.ui64[1], d.ui64[0]); + + for (int i = 0; i < 4; i++) { + a.ui8[i] = -128; + a.ui8[i+4] = i-128; + a.ui8[i+8] = 10*i; + a.ui8[i+12] = 127; + } + d.i = _mm_haddd_epi8(a.i); + for (int i = 0; i < 4; i++) printf(" %d", d.ui32[i]); + printf("\n"); + return 0; +} + +#endif +#ifndef __XOP__ +int xop(){ + printf("XOP is not supported."); + return 0; +} +#endif -- 1.7.7.4 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html