On 05/02/2014 02:21 PM, Marc-André Lureau wrote: > > > ----- Original Message ----- >> Two benchmarks: >> 1. standalone, actually uses LIBCMT's memcpy to compare. >> 2. Part of qxldd.dll (in display/res.c called from display/driver.c) >> so using the same implementation. >> >> Note: next commit removes benchmark code as well as fast_memcpy_* code >> and related SSE check and FPU save/restore. > > Why not keep it in a "benchmark" branch, instead of adding and removing? works for me. > >> Results for 32 bit windows 7 give ~2 times better performance for >> *system* memcpy. >> 64 bit windows 7 is ~2 times better for size < 128 KByte copies, and >> approximately the same for 128 KB <= size <= 1 MB >> >> More complete results: >> Note: for 64 bit with 262144 <= size <= 1048576 I got mixed results, >> depending on the order of performing the comparison - if memcpy was run >> first I got better results for fast_memcpy, by about 20%, for 64 bit >> only (for 32 bit memcpy/RtlCopyMemory was still faster). >> >> Windows 7 32 bit: >> size [bytes] our time/system time [percent] >> 1024 232 >> 2048 352 >> 4096 681 >> 8192 303 >> 16384 455 >> 32768 403 >> 65536 352 >> 131072 232 >> 262144 232 >> 524288 152 >> 1048576 177 >> >> Windows 7 64 bit: >> size [bytes] our time/system time [percent] >> 1024 140 >> 2048 204 >> 4096 200 >> 8192 198 >> 16384 232 >> 32768 173 >> 65536 272 >> 131072 177 >> 262144 134 >> 524288 115 >> 1048576 115 >> --- >> xddm/display/benchmark_memcpy.c | 152 >> +++++++++++++++++++++++++++++++++ >> xddm/display/driver.c | 4 + >> xddm/display/res.c | 57 +++++++++++++ >> xddm/tests/benchmark_format_results.py | 38 +++++++++ >> xddm/tests/build_benchmark.bat | 7 ++ >> 5 files changed, 258 insertions(+) >> create mode 100644 xddm/display/benchmark_memcpy.c >> create mode 100644 xddm/tests/benchmark_format_results.py >> create mode 100644 xddm/tests/build_benchmark.bat >> >> diff --git a/xddm/display/benchmark_memcpy.c >> b/xddm/display/benchmark_memcpy.c >> new file mode 100644 >> index 0000000..fa44577 >> --- /dev/null >> +++ b/xddm/display/benchmark_memcpy.c >> @@ -0,0 +1,152 @@ >> +#include <windows.h> >> +#include <stdio.h> >> + >> +#ifdef _WIN64 >> +int have_sse2 = 0; >> +void fast_memcpy_aligned(void *dest, const void *src, size_t len); >> +#else >> +static _inline void fast_memcpy_aligned(void *dest, const void *src, size_t >> len) >> +{ >> + _asm >> + { >> + mov ecx, len >> + mov esi, src >> + mov edi, dest >> + >> + cmp ecx, 128 >> + jb try_to_copy64 >> + >> + prefetchnta [esi] >> + copy_128: >> + prefetchnta [esi + 64] >> + >> + movdqa xmm0, [esi] >> + movdqa xmm1, [esi + 16] >> + movdqa xmm2, [esi + 32] >> + movdqa xmm3, [esi + 48] >> + >> + prefetchnta [esi + 128] >> + >> + movntdq [edi], xmm0 >> + movntdq [edi + 16], xmm1 >> + movntdq [edi + 32], xmm2 >> + movntdq [edi + 48], xmm3 >> + >> + movdqa xmm0, [esi + 64] >> + movdqa xmm1, [esi + 80] >> + movdqa xmm2, [esi + 96] >> + movdqa xmm3, [esi + 112] >> + >> + movntdq [edi + 64], xmm0 >> + movntdq [edi + 80], xmm1 >> + movntdq [edi + 96], xmm2 >> + movntdq [edi + 112], xmm3 >> + >> + add edi, 128 >> + add esi, 128 >> + sub ecx, 128 >> + cmp ecx, 128 >> + jae copy_128 >> + >> + try_to_copy64: >> + cmp ecx, 64 >> + jb try_to_copy32 >> + >> + movdqa xmm0, [esi] >> + movdqa xmm1, [esi + 16] >> + movdqa xmm2, [esi + 32] >> + movdqa xmm3, [esi + 48] >> + >> + movntdq [edi], xmm0 >> + movntdq [edi + 16], xmm1 >> + movntdq [edi + 32], xmm2 >> + movntdq [edi + 48], xmm3 >> + >> + add edi, 64 >> + add esi, 64 >> + sub ecx, 64 >> + prefetchnta [esi] >> + >> + try_to_copy32: >> + cmp ecx, 32 >> + jb try_to_copy16 >> + >> + movdqa xmm0, [esi] >> + movdqa xmm1, [esi + 16] >> + movntdq [edi], xmm0 >> + movntdq [edi + 16], xmm1 >> + >> + add edi, 32 >> + add esi, 32 >> + sub ecx, 32 >> + >> + try_to_copy16: >> + cmp ecx, 16 >> + jb try_to_copy4 >> + >> + movdqa xmm0, [esi] >> + movntdq [edi], xmm0 >> + >> + add edi, 16 >> + add esi, 16 >> + sub ecx, 16 >> + >> + >> + try_to_copy4: >> + cmp ecx, 4 >> + jb try_to_copy_1 >> + movsd >> + sub ecx, 4 >> + jmp try_to_copy4 >> + >> + try_to_copy_1: >> + rep movsb >> + >> + sfence >> + } >> +} >> +#endif >> + >> +typedef unsigned long long uint64_t; >> + >> +uint64_t time_usecs(void) >> +{ >> + SYSTEMTIME systime; >> + GetSystemTime(&systime); >> + return systime.wMilliseconds * 1000 + systime.wSecond * 1e6 + >> systime.wMinute * 60e6 + systime.wHour * 3600e6; >> +} >> + >> +int main(void) >> +{ >> + int i; >> + unsigned char *src_unaligned; >> + unsigned char *dest_unaligned; >> + uint64_t start, total1, total2; >> + unsigned char *src = NULL; >> + unsigned char *dest = NULL; >> + size_t size = 1024; >> + size_t iter = 1024 * 1024; >> + >> + printf("fast_memcpy compared to memcpy (< 1.0 means memcpy is better)\n"); >> + for (size = 1024; size < 1024*1024*2; size *= 2, iter /= 2) { >> + src_unaligned = malloc(size + 15); >> + dest_unaligned = malloc(size + 15); >> + src = (unsigned char *)((size_t)(src_unaligned + 15) & ~0xf); >> + dest = (unsigned char *)((size_t)(dest_unaligned + 15) & ~0xf); >> + start = time_usecs(); >> + for (i = 0 ; i < iter ; ++i) >> + memcpy(dest, src, size); >> + total1 = time_usecs() - start; >> + >> + start = time_usecs(); >> + for (i = 0 ; i < iter ; ++i) >> + fast_memcpy_aligned(dest, src, size); >> + total2 = time_usecs() - start; >> + >> + printf("%d: %f (%d, ", size, ((float)total1) / total2, total1); >> + printf("%d)\n", total2); >> + free(src_unaligned); >> + free(dest_unaligned); >> + } >> + return 0; >> +} >> diff --git a/xddm/display/driver.c b/xddm/display/driver.c >> index 5a3dbfa..bed1d58 100644 >> --- a/xddm/display/driver.c >> +++ b/xddm/display/driver.c >> @@ -903,6 +903,8 @@ VOID EnableQXLPrimarySurface(PDev *pdev) >> pdev->surf_enable = TRUE; >> } >> >> +void benchmark_memcpy(PDev *pdev); >> + >> HSURF DrvEnableSurface(DHPDEV in_pdev) >> { >> PDev *pdev; >> @@ -941,6 +943,8 @@ HSURF DrvEnableSurface(DHPDEV in_pdev) >> >> EnableQXLPrimarySurface(pdev); >> >> + benchmark_memcpy(pdev); >> + >> DEBUG_PRINT((pdev, 1, "%s: 0x%lx exit\n", __FUNCTION__, pdev)); >> return surf; >> >> diff --git a/xddm/display/res.c b/xddm/display/res.c >> index 60e9bcb..589218b 100644 >> --- a/xddm/display/res.c >> +++ b/xddm/display/res.c >> @@ -1283,6 +1283,63 @@ static _inline void fast_memcpy_unaligment(void *dest, >> const void *src, size_t l >> >> #endif >> >> +uint64_t time_usecs(void) >> +{ >> + ENG_TIME_FIELDS systime; >> + EngQueryLocalTime(&systime); >> + return (uint64_t)(systime.usMilliseconds * 1000 + systime.usSecond * 1e6 + >> + systime.usMinute * 60e6 + systime.usHour * 3600e6); >> +} >> + >> +void benchmark_memcpy(PDev *pdev) >> +{ >> + size_t i; >> + unsigned char *src_unaligned; >> + unsigned char *dest_unaligned; >> + uint64_t start, total1, total2; >> + unsigned char *src = NULL; >> + unsigned char *dest = NULL; >> + size_t size = 1024; >> + size_t iter = 1024 * 1024; >> + >> + for (size = 1024; size < 1024*1024*2; size *= 2, iter /= 2) { >> + src_unaligned = EngAllocMem(0, size + 31, ALLOC_TAG); >> + dest_unaligned = EngAllocMem(0, size + 31, ALLOC_TAG); >> + src = (unsigned char *)((size_t)(src_unaligned + 31) & ~0x1f); >> + dest = (unsigned char *)((size_t)(dest_unaligned + 31) & ~0x1f); >> + >> + for (i = 0 ; i < size ; ++i) >> + src[i] = i; >> + >> + start = time_usecs(); >> + for (i = 0 ; i < iter ; ++i) { >> + fast_memcpy_aligned(dest, src, size); >> + } >> + total2 = time_usecs() - start; >> + >> + { >> + int errors = 0; >> + for (i = 0 ; i < size ; ++i) { >> + if (dest[i] != src[i]) { >> + errors++; >> + } >> + } >> + if (errors > 0) { >> + DEBUG_PRINT((pdev, 1, "!!! copy errors %d !!!\n", errors)); >> + } >> + } >> + >> + start = time_usecs(); >> + for (i = 0 ; i < iter ; ++i) >> + memcpy(dest, src, size); >> + total1 = time_usecs() - start; >> + >> + DEBUG_PRINT((pdev, 1, "%d: %lld, %lld\n", size, total1, total2)); >> + EngFreeMem(src_unaligned); >> + EngFreeMem(dest_unaligned); >> + } >> +} >> + >> #ifdef DBG >> #define PutBytesAlign __PutBytesAlign >> #define PutBytes(pdev, chunk, now, end, src, size, page_counter, alloc_size, >> use_sse)\ >> diff --git a/xddm/tests/benchmark_format_results.py >> b/xddm/tests/benchmark_format_results.py >> new file mode 100644 >> index 0000000..96d302b >> --- /dev/null >> +++ b/xddm/tests/benchmark_format_results.py >> @@ -0,0 +1,38 @@ >> +import sys >> + >> +win7_32="""qxl/guest-0: 96463384453: qxldd: 1024: 47000, 109000 >> +qxl/guest-0: 96591785177: qxldd: 2048: 31000, 109000 >> +qxl/guest-0: 96722899152: qxldd: 4096: 16000, 109000 >> +qxl/guest-0: 96851422238: qxldd: 8192: 31000, 94000 >> +qxl/guest-0: 97013842048: qxldd: 16384: 31000, 141000 >> +qxl/guest-0: 97167323122: qxldd: 32768: 31000, 125000 >> +qxl/guest-0: 97316872306: qxldd: 65536: 31000, 109000 >> +qxl/guest-0: 97465747407: qxldd: 131072: 47000, 109000 >> +qxl/guest-0: 97624668249: qxldd: 262144: 47000, 109000 >> +qxl/guest-0: 97785876639: qxldd: 524288: 62000, 94000 >> +qxl/guest-0: 97953480643: qxldd: 1048576: 62000, 110000 >> +""" >> + >> +win7_64=""" >> +qxl/guest-0: 2278149101498: qxldd: 1024: 78000, 109000 >> +qxl/guest-0: 2278288271327: qxldd: 2048: 46000, 94000 >> +qxl/guest-0: 2278428135167: qxldd: 4096: 47000, 94000 >> +qxl/guest-0: 2278575078269: qxldd: 8192: 47000, 93000 >> +qxl/guest-0: 2278734906600: qxldd: 16384: 47000, 109000 >> +qxl/guest-0: 2278896881683: qxldd: 32768: 63000, 109000 >> +qxl/guest-0: 2279073699223: qxldd: 65536: 46000, 125000 >> +qxl/guest-0: 2279250403663: qxldd: 131072: 62000, 110000 >> +qxl/guest-0: 2279467314681: qxldd: 262144: 93000, 125000 >> +qxl/guest-0: 2279693375414: qxldd: 524288: 109000, 125000 >> +qxl/guest-0: 2279929972847: qxldd: 1048576: 109000, 125000 >> +""" >> + >> +filt = lambda txt: filt2(filt1(txt)) >> +filt2 = lambda data: [(s, system, ours*100.0/system) for t, s, system, ours >> in data] >> +filt1 = lambda txt: map(lambda a: (int(a[1][:-1]), int(a[3][:-1]), >> int(a[4][:-1]), int(a[5])), map(lambda l: l.strip().split(), [l for l in >> txt.split('\n') if l.strip() != ''])) >> +display = lambda txt: sys.stdout.write('\n'.join('%10s %10s' % (a, '%3.0f' % >> c) for a, b, c in filt(txt))+'\n') >> + >> +print('size [bytes]'.ljust(18) + 'our time/system time [percent]') >> +display(win7_32) >> +print('size [bytes]'.ljust(18) + 'our time/system time [percent]') >> +display(win7_64) >> diff --git a/xddm/tests/build_benchmark.bat b/xddm/tests/build_benchmark.bat >> new file mode 100644 >> index 0000000..a184249 >> --- /dev/null >> +++ b/xddm/tests/build_benchmark.bat >> @@ -0,0 +1,7 @@ >> +cl /Zi /nologo /c /I %CRT_INC_PATH% ..\display\benchmark_memcpy.c >> +if defined AMD64 ( >> +ml64 /c /Zd ..\display\amd64\x64.asm >> +link /nologo /debug /libpath:%BASEDIR%\lib\crt\amd64\ >> /libpath:%DDK_LIB_DEST%\amd64 x64.obj benchmark_memcpy.obj >> +) else ( >> +link /nologo /debug /libpath:%BASEDIR%\lib\crt\i386\ >> /libpath:%DDK_LIB_DEST%\i386 benchmark_memcpy.obj >> +) >> -- >> 1.9.0 >> >> _______________________________________________ >> Spice-devel mailing list >> Spice-devel@xxxxxxxxxxxxxxxxxxxxx >> http://lists.freedesktop.org/mailman/listinfo/spice-devel >> _______________________________________________ Spice-devel mailing list Spice-devel@xxxxxxxxxxxxxxxxxxxxx http://lists.freedesktop.org/mailman/listinfo/spice-devel