Re: [PATCH 11/12] add benchmarks for memcpy (RtlCopyMemory) vs fast - fast is slower

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On 05/02/2014 02:21 PM, Marc-André Lureau wrote:
> 
> 
> ----- Original Message -----
>> Two benchmarks:
>> 1. standalone, actually uses LIBCMT's memcpy to compare.
>> 2. Part of qxldd.dll (in display/res.c called from display/driver.c)
>> so using the same implementation.
>>
>> Note: next commit removes benchmark code as well as fast_memcpy_* code
>> and related SSE check and FPU save/restore.
> 
> Why not keep it in a "benchmark" branch, instead of adding and removing?

works for me.

> 
>> Results for 32 bit windows 7 give ~2 times better performance for
>> *system* memcpy.
>> 64 bit windows 7 is ~2 times better for size < 128 KByte copies, and
>> approximately the same for 128 KB <= size <= 1 MB
>>
>> More complete results:
>> Note: for 64 bit with 262144 <= size <= 1048576 I got mixed results,
>> depending on the order of performing the comparison - if memcpy was run
>> first I got better results for fast_memcpy, by about 20%, for 64 bit
>> only (for 32 bit memcpy/RtlCopyMemory was still faster).
>>
>> Windows 7 32 bit:
>> size [bytes]      our time/system time [percent]
>>       1024        232
>>       2048        352
>>       4096        681
>>       8192        303
>>      16384        455
>>      32768        403
>>      65536        352
>>     131072        232
>>     262144        232
>>     524288        152
>>    1048576        177
>>
>> Windows 7 64 bit:
>> size [bytes]      our time/system time [percent]
>>       1024        140
>>       2048        204
>>       4096        200
>>       8192        198
>>      16384        232
>>      32768        173
>>      65536        272
>>     131072        177
>>     262144        134
>>     524288        115
>>    1048576        115
>> ---
>>  xddm/display/benchmark_memcpy.c        | 152
>>  +++++++++++++++++++++++++++++++++
>>  xddm/display/driver.c                  |   4 +
>>  xddm/display/res.c                     |  57 +++++++++++++
>>  xddm/tests/benchmark_format_results.py |  38 +++++++++
>>  xddm/tests/build_benchmark.bat         |   7 ++
>>  5 files changed, 258 insertions(+)
>>  create mode 100644 xddm/display/benchmark_memcpy.c
>>  create mode 100644 xddm/tests/benchmark_format_results.py
>>  create mode 100644 xddm/tests/build_benchmark.bat
>>
>> diff --git a/xddm/display/benchmark_memcpy.c
>> b/xddm/display/benchmark_memcpy.c
>> new file mode 100644
>> index 0000000..fa44577
>> --- /dev/null
>> +++ b/xddm/display/benchmark_memcpy.c
>> @@ -0,0 +1,152 @@
>> +#include <windows.h>
>> +#include <stdio.h>
>> +
>> +#ifdef _WIN64
>> +int have_sse2 = 0;
>> +void fast_memcpy_aligned(void *dest, const void *src, size_t len);
>> +#else
>> +static _inline void fast_memcpy_aligned(void *dest, const void *src, size_t
>> len)
>> +{
>> +    _asm
>> +    {
>> +        mov ecx, len
>> +        mov esi, src
>> +        mov edi, dest
>> +
>> +        cmp ecx, 128
>> +        jb try_to_copy64
>> +
>> +        prefetchnta [esi]
>> +        copy_128:
>> +            prefetchnta [esi + 64]
>> +
>> +            movdqa xmm0, [esi]
>> +            movdqa xmm1, [esi + 16]
>> +            movdqa xmm2, [esi + 32]
>> +            movdqa xmm3, [esi + 48]
>> +
>> +            prefetchnta [esi + 128]
>> +
>> +            movntdq [edi], xmm0
>> +            movntdq [edi + 16], xmm1
>> +            movntdq [edi + 32], xmm2
>> +            movntdq [edi + 48], xmm3
>> +
>> +            movdqa xmm0, [esi + 64]
>> +            movdqa xmm1, [esi + 80]
>> +            movdqa xmm2, [esi + 96]
>> +            movdqa xmm3, [esi + 112]
>> +
>> +            movntdq [edi + 64], xmm0
>> +            movntdq [edi + 80], xmm1
>> +            movntdq [edi + 96], xmm2
>> +            movntdq [edi + 112], xmm3
>> +
>> +            add edi, 128
>> +            add esi, 128
>> +            sub ecx, 128
>> +            cmp ecx, 128
>> +            jae copy_128
>> +
>> +       try_to_copy64:
>> +            cmp ecx, 64
>> +            jb try_to_copy32
>> +
>> +             movdqa xmm0, [esi]
>> +             movdqa xmm1, [esi + 16]
>> +             movdqa xmm2, [esi + 32]
>> +             movdqa xmm3, [esi + 48]
>> +
>> +             movntdq [edi], xmm0
>> +             movntdq [edi + 16], xmm1
>> +             movntdq [edi + 32], xmm2
>> +             movntdq [edi + 48], xmm3
>> +
>> +             add edi, 64
>> +             add esi, 64
>> +             sub ecx, 64
>> +             prefetchnta [esi]
>> +
>> +        try_to_copy32:
>> +             cmp ecx, 32
>> +             jb try_to_copy16
>> +
>> +             movdqa xmm0, [esi]
>> +             movdqa xmm1, [esi + 16]
>> +             movntdq [edi], xmm0
>> +             movntdq [edi + 16], xmm1
>> +
>> +             add edi, 32
>> +             add esi, 32
>> +             sub ecx, 32
>> +
>> +        try_to_copy16:
>> +             cmp ecx, 16
>> +             jb try_to_copy4
>> +
>> +             movdqa xmm0, [esi]
>> +             movntdq [edi], xmm0
>> +
>> +             add edi, 16
>> +             add esi, 16
>> +             sub ecx, 16
>> +
>> +
>> +        try_to_copy4:
>> +            cmp ecx, 4
>> +            jb try_to_copy_1
>> +            movsd
>> +            sub ecx, 4
>> +            jmp try_to_copy4
>> +
>> +        try_to_copy_1:
>> +            rep movsb
>> +
>> +        sfence
>> +    }
>> +}
>> +#endif
>> +
>> +typedef unsigned long long uint64_t;
>> +
>> +uint64_t time_usecs(void)
>> +{
>> +  SYSTEMTIME systime;
>> +  GetSystemTime(&systime);
>> +  return systime.wMilliseconds * 1000 + systime.wSecond * 1e6 +
>> systime.wMinute * 60e6 + systime.wHour * 3600e6;
>> +}
>> +
>> +int main(void)
>> +{
>> +  int i;
>> +  unsigned char *src_unaligned;
>> +  unsigned char *dest_unaligned;
>> +  uint64_t start, total1, total2;
>> +  unsigned char *src = NULL;
>> +  unsigned char *dest = NULL;
>> +  size_t size = 1024;
>> +  size_t iter = 1024 * 1024;
>> +
>> +  printf("fast_memcpy compared to memcpy (< 1.0 means memcpy is better)\n");
>> +  for (size = 1024; size < 1024*1024*2; size *= 2, iter /= 2) {
>> +    src_unaligned = malloc(size + 15);
>> +    dest_unaligned = malloc(size + 15);
>> +    src = (unsigned char *)((size_t)(src_unaligned + 15) & ~0xf);
>> +    dest = (unsigned char *)((size_t)(dest_unaligned + 15) & ~0xf);
>> +    start = time_usecs();
>> +    for (i = 0 ; i < iter ; ++i)
>> +      memcpy(dest, src, size);
>> +    total1 = time_usecs() - start;
>> +
>> +    start = time_usecs();
>> +    for (i = 0 ; i < iter ; ++i)
>> +      fast_memcpy_aligned(dest, src, size);
>> +    total2 = time_usecs() - start;
>> +
>> +    printf("%d: %f (%d, ", size, ((float)total1) / total2, total1);
>> +    printf("%d)\n", total2);
>> +    free(src_unaligned);
>> +    free(dest_unaligned);
>> +  }
>> +  return 0;
>> +}
>> diff --git a/xddm/display/driver.c b/xddm/display/driver.c
>> index 5a3dbfa..bed1d58 100644
>> --- a/xddm/display/driver.c
>> +++ b/xddm/display/driver.c
>> @@ -903,6 +903,8 @@ VOID EnableQXLPrimarySurface(PDev *pdev)
>>      pdev->surf_enable = TRUE;
>>  }
>>  
>> +void benchmark_memcpy(PDev *pdev);
>> +
>>  HSURF DrvEnableSurface(DHPDEV in_pdev)
>>  {
>>      PDev *pdev;
>> @@ -941,6 +943,8 @@ HSURF DrvEnableSurface(DHPDEV in_pdev)
>>  
>>      EnableQXLPrimarySurface(pdev);
>>  
>> +    benchmark_memcpy(pdev);
>> +
>>      DEBUG_PRINT((pdev, 1, "%s: 0x%lx exit\n", __FUNCTION__, pdev));
>>      return surf;
>>  
>> diff --git a/xddm/display/res.c b/xddm/display/res.c
>> index 60e9bcb..589218b 100644
>> --- a/xddm/display/res.c
>> +++ b/xddm/display/res.c
>> @@ -1283,6 +1283,63 @@ static _inline void fast_memcpy_unaligment(void *dest,
>> const void *src, size_t l
>>  
>>  #endif
>>  
>> +uint64_t time_usecs(void)
>> +{
>> +  ENG_TIME_FIELDS systime;
>> +  EngQueryLocalTime(&systime);
>> +  return (uint64_t)(systime.usMilliseconds * 1000 + systime.usSecond * 1e6 +
>> +                     systime.usMinute * 60e6 + systime.usHour * 3600e6);
>> +}
>> +
>> +void benchmark_memcpy(PDev *pdev)
>> +{
>> +  size_t i;
>> +  unsigned char *src_unaligned;
>> +  unsigned char *dest_unaligned;
>> +  uint64_t start, total1, total2;
>> +  unsigned char *src = NULL;
>> +  unsigned char *dest = NULL;
>> +  size_t size = 1024;
>> +  size_t iter = 1024 * 1024;
>> +
>> +  for (size = 1024; size < 1024*1024*2; size *= 2, iter /= 2) {
>> +    src_unaligned = EngAllocMem(0, size + 31, ALLOC_TAG);
>> +    dest_unaligned = EngAllocMem(0, size + 31, ALLOC_TAG);
>> +    src = (unsigned char *)((size_t)(src_unaligned + 31) & ~0x1f);
>> +    dest = (unsigned char *)((size_t)(dest_unaligned + 31) & ~0x1f);
>> +
>> +    for (i = 0 ; i < size ; ++i)
>> +      src[i] = i;
>> +
>> +    start = time_usecs();
>> +    for (i = 0 ; i < iter ; ++i) {
>> +      fast_memcpy_aligned(dest, src, size);
>> +    }
>> +    total2 = time_usecs() - start;
>> +
>> +    {
>> +      int errors = 0;
>> +      for (i = 0 ; i < size ; ++i) {
>> +        if (dest[i] != src[i]) {
>> +          errors++;
>> +        }
>> +      }
>> +      if (errors > 0) {
>> +        DEBUG_PRINT((pdev, 1, "!!! copy errors %d !!!\n", errors));
>> +      }
>> +    }
>> +
>> +    start = time_usecs();
>> +    for (i = 0 ; i < iter ; ++i)
>> +      memcpy(dest, src, size);
>> +    total1 = time_usecs() - start;
>> +
>> +    DEBUG_PRINT((pdev, 1, "%d: %lld, %lld\n", size, total1, total2));
>> +    EngFreeMem(src_unaligned);
>> +    EngFreeMem(dest_unaligned);
>> +  }
>> +}
>> +
>>  #ifdef DBG
>>      #define PutBytesAlign __PutBytesAlign
>>  #define PutBytes(pdev, chunk, now, end, src, size, page_counter, alloc_size,
>>  use_sse)\
>> diff --git a/xddm/tests/benchmark_format_results.py
>> b/xddm/tests/benchmark_format_results.py
>> new file mode 100644
>> index 0000000..96d302b
>> --- /dev/null
>> +++ b/xddm/tests/benchmark_format_results.py
>> @@ -0,0 +1,38 @@
>> +import sys
>> +
>> +win7_32="""qxl/guest-0: 96463384453: qxldd: 1024: 47000, 109000
>> +qxl/guest-0: 96591785177: qxldd: 2048: 31000, 109000
>> +qxl/guest-0: 96722899152: qxldd: 4096: 16000, 109000
>> +qxl/guest-0: 96851422238: qxldd: 8192: 31000, 94000
>> +qxl/guest-0: 97013842048: qxldd: 16384: 31000, 141000
>> +qxl/guest-0: 97167323122: qxldd: 32768: 31000, 125000
>> +qxl/guest-0: 97316872306: qxldd: 65536: 31000, 109000
>> +qxl/guest-0: 97465747407: qxldd: 131072: 47000, 109000
>> +qxl/guest-0: 97624668249: qxldd: 262144: 47000, 109000
>> +qxl/guest-0: 97785876639: qxldd: 524288: 62000, 94000
>> +qxl/guest-0: 97953480643: qxldd: 1048576: 62000, 110000
>> +"""
>> +
>> +win7_64="""
>> +qxl/guest-0: 2278149101498: qxldd: 1024: 78000, 109000
>> +qxl/guest-0: 2278288271327: qxldd: 2048: 46000, 94000
>> +qxl/guest-0: 2278428135167: qxldd: 4096: 47000, 94000
>> +qxl/guest-0: 2278575078269: qxldd: 8192: 47000, 93000
>> +qxl/guest-0: 2278734906600: qxldd: 16384: 47000, 109000
>> +qxl/guest-0: 2278896881683: qxldd: 32768: 63000, 109000
>> +qxl/guest-0: 2279073699223: qxldd: 65536: 46000, 125000
>> +qxl/guest-0: 2279250403663: qxldd: 131072: 62000, 110000
>> +qxl/guest-0: 2279467314681: qxldd: 262144: 93000, 125000
>> +qxl/guest-0: 2279693375414: qxldd: 524288: 109000, 125000
>> +qxl/guest-0: 2279929972847: qxldd: 1048576: 109000, 125000
>> +"""
>> +
>> +filt = lambda txt: filt2(filt1(txt))
>> +filt2 = lambda data: [(s, system, ours*100.0/system) for t, s, system, ours
>> in data]
>> +filt1 = lambda txt: map(lambda a: (int(a[1][:-1]), int(a[3][:-1]),
>> int(a[4][:-1]), int(a[5])), map(lambda l: l.strip().split(), [l for l in
>> txt.split('\n') if l.strip() != '']))
>> +display = lambda txt: sys.stdout.write('\n'.join('%10s %10s' % (a, '%3.0f' %
>> c) for a, b, c in filt(txt))+'\n')
>> +
>> +print('size [bytes]'.ljust(18) + 'our time/system time [percent]')
>> +display(win7_32)
>> +print('size [bytes]'.ljust(18) + 'our time/system time [percent]')
>> +display(win7_64)
>> diff --git a/xddm/tests/build_benchmark.bat b/xddm/tests/build_benchmark.bat
>> new file mode 100644
>> index 0000000..a184249
>> --- /dev/null
>> +++ b/xddm/tests/build_benchmark.bat
>> @@ -0,0 +1,7 @@
>> +cl /Zi /nologo /c /I %CRT_INC_PATH% ..\display\benchmark_memcpy.c
>> +if defined AMD64 (
>> +ml64 /c /Zd ..\display\amd64\x64.asm
>> +link /nologo /debug /libpath:%BASEDIR%\lib\crt\amd64\
>> /libpath:%DDK_LIB_DEST%\amd64 x64.obj benchmark_memcpy.obj
>> +) else (
>> +link /nologo /debug /libpath:%BASEDIR%\lib\crt\i386\
>> /libpath:%DDK_LIB_DEST%\i386 benchmark_memcpy.obj
>> +)
>> --
>> 1.9.0
>>
>> _______________________________________________
>> Spice-devel mailing list
>> Spice-devel@xxxxxxxxxxxxxxxxxxxxx
>> http://lists.freedesktop.org/mailman/listinfo/spice-devel
>>

_______________________________________________
Spice-devel mailing list
Spice-devel@xxxxxxxxxxxxxxxxxxxxx
http://lists.freedesktop.org/mailman/listinfo/spice-devel





[Index of Archives]     [Linux ARM Kernel]     [Linux ARM]     [Linux Omap]     [Fedora ARM]     [IETF Annouce]     [Security]     [Bugtraq]     [Linux]     [Linux OMAP]     [Linux MIPS]     [ECOS]     [Asterisk Internet PBX]     [Linux API]     [Monitors]