On Thu, Aug 09, 2012 at 04:22:04PM +0100, Jan Beulich wrote: > >>> On 09.08.12 at 17:03, "Kirill A. Shutemov" <kirill.shutemov@xxxxxxxxxxxxxxx> wrote: ... > > --- > > arch/x86/include/asm/page.h | 2 ++ > > arch/x86/include/asm/string_32.h | 5 +++++ > > arch/x86/include/asm/string_64.h | 5 +++++ > > arch/x86/lib/Makefile | 1 + > > arch/x86/lib/clear_page_nocache_32.S | 30 ++++++++++++++++++++++++++++++ > > arch/x86/lib/clear_page_nocache_64.S | 29 +++++++++++++++++++++++++++++ > > Couldn't this more reasonably go into clear_page_{32,64}.S? We don't have clear_page_32.S. > >+ xorl %eax,%eax > >+ movl $4096/64,%ecx > >+ .p2align 4 > >+.Lloop: > >+ decl %ecx > >+#define PUT(x) movnti %eax,x*8(%edi) ; movnti %eax,x*8+4(%edi) > > Is doing twice as much unrolling as on 64-bit really worth it? Moving 64 bytes per cycle is faster on Sandy Bridge, but slower on Westmere. Any preference? ;) Westmere: Performance counter stats for './test_unroll32' (20 runs): 31498.420608 task-clock # 0.998 CPUs utilized ( +- 0.25% ) 40 context-switches # 0.001 K/sec ( +- 1.40% ) 0 CPU-migrations # 0.000 K/sec ( +-100.00% ) 89 page-faults # 0.003 K/sec ( +- 0.13% ) 74,728,231,935 cycles # 2.372 GHz ( +- 0.25% ) [83.34%] 53,789,969,009 stalled-cycles-frontend # 71.98% frontend cycles idle ( +- 0.35% ) [83.33%] 41,681,014,054 stalled-cycles-backend # 55.78% backend cycles idle ( +- 0.43% ) [66.67%] 37,992,733,278 instructions # 0.51 insns per cycle # 1.42 stalled cycles per insn ( +- 0.05% ) [83.33%] 3,561,376,245 branches # 113.065 M/sec ( +- 0.05% ) [83.33%] 27,182,795 branch-misses # 0.76% of all branches ( +- 0.06% ) [83.33%] 31.558545812 seconds time elapsed ( +- 0.25% ) Performance counter stats for './test_unroll64' (20 runs): 31564.753623 task-clock # 0.998 CPUs utilized ( +- 0.19% ) 39 context-switches # 0.001 K/sec ( +- 0.40% ) 0 CPU-migrations # 0.000 K/sec 90 page-faults # 0.003 K/sec ( +- 0.12% ) 74,886,045,192 cycles # 2.372 GHz ( +- 0.19% ) [83.33%] 57,477,323,995 stalled-cycles-frontend # 76.75% frontend cycles idle ( +- 0.26% ) [83.34%] 44,548,142,150 stalled-cycles-backend # 59.49% backend cycles idle ( +- 0.31% ) [66.67%] 32,940,027,099 instructions # 0.44 insns per cycle # 1.74 stalled cycles per insn ( +- 0.05% ) [83.34%] 1,884,944,093 branches # 59.717 M/sec ( +- 0.05% ) [83.32%] 1,027,135 branch-misses # 0.05% of all branches ( +- 0.56% ) [83.34%] 31.621001407 seconds time elapsed ( +- 0.19% ) Sandy Bridge: Performance counter stats for './test_unroll32' (20 runs): 8578.382891 task-clock # 0.997 CPUs utilized ( +- 0.08% ) 15 context-switches # 0.000 M/sec ( +- 2.97% ) 0 CPU-migrations # 0.000 M/sec 84 page-faults # 0.000 M/sec ( +- 0.13% ) 29,154,476,597 cycles # 3.399 GHz ( +- 0.08% ) [83.33%] 11,851,215,147 stalled-cycles-frontend # 40.65% frontend cycles idle ( +- 0.20% ) [83.33%] 1,530,172,593 stalled-cycles-backend # 5.25% backend cycles idle ( +- 1.44% ) [66.67%] 37,915,778,094 instructions # 1.30 insns per cycle # 0.31 stalled cycles per insn ( +- 0.00% ) [83.34%] 3,590,533,447 branches # 418.556 M/sec ( +- 0.01% ) [83.35%] 26,500,765 branch-misses # 0.74% of all branches ( +- 0.01% ) [83.34%] 8.604638449 seconds time elapsed ( +- 0.08% ) Performance counter stats for './test_unroll64' (20 runs): 8463.789963 task-clock # 0.997 CPUs utilized ( +- 0.07% ) 14 context-switches # 0.000 M/sec ( +- 1.70% ) 0 CPU-migrations # 0.000 M/sec ( +-100.00% ) 85 page-faults # 0.000 M/sec ( +- 0.12% ) 28,763,328,688 cycles # 3.398 GHz ( +- 0.07% ) [83.32%] 13,517,462,952 stalled-cycles-frontend # 47.00% frontend cycles idle ( +- 0.14% ) [83.33%] 1,356,208,859 stalled-cycles-backend # 4.72% backend cycles idle ( +- 1.42% ) [66.68%] 32,885,492,141 instructions # 1.14 insns per cycle # 0.41 stalled cycles per insn ( +- 0.00% ) [83.34%] 1,912,094,072 branches # 225.915 M/sec ( +- 0.02% ) [83.34%] 305,896 branch-misses # 0.02% of all branches ( +- 1.05% ) [83.33%] 8.488304839 seconds time elapsed ( +- 0.07% ) $ cat test.c #include <stdio.h> #include <sys/mman.h> #define SIZE 1024*1024*1024 void clear_page_nocache_sse2(void *page) __attribute__((regparm(1))); int main(int argc, char** argv) { char *p; unsigned long i, j; p = mmap(NULL, SIZE, PROT_WRITE|PROT_READ, MAP_PRIVATE|MAP_ANONYMOUS|MAP_POPULATE, -1, 0); for(j = 0; j < 100; j++) { for(i = 0; i < SIZE; i += 4096) { clear_page_nocache_sse2(p + i); } } return 0; } $ cat clear_page_nocache_unroll32.S .globl clear_page_nocache_sse2 .align 4,0x90 clear_page_nocache_sse2: .cfi_startproc mov %eax,%edx xorl %eax,%eax movl $4096/32,%ecx .p2align 4 .Lloop_sse2: decl %ecx #define PUT(x) movnti %eax,x*4(%edx) PUT(0) PUT(1) PUT(2) PUT(3) PUT(4) PUT(5) PUT(6) PUT(7) #undef PUT lea 32(%edx),%edx jnz .Lloop_sse2 nop ret .cfi_endproc .type clear_page_nocache_sse2, @function .size clear_page_nocache_sse2, .-clear_page_nocache_sse2 $ cat clear_page_nocache_unroll64.S .globl clear_page_nocache_sse2 .align 4,0x90 clear_page_nocache_sse2: .cfi_startproc mov %eax,%edx xorl %eax,%eax movl $4096/64,%ecx .p2align 4 .Lloop_sse2: decl %ecx #define PUT(x) movnti %eax,x*8(%edx) ; movnti %eax,x*8+4(%edx) PUT(0) PUT(1) PUT(2) PUT(3) PUT(4) PUT(5) PUT(6) PUT(7) #undef PUT lea 64(%edx),%edx jnz .Lloop_sse2 nop ret .cfi_endproc .type clear_page_nocache_sse2, @function .size clear_page_nocache_sse2, .-clear_page_nocache_sse2 -- Kirill A. Shutemov
Attachment:
signature.asc
Description: Digital signature