Hi,
I'm working on RMCProfile ( http://www.rmcprofile.org ), trying to reduce the
run time (which is on the order of 3 CPU years for my datasets). I've done
some profiling and about 75% of the runtime is spent executing an operation of
the form y(p:q) += a * x(r:s) [where q-p == s-r] - i.e. DAXPY but on slices of
the original y and x arrays.
I've tried implementing this as a do loop and using Fortran array intrinsics -
the version using the do loop is significantly faster (the overall running time
of the program on a reduced problem set taking ~1 hour is 7% faster). Looking
at the generated assembly, the do loop uses
incq %rax
cmpq %rcx, %rax
jne .L5
to increment and check the loop counter, whereas the array intrinsic uses
movq %rdx, %rdi
leaq 1(%rdi), %rdx
cmpq %rdi, %r8
jge .L13
The same pattern can be reproduced using much simpler source files - I've
attached the fortran source and the assembly output.
NB as far as I can tell, none of the above rax, rdx or rdi play any part in
their respective calculations, they function purely as counters.
I have two questions about the above behaviour:
1) Why does the array intrinsic method use an extra instruction compared to
the do loop? Is there any way of stopping this?
2) Is there any way of getting these loops to use SIMD instructions such as
vfmaddpd? Currently even when -march=native is switched on, the loop only uses
the scalar instruction vfmaddsd. I'd rather not have to hand-code an unrolled
loop, especially as I'm more used to C and Python so there would probably be
off-by-one errors all over the place on my first ten tries.
Thanks in advance,
Chris Kerr
Department of Chemistry
University of Cambridge
gfortran -v -save-temps -S -fverbose-asm -O3 -march=native slice_daxpy.f90 -o slice_daxpy_O3_native.s
Using built-in specs.
COLLECT_GCC=gfortran
COLLECT_LTO_WRAPPER=/usr/lib/gcc/x86_64-linux-gnu/4.6.1/lto-wrapper
Target: x86_64-linux-gnu
Configured with: ../src/configure -v --with-pkgversion='Ubuntu/Linaro 4.6.1-9ubuntu3' --with-bugurl=file:///usr/share/doc/gcc-4.6/README.Bugs --enable-languages=c,c++,fortran,objc,obj-c++,go --prefix=/usr --program-suffix=-4.6 --enable-shared --enable-linker-build-id --with-system-zlib --libexecdir=/usr/lib --without-included-gettext --enable-threads=posix --with-gxx-include-dir=/usr/include/c++/4.6 --libdir=/usr/lib --enable-nls --with-sysroot=/ --enable-clocale=gnu --enable-libstdcxx-debug --enable-libstdcxx-time=yes --enable-plugin --enable-objc-gc --disable-werror --with-arch-32=i686 --with-tune=generic --enable-checking=release --build=x86_64-linux-gnu --host=x86_64-linux-gnu --target=x86_64-linux-gnu
Thread model: posix
gcc version 4.6.1 (Ubuntu/Linaro 4.6.1-9ubuntu3)
COLLECT_GCC_OPTIONS='-v' '-save-temps' '-S' '-fverbose-asm' '-O3' '-march=native' '-o' 'slice_daxpy_O3_native.s'
/usr/lib/gcc/x86_64-linux-gnu/4.6.1/f951 slice_daxpy.f90 -march=bdver1 -mcx16 -msahf -mno-movbe -maes -mpclmul -mpopcnt -mabm -mlwp -mno-fma -mfma4 -mxop -mno-bmi -mno-tbm -mavx -msse4.2 -msse4.1 --param l1-cache-size=16 --param l1-cache-line-size=64 --param l2-cache-size=2048 -mtune=bdver1 -quiet -dumpbase slice_daxpy.f90 -auxbase-strip slice_daxpy_O3_native.s -O3 -version -fverbose-asm -o slice_daxpy_O3_native.s -fintrinsic-modules-path /usr/lib/gcc/x86_64-linux-gnu/4.6.1/finclude
GNU Fortran (Ubuntu/Linaro 4.6.1-9ubuntu3) version 4.6.1 (x86_64-linux-gnu)
compiled by GNU C version 4.6.1, GMP version 5.0.1, MPFR version 3.0.1-p3, MPC version 0.9
GGC heuristics: --param ggc-min-expand=100 --param ggc-min-heapsize=131072
GNU Fortran (Ubuntu/Linaro 4.6.1-9ubuntu3) version 4.6.1 (x86_64-linux-gnu)
compiled by GNU C version 4.6.1, GMP version 5.0.1, MPFR version 3.0.1-p3, MPC version 0.9
GGC heuristics: --param ggc-min-expand=100 --param ggc-min-heapsize=131072
COMPILER_PATH=/usr/lib/gcc/x86_64-linux-gnu/4.6.1/:/usr/lib/gcc/x86_64-linux-gnu/4.6.1/:/usr/lib/gcc/x86_64-linux-gnu/:/usr/lib/gcc/x86_64-linux-gnu/4.6.1/:/usr/lib/gcc/x86_64-linux-gnu/
LIBRARY_PATH=/usr/lib/gcc/x86_64-linux-gnu/4.6.1/:/usr/lib/gcc/x86_64-linux-gnu/4.6.1/../../../x86_64-linux-gnu/:/usr/lib/gcc/x86_64-linux-gnu/4.6.1/../../../../lib/:/lib/x86_64-linux-gnu/:/lib/../lib/:/usr/lib/x86_64-linux-gnu/:/usr/lib/../lib/:/usr/lib/gcc/x86_64-linux-gnu/4.6.1/../../../:/lib/:/usr/lib/
COLLECT_GCC_OPTIONS='-v' '-save-temps' '-S' '-fverbose-asm' '-O3' '-march=native' '-o' 'slice_daxpy_O3_native.s'
module slice_daxpy
implicit none
contains
subroutine slice_daxpy_do (a, x, y, xstart, ystart, yend)
integer :: xstart, ystart, yend
double precision :: a, x(:), y(:)
integer :: xend
xend = xstart + (yend-ystart)
y(ystart:yend) = y(ystart:yend) + a * x(xstart:xend)
end subroutine
subroutine slice_daxpy_arrayintrinsics (a, x, y, xstart, ystart, yend)
integer :: xstart, ystart, yend
double precision :: a, x(:), y(:)
integer :: xpos, ypos, i
xpos = xstart
do ypos = ystart,yend
y(ypos) = y(ypos) + a * x(xpos)
xpos = xpos + 1
end do
end subroutine
end module
.file "slice_daxpy.f90"
# GNU Fortran (Ubuntu/Linaro 4.6.1-9ubuntu3) version 4.6.1 (x86_64-linux-gnu)
# compiled by GNU C version 4.6.1, GMP version 5.0.1, MPFR version 3.0.1-p3, MPC version 0.9
# GGC heuristics: --param ggc-min-expand=100 --param ggc-min-heapsize=131072
# options passed: slice_daxpy.f90 -march=bdver1 -mcx16 -msahf -mno-movbe
# -maes -mpclmul -mpopcnt -mabm -mlwp -mno-fma -mfma4 -mxop -mno-bmi
# -mno-tbm -mavx -msse4.2 -msse4.1 --param l1-cache-size=16
# --param l1-cache-line-size=64 --param l2-cache-size=2048 -mtune=bdver1
# -auxbase-strip slice_daxpy_O3_native.s -O3 -fverbose-asm
# -fintrinsic-modules-path /usr/lib/gcc/x86_64-linux-gnu/4.6.1/finclude
# options enabled: -fasynchronous-unwind-tables -fauto-inc-dec
# -fbranch-count-reg -fcaller-saves -fcombine-stack-adjustments -fcommon
# -fcompare-elim -fcprop-registers -fcrossjumping -fcse-follow-jumps
# -fdefer-pop -fdelete-null-pointer-checks -fdevirtualize -fdwarf2-cfi-asm
# -fearly-inlining -feliminate-unused-debug-types -fexpensive-optimizations
# -fforward-propagate -ffunction-cse -fgcse -fgcse-after-reload -fgcse-lm
# -fguess-branch-probability -fident -fif-conversion -fif-conversion2
# -findirect-inlining -finline -finline-functions
# -finline-functions-called-once -finline-small-functions -fipa-cp
# -fipa-cp-clone -fipa-profile -fipa-pure-const -fipa-reference -fipa-sra
# -fira-share-save-slots -fira-share-spill-slots -fivopts
# -fkeep-static-consts -fleading-underscore -fmath-errno -fmerge-constants
# -fmerge-debug-strings -fmove-loop-invariants -fomit-frame-pointer
# -foptimize-register-move -foptimize-sibling-calls -fpartial-inlining
# -fpeephole -fpeephole2 -fpredictive-commoning -fprefetch-loop-arrays
# -freg-struct-return -fregmove -freorder-blocks -freorder-functions
# -frerun-cse-after-loop -fsched-critical-path-heuristic
# -fsched-dep-count-heuristic -fsched-group-heuristic -fsched-interblock
# -fsched-last-insn-heuristic -fsched-rank-heuristic -fsched-spec
# -fsched-spec-insn-heuristic -fsched-stalled-insns-dep -fschedule-insns2
# -fshow-column -fsigned-zeros -fsplit-ivs-in-unroller -fsplit-wide-types
# -fstrict-aliasing -fstrict-overflow -fstrict-volatile-bitfields
# -fthread-jumps -ftoplevel-reorder -ftrapping-math -ftree-bit-ccp
# -ftree-builtin-call-dce -ftree-ccp -ftree-ch -ftree-copy-prop
# -ftree-copyrename -ftree-cselim -ftree-dce -ftree-dominator-opts
# -ftree-dse -ftree-forwprop -ftree-fre -ftree-loop-distribute-patterns
# -ftree-loop-if-convert -ftree-loop-im -ftree-loop-ivcanon
# -ftree-loop-optimize -ftree-parallelize-loops= -ftree-phiprop -ftree-pre
# -ftree-pta -ftree-reassoc -ftree-scev-cprop -ftree-sink
# -ftree-slp-vectorize -ftree-sra -ftree-switch-conversion -ftree-ter
# -ftree-vect-loop-version -ftree-vectorize -ftree-vrp -funit-at-a-time
# -funswitch-loops -funwind-tables -fvect-cost-model -fverbose-asm -fzee
# -fzero-initialized-in-bss -m128bit-long-double -m64 -m80387 -mabm
# -maccumulate-outgoing-args -maes -malign-stringops -mavx
# -mavx256-split-unaligned-store -mcx16 -mfancy-math-387 -mfma4
# -mfp-ret-in-387 -mglibc -mieee-fp -mlwp -mmmx -mpclmul -mpopcnt
# -mprefer-avx128 -mpush-args -mred-zone -msahf -msse -msse2 -msse3 -msse4
# -msse4.1 -msse4.2 -msse4a -mssse3 -mtls-direct-seg-refs -mvzeroupper
# -mxop
.text
.p2align 5,,31
.globl __slice_daxpy_MOD_slice_daxpy_arrayintrinsics
.type __slice_daxpy_MOD_slice_daxpy_arrayintrinsics, @function
__slice_daxpy_MOD_slice_daxpy_arrayintrinsics:
.LFB0:
.cfi_startproc
pushq %rbx #
.cfi_def_cfa_offset 16
.cfi_offset 3, -16
movq 24(%rdx), %r10 # y_5(D)->dim[0].stride, stride.5
movl $1, %r11d #, tmp155
movq 24(%rsi), %rax # x_22(D)->dim[0].stride, stride.1
movq (%rdx), %rbx # y_5(D)->data, y.0
movq (%rsi), %rdx # x_22(D)->data, x.0
movl (%r8), %esi # *ystart_41(D), ypos
movl (%rcx), %ecx # *xstart_39(D), xpos
testq %r10, %r10 # stride.5
cmove %r11, %r10 # stride.5,, tmp155, stride.5
testq %rax, %rax # stride.1
cmove %r11, %rax # stride.1,, tmp155, stride.1
movl (%r9), %r11d # *yend_43(D), D.1584
cmpl %r11d, %esi # D.1584, ypos
jg .L1 #,
vmovsd (%rdi), %xmm1 # *a_53(D), pretmp.53
movslq %ecx, %rcx # xpos, xpos
movslq %esi, %rdi # ypos, ypos
decq %rdi # tmp146
decq %rcx # tmp150
leaq 0(,%rax,8), %r8 #, D.1753
imulq %rax, %rcx # stride.1, tmp150
subl %esi, %r11d # ypos, tmp153
leaq 0(,%r10,8), %r9 #, D.1747
imulq %r10, %rdi # stride.5, tmp146
xorl %eax, %eax # ivtmp.61
leaq (%rdx,%rcx,8), %rdx #, ivtmp.68
leaq 1(%r11), %rcx #, D.1765
leaq (%rbx,%rdi,8), %rdi #, ivtmp.66
.p2align 5,,24
.p2align 3
.L5:
vmovsd (%rdx), %xmm2 # MEM[base: D.1759_116, offset: 0B],
incq %rax # ivtmp.61
addq %r8, %rdx # D.1753, ivtmp.68
vfmaddsd (%rdi), %xmm1, %xmm2, %xmm0 # MEM[base: D.1758_115, offset: 0B], pretmp.53,, tmp154
vmovsd %xmm0, (%rdi) # tmp154, MEM[base: D.1758_115, offset: 0B]
addq %r9, %rdi # D.1747, ivtmp.66
cmpq %rcx, %rax # D.1765, ivtmp.61
jne .L5 #,
.L1:
popq %rbx #
.cfi_def_cfa_offset 8
ret
.cfi_endproc
.LFE0:
.size __slice_daxpy_MOD_slice_daxpy_arrayintrinsics, .-__slice_daxpy_MOD_slice_daxpy_arrayintrinsics
.p2align 5,,31
.globl __slice_daxpy_MOD_slice_daxpy_do
.type __slice_daxpy_MOD_slice_daxpy_do, @function
__slice_daxpy_MOD_slice_daxpy_do:
.LFB1:
.cfi_startproc
pushq %rbx #
.cfi_def_cfa_offset 16
.cfi_offset 3, -16
movq 24(%rdx), %r10 # y_4(D)->dim[0].stride, stride.13
movl $1, %r11d #, tmp134
movq 24(%rsi), %rax # x_21(D)->dim[0].stride, stride.9
movq (%rdx), %rbx # y_4(D)->data, y.0
movq (%rsi), %rdx # x_21(D)->data, x.0
movslq (%r8), %rsi # *ystart_42(D), S.16
movslq (%r9), %r8 # *yend_40(D), D.1611
vmovsd (%rdi), %xmm1 # *a_60(D), D.1618
testq %r10, %r10 # stride.13
cmove %r11, %r10 # stride.13,, tmp134, stride.13
testq %rax, %rax # stride.9
cmove %r11, %rax # stride.9,, tmp134, stride.9
movslq (%rcx), %r11 # *xstart_38(D), D.1803
cmpq %r8, %rsi # D.1611, S.16
jg .L8 #,
leaq 1(%rsi), %rdi #, D.1810
decq %rsi # tmp128
leaq 0(,%r10,8), %r9 #, D.1795
imulq %r10, %rsi # stride.13, tmp128
leaq -1(%r11), %r10 #, tmp131
imulq %rax, %r10 # stride.9, tmp131
leaq (%rbx,%rsi,8), %rcx #, ivtmp.93
leaq 0(,%rax,8), %rsi #, D.1801
leaq (%rdx,%r10,8), %rax #, ivtmp.95
jmp .L12 #
.p2align 5,,7
.p2align 3
.L13:
movq %rdx, %rdi # D.1810, D.1810
.L12:
vmovsd (%rax), %xmm2 # MEM[base: D.1808_127, offset: 0B],
addq %rsi, %rax # D.1801, ivtmp.95
leaq 1(%rdi), %rdx #, D.1810
vfmaddsd (%rcx), %xmm1, %xmm2, %xmm0 # MEM[base: D.1807_126, offset: 0B], D.1618,, tmp133
vmovsd %xmm0, (%rcx) # tmp133, MEM[base: D.1807_126, offset: 0B]
addq %r9, %rcx # D.1795, ivtmp.93
cmpq %rdi, %r8 # D.1810, D.1611
jge .L13 #,
.L8:
popq %rbx #
.cfi_def_cfa_offset 8
ret
.cfi_endproc
.LFE1:
.size __slice_daxpy_MOD_slice_daxpy_do, .-__slice_daxpy_MOD_slice_daxpy_do
.ident "GCC: (Ubuntu/Linaro 4.6.1-9ubuntu3) 4.6.1"
.section .note.GNU-stack,"",@progbits