Array intrinsics vs do loops in gfortran

Chris Kerr <cjk34@xxxxxxxxx> · Sun, 27 May 2012 20:27:28 +0100

Hi,

I'm working on RMCProfile ( http://www.rmcprofile.org ), trying to reduce the 
run time (which is on the order of 3 CPU years for my datasets). I've done 
some profiling and about 75% of the runtime is spent executing an operation of 
the form y(p:q) += a * x(r:s) [where q-p == s-r] - i.e. DAXPY but on slices of 
the original y and x arrays.

I've tried implementing this as a do loop and using Fortran array intrinsics - 
the version using the do loop is significantly faster (the overall running time 
of the program on a reduced problem set taking ~1 hour is 7% faster). Looking 
at the generated assembly, the do loop uses 
incq	%rax
cmpq	%rcx, %rax
jne	.L5
to increment and check the loop counter, whereas the array intrinsic uses
movq	%rdx, %rdi
leaq	1(%rdi), %rdx
cmpq	%rdi, %r8
jge	.L13

The same pattern can be reproduced using much simpler source files - I've 
attached the fortran source and the assembly output. 
NB as far as I can tell, none of the above rax, rdx or rdi play any part in 
their respective calculations, they function purely as counters.

I have two questions about the above behaviour:
1) Why does the array intrinsic method use an extra instruction compared to 
the do loop? Is there any way of stopping this?
2) Is there any way of getting these loops to use SIMD instructions such as 
vfmaddpd? Currently even when -march=native is switched on, the loop only uses 
the scalar instruction vfmaddsd. I'd rather not have to hand-code an unrolled 
loop, especially as I'm more used to C and Python so there would probably be 
off-by-one errors all over the place on my first ten tries.

Thanks in advance,

Chris Kerr
Department of Chemistry
University of Cambridge
gfortran -v -save-temps -S -fverbose-asm -O3 -march=native slice_daxpy.f90 -o slice_daxpy_O3_native.s
Using built-in specs.
COLLECT_GCC=gfortran
COLLECT_LTO_WRAPPER=/usr/lib/gcc/x86_64-linux-gnu/4.6.1/lto-wrapper
Target: x86_64-linux-gnu
Configured with: ../src/configure -v --with-pkgversion='Ubuntu/Linaro 4.6.1-9ubuntu3' --with-bugurl=file:///usr/share/doc/gcc-4.6/README.Bugs --enable-languages=c,c++,fortran,objc,obj-c++,go --prefix=/usr --program-suffix=-4.6 --enable-shared --enable-linker-build-id --with-system-zlib --libexecdir=/usr/lib --without-included-gettext --enable-threads=posix --with-gxx-include-dir=/usr/include/c++/4.6 --libdir=/usr/lib --enable-nls --with-sysroot=/ --enable-clocale=gnu --enable-libstdcxx-debug --enable-libstdcxx-time=yes --enable-plugin --enable-objc-gc --disable-werror --with-arch-32=i686 --with-tune=generic --enable-checking=release --build=x86_64-linux-gnu --host=x86_64-linux-gnu --target=x86_64-linux-gnu
Thread model: posix
gcc version 4.6.1 (Ubuntu/Linaro 4.6.1-9ubuntu3) 
COLLECT_GCC_OPTIONS='-v' '-save-temps' '-S' '-fverbose-asm' '-O3' '-march=native' '-o' 'slice_daxpy_O3_native.s'
 /usr/lib/gcc/x86_64-linux-gnu/4.6.1/f951 slice_daxpy.f90 -march=bdver1 -mcx16 -msahf -mno-movbe -maes -mpclmul -mpopcnt -mabm -mlwp -mno-fma -mfma4 -mxop -mno-bmi -mno-tbm -mavx -msse4.2 -msse4.1 --param l1-cache-size=16 --param l1-cache-line-size=64 --param l2-cache-size=2048 -mtune=bdver1 -quiet -dumpbase slice_daxpy.f90 -auxbase-strip slice_daxpy_O3_native.s -O3 -version -fverbose-asm -o slice_daxpy_O3_native.s -fintrinsic-modules-path /usr/lib/gcc/x86_64-linux-gnu/4.6.1/finclude
GNU Fortran (Ubuntu/Linaro 4.6.1-9ubuntu3) version 4.6.1 (x86_64-linux-gnu)
	compiled by GNU C version 4.6.1, GMP version 5.0.1, MPFR version 3.0.1-p3, MPC version 0.9
GGC heuristics: --param ggc-min-expand=100 --param ggc-min-heapsize=131072
GNU Fortran (Ubuntu/Linaro 4.6.1-9ubuntu3) version 4.6.1 (x86_64-linux-gnu)
	compiled by GNU C version 4.6.1, GMP version 5.0.1, MPFR version 3.0.1-p3, MPC version 0.9
GGC heuristics: --param ggc-min-expand=100 --param ggc-min-heapsize=131072
COMPILER_PATH=/usr/lib/gcc/x86_64-linux-gnu/4.6.1/:/usr/lib/gcc/x86_64-linux-gnu/4.6.1/:/usr/lib/gcc/x86_64-linux-gnu/:/usr/lib/gcc/x86_64-linux-gnu/4.6.1/:/usr/lib/gcc/x86_64-linux-gnu/
LIBRARY_PATH=/usr/lib/gcc/x86_64-linux-gnu/4.6.1/:/usr/lib/gcc/x86_64-linux-gnu/4.6.1/../../../x86_64-linux-gnu/:/usr/lib/gcc/x86_64-linux-gnu/4.6.1/../../../../lib/:/lib/x86_64-linux-gnu/:/lib/../lib/:/usr/lib/x86_64-linux-gnu/:/usr/lib/../lib/:/usr/lib/gcc/x86_64-linux-gnu/4.6.1/../../../:/lib/:/usr/lib/
COLLECT_GCC_OPTIONS='-v' '-save-temps' '-S' '-fverbose-asm' '-O3' '-march=native' '-o' 'slice_daxpy_O3_native.s'
module slice_daxpy
implicit none
contains

subroutine slice_daxpy_do (a, x, y, xstart, ystart, yend)
  integer :: xstart, ystart, yend
  double precision :: a, x(:), y(:)

  integer :: xend

  xend = xstart + (yend-ystart)
  y(ystart:yend) = y(ystart:yend) + a * x(xstart:xend)

end subroutine

subroutine slice_daxpy_arrayintrinsics (a, x, y, xstart, ystart, yend)
  integer :: xstart, ystart, yend
  double precision :: a, x(:), y(:)

  integer :: xpos, ypos, i

  xpos = xstart
  do ypos = ystart,yend
    y(ypos) = y(ypos) + a * x(xpos)
    xpos = xpos + 1
  end do

end subroutine

end module
	.file	"slice_daxpy.f90"
# GNU Fortran (Ubuntu/Linaro 4.6.1-9ubuntu3) version 4.6.1 (x86_64-linux-gnu)
#	compiled by GNU C version 4.6.1, GMP version 5.0.1, MPFR version 3.0.1-p3, MPC version 0.9
# GGC heuristics: --param ggc-min-expand=100 --param ggc-min-heapsize=131072
# options passed:  slice_daxpy.f90 -march=bdver1 -mcx16 -msahf -mno-movbe
# -maes -mpclmul -mpopcnt -mabm -mlwp -mno-fma -mfma4 -mxop -mno-bmi
# -mno-tbm -mavx -msse4.2 -msse4.1 --param l1-cache-size=16
# --param l1-cache-line-size=64 --param l2-cache-size=2048 -mtune=bdver1
# -auxbase-strip slice_daxpy_O3_native.s -O3 -fverbose-asm
# -fintrinsic-modules-path /usr/lib/gcc/x86_64-linux-gnu/4.6.1/finclude
# options enabled:  -fasynchronous-unwind-tables -fauto-inc-dec
# -fbranch-count-reg -fcaller-saves -fcombine-stack-adjustments -fcommon
# -fcompare-elim -fcprop-registers -fcrossjumping -fcse-follow-jumps
# -fdefer-pop -fdelete-null-pointer-checks -fdevirtualize -fdwarf2-cfi-asm
# -fearly-inlining -feliminate-unused-debug-types -fexpensive-optimizations
# -fforward-propagate -ffunction-cse -fgcse -fgcse-after-reload -fgcse-lm
# -fguess-branch-probability -fident -fif-conversion -fif-conversion2
# -findirect-inlining -finline -finline-functions
# -finline-functions-called-once -finline-small-functions -fipa-cp
# -fipa-cp-clone -fipa-profile -fipa-pure-const -fipa-reference -fipa-sra
# -fira-share-save-slots -fira-share-spill-slots -fivopts
# -fkeep-static-consts -fleading-underscore -fmath-errno -fmerge-constants
# -fmerge-debug-strings -fmove-loop-invariants -fomit-frame-pointer
# -foptimize-register-move -foptimize-sibling-calls -fpartial-inlining
# -fpeephole -fpeephole2 -fpredictive-commoning -fprefetch-loop-arrays
# -freg-struct-return -fregmove -freorder-blocks -freorder-functions
# -frerun-cse-after-loop -fsched-critical-path-heuristic
# -fsched-dep-count-heuristic -fsched-group-heuristic -fsched-interblock
# -fsched-last-insn-heuristic -fsched-rank-heuristic -fsched-spec
# -fsched-spec-insn-heuristic -fsched-stalled-insns-dep -fschedule-insns2
# -fshow-column -fsigned-zeros -fsplit-ivs-in-unroller -fsplit-wide-types
# -fstrict-aliasing -fstrict-overflow -fstrict-volatile-bitfields
# -fthread-jumps -ftoplevel-reorder -ftrapping-math -ftree-bit-ccp
# -ftree-builtin-call-dce -ftree-ccp -ftree-ch -ftree-copy-prop
# -ftree-copyrename -ftree-cselim -ftree-dce -ftree-dominator-opts
# -ftree-dse -ftree-forwprop -ftree-fre -ftree-loop-distribute-patterns
# -ftree-loop-if-convert -ftree-loop-im -ftree-loop-ivcanon
# -ftree-loop-optimize -ftree-parallelize-loops= -ftree-phiprop -ftree-pre
# -ftree-pta -ftree-reassoc -ftree-scev-cprop -ftree-sink
# -ftree-slp-vectorize -ftree-sra -ftree-switch-conversion -ftree-ter
# -ftree-vect-loop-version -ftree-vectorize -ftree-vrp -funit-at-a-time
# -funswitch-loops -funwind-tables -fvect-cost-model -fverbose-asm -fzee
# -fzero-initialized-in-bss -m128bit-long-double -m64 -m80387 -mabm
# -maccumulate-outgoing-args -maes -malign-stringops -mavx
# -mavx256-split-unaligned-store -mcx16 -mfancy-math-387 -mfma4
# -mfp-ret-in-387 -mglibc -mieee-fp -mlwp -mmmx -mpclmul -mpopcnt
# -mprefer-avx128 -mpush-args -mred-zone -msahf -msse -msse2 -msse3 -msse4
# -msse4.1 -msse4.2 -msse4a -mssse3 -mtls-direct-seg-refs -mvzeroupper
# -mxop

	.text
	.p2align 5,,31
	.globl	__slice_daxpy_MOD_slice_daxpy_arrayintrinsics
	.type	__slice_daxpy_MOD_slice_daxpy_arrayintrinsics, @function
__slice_daxpy_MOD_slice_daxpy_arrayintrinsics:
.LFB0:
	.cfi_startproc
	pushq	%rbx	#
	.cfi_def_cfa_offset 16
	.cfi_offset 3, -16
	movq	24(%rdx), %r10	# y_5(D)->dim[0].stride, stride.5
	movl	$1, %r11d	#, tmp155
	movq	24(%rsi), %rax	# x_22(D)->dim[0].stride, stride.1
	movq	(%rdx), %rbx	# y_5(D)->data, y.0
	movq	(%rsi), %rdx	# x_22(D)->data, x.0
	movl	(%r8), %esi	# *ystart_41(D), ypos
	movl	(%rcx), %ecx	# *xstart_39(D), xpos
	testq	%r10, %r10	# stride.5
	cmove	%r11, %r10	# stride.5,, tmp155, stride.5
	testq	%rax, %rax	# stride.1
	cmove	%r11, %rax	# stride.1,, tmp155, stride.1
	movl	(%r9), %r11d	# *yend_43(D), D.1584
	cmpl	%r11d, %esi	# D.1584, ypos
	jg	.L1	#,
	vmovsd	(%rdi), %xmm1	# *a_53(D), pretmp.53
	movslq	%ecx, %rcx	# xpos, xpos
	movslq	%esi, %rdi	# ypos, ypos
	decq	%rdi	# tmp146
	decq	%rcx	# tmp150
	leaq	0(,%rax,8), %r8	#, D.1753
	imulq	%rax, %rcx	# stride.1, tmp150
	subl	%esi, %r11d	# ypos, tmp153
	leaq	0(,%r10,8), %r9	#, D.1747
	imulq	%r10, %rdi	# stride.5, tmp146
	xorl	%eax, %eax	# ivtmp.61
	leaq	(%rdx,%rcx,8), %rdx	#, ivtmp.68
	leaq	1(%r11), %rcx	#, D.1765
	leaq	(%rbx,%rdi,8), %rdi	#, ivtmp.66
	.p2align 5,,24
	.p2align 3
.L5:
	vmovsd	(%rdx), %xmm2	# MEM[base: D.1759_116, offset: 0B],
	incq	%rax	# ivtmp.61
	addq	%r8, %rdx	# D.1753, ivtmp.68
	vfmaddsd	(%rdi), %xmm1, %xmm2, %xmm0	# MEM[base: D.1758_115, offset: 0B], pretmp.53,, tmp154
	vmovsd	%xmm0, (%rdi)	# tmp154, MEM[base: D.1758_115, offset: 0B]
	addq	%r9, %rdi	# D.1747, ivtmp.66
	cmpq	%rcx, %rax	# D.1765, ivtmp.61
	jne	.L5	#,
.L1:
	popq	%rbx	#
	.cfi_def_cfa_offset 8
	ret
	.cfi_endproc
.LFE0:
	.size	__slice_daxpy_MOD_slice_daxpy_arrayintrinsics, .-__slice_daxpy_MOD_slice_daxpy_arrayintrinsics
	.p2align 5,,31
	.globl	__slice_daxpy_MOD_slice_daxpy_do
	.type	__slice_daxpy_MOD_slice_daxpy_do, @function
__slice_daxpy_MOD_slice_daxpy_do:
.LFB1:
	.cfi_startproc
	pushq	%rbx	#
	.cfi_def_cfa_offset 16
	.cfi_offset 3, -16
	movq	24(%rdx), %r10	# y_4(D)->dim[0].stride, stride.13
	movl	$1, %r11d	#, tmp134
	movq	24(%rsi), %rax	# x_21(D)->dim[0].stride, stride.9
	movq	(%rdx), %rbx	# y_4(D)->data, y.0
	movq	(%rsi), %rdx	# x_21(D)->data, x.0
	movslq	(%r8), %rsi	# *ystart_42(D), S.16
	movslq	(%r9), %r8	# *yend_40(D), D.1611
	vmovsd	(%rdi), %xmm1	# *a_60(D), D.1618
	testq	%r10, %r10	# stride.13
	cmove	%r11, %r10	# stride.13,, tmp134, stride.13
	testq	%rax, %rax	# stride.9
	cmove	%r11, %rax	# stride.9,, tmp134, stride.9
	movslq	(%rcx), %r11	# *xstart_38(D), D.1803
	cmpq	%r8, %rsi	# D.1611, S.16
	jg	.L8	#,
	leaq	1(%rsi), %rdi	#, D.1810
	decq	%rsi	# tmp128
	leaq	0(,%r10,8), %r9	#, D.1795
	imulq	%r10, %rsi	# stride.13, tmp128
	leaq	-1(%r11), %r10	#, tmp131
	imulq	%rax, %r10	# stride.9, tmp131
	leaq	(%rbx,%rsi,8), %rcx	#, ivtmp.93
	leaq	0(,%rax,8), %rsi	#, D.1801
	leaq	(%rdx,%r10,8), %rax	#, ivtmp.95
	jmp	.L12	#
	.p2align 5,,7
	.p2align 3
.L13:
	movq	%rdx, %rdi	# D.1810, D.1810
.L12:
	vmovsd	(%rax), %xmm2	# MEM[base: D.1808_127, offset: 0B],
	addq	%rsi, %rax	# D.1801, ivtmp.95
	leaq	1(%rdi), %rdx	#, D.1810
	vfmaddsd	(%rcx), %xmm1, %xmm2, %xmm0	# MEM[base: D.1807_126, offset: 0B], D.1618,, tmp133
	vmovsd	%xmm0, (%rcx)	# tmp133, MEM[base: D.1807_126, offset: 0B]
	addq	%r9, %rcx	# D.1795, ivtmp.93
	cmpq	%rdi, %r8	# D.1810, D.1611
	jge	.L13	#,
.L8:
	popq	%rbx	#
	.cfi_def_cfa_offset 8
	ret
	.cfi_endproc
.LFE1:
	.size	__slice_daxpy_MOD_slice_daxpy_do, .-__slice_daxpy_MOD_slice_daxpy_do
	.ident	"GCC: (Ubuntu/Linaro 4.6.1-9ubuntu3) 4.6.1"
	.section	.note.GNU-stack,"",@progbits