hi, i'm trying to read 128-bits of unaligned data in one atomic move using 'movups' opcode. the protoype with _mm_loadu_ps was optimized out by the compiler. $ gcc46 hw_reg.c -Wall -c -O2 -m64 --save-temps hw_reg.c: In function 'read_mapped_register_128': hw_reg.c:19:2: warning: passing argument 1 of '_mm_loadu_ps' from incompatible pointer type [enabled by default] /opt/gcc46/lib/gcc/x86_64-unknown-linux-gnu/4.6.0/include/xmmintrin.h:904:1: note: expected 'const float *' but argument is of type 'const volatile __vector(4) float *' i've tested another idea with intermediate volatile value: static __m128 read_mapped_register_128( __m128 volatile const* address, ptrdiff_t index ) { __m128 volatile const* p = address + index; __m128 volatile const v = _mm_loadu_ps( p ); return v; } but it generates 3 moves while one is enough: unused_read_128_with_side_effects: salq $4, %rsi movups (%rdi,%rsi), %xmm0 movaps %xmm0, -24(%rsp) movaps -24(%rsp), %xmm0 ret is there a nice solution in C language for such (one-move) access? BR, Pawel.
#include <stddef.h> #include <xmmintrin.h> static unsigned read_mapped_register_32( unsigned volatile const* address, ptrdiff_t index ) { unsigned volatile const* p = address + index; return *p; } static __m64 read_mapped_register_64( __m64 volatile const* address, ptrdiff_t index ) { __m64 volatile const* p = address + index; return *p; } static __m128 read_mapped_register_128( __m128 volatile const* address, ptrdiff_t index ) { __m128 volatile const* p = address + index; return _mm_loadu_ps( p ); } void unused_read_32_with_side_effects( unsigned volatile const* address, ptrdiff_t index ) { read_mapped_register_32( address, index ); } void unused_read_64_with_side_effects( __m64 volatile const* address, ptrdiff_t index ) { read_mapped_register_64( address, index );} void unused_read_128_with_side_effects( __m128 volatile const* address, ptrdiff_t index ) { read_mapped_register_128( address, index ); }
.file "hw_reg.c" .text .p2align 4,,15 .globl unused_read_32_with_side_effects .type unused_read_32_with_side_effects, @function unused_read_32_with_side_effects: .LFB519: .cfi_startproc leaq (%rdi,%rsi,4), %rax movl (%rax), %eax ret .cfi_endproc .LFE519: .size unused_read_32_with_side_effects, .-unused_read_32_with_side_effects .p2align 4,,15 .globl unused_read_64_with_side_effects .type unused_read_64_with_side_effects, @function unused_read_64_with_side_effects: .LFB520: .cfi_startproc leaq (%rdi,%rsi,8), %rax movq (%rax), %rax ret .cfi_endproc .LFE520: .size unused_read_64_with_side_effects, .-unused_read_64_with_side_effects .p2align 4,,15 .globl unused_read_128_with_side_effects .type unused_read_128_with_side_effects, @function unused_read_128_with_side_effects: .LFB521: .cfi_startproc rep ret .cfi_endproc .LFE521: .size unused_read_128_with_side_effects, .-unused_read_128_with_side_effects .ident "GCC: (GNU) 4.6.0 20110122 (experimental)" .section .note.GNU-stack,"",@progbits