Thanks for your reply. Here is an example: #include<arm_neon.h> void neon_convert (uint8_t * __restrict dest, uint8_t * __restrict src, int n) { int i; uint8x8_t rfac = vdup_n_u8 (77); uint8x8_t gfac = vdup_n_u8 (151); uint8x8_t bfac = vdup_n_u8 (28); n/=8; for (i=0; i<n; i++) { uint16x8_t temp; uint8x8x3_t rgb = vld3_u8 (src); uint8x8x3_t rgb1 = vld3_u8 (src+2); uint8x8_t result; temp = vmull_u8 (rgb1.val[0], rfac); temp = vmlal_u8 (temp,rgb.val[1], gfac); temp = vmlal_u8 (temp,rgb.val[2], bfac); result = vshrn_n_u16 (temp, 8); vst1_u8 (dest, result); src += 8*3; dest += 8; } } The above code on compilation produces the following output on objdump: Disassembly of section .text.neon_convert: 00000000 <neon_convert>: 0: e92d 0ff0 stmdb sp!, {r4, r5, r6, r7, r8, r9, sl, fp} 4: f102 0a07 add.w sl, r2, #7 ; 0x7 8: ea32 0222 bics.w r2, r2, r2, asr #32 c: bf28 it cs e: 4652 movcs r2, sl 10: f04f 034d mov.w r3, #77 ; 0x4d 14: eec5 3b90 vdup.8 d21, r3 18: f06f 0368 mvn.w r3, #104 ; 0x68 1c: ea4f 0ae2 mov.w sl, r2, asr #3 20: eec4 3b90 vdup.8 d20, r3 24: f1ba 0f00 cmp.w sl, #0 ; 0x0 28: f04f 031c mov.w r3, #28 ; 0x1c 2c: b09a sub sp, #104 2e: 4604 mov r4, r0 30: 468c mov ip, r1 32: eec3 3b90 vdup.8 d19, r3 36: dd3b ble.n b0 <neon_convert+0xb0> 38: 2500 movs r5, #0 3a: af08 add r7, sp, #32 3c: f10d 0950 add.w r9, sp, #80 ; 0x50 40: f10d 0808 add.w r8, sp, #8 ; 0x8 44: f10d 0b38 add.w fp, sp, #56 ; 0x38 48: f96c 040f vld3.8 {d16-d18}, [ip] 4c: 463e mov r6, r7 4e: ecc7 0b06 vstmia r7, {d16-d18} 52: 3501 adds r5, #1 54: ce0f ldmia r6!, {r0, r1, r2, r3} 56: 9601 str r6, [sp, #4] 58: 464e mov r6, r9 5a: c60f stmia r6!, {r0, r1, r2, r3} 5c: 9a01 ldr r2, [sp, #4] 5e: f10c 0302 add.w r3, ip, #2 ; 0x2 62: f10c 0c18 add.w ip, ip, #24 ; 0x18 66: e892 0003 ldmia.w r2, {r0, r1} 6a: e886 0003 stmia.w r6, {r0, r1} 6e: f963 040f vld3.8 {d16-d18}, [r3] 72: ecc8 0b06 vstmia r8, {d16-d18} 76: 4646 mov r6, r8 78: eddd 2b16 vldr d18, [sp, #88] 7c: ce0f ldmia r6!, {r0, r1, r2, r3} 7e: 9601 str r6, [sp, #4] 80: 465e mov r6, fp 82: c60f stmia r6!, {r0, r1, r2, r3} 84: eddd 0b0e vldr d16, [sp, #56] 88: 9a01 ldr r2, [sp, #4] 8a: ffc0 0ca5 vmull.u8 q8, d16, d21 8e: ffc2 08a4 vmlal.u8 q8, d18, d20 92: eddd 2b18 vldr d18, [sp, #96] 96: e892 0003 ldmia.w r2, {r0, r1} 9a: ffc2 08a3 vmlal.u8 q8, d18, d19 9e: efc8 0830 vshrn.i16 d16, q8, #8 a2: e886 0003 stmia.w r6, {r0, r1} a6: f944 070f vst1.8 {d16}, [r4] aa: 3408 adds r4, #8 ac: 45aa cmp sl, r5 ae: dccb bgt.n 48 <neon_convert+0x48> b0: b01a add sp, #104 b2: e8bd 0ff0 ldmia.w sp!, {r4, r5, r6, r7, r8, r9, sl, fp} b6: 4770 bx lr On Fri, Jul 16, 2010 at 8:18 PM, Richard Earnshaw <rearnsha@xxxxxxx> wrote: > > > On Fri, 2010-07-09 at 16:59 +0530, Ajmal Ahammed wrote: > > Sir, > > > > My name is Ajmal A. I was compiling a program with neon > > intrinsics for arm cortex A-8. When I examined the output code, I > > observed that, among the available 32 d-registers of neon, the program > > uses only those from d16 (either if it's used as double word registers > > or quad word registers). And another problem is with the vld > > instructions. For vld instruction, it always uses the registers d16, > > d17, .... Even if there are free registers like d26,d27, etc it pushes > > the contents of the registers d16, d17,... to the stack. > > > > > > Without a testcase it is impossible to tell for sure what is happening. > Please post an example. > > > > I am using gcc 4.4.0, compiled using android-ndk-r4. > > > > For compiling I used the following command: > > /home/master/ajmal/ndk/build/ > > prebuilt/linux-x86/arm-eabi-4.4.0/bin/arm-eabi-gcc > > -I/home/master/ajmal/ndk/build/platforms/android-3/arch-arm/usr/include > > -fpic -mthumb-interwork -ffunction-sections -funwind-tables > > -fstack-protector -fno-short-enums -D__ARM_ARCH_5__ -D__ARM_ARCH_5T__ > > -D__ARM_ARCH_5E__ -D__ARM_ARCH_5TE__ -Wno-psabi -march=armv7-a > > -mfloat-abi=softfp -mthumb -Os -fomit-frame-pointer > > -fno-strict-aliasing -finline-limit=64 -mfpu=neon > > -I/home/master/ajmal/ndk/sources/cpufeatures > > -I/home/master/ajmal/ndk/samples/hello-neon/jni -DANDROID > > -DHAVE_NEON=1 -Wa,--noexecstack -O3 -DNDEBUG -g -c -MMD -MP -MF > > /home/master/ajmal/ndk/samples/hello-neon/bin/ndk/local/armeabi-v7a/objs/helloneon/$1.o.d > > /home/master/ajmal/ndk/samples/hello-neon/jni/$1.c -o > > /home/master/ajmal/ndk/samples/hello-neon/bin/ndk/local/armeabi-v7a/objs/helloneon/$1.o > > > > where $1 is the filename without the extension. > > > > I have tried the latest version of gcc (4.5.0) also, with no better results. > > > > Can you please tell me if there is a special reason for not using the > > other d registers, or for the stack operations. I have gone through > > the gcc manuals and some files gcc sources, but I couldn't clarify the > > doubt. > > > > (My language proficiency is not so good, and this is the first time I > > am writing an e-mail for such an authority. I am sorry for any > > mistakes due to them). > > > > Thank you in advance. > > Ajmal. A >