Hello all, I recently learned that GCC has vector extension, so I tried to use this extension to speed up the code as follow: for (; v2 < (x_dim - startp1) / STRIDE_LEN ; v2++) { double v; v = *(((double *) v25.valp) + (v1 + v25.startp[0] + y_dim) % y_dim * x_dim + v2); v += *(((double *) v34.valp) + (v1 + v34.startp[0] + y_dim) % y_dim * x_dim + v2); v += valp1[offset + v2 + startp1]; v += valp2[offset + v2 + startp2 - x_dim]; v *= (((int *)v19.valp)[(offset + v2)/32] >> (31 - (offset + v2) % 32)) & 1; v *= 0.25; v += ((double *) v21.valp)[offset + v2]; ((double *) v18.valp)[offset + v2] = v; ((double *) v23.valp)[offset + v2] = fabs(((double *) v20.valp)[offset + v2] - v); } It is transformed to: for (; v2 < (x_dim - startp1) / STRIDE_LEN ; v2 += 2) { register v2df_u v; register v2df_u t; v.v = __builtin_ia32_loadupd(((double *) v25.valp) + (v1 + v25.startp[0] + y_dim) % y_dim * x_dim + v2); t.v = __builtin_ia32_loadupd(((double *) v34.valp) + (v1 + v34.startp[0] + y_dim) % y_dim * x_dim + v2); v.v += t.v; t.v = __builtin_ia32_loadupd(&valp1[offset + startp1 + v2]); v.v += t.v; t.v = __builtin_ia32_loadupd(&valp2[offset + startp2 - x_dim + v2]); v.v += t.v; v.d[0] *= (((int *)v19.valp)[(offset + v2)/32] >> (31 - (offset + v2) % 32)) & 1; v.d[1] *= (((int *)v19.valp)[(offset + v2 + 1)/32] >> (31 - (offset + v2 + 1) % 32)) & 1; v.d[0] *= 0.25; v.d[1] *= 0.25; t.v = __builtin_ia32_loadupd(((double *) v21.valp) + offset + v2); v.v += t.v; __builtin_ia32_storeupd(((double *) v18.valp) + offset + v2, v.v); t.v = __builtin_ia32_loadupd(((double *) v20.valp) + offset + v2); t.v -= v.v; *(((double *) v23.valp) + offset + v2) = fabs(t.d[0]); *(((double *) v23.valp) + offset + v2 + 1) = fabs(t.d[1]); } I measure the performance, but surprisingly, the transformed code is even slower. Two pieces of code were compiled with -O3 enabled and they run on Intel Core 2 with 2.4 GHz. I don't understand how performance can be worse after I use vector extension. I'm pretty sure the original code cannot be vectorized by GCC automatically. Is it because the code uses SSE and FPU together? so data has to be moved from the registers in SSE to FPU? Best regards, Zheng Da