using vector extension in gcc slows down my code

Da Zheng <zhengda1936@xxxxxxxxx> · Wed, 10 Feb 2010 11:54:21 +0800

Hello all,

I recently learned that GCC has vector extension, so I tried to use this
extension to speed up the code as follow:
for (; v2 < (x_dim - startp1) / STRIDE_LEN ; v2++)
{
        double v;

        v = *(((double *) v25.valp) + (v1 + v25.startp[0] + y_dim) % y_dim *
x_dim + v2);
        v += *(((double *) v34.valp) + (v1 + v34.startp[0] + y_dim) % y_dim *
x_dim + v2);
        v += valp1[offset + v2 + startp1];
        v += valp2[offset + v2 + startp2 - x_dim];
        v *= (((int *)v19.valp)[(offset + v2)/32] >> (31 - (offset + v2) % 32)) & 1;
        v *= 0.25;
        v += ((double *) v21.valp)[offset + v2];
        ((double *) v18.valp)[offset + v2] = v;
        ((double *) v23.valp)[offset + v2] = fabs(((double *) v20.valp)[offset +
v2] - v);
}

It is transformed to:
for (; v2 < (x_dim - startp1) / STRIDE_LEN ; v2 += 2)
{
        register v2df_u v;
        register v2df_u t;

        v.v = __builtin_ia32_loadupd(((double *) v25.valp)
                        + (v1 + v25.startp[0] + y_dim) % y_dim * x_dim + v2);
        t.v = __builtin_ia32_loadupd(((double *) v34.valp)
                        + (v1 + v34.startp[0] + y_dim) % y_dim * x_dim + v2);
        v.v += t.v;
        t.v = __builtin_ia32_loadupd(&valp1[offset + startp1 + v2]);
        v.v += t.v;
        t.v = __builtin_ia32_loadupd(&valp2[offset + startp2 - x_dim + v2]);
        v.v += t.v;
        v.d[0] *= (((int *)v19.valp)[(offset + v2)/32] >> (31 - (offset + v2) %
32)) & 1;
        v.d[1] *= (((int *)v19.valp)[(offset + v2 + 1)/32] >> (31 - (offset + v2
+ 1) % 32)) & 1;
        v.d[0] *= 0.25;
        v.d[1] *= 0.25;
        t.v = __builtin_ia32_loadupd(((double *) v21.valp) + offset + v2);
        v.v += t.v;
        __builtin_ia32_storeupd(((double *) v18.valp) + offset + v2, v.v);
        t.v = __builtin_ia32_loadupd(((double *) v20.valp) + offset + v2);
        t.v -= v.v;
        *(((double *) v23.valp) + offset + v2) = fabs(t.d[0]);
        *(((double *) v23.valp) + offset + v2 + 1) = fabs(t.d[1]);
}

I measure the performance, but surprisingly, the transformed code is even
slower. Two pieces of code were compiled with -O3 enabled and they run on Intel
Core 2 with 2.4 GHz.

I don't understand how performance can be worse after I use vector extension.
I'm pretty sure the original code cannot be vectorized by GCC automatically. Is
it because the code uses SSE and FPU together? so data has to be moved from the
registers in SSE to FPU?

Best regards,
Zheng Da