How to make parallelizing loops and vectorization work at the same time?

Hanke Zhang via Gcc-help <gcc-help@xxxxxxxxxxx> · Fri, 15 Sep 2023 17:29:36 +0800

Hi I'm trying to accelerate my program with -ftree-vectorize and
-ftree-parallelize-loops.

Here are my test results using the different options (based on
gcc10.3.0 on i9-12900KF):
gcc-10 test.c -O3 -flto
time: 29000ms
gcc-10 test.c -O3 -flto -mavx2 -ftree-vectorize
time: 17000ms
gcc-10 test.c -O3 -flto -ftree-parallelize-loops=24
time: 5000ms
gcc-10 test.c -O3 -flto -ftree-parallelize-loops=24 -mavx2 -ftree-vectorize
time: 5000ms

I found that these two options do not work at the same time, that is,
if I use the `-ftree-vectorize` option alone, it can bring a big
efficiency gain compared to doing nothing; At the same time, if I use
the option of `-ftree-parallelize-loops` alone, it will also bring a
big efficiency gain. But if I use both options, vectorization fails,
that is, I can't get the benefits of vectorization, I can only get the
benefits of parallelizing loops.

I know that the reason may be that after parallelizing the loop,
vectorization cannot be performed, but is there any way I can reap the
benefits of both optimizations?

Here is my example program, adapted from the 462.libquantum in speccpu2006:

```
#include <stdio.h>
#include <stdlib.h>
#include <time.h>

#define MAX_UNSIGNED unsigned long long

struct quantum_reg_node_struct {
    float _Complex *amplitude; /* alpha_j */
    MAX_UNSIGNED *state;       /* j */
};

typedef struct quantum_reg_node_struct quantum_reg_node;

struct quantum_reg_struct {
    int width; /* number of qubits in the qureg */
    int size;  /* number of non-zero vectors */
    int hashw; /* width of the hash array */
    quantum_reg_node *node;
    int *hash;
};

typedef struct quantum_reg_struct quantum_reg;

void quantum_toffoli(int control1, int control2, int target, quantum_reg *reg) {
    for (int i = 0; i < reg->size; i++) {
        if (reg->node->state[i] & ((MAX_UNSIGNED)1 << control1)) {
            if (reg->node->state[i] & ((MAX_UNSIGNED)1 << control2))  {
                reg->node->state[i] ^= ((MAX_UNSIGNED)1 << target);
            }
        }
    }
}

int get_random() {
    return rand() % 64;
}

void init(quantum_reg *reg) {
    reg->size = 2097152;
    for (int i = 0; i < reg->size; i++)  {
        reg->node = (quantum_reg_node *)malloc(sizeof(quantum_reg_node));
        reg->node->state = (MAX_UNSIGNED *)malloc(sizeof(MAX_UNSIGNED)
* reg->size);
        reg->node->amplitude = (float _Complex *)malloc(sizeof(float
_Complex) * reg->size);
        if (i >= 1) break;
    }
    for (int i = 0; i < reg->size; i++)  {
        reg->node->amplitude[i] = 0;
        reg->node->state[i] = 0;
    }
}

int main() {
    quantum_reg reg;
    init(&reg);
    for (int i = 0; i < 65000; i++) {
        quantum_toffoli(get_random(), get_random(), get_random(), &reg);
    }
}
```

Thanks so much.