Hi I'm trying to accelerate my program with -ftree-vectorize and -ftree-parallelize-loops. Here are my test results using the different options (based on gcc10.3.0 on i9-12900KF): gcc-10 test.c -O3 -flto time: 29000ms gcc-10 test.c -O3 -flto -mavx2 -ftree-vectorize time: 17000ms gcc-10 test.c -O3 -flto -ftree-parallelize-loops=24 time: 5000ms gcc-10 test.c -O3 -flto -ftree-parallelize-loops=24 -mavx2 -ftree-vectorize time: 5000ms I found that these two options do not work at the same time, that is, if I use the `-ftree-vectorize` option alone, it can bring a big efficiency gain compared to doing nothing; At the same time, if I use the option of `-ftree-parallelize-loops` alone, it will also bring a big efficiency gain. But if I use both options, vectorization fails, that is, I can't get the benefits of vectorization, I can only get the benefits of parallelizing loops. I know that the reason may be that after parallelizing the loop, vectorization cannot be performed, but is there any way I can reap the benefits of both optimizations? Here is my example program, adapted from the 462.libquantum in speccpu2006: ``` #include <stdio.h> #include <stdlib.h> #include <time.h> #define MAX_UNSIGNED unsigned long long struct quantum_reg_node_struct { float _Complex *amplitude; /* alpha_j */ MAX_UNSIGNED *state; /* j */ }; typedef struct quantum_reg_node_struct quantum_reg_node; struct quantum_reg_struct { int width; /* number of qubits in the qureg */ int size; /* number of non-zero vectors */ int hashw; /* width of the hash array */ quantum_reg_node *node; int *hash; }; typedef struct quantum_reg_struct quantum_reg; void quantum_toffoli(int control1, int control2, int target, quantum_reg *reg) { for (int i = 0; i < reg->size; i++) { if (reg->node->state[i] & ((MAX_UNSIGNED)1 << control1)) { if (reg->node->state[i] & ((MAX_UNSIGNED)1 << control2)) { reg->node->state[i] ^= ((MAX_UNSIGNED)1 << target); } } } } int get_random() { return rand() % 64; } void init(quantum_reg *reg) { reg->size = 2097152; for (int i = 0; i < reg->size; i++) { reg->node = (quantum_reg_node *)malloc(sizeof(quantum_reg_node)); reg->node->state = (MAX_UNSIGNED *)malloc(sizeof(MAX_UNSIGNED) * reg->size); reg->node->amplitude = (float _Complex *)malloc(sizeof(float _Complex) * reg->size); if (i >= 1) break; } for (int i = 0; i < reg->size; i++) { reg->node->amplitude[i] = 0; reg->node->state[i] = 0; } } int main() { quantum_reg reg; init(®); for (int i = 0; i < 65000; i++) { quantum_toffoli(get_random(), get_random(), get_random(), ®); } } ``` Thanks so much.