Hi I see that my program works fine in the debug mode, but not in the release mode. With GDB I was able to find the function that got error. The code looks like std::vector<std::vector<inst_trace_t> *> threadblock_traces; ... printf("hello %d\n",threadblock_traces.size()); trace_kernel.get_next_threadblock_traces(threadblock_traces); At the printf(), I see the size is 4. So, the vector is not empty. According to GDB, the backtrace is Program received signal SIGSEGV, Segmentation fault. __memmove_avx_unaligned_erms () at ../sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S:384 384 ../sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S: No such file or directory. (gdb) bt #0 __memmove_avx_unaligned_erms () at ../sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S:384 #1 0x0000555555569849 in std::__copy_move<false, true, std::random_access_iterator_tag>::__copy_m<std::vector<inst_trace_t, std::allocator<inst_trace_t> >*> (__result=<optimized out>, __last=<optimized out>, __first=<optimized out>) at /usr/include/c++/9/bits/stl_algobase.h:465 #2 std::__copy_move_a<false, std::vector<inst_trace_t, std::allocator<inst_trace_t> >* const*, std::vector<inst_trace_t, std::allocator<inst_trace_t> >**> (__result=<optimized out>, __last=<optimized out>, __first=<optimized out>) at /usr/include/c++/9/bits/stl_algobase.h:404 #3 std::__copy_move_a2<false, __gnu_cxx::__normal_iterator<std::vector<inst_trace_t, std::allocator<inst_trace_t> >* const*, std::vector<std::vector<inst_trace_t, std::allocator<inst_trace_t> >*, std::allocator<std::vector<inst_trace_t, std::allocator<inst_trace_t> >*> > >, std::vector<inst_trace_t, std::allocator<inst_trace_t> >**> (__result=<optimized out>, __last=..., __first=...) at /usr/include/c++/9/bits/stl_algobase.h:440 #4 std::copy<__gnu_cxx::__normal_iterator<std::vector<inst_trace_t, std::allocator<inst_trace_t> >* const*, std::vector<std::vector<inst_trace_t, std::allocator<inst_trace_t> >*, std::allocator<std::vector<inst_trace_t, std::allocator<inst_trace_t> >*> > >, std::vector<inst_trace_t, std::allocator<inst_trace_t> >**> (__result=<optimized out>, __last=..., __first=...) at /usr/include/c++/9/bits/stl_algobase.h:474 #5 std::__uninitialized_copy<true>::__uninit_copy<__gnu_cxx::__normal_iterator<std::vector<inst_trace_t, std::allocator<inst_trace_t> >* const*, std::vector<std::vector<inst_trace_t, std::allocator<inst_trace_t> >*, std::allocator<std::vector<inst_trace_t, std::allocator<inst_trace_t> >*> > >, std::vector<inst_trace_t, std::allocator<inst_trace_t> >**> (__result=<optimized out>, __last=..., __first=...) at /usr/include/c++/9/bits/stl_uninitialized.h:101 #6 std::uninitialized_copy<__gnu_cxx::__normal_iterator<std::vector<inst_trace_t, std::allocator<inst_trace_t> >* const*, std::vector<std::vector<inst_trace_t, std::allocator<inst_trace_t> >*, std::allocator<std::vector<inst_trace_t, std::allocator<inst_trace_t> >*> > >, std::vector<inst_trace_t, std::allocator<inst_trace_t> >**> (__result=<optimized out>, __last=..., __first=...) at /usr/include/c++/9/bits/stl_uninitialized.h:140 #7 std::__uninitialized_copy_a<__gnu_cxx::__normal_iterator<std::vector<inst_trace_t, std::allocator<inst_trace_t> >* const*, std::vector<std::vector<inst_trace_t, std::allocator<inst_trace_t> >*, std::allocator<std::vector<inst_trace_t, std::allocator<inst_trace_t> >*> > >, std::vector<inst_trace_t, std::allocator<inst_trace_t> >**, std::vector<inst_trace_t, std::allocator<inst_trace_t> >*> (__result=<optimized out>, __last=..., __first=...) at /usr/include/c++/9/bits/stl_uninitialized.h:307 #8 std::vector<std::vector<inst_trace_t, std::allocator<inst_trace_t> >*, std::allocator<std::vector<inst_trace_t, std::allocator<inst_trace_t> >*> >::vector ( __x=std::vector of length 4, capacity 4 = {...}, this=0x7fffffffc4f0) --Type <RET> for more, q to quit, c to continue without paging-- at /usr/include/c++/9/bits/stl_vector.h:555 #9 trace_shader_core_ctx::init_traces (this=0x55555696ec30, start_warp=0, end_warp=4, kernel=...) at trace_driven.cc:486 With readelf command, you can see the compile options for both debug and release modes: RELEASE: $ readelf -p .GCC.command.line gpu-simulator/bin/release/accel-sim.out String dump of section '.GCC.command.line': [ 0] -I ./build/release [ 13] -I ./trace-driven [ 25] -I ./trace-parser [ 37] -I /home/mahmood/accel-sim-framework/gpu-simulator/gpgpu-sim/libcuda [ 7d] -I /home/mahmood/accel-sim-framework/gpu-simulator/gpgpu-sim/src [ bf] -I /usr/local/cuda-11.2/include [ df] -imultiarch x86_64-linux-gnu [ fc] -D_GNU_SOURCE [ 10a] main.cc [ 112] -mtune=generic [ 121] -march=x86-64 [ 12f] -auxbase-strip ./build/release/main.o [ 155] -g3 [ 159] -O3 [ 15d] -Wall [ 163] -std=c++11 [ 16e] -fPIC [ 174] -frecord-gcc-switches [ 18a] -fasynchronous-unwind-tables [ 1a7] -fstack-protector-strong [ 1c0] -Wformat-security [ 1d2] -fstack-clash-protection [ 1eb] -fcf-protection DEBUG: $ readelf -p .GCC.command.line gpu-simulator/bin/debug/accel-sim.out String dump of section '.GCC.command.line': [ 0] -I ./build/debug [ 11] -I ./trace-driven [ 23] -I ./trace-parser [ 35] -I /home/mahmood/accel-sim-framework/gpu-simulator/gpgpu-sim/libcuda [ 7b] -I /home/mahmood/accel-sim-framework/gpu-simulator/gpgpu-sim/src [ bd] -I /usr/local/cuda-11.2/include [ dd] -imultiarch x86_64-linux-gnu [ fa] -D_GNU_SOURCE [ 108] main.cc [ 110] -mtune=generic [ 11f] -march=x86-64 [ 12d] -auxbase-strip ./build/debug/main.o [ 151] -g3 [ 155] -O0 [ 159] -Wall [ 15f] -std=c++11 [ 16a] -fPIC [ 170] -frecord-gcc-switches [ 186] -fasynchronous-unwind-tables [ 1a3] -fstack-protector-strong [ 1bc] -Wformat-security [ 1ce] -fstack-clash-protection [ 1e7] -fcf-protection I would like to know if a similar problem has been reported before. Any idea about that? Regards, Mahmood