On 2008-07-02 20:08:52 -0400, Michael Meissner wrote: > Without having the code in a bug report, there is no way to say what > the problem is. It is best if you can take some time to reduce the > code to an example that shows clearly where the slowdown occurs. You > can use normal -pg profiling, oprofile, or tools like Code > Analyst/Vtune to identify where the hot spots are if you don't > already know where the hot function is. The simpler you make the > example, the more likely somebody will fix it (unless you pay > somebody to fix it, and then presumably as part of the > investigation, they will reduce it). -pg/gprof didn't help since there's more or less a single function (in fact there are several ones, but inlined). In fact, -march solved the problem, and after all, it's not abnormal that a later gcc version generates less efficient code if the architecture is incorrect. Now, I thought that gcc would have used sensible default values (I couldn't find anything in the manual about the default architecture). Also, is there a way to *automatically* get the best march value for the local machine? In my case, jobs run on various machines (they're submitted by SGE): they generate C code, compile it locally and run it. I don't even have a direct ssh access to some of these machines. I could instruct my jobs to try different options, do timings and cache the best option for the next jobs that will run on the machine, but if I can directly get the best one, that would be easier. In case someone is interested, I've attached an example of generated C source (the one I used for the timings). On one of the machines, that was the "naive_method" part that was slower (with no -march). But on another machine, that was another part (I didn't determine it). Note: this is an implementation of my algorithms described in my PhD thesis 8 years ago and improved in the last few years. -- Vincent Lefèvre <vincent@xxxxxxxxxx> - Web: <http://www.vinc17.org/> 100% accessible validated (X)HTML - Blog: <http://www.vinc17.org/blog/> Work: CR INRIA - computer arithmetic / Arenaire project (LIP, ENS-Lyon)
/* tmp/tst-p345-54-0.c, generated by test32f 21738 */ #include <stdio.h> #include <stdlib.h> #include <stdint.h> #include <limits.h> #include <assert.h> #ifndef NOTIMING #include <unistd.h> #include <sys/times.h> #endif #if CHAR_BIT != 8 #error CHAR_BIT must be equal to 8 #endif #define BUFFSIZE (8<<12) #include <gmp.h> #if GMP_LIMB_BITS != 32 && GMP_LIMB_BITS != 64 #error GMP_LIMB_BITS must be equal to 32 or 64 #endif #if defined(GMP_NUMB_BITS) && GMP_NUMB_BITS != GMP_LIMB_BITS #error GMP_NUMB_BITS must be equal to GMP_LIMB_BITS (when defined) #endif #define NPTS (0x10000000000) #define K 32768 #define NSSI 16 #define LSSI 2048 #define LOGL 11 #define LOGSS 4 #define LOGMS 3 #define MIN(a, b) (((a) < (b)) ? (a) : (b)) #define TESTEND0(COND) \ do \ if (COND) \ return b >> 32 >= d; \ while (0) #define R0 (0) #define TESTEND1(COND) \ do \ if (COND) \ return -1; /* >= N */ \ while (0) #define R1 (r) #if GMP_LIMB_BITS == 32 uint32_t a0_0[] = { 0x42954574, 0x86A0263D, 0x527F20F5, 0xDFF00C26 }; uint32_t a0_1[] = { 0x0DB3846A, 0x65610788, 0x6F49AF5C, 0x0C0808FA, 0xB0A18795 }; uint32_t a0_2[] = { 0x39904786, 0x5AFA1155, 0x026030B1, 0xD06ABF0A, 0x168FC8FE, 0x04E42748 }; uint32_t a0_3[] = { 0x94F5B78B, 0x68466878, 0xB9401795, 0x19D21418, 0xCA620C9B, 0x16C83961, 0x00000000 }; uint32_t a0_4[] = { 0x9FF5506F, 0x36E66055, 0x2B77A122, 0xE59AFB9A, 0x497B0499, 0x94C67171, 0x69CDDFA3, 0x00000000, 0x00000000 }; uint32_t a0_5[] = { 0xCFD34284, 0x6F76EE42, 0x18F7684A, 0x7BD772A4, 0x34956E4F, 0xB2E429D8, 0xE9EFBE77, 0x00000001, 0x00000000, 0x00000000 }; uint32_t a0_6[] = { 0xE9DAC7E1, 0x5AF36977, 0x0F40CD56, 0x0D292B1C, 0x225F252E, 0xBF34CF06, 0xD60B875E, 0x00000008, 0x00000000, 0x00000000, 0x00000000 }; uint32_t a0_7[] = { 0x756193CC, 0x144D7FAC, 0x6BBFC224, 0xCCEDF9C9, 0x1338BBAE, 0x98F2D180, 0xAD2EB7E0, 0x00000028, 0x00000000, 0x00000000, 0x00000000, 0x00000000 }; uint32_t a0_8[] = { 0x496229CB, 0x00FBC966, 0xFCFB2183, 0x4ED17D0A, 0x7AEA009E, 0x7FDD995D, 0xB306FF2B, 0x000000BA, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 }; uint32_t a0_9[] = { 0x20D2B101, 0x4735D743, 0xEC9CB2E1, 0x3451D3B0, 0x35A887A1, 0x9DE774D6, 0x6359ADEB, 0x00000356, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 }; uint32_t a0_10[] = { 0x88462A01, 0x62A08AB6, 0xEEABD553, 0xFC793408, 0xAE24E1A0, 0xE61B663A, 0x50CB56A0, 0x00000F3A, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 }; uint32_t a0_11[] = { 0x25FDE8D6, 0xDEDDD071, 0x103E7414, 0xA6E8304F, 0xEB0BB407, 0xB0C4A447, 0x140F635C, 0xE74300F6, 0x00004545, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 }; uint32_t a0_12[] = { 0x25DF8300, 0x1C452D59, 0x98EDF933, 0xE65D85FE, 0x6F9D7A2E, 0x115FE606, 0xD3D91040, 0x00013A30, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 }; uint32_t a1_0[] = { 0x7D7C36F2, 0x6EC64873, 0xF1AA5F40, 0x19D75C5E }; uint32_t a1_1[] = { 0x702CF201, 0x7263402A, 0x862C8FBF, 0x4E901657, 0x000009C8 }; uint32_t a1_2[] = { 0x8BA80E38, 0xE0BB7DDA, 0x0D2FA4C0, 0x72C32AF7, 0x00002D90, 0x00000000 }; uint32_t a1_3[] = { 0x93B1DC07, 0x4898CCCE, 0xF85040F7, 0xBF453FA0, 0x0000D39B, 0x00000000, 0x00000000 }; uint32_t a1_4[] = { 0x09627E73, 0x1C4B0EF7, 0x56BE4DA3, 0x7883D681, 0x7CE68FCE, 0x0003D3DF, 0x00000000, 0x00000000, 0x00000000 }; uint32_t a1_5[] = { 0x4373E578, 0xAFA64D8A, 0x4373C426, 0x41059171, 0x0E94D18C, 0x0011AC17, 0x00000000, 0x00000000, 0x00000000, 0x00000000 }; uint32_t a1_6[] = { 0x740AFCDF, 0xC8D21C77, 0x327B2C6C, 0x0C1C35CF, 0x6F068054, 0x00515A5D, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 }; uint32_t a1_7[] = { 0xD2F2CCE1, 0xB92BF1F8, 0xA75FDF8F, 0x55A975D6, 0xFB00A30E, 0x0175660D, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 }; uint32_t a1_8[] = { 0xFF60C84D, 0xC55428BE, 0x114E4780, 0x62D04218, 0x4C9D0978, 0x06ACC6B3, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 }; uint32_t a1_9[] = { 0x71A36B1B, 0xB197D062, 0x38BDBF1D, 0x6B78B857, 0x67FC6F80, 0x1E74A196, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 }; uint32_t a1_10[] = { 0xA75A938C, 0xCD1B5BD0, 0x85493286, 0x5E2B70A9, 0xC7BDC8A7, 0x8A8BCE84, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 }; uint32_t a1_11[] = { 0x5AB24BBF, 0xF266388A, 0x0BFD31DB, 0xF45DCCBB, 0xCC0CDF3A, 0x208022BF, 0x7461A7B2, 0x00000002, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 }; uint32_t a2_0[] = { 0x671F05D0, 0xFF1EF4B7, 0x13909D1F, 0x00000000 }; uint32_t a2_1[] = { 0x15A34940, 0x82540254, 0x5B20E585, 0x00000000, 0x00000000 }; uint32_t a2_2[] = { 0xB3D93F19, 0xAB6A1B84, 0xA7377E86, 0x00000001, 0x00000000, 0x00000000 }; uint32_t a2_3[] = { 0x343BA48B, 0x73A93AD7, 0xA7BEF9BB, 0x00000007, 0x00000000, 0x00000000, 0x00000000 }; uint32_t a2_4[] = { 0x125F4994, 0x6C64AA9E, 0x495DC8B8, 0x582E1CD8, 0x00000023, 0x00000000, 0x00000000, 0x00000000, 0x00000000 }; uint32_t a2_5[] = { 0x961059C9, 0xE059F4C2, 0x9D84EDC6, 0xB4BADC97, 0x000000A2, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 }; uint32_t a2_6[] = { 0xAB35AAD8, 0xFF948F07, 0x8CC30069, 0xCC1BEF54, 0x000002EA, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 }; uint32_t a2_7[] = { 0x636B34D4, 0x03E8FCD2, 0xAE43FD2D, 0x8D667AC5, 0x00000D59, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 }; uint32_t a2_8[] = { 0x2A55D7C3, 0x13E40612, 0x25954F24, 0x432C456E, 0x00003CE9, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 }; uint32_t a2_9[] = { 0x1DEC644F, 0xFF938405, 0xD25FEB3A, 0x9D071B1E, 0x00011517, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 }; uint32_t a2_10[] = { 0x997617FA, 0xBE75E8BB, 0x457F9819, 0x4F644100, 0x0004E8C3, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000 }; #else uint64_t a0_0[] = { 0x86A0263D42954574, 0xDFF00C26527F20F5 }; uint64_t a0_1[] = { 0x0DB3846A00000000, 0x6F49AF5C65610788, 0xB0A187950C0808FA }; uint64_t a0_2[] = { 0x5AFA115539904786, 0xD06ABF0A026030B1, 0x04E42748168FC8FE }; uint64_t a0_3[] = { 0x94F5B78B00000000, 0xB940179568466878, 0xCA620C9B19D21418, 0x0000000016C83961 }; uint64_t a0_4[] = { 0x9FF5506F00000000, 0x2B77A12236E66055, 0x497B0499E59AFB9A, 0x69CDDFA394C67171, 0x0000000000000000 }; uint64_t a0_5[] = { 0x6F76EE42CFD34284, 0x7BD772A418F7684A, 0xB2E429D834956E4F, 0x00000001E9EFBE77, 0x0000000000000000 }; uint64_t a0_6[] = { 0xE9DAC7E100000000, 0x0F40CD565AF36977, 0x225F252E0D292B1C, 0xD60B875EBF34CF06, 0x0000000000000008, 0x0000000000000000 }; uint64_t a0_7[] = { 0x144D7FAC756193CC, 0xCCEDF9C96BBFC224, 0x98F2D1801338BBAE, 0x00000028AD2EB7E0, 0x0000000000000000, 0x0000000000000000 }; uint64_t a0_8[] = { 0x496229CB00000000, 0xFCFB218300FBC966, 0x7AEA009E4ED17D0A, 0xB306FF2B7FDD995D, 0x00000000000000BA, 0x0000000000000000, 0x0000000000000000 }; uint64_t a0_9[] = { 0x4735D74320D2B101, 0x3451D3B0EC9CB2E1, 0x9DE774D635A887A1, 0x000003566359ADEB, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000 }; uint64_t a0_10[] = { 0x88462A0100000000, 0xEEABD55362A08AB6, 0xAE24E1A0FC793408, 0x50CB56A0E61B663A, 0x0000000000000F3A, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000 }; uint64_t a0_11[] = { 0x25FDE8D600000000, 0x103E7414DEDDD071, 0xEB0BB407A6E8304F, 0x140F635CB0C4A447, 0x00004545E74300F6, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000 }; uint64_t a0_12[] = { 0x25DF830000000000, 0x98EDF9331C452D59, 0x6F9D7A2EE65D85FE, 0xD3D91040115FE606, 0x0000000000013A30, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000 }; uint64_t a1_0[] = { 0x6EC648737D7C36F2, 0x19D75C5EF1AA5F40 }; uint64_t a1_1[] = { 0x702CF20100000000, 0x862C8FBF7263402A, 0x000009C84E901657 }; uint64_t a1_2[] = { 0xE0BB7DDA8BA80E38, 0x72C32AF70D2FA4C0, 0x0000000000002D90 }; uint64_t a1_3[] = { 0x93B1DC0700000000, 0xF85040F74898CCCE, 0x0000D39BBF453FA0, 0x0000000000000000 }; uint64_t a1_4[] = { 0x09627E7300000000, 0x56BE4DA31C4B0EF7, 0x7CE68FCE7883D681, 0x000000000003D3DF, 0x0000000000000000 }; uint64_t a1_5[] = { 0xAFA64D8A4373E578, 0x410591714373C426, 0x0011AC170E94D18C, 0x0000000000000000, 0x0000000000000000 }; uint64_t a1_6[] = { 0x740AFCDF00000000, 0x327B2C6CC8D21C77, 0x6F0680540C1C35CF, 0x0000000000515A5D, 0x0000000000000000, 0x0000000000000000 }; uint64_t a1_7[] = { 0xB92BF1F8D2F2CCE1, 0x55A975D6A75FDF8F, 0x0175660DFB00A30E, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000 }; uint64_t a1_8[] = { 0xFF60C84D00000000, 0x114E4780C55428BE, 0x4C9D097862D04218, 0x0000000006ACC6B3, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000 }; uint64_t a1_9[] = { 0xB197D06271A36B1B, 0x6B78B85738BDBF1D, 0x1E74A19667FC6F80, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000 }; uint64_t a1_10[] = { 0xA75A938C00000000, 0x85493286CD1B5BD0, 0xC7BDC8A75E2B70A9, 0x000000008A8BCE84, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000 }; uint64_t a1_11[] = { 0x5AB24BBF00000000, 0x0BFD31DBF266388A, 0xCC0CDF3AF45DCCBB, 0x7461A7B2208022BF, 0x0000000000000002, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000 }; uint64_t a2_0[] = { 0xFF1EF4B7671F05D0, 0x0000000013909D1F }; uint64_t a2_1[] = { 0x15A3494000000000, 0x5B20E58582540254, 0x0000000000000000 }; uint64_t a2_2[] = { 0xAB6A1B84B3D93F19, 0x00000001A7377E86, 0x0000000000000000 }; uint64_t a2_3[] = { 0x343BA48B00000000, 0xA7BEF9BB73A93AD7, 0x0000000000000007, 0x0000000000000000 }; uint64_t a2_4[] = { 0x125F499400000000, 0x495DC8B86C64AA9E, 0x00000023582E1CD8, 0x0000000000000000, 0x0000000000000000 }; uint64_t a2_5[] = { 0xE059F4C2961059C9, 0xB4BADC979D84EDC6, 0x00000000000000A2, 0x0000000000000000, 0x0000000000000000 }; uint64_t a2_6[] = { 0xAB35AAD800000000, 0x8CC30069FF948F07, 0x000002EACC1BEF54, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000 }; uint64_t a2_7[] = { 0x03E8FCD2636B34D4, 0x8D667AC5AE43FD2D, 0x0000000000000D59, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000 }; uint64_t a2_8[] = { 0x2A55D7C300000000, 0x25954F2413E40612, 0x00003CE9432C456E, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000 }; uint64_t a2_9[] = { 0xFF9384051DEC644F, 0x9D071B1ED25FEB3A, 0x0000000000011517, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000 }; uint64_t a2_10[] = { 0x997617FA00000000, 0x457F9819BE75E8BB, 0x0004E8C34F644100, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000 }; #endif #if 0 #include <inttypes.h> #define OUT0(T) fprintf(stderr, "%2d" \ " x = %016" PRIX64 " u = %-6" PRIuFAST32 \ " y = %016" PRIX64 " v = %-6" PRIuFAST32 \ " b = %016" PRIX64 " r = %-6" PRIuFAST32 \ "\n", T, x, u, y, v, b, R0) #else #define OUT0(T) ((void) 0) #endif static inline int glt0(uint64_t x, uint64_t b, uint_fast32_t d, uint_fast32_t N) { uint64_t y = - x; uint_fast32_t u = 1, v = 1; OUT0(0); for (;;) { if (b < x) { if (LOGMS < 64 && ((x - b) >> LOGMS) >= y) { uint64_t q = (x - b) / y - 1; TESTEND0(q >= N); x -= (unsigned int) q * y; v += (unsigned int) q * u; OUT0(10); } if (LOGMS == 0 || (LOGMS < 64 && (y >> LOGMS) > x)) { uint64_t q = y / x; TESTEND0(q >= N); y -= (unsigned int) q * x; u += (unsigned int) q * v; OUT0(11); } else while (x < y) { TESTEND0(u + v >= N); y -= x; u += v; OUT0(12); } TESTEND0(u + v >= N); x -= y; v += u; OUT0(13); } else { if (LOGMS < 64) { uint64_t diff; if (b >> 32 < d) return 0; diff = b - ((uint64_t) d << 32); if (diff > ((uint64_t) d << 32)) { diff -= (uint64_t) d << 32; if ((MIN(y, diff) >> LOGMS) >= x) { uint64_t q = MIN(y, diff) / x - 1; b -= (unsigned int) q * x; y -= (unsigned int) q * x; u += (unsigned int) q * v; OUT0(20); } } } b -= x; OUT0(21); if (LOGMS == 0 || (LOGMS < 64 && (x >> LOGMS) > y)) { uint64_t q = x / y; TESTEND0(q >= N); x -= (unsigned int) q * y; v += (unsigned int) q * u; OUT0(22); } else while (y < x) { TESTEND0(u + v >= N); x -= y; v += u; OUT0(23); } TESTEND0(u + v >= N); y -= x; u += v; OUT0(24); } } } int32_t test(uint32_t *buffer, char *endbuff, long pgen) { uint64_t a0, a1, a2; uint64_t i = NPTS; /* number of points */ int_fast32_t n = 0; /* number of exceptions */ if (pgen && i >> pgen > K) i = (uint64_t) K << pgen; assert(sizeof(uint64_t) == 8 && (uint64_t) -1 > 0); assert(sizeof(int_fast32_t) >= 4); assert(sizeof(mp_limb_t) * CHAR_BIT == GMP_LIMB_BITS); assert(mp_bits_per_limb == GMP_LIMB_BITS); do { uint_fast32_t h = NSSI; /* Coefficient initialization (degree 2) */ #if GMP_LIMB_BITS == 32 a0 = ((uint64_t) a0_0[4-1] << 32) + a0_0[4-2]; a1 = ((uint64_t) a1_0[4-1] << 32) + a1_0[4-2]; a2 = ((uint64_t) a2_0[4-1] << 32) + a2_0[4-2]; #else a0 = a0_0[2-1]; a1 = a1_0[2-1]; a2 = a2_0[2-1]; #endif /* Global line test on subinterval */ { uint64_t b; uint_fast32_t d; #if 0 uint_fast32_t r, s; #endif if ((uint32_t) (a2 >> 32) == 0) { b = (uint64_t) 1 << 32; d = (uint_fast32_t) 1 + 1 + ((uint32_t) a2 >> 3); } else if ((uint32_t) (a2 >> 32) == (uint32_t) -1) { d = (uint_fast32_t) 1 + 1 + ((- (uint32_t) a2) >> 3); b = (uint64_t) d << 32; } else goto subint; if (glt0(a1, b - a0, d, K)) goto next; #if 0 for (s = K; (r = glt1(a1, b - a0, d, s)) < s; ) { a0 += r * a1 + (r*(r-1)/2) * a2; a1 += r * a2; if (a0 >> 32 < 1) { uint64_t j; if ((void *) buffer == (void *) endbuff || n == INT_MAX) return -1; n++; j = (NPTS - i) + (K - (s - r)); buffer[0] = (uint32_t) j; buffer[1] = j >> 32; buffer += 2; } s -= r + 1; if (s == 0) break; a0 += a1; a1 += a2; } goto next; #endif } subint: do { int_fast32_t k = LSSI; if (LOGSS) { /* Global line test on subsubinterval */ if ((uint32_t) (a2 >> 32) == 0) { if (!glt0(a1, ((uint64_t) 1 << 32) - a0, (uint_fast32_t) 1 + 1 + ((uint32_t) a2 >> 11), LSSI)) goto naive_method; } else if ((uint32_t) (a2 >> 32) == (uint32_t) -1) { uint_fast32_t err; err = (uint_fast32_t) 1 + 1 + ((- (uint32_t) a2) >> 11); if (!glt0(a1, ((uint64_t) err << 32) - a0, err, LSSI)) goto naive_method; } else goto naive_method; a0 += a1 << LOGL; a1 += a2 << LOGL; a0 += (a2 << (2 * LOGL - 1)) - (a2 << (LOGL - 1)); continue; } naive_method: do { if (a0 >> 32 < 1) { uint64_t j; if ((void *) buffer == (void *) endbuff || n == INT_MAX) return -1; n++; j = (NPTS - i) + (LOGSS ? ((NSSI - h) << LOGL) : 0) + (LSSI - k); buffer[0] = (uint32_t) j; buffer[1] = j >> 32; buffer += 2; } a0 += a1; a1 += a2; } while (--k); } while (LOGSS && --h); next: #if GMP_LIMB_BITS == 32 mpn_add_n((mp_limb_t *) a0_0, (mp_limb_t *) a0_0, (mp_limb_t *) a0_1 + 1, 4); mpn_add_n((mp_limb_t *) a0_1, (mp_limb_t *) a0_1, (mp_limb_t *) a0_2 + 1, 5); mpn_add_n((mp_limb_t *) a0_2, (mp_limb_t *) a0_2, (mp_limb_t *) a0_3 + 1, 6); mpn_add_n((mp_limb_t *) a0_3, (mp_limb_t *) a0_3, (mp_limb_t *) a0_4 + 2, 7); mpn_add_n((mp_limb_t *) a0_4, (mp_limb_t *) a0_4, (mp_limb_t *) a0_5 + 1, 9); mpn_add_n((mp_limb_t *) a0_5, (mp_limb_t *) a0_5, (mp_limb_t *) a0_6 + 1, 10); mpn_add_n((mp_limb_t *) a0_6, (mp_limb_t *) a0_6, (mp_limb_t *) a0_7 + 1, 11); mpn_add_n((mp_limb_t *) a0_7, (mp_limb_t *) a0_7, (mp_limb_t *) a0_8 + 1, 12); mpn_add_n((mp_limb_t *) a0_8, (mp_limb_t *) a0_8, (mp_limb_t *) a0_9 + 1, 13); mpn_add_n((mp_limb_t *) a0_9, (mp_limb_t *) a0_9, (mp_limb_t *) a0_10 + 1, 14); mpn_add_n((mp_limb_t *) a0_10, (mp_limb_t *) a0_10, (mp_limb_t *) a0_11 + 2, 15); mpn_add_n((mp_limb_t *) a0_11, (mp_limb_t *) a0_11, (mp_limb_t *) a0_12 + 0, 17); mpn_add_n((mp_limb_t *) a1_0, (mp_limb_t *) a1_0, (mp_limb_t *) a1_1 + 1, 4); mpn_add_n((mp_limb_t *) a1_1, (mp_limb_t *) a1_1, (mp_limb_t *) a1_2 + 1, 5); mpn_add_n((mp_limb_t *) a1_2, (mp_limb_t *) a1_2, (mp_limb_t *) a1_3 + 1, 6); mpn_add_n((mp_limb_t *) a1_3, (mp_limb_t *) a1_3, (mp_limb_t *) a1_4 + 2, 7); mpn_add_n((mp_limb_t *) a1_4, (mp_limb_t *) a1_4, (mp_limb_t *) a1_5 + 1, 9); mpn_add_n((mp_limb_t *) a1_5, (mp_limb_t *) a1_5, (mp_limb_t *) a1_6 + 1, 10); mpn_add_n((mp_limb_t *) a1_6, (mp_limb_t *) a1_6, (mp_limb_t *) a1_7 + 1, 11); mpn_add_n((mp_limb_t *) a1_7, (mp_limb_t *) a1_7, (mp_limb_t *) a1_8 + 1, 12); mpn_add_n((mp_limb_t *) a1_8, (mp_limb_t *) a1_8, (mp_limb_t *) a1_9 + 1, 13); mpn_add_n((mp_limb_t *) a1_9, (mp_limb_t *) a1_9, (mp_limb_t *) a1_10 + 1, 14); mpn_add_n((mp_limb_t *) a1_10, (mp_limb_t *) a1_10, (mp_limb_t *) a1_11 + 2, 15); mpn_add_n((mp_limb_t *) a2_0, (mp_limb_t *) a2_0, (mp_limb_t *) a2_1 + 1, 4); mpn_add_n((mp_limb_t *) a2_1, (mp_limb_t *) a2_1, (mp_limb_t *) a2_2 + 1, 5); mpn_add_n((mp_limb_t *) a2_2, (mp_limb_t *) a2_2, (mp_limb_t *) a2_3 + 1, 6); mpn_add_n((mp_limb_t *) a2_3, (mp_limb_t *) a2_3, (mp_limb_t *) a2_4 + 2, 7); mpn_add_n((mp_limb_t *) a2_4, (mp_limb_t *) a2_4, (mp_limb_t *) a2_5 + 1, 9); mpn_add_n((mp_limb_t *) a2_5, (mp_limb_t *) a2_5, (mp_limb_t *) a2_6 + 1, 10); mpn_add_n((mp_limb_t *) a2_6, (mp_limb_t *) a2_6, (mp_limb_t *) a2_7 + 1, 11); mpn_add_n((mp_limb_t *) a2_7, (mp_limb_t *) a2_7, (mp_limb_t *) a2_8 + 1, 12); mpn_add_n((mp_limb_t *) a2_8, (mp_limb_t *) a2_8, (mp_limb_t *) a2_9 + 1, 13); mpn_add_n((mp_limb_t *) a2_9, (mp_limb_t *) a2_9, (mp_limb_t *) a2_10 + 1, 14); #else mpn_add_n((mp_limb_t *) a0_0, (mp_limb_t *) a0_0, (mp_limb_t *) a0_1 + 1, 2); mpn_add_n((mp_limb_t *) a0_1, (mp_limb_t *) a0_1, (mp_limb_t *) a0_2 + 0, 3); mpn_add_n((mp_limb_t *) a0_2, (mp_limb_t *) a0_2, (mp_limb_t *) a0_3 + 1, 3); mpn_add_n((mp_limb_t *) a0_3, (mp_limb_t *) a0_3, (mp_limb_t *) a0_4 + 1, 4); mpn_add_n((mp_limb_t *) a0_4, (mp_limb_t *) a0_4, (mp_limb_t *) a0_5 + 0, 5); mpn_add_n((mp_limb_t *) a0_5, (mp_limb_t *) a0_5, (mp_limb_t *) a0_6 + 1, 5); mpn_add_n((mp_limb_t *) a0_6, (mp_limb_t *) a0_6, (mp_limb_t *) a0_7 + 0, 6); mpn_add_n((mp_limb_t *) a0_7, (mp_limb_t *) a0_7, (mp_limb_t *) a0_8 + 1, 6); mpn_add_n((mp_limb_t *) a0_8, (mp_limb_t *) a0_8, (mp_limb_t *) a0_9 + 0, 7); mpn_add_n((mp_limb_t *) a0_9, (mp_limb_t *) a0_9, (mp_limb_t *) a0_10 + 1, 7); mpn_add_n((mp_limb_t *) a0_10, (mp_limb_t *) a0_10, (mp_limb_t *) a0_11 + 1, 8); mpn_add_n((mp_limb_t *) a0_11, (mp_limb_t *) a0_11, (mp_limb_t *) a0_12 + 0, 9); mpn_add_n((mp_limb_t *) a1_0, (mp_limb_t *) a1_0, (mp_limb_t *) a1_1 + 1, 2); mpn_add_n((mp_limb_t *) a1_1, (mp_limb_t *) a1_1, (mp_limb_t *) a1_2 + 0, 3); mpn_add_n((mp_limb_t *) a1_2, (mp_limb_t *) a1_2, (mp_limb_t *) a1_3 + 1, 3); mpn_add_n((mp_limb_t *) a1_3, (mp_limb_t *) a1_3, (mp_limb_t *) a1_4 + 1, 4); mpn_add_n((mp_limb_t *) a1_4, (mp_limb_t *) a1_4, (mp_limb_t *) a1_5 + 0, 5); mpn_add_n((mp_limb_t *) a1_5, (mp_limb_t *) a1_5, (mp_limb_t *) a1_6 + 1, 5); mpn_add_n((mp_limb_t *) a1_6, (mp_limb_t *) a1_6, (mp_limb_t *) a1_7 + 0, 6); mpn_add_n((mp_limb_t *) a1_7, (mp_limb_t *) a1_7, (mp_limb_t *) a1_8 + 1, 6); mpn_add_n((mp_limb_t *) a1_8, (mp_limb_t *) a1_8, (mp_limb_t *) a1_9 + 0, 7); mpn_add_n((mp_limb_t *) a1_9, (mp_limb_t *) a1_9, (mp_limb_t *) a1_10 + 1, 7); mpn_add_n((mp_limb_t *) a1_10, (mp_limb_t *) a1_10, (mp_limb_t *) a1_11 + 1, 8); mpn_add_n((mp_limb_t *) a2_0, (mp_limb_t *) a2_0, (mp_limb_t *) a2_1 + 1, 2); mpn_add_n((mp_limb_t *) a2_1, (mp_limb_t *) a2_1, (mp_limb_t *) a2_2 + 0, 3); mpn_add_n((mp_limb_t *) a2_2, (mp_limb_t *) a2_2, (mp_limb_t *) a2_3 + 1, 3); mpn_add_n((mp_limb_t *) a2_3, (mp_limb_t *) a2_3, (mp_limb_t *) a2_4 + 1, 4); mpn_add_n((mp_limb_t *) a2_4, (mp_limb_t *) a2_4, (mp_limb_t *) a2_5 + 0, 5); mpn_add_n((mp_limb_t *) a2_5, (mp_limb_t *) a2_5, (mp_limb_t *) a2_6 + 1, 5); mpn_add_n((mp_limb_t *) a2_6, (mp_limb_t *) a2_6, (mp_limb_t *) a2_7 + 0, 6); mpn_add_n((mp_limb_t *) a2_7, (mp_limb_t *) a2_7, (mp_limb_t *) a2_8 + 1, 6); mpn_add_n((mp_limb_t *) a2_8, (mp_limb_t *) a2_8, (mp_limb_t *) a2_9 + 0, 7); mpn_add_n((mp_limb_t *) a2_9, (mp_limb_t *) a2_9, (mp_limb_t *) a2_10 + 1, 7); #endif } while (i -= K); return n; } int main(int argc, char **argv) { uint32_t *buffer; int32_t n; long pgen = 0; #ifndef NOTIMING struct tms tbuf; double usertime = -1.0; #endif assert(sizeof(uint32_t) == 4 && (unsigned long) (uint32_t) -1 == 4294967295UL); if (argc > 1) { char *end; pgen = strtol(argv[1], &end, 10); if (*end != '\0' || pgen < 1 || pgen > 31) { fprintf(stderr, "Incorrect pgen argument\n"); exit(6); } } if (!pgen) { if (printf("[1234]\t(0:34)\n") < 0 || fflush(stdout) != 0) { fprintf(stderr, "Output error!\n"); exit(3); } } /* Allocate the buffer + a word */ if ((buffer = (uint32_t *) malloc(BUFFSIZE+4)) == NULL) { fprintf(stderr, "Insufficient memory!\n"); exit(1); } n = test(buffer, (char *) buffer + BUFFSIZE, pgen); if (pgen) return 0; #ifndef NOTIMING if (times(&tbuf) == (clock_t) -1 || (usertime = (double) tbuf.tms_utime / sysconf(_SC_CLK_TCK)) < 0.0) { fprintf(stderr, "Could not get user time!\n"); exit(7); } #endif if (n < 0) { fprintf(stderr, "Too many exceptions!\n"); exit(2); } printf("<%ld>\n", (long int) n); while (n--) { int ret; ret = printf("[%08X,%08X] [%u,%u]\n", buffer[0], buffer[1], buffer[0], buffer[1]); if (ret < 0) { fprintf(stderr, "Output error (return value = %d)!\n", ret); exit(4); } buffer += 2; } if (printf("{1234}") < 0 #ifndef NOTIMING || printf("\t%.2f", usertime) < 0 #endif || printf("\n") < 0) { fprintf(stderr, "Output error!\n"); exit(5); } return 0; } /* end of tmp/tst-p345-54-0.c */