Re: gcc 4.3 generates less efficient code than gcc 4.1 or 4.2

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On 2008-07-02 20:08:52 -0400, Michael Meissner wrote:
> Without having the code in a bug report, there is no way to say what
> the problem is. It is best if you can take some time to reduce the
> code to an example that shows clearly where the slowdown occurs. You
> can use normal -pg profiling, oprofile, or tools like Code
> Analyst/Vtune to identify where the hot spots are if you don't
> already know where the hot function is. The simpler you make the
> example, the more likely somebody will fix it (unless you pay
> somebody to fix it, and then presumably as part of the
> investigation, they will reduce it).

-pg/gprof didn't help since there's more or less a single function
(in fact there are several ones, but inlined).

In fact, -march solved the problem, and after all, it's not abnormal
that a later gcc version generates less efficient code if the
architecture is incorrect. Now, I thought that gcc would have used
sensible default values (I couldn't find anything in the manual about
the default architecture).

Also, is there a way to *automatically* get the best march value for
the local machine? In my case, jobs run on various machines (they're
submitted by SGE): they generate C code, compile it locally and run
it. I don't even have a direct ssh access to some of these machines.

I could instruct my jobs to try different options, do timings and
cache the best option for the next jobs that will run on the machine,
but if I can directly get the best one, that would be easier.

In case someone is interested, I've attached an example of generated C
source (the one I used for the timings). On one of the machines, that
was the "naive_method" part that was slower (with no -march). But on
another machine, that was another part (I didn't determine it). Note:
this is an implementation of my algorithms described in my PhD thesis
8 years ago and improved in the last few years.

-- 
Vincent Lefèvre <vincent@xxxxxxxxxx> - Web: <http://www.vinc17.org/>
100% accessible validated (X)HTML - Blog: <http://www.vinc17.org/blog/>
Work: CR INRIA - computer arithmetic / Arenaire project (LIP, ENS-Lyon)
/* tmp/tst-p345-54-0.c, generated by test32f 21738 */

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <limits.h>
#include <assert.h>

#ifndef NOTIMING
#include <unistd.h>
#include <sys/times.h>
#endif

#if CHAR_BIT != 8
#error CHAR_BIT must be equal to 8
#endif

#define BUFFSIZE (8<<12)

#include <gmp.h>

#if GMP_LIMB_BITS != 32 && GMP_LIMB_BITS != 64
#error GMP_LIMB_BITS must be equal to 32 or 64
#endif

#if defined(GMP_NUMB_BITS) && GMP_NUMB_BITS != GMP_LIMB_BITS
#error GMP_NUMB_BITS must be equal to GMP_LIMB_BITS (when defined)
#endif

#define NPTS (0x10000000000)
#define K 32768
#define NSSI 16
#define LSSI 2048
#define LOGL 11
#define LOGSS 4
#define LOGMS 3

#define MIN(a, b) (((a) < (b)) ? (a) : (b))

#define TESTEND0(COND)      \
  do                        \
    if (COND)               \
      return b >> 32 >= d;  \
  while (0)
#define R0 (0)

#define TESTEND1(COND)      \
  do                        \
    if (COND)               \
      return -1; /* >= N */ \
  while (0)
#define R1 (r)

#if GMP_LIMB_BITS == 32

uint32_t a0_0[] = { 0x42954574, 0x86A0263D, 0x527F20F5, 0xDFF00C26 };
uint32_t a0_1[] = { 0x0DB3846A, 0x65610788, 0x6F49AF5C, 0x0C0808FA,
                    0xB0A18795 };
uint32_t a0_2[] = { 0x39904786, 0x5AFA1155, 0x026030B1, 0xD06ABF0A,
                    0x168FC8FE, 0x04E42748 };
uint32_t a0_3[] = { 0x94F5B78B, 0x68466878, 0xB9401795, 0x19D21418,
                    0xCA620C9B, 0x16C83961, 0x00000000 };
uint32_t a0_4[] = { 0x9FF5506F, 0x36E66055, 0x2B77A122, 0xE59AFB9A,
                    0x497B0499, 0x94C67171, 0x69CDDFA3, 0x00000000,
                    0x00000000 };
uint32_t a0_5[] = { 0xCFD34284, 0x6F76EE42, 0x18F7684A, 0x7BD772A4,
                    0x34956E4F, 0xB2E429D8, 0xE9EFBE77, 0x00000001,
                    0x00000000, 0x00000000 };
uint32_t a0_6[] = { 0xE9DAC7E1, 0x5AF36977, 0x0F40CD56, 0x0D292B1C,
                    0x225F252E, 0xBF34CF06, 0xD60B875E, 0x00000008,
                    0x00000000, 0x00000000, 0x00000000 };
uint32_t a0_7[] = { 0x756193CC, 0x144D7FAC, 0x6BBFC224, 0xCCEDF9C9,
                    0x1338BBAE, 0x98F2D180, 0xAD2EB7E0, 0x00000028,
                    0x00000000, 0x00000000, 0x00000000, 0x00000000 };
uint32_t a0_8[] = { 0x496229CB, 0x00FBC966, 0xFCFB2183, 0x4ED17D0A,
                    0x7AEA009E, 0x7FDD995D, 0xB306FF2B, 0x000000BA,
                    0x00000000, 0x00000000, 0x00000000, 0x00000000,
                    0x00000000 };
uint32_t a0_9[] = { 0x20D2B101, 0x4735D743, 0xEC9CB2E1, 0x3451D3B0,
                    0x35A887A1, 0x9DE774D6, 0x6359ADEB, 0x00000356,
                    0x00000000, 0x00000000, 0x00000000, 0x00000000,
                    0x00000000, 0x00000000 };
uint32_t a0_10[] = { 0x88462A01, 0x62A08AB6, 0xEEABD553, 0xFC793408,
                     0xAE24E1A0, 0xE61B663A, 0x50CB56A0, 0x00000F3A,
                     0x00000000, 0x00000000, 0x00000000, 0x00000000,
                     0x00000000, 0x00000000, 0x00000000 };
uint32_t a0_11[] = { 0x25FDE8D6, 0xDEDDD071, 0x103E7414, 0xA6E8304F,
                     0xEB0BB407, 0xB0C4A447, 0x140F635C, 0xE74300F6,
                     0x00004545, 0x00000000, 0x00000000, 0x00000000,
                     0x00000000, 0x00000000, 0x00000000, 0x00000000,
                     0x00000000 };
uint32_t a0_12[] = { 0x25DF8300, 0x1C452D59, 0x98EDF933, 0xE65D85FE,
                     0x6F9D7A2E, 0x115FE606, 0xD3D91040, 0x00013A30,
                     0x00000000, 0x00000000, 0x00000000, 0x00000000,
                     0x00000000, 0x00000000, 0x00000000, 0x00000000,
                     0x00000000 };

uint32_t a1_0[] = { 0x7D7C36F2, 0x6EC64873, 0xF1AA5F40, 0x19D75C5E };
uint32_t a1_1[] = { 0x702CF201, 0x7263402A, 0x862C8FBF, 0x4E901657,
                    0x000009C8 };
uint32_t a1_2[] = { 0x8BA80E38, 0xE0BB7DDA, 0x0D2FA4C0, 0x72C32AF7,
                    0x00002D90, 0x00000000 };
uint32_t a1_3[] = { 0x93B1DC07, 0x4898CCCE, 0xF85040F7, 0xBF453FA0,
                    0x0000D39B, 0x00000000, 0x00000000 };
uint32_t a1_4[] = { 0x09627E73, 0x1C4B0EF7, 0x56BE4DA3, 0x7883D681,
                    0x7CE68FCE, 0x0003D3DF, 0x00000000, 0x00000000,
                    0x00000000 };
uint32_t a1_5[] = { 0x4373E578, 0xAFA64D8A, 0x4373C426, 0x41059171,
                    0x0E94D18C, 0x0011AC17, 0x00000000, 0x00000000,
                    0x00000000, 0x00000000 };
uint32_t a1_6[] = { 0x740AFCDF, 0xC8D21C77, 0x327B2C6C, 0x0C1C35CF,
                    0x6F068054, 0x00515A5D, 0x00000000, 0x00000000,
                    0x00000000, 0x00000000, 0x00000000 };
uint32_t a1_7[] = { 0xD2F2CCE1, 0xB92BF1F8, 0xA75FDF8F, 0x55A975D6,
                    0xFB00A30E, 0x0175660D, 0x00000000, 0x00000000,
                    0x00000000, 0x00000000, 0x00000000, 0x00000000 };
uint32_t a1_8[] = { 0xFF60C84D, 0xC55428BE, 0x114E4780, 0x62D04218,
                    0x4C9D0978, 0x06ACC6B3, 0x00000000, 0x00000000,
                    0x00000000, 0x00000000, 0x00000000, 0x00000000,
                    0x00000000 };
uint32_t a1_9[] = { 0x71A36B1B, 0xB197D062, 0x38BDBF1D, 0x6B78B857,
                    0x67FC6F80, 0x1E74A196, 0x00000000, 0x00000000,
                    0x00000000, 0x00000000, 0x00000000, 0x00000000,
                    0x00000000, 0x00000000 };
uint32_t a1_10[] = { 0xA75A938C, 0xCD1B5BD0, 0x85493286, 0x5E2B70A9,
                     0xC7BDC8A7, 0x8A8BCE84, 0x00000000, 0x00000000,
                     0x00000000, 0x00000000, 0x00000000, 0x00000000,
                     0x00000000, 0x00000000, 0x00000000 };
uint32_t a1_11[] = { 0x5AB24BBF, 0xF266388A, 0x0BFD31DB, 0xF45DCCBB,
                     0xCC0CDF3A, 0x208022BF, 0x7461A7B2, 0x00000002,
                     0x00000000, 0x00000000, 0x00000000, 0x00000000,
                     0x00000000, 0x00000000, 0x00000000, 0x00000000,
                     0x00000000 };

uint32_t a2_0[] = { 0x671F05D0, 0xFF1EF4B7, 0x13909D1F, 0x00000000 };
uint32_t a2_1[] = { 0x15A34940, 0x82540254, 0x5B20E585, 0x00000000,
                    0x00000000 };
uint32_t a2_2[] = { 0xB3D93F19, 0xAB6A1B84, 0xA7377E86, 0x00000001,
                    0x00000000, 0x00000000 };
uint32_t a2_3[] = { 0x343BA48B, 0x73A93AD7, 0xA7BEF9BB, 0x00000007,
                    0x00000000, 0x00000000, 0x00000000 };
uint32_t a2_4[] = { 0x125F4994, 0x6C64AA9E, 0x495DC8B8, 0x582E1CD8,
                    0x00000023, 0x00000000, 0x00000000, 0x00000000,
                    0x00000000 };
uint32_t a2_5[] = { 0x961059C9, 0xE059F4C2, 0x9D84EDC6, 0xB4BADC97,
                    0x000000A2, 0x00000000, 0x00000000, 0x00000000,
                    0x00000000, 0x00000000 };
uint32_t a2_6[] = { 0xAB35AAD8, 0xFF948F07, 0x8CC30069, 0xCC1BEF54,
                    0x000002EA, 0x00000000, 0x00000000, 0x00000000,
                    0x00000000, 0x00000000, 0x00000000 };
uint32_t a2_7[] = { 0x636B34D4, 0x03E8FCD2, 0xAE43FD2D, 0x8D667AC5,
                    0x00000D59, 0x00000000, 0x00000000, 0x00000000,
                    0x00000000, 0x00000000, 0x00000000, 0x00000000 };
uint32_t a2_8[] = { 0x2A55D7C3, 0x13E40612, 0x25954F24, 0x432C456E,
                    0x00003CE9, 0x00000000, 0x00000000, 0x00000000,
                    0x00000000, 0x00000000, 0x00000000, 0x00000000,
                    0x00000000 };
uint32_t a2_9[] = { 0x1DEC644F, 0xFF938405, 0xD25FEB3A, 0x9D071B1E,
                    0x00011517, 0x00000000, 0x00000000, 0x00000000,
                    0x00000000, 0x00000000, 0x00000000, 0x00000000,
                    0x00000000, 0x00000000 };
uint32_t a2_10[] = { 0x997617FA, 0xBE75E8BB, 0x457F9819, 0x4F644100,
                     0x0004E8C3, 0x00000000, 0x00000000, 0x00000000,
                     0x00000000, 0x00000000, 0x00000000, 0x00000000,
                     0x00000000, 0x00000000, 0x00000000 };

#else

uint64_t a0_0[] = { 0x86A0263D42954574, 0xDFF00C26527F20F5 };
uint64_t a0_1[] = { 0x0DB3846A00000000, 0x6F49AF5C65610788,
                    0xB0A187950C0808FA };
uint64_t a0_2[] = { 0x5AFA115539904786, 0xD06ABF0A026030B1,
                    0x04E42748168FC8FE };
uint64_t a0_3[] = { 0x94F5B78B00000000, 0xB940179568466878,
                    0xCA620C9B19D21418, 0x0000000016C83961 };
uint64_t a0_4[] = { 0x9FF5506F00000000, 0x2B77A12236E66055,
                    0x497B0499E59AFB9A, 0x69CDDFA394C67171,
                    0x0000000000000000 };
uint64_t a0_5[] = { 0x6F76EE42CFD34284, 0x7BD772A418F7684A,
                    0xB2E429D834956E4F, 0x00000001E9EFBE77,
                    0x0000000000000000 };
uint64_t a0_6[] = { 0xE9DAC7E100000000, 0x0F40CD565AF36977,
                    0x225F252E0D292B1C, 0xD60B875EBF34CF06,
                    0x0000000000000008, 0x0000000000000000 };
uint64_t a0_7[] = { 0x144D7FAC756193CC, 0xCCEDF9C96BBFC224,
                    0x98F2D1801338BBAE, 0x00000028AD2EB7E0,
                    0x0000000000000000, 0x0000000000000000 };
uint64_t a0_8[] = { 0x496229CB00000000, 0xFCFB218300FBC966,
                    0x7AEA009E4ED17D0A, 0xB306FF2B7FDD995D,
                    0x00000000000000BA, 0x0000000000000000,
                    0x0000000000000000 };
uint64_t a0_9[] = { 0x4735D74320D2B101, 0x3451D3B0EC9CB2E1,
                    0x9DE774D635A887A1, 0x000003566359ADEB,
                    0x0000000000000000, 0x0000000000000000,
                    0x0000000000000000 };
uint64_t a0_10[] = { 0x88462A0100000000, 0xEEABD55362A08AB6,
                     0xAE24E1A0FC793408, 0x50CB56A0E61B663A,
                     0x0000000000000F3A, 0x0000000000000000,
                     0x0000000000000000, 0x0000000000000000 };
uint64_t a0_11[] = { 0x25FDE8D600000000, 0x103E7414DEDDD071,
                     0xEB0BB407A6E8304F, 0x140F635CB0C4A447,
                     0x00004545E74300F6, 0x0000000000000000,
                     0x0000000000000000, 0x0000000000000000,
                     0x0000000000000000 };
uint64_t a0_12[] = { 0x25DF830000000000, 0x98EDF9331C452D59,
                     0x6F9D7A2EE65D85FE, 0xD3D91040115FE606,
                     0x0000000000013A30, 0x0000000000000000,
                     0x0000000000000000, 0x0000000000000000,
                     0x0000000000000000 };

uint64_t a1_0[] = { 0x6EC648737D7C36F2, 0x19D75C5EF1AA5F40 };
uint64_t a1_1[] = { 0x702CF20100000000, 0x862C8FBF7263402A,
                    0x000009C84E901657 };
uint64_t a1_2[] = { 0xE0BB7DDA8BA80E38, 0x72C32AF70D2FA4C0,
                    0x0000000000002D90 };
uint64_t a1_3[] = { 0x93B1DC0700000000, 0xF85040F74898CCCE,
                    0x0000D39BBF453FA0, 0x0000000000000000 };
uint64_t a1_4[] = { 0x09627E7300000000, 0x56BE4DA31C4B0EF7,
                    0x7CE68FCE7883D681, 0x000000000003D3DF,
                    0x0000000000000000 };
uint64_t a1_5[] = { 0xAFA64D8A4373E578, 0x410591714373C426,
                    0x0011AC170E94D18C, 0x0000000000000000,
                    0x0000000000000000 };
uint64_t a1_6[] = { 0x740AFCDF00000000, 0x327B2C6CC8D21C77,
                    0x6F0680540C1C35CF, 0x0000000000515A5D,
                    0x0000000000000000, 0x0000000000000000 };
uint64_t a1_7[] = { 0xB92BF1F8D2F2CCE1, 0x55A975D6A75FDF8F,
                    0x0175660DFB00A30E, 0x0000000000000000,
                    0x0000000000000000, 0x0000000000000000 };
uint64_t a1_8[] = { 0xFF60C84D00000000, 0x114E4780C55428BE,
                    0x4C9D097862D04218, 0x0000000006ACC6B3,
                    0x0000000000000000, 0x0000000000000000,
                    0x0000000000000000 };
uint64_t a1_9[] = { 0xB197D06271A36B1B, 0x6B78B85738BDBF1D,
                    0x1E74A19667FC6F80, 0x0000000000000000,
                    0x0000000000000000, 0x0000000000000000,
                    0x0000000000000000 };
uint64_t a1_10[] = { 0xA75A938C00000000, 0x85493286CD1B5BD0,
                     0xC7BDC8A75E2B70A9, 0x000000008A8BCE84,
                     0x0000000000000000, 0x0000000000000000,
                     0x0000000000000000, 0x0000000000000000 };
uint64_t a1_11[] = { 0x5AB24BBF00000000, 0x0BFD31DBF266388A,
                     0xCC0CDF3AF45DCCBB, 0x7461A7B2208022BF,
                     0x0000000000000002, 0x0000000000000000,
                     0x0000000000000000, 0x0000000000000000,
                     0x0000000000000000 };

uint64_t a2_0[] = { 0xFF1EF4B7671F05D0, 0x0000000013909D1F };
uint64_t a2_1[] = { 0x15A3494000000000, 0x5B20E58582540254,
                    0x0000000000000000 };
uint64_t a2_2[] = { 0xAB6A1B84B3D93F19, 0x00000001A7377E86,
                    0x0000000000000000 };
uint64_t a2_3[] = { 0x343BA48B00000000, 0xA7BEF9BB73A93AD7,
                    0x0000000000000007, 0x0000000000000000 };
uint64_t a2_4[] = { 0x125F499400000000, 0x495DC8B86C64AA9E,
                    0x00000023582E1CD8, 0x0000000000000000,
                    0x0000000000000000 };
uint64_t a2_5[] = { 0xE059F4C2961059C9, 0xB4BADC979D84EDC6,
                    0x00000000000000A2, 0x0000000000000000,
                    0x0000000000000000 };
uint64_t a2_6[] = { 0xAB35AAD800000000, 0x8CC30069FF948F07,
                    0x000002EACC1BEF54, 0x0000000000000000,
                    0x0000000000000000, 0x0000000000000000 };
uint64_t a2_7[] = { 0x03E8FCD2636B34D4, 0x8D667AC5AE43FD2D,
                    0x0000000000000D59, 0x0000000000000000,
                    0x0000000000000000, 0x0000000000000000 };
uint64_t a2_8[] = { 0x2A55D7C300000000, 0x25954F2413E40612,
                    0x00003CE9432C456E, 0x0000000000000000,
                    0x0000000000000000, 0x0000000000000000,
                    0x0000000000000000 };
uint64_t a2_9[] = { 0xFF9384051DEC644F, 0x9D071B1ED25FEB3A,
                    0x0000000000011517, 0x0000000000000000,
                    0x0000000000000000, 0x0000000000000000,
                    0x0000000000000000 };
uint64_t a2_10[] = { 0x997617FA00000000, 0x457F9819BE75E8BB,
                     0x0004E8C34F644100, 0x0000000000000000,
                     0x0000000000000000, 0x0000000000000000,
                     0x0000000000000000, 0x0000000000000000 };

#endif

#if 0
#include <inttypes.h>
#define OUT0(T) fprintf(stderr, "%2d" \
  "  x = %016" PRIX64 "  u = %-6" PRIuFAST32 \
  "  y = %016" PRIX64 "  v = %-6" PRIuFAST32 \
  "  b = %016" PRIX64 "  r = %-6" PRIuFAST32 \
  "\n", T, x, u, y, v, b, R0)
#else
#define OUT0(T) ((void) 0)
#endif

static inline int
glt0(uint64_t x, uint64_t b, uint_fast32_t d, uint_fast32_t N)
{
  uint64_t y = - x;
  uint_fast32_t u = 1, v = 1;

  OUT0(0);
  for (;;)
    {
      if (b < x)
        {
          if (LOGMS < 64 && ((x - b) >> LOGMS) >= y)
            {
              uint64_t q = (x - b) / y - 1;
              TESTEND0(q >= N);
              x -= (unsigned int) q * y;
              v += (unsigned int) q * u;
              OUT0(10);
            }
          if (LOGMS == 0 || (LOGMS < 64 && (y >> LOGMS) > x))
            {
              uint64_t q = y / x;
              TESTEND0(q >= N);
              y -= (unsigned int) q * x;
              u += (unsigned int) q * v;
              OUT0(11);
            }
          else
            while (x < y)
              {
                TESTEND0(u + v >= N);
                y -= x;
                u += v;
                OUT0(12);
              }
          TESTEND0(u + v >= N);
          x -= y;
          v += u;
          OUT0(13);
        }
      else
        {
          if (LOGMS < 64)
            {
              uint64_t diff;
              if (b >> 32 < d)
                return 0;
              diff = b - ((uint64_t) d << 32);
              if (diff > ((uint64_t) d << 32))
                {
                  diff -= (uint64_t) d << 32;
                  if ((MIN(y, diff) >> LOGMS) >= x)
                    {
                      uint64_t q = MIN(y, diff) / x - 1;
                      b -= (unsigned int) q * x;
                      y -= (unsigned int) q * x;
                      u += (unsigned int) q * v;
                      OUT0(20);
                    }
                }
            }
          b -= x;
          OUT0(21);
          if (LOGMS == 0 || (LOGMS < 64 && (x >> LOGMS) > y))
            {
              uint64_t q = x / y;
              TESTEND0(q >= N);
              x -= (unsigned int) q * y;
              v += (unsigned int) q * u;
              OUT0(22);
            }
          else
            while (y < x)
              {
                TESTEND0(u + v >= N);
                x -= y;
                v += u;
                OUT0(23);
              }
          TESTEND0(u + v >= N);
          y -= x;
          u += v;
          OUT0(24);
        }
    }
}

int32_t test(uint32_t *buffer, char *endbuff, long pgen)
{
  uint64_t a0, a1, a2;
  uint64_t i = NPTS;  /* number of points */
  int_fast32_t n = 0;  /* number of exceptions */

  if (pgen && i >> pgen > K)
    i = (uint64_t) K << pgen;

  assert(sizeof(uint64_t) == 8 && (uint64_t) -1 > 0);
  assert(sizeof(int_fast32_t) >= 4);
  assert(sizeof(mp_limb_t) * CHAR_BIT == GMP_LIMB_BITS);
  assert(mp_bits_per_limb == GMP_LIMB_BITS);

  do
    {
      uint_fast32_t h = NSSI;

      /* Coefficient initialization (degree 2) */
#if GMP_LIMB_BITS == 32
      a0 = ((uint64_t) a0_0[4-1] << 32) + a0_0[4-2];
      a1 = ((uint64_t) a1_0[4-1] << 32) + a1_0[4-2];
      a2 = ((uint64_t) a2_0[4-1] << 32) + a2_0[4-2];
#else
      a0 = a0_0[2-1];
      a1 = a1_0[2-1];
      a2 = a2_0[2-1];
#endif

      /* Global line test on subinterval */
      {
        uint64_t b;
        uint_fast32_t d;
#if 0
        uint_fast32_t r, s;
#endif

        if ((uint32_t) (a2 >> 32) == 0)
          {
            b = (uint64_t) 1 << 32;
            d = (uint_fast32_t) 1 + 1 + ((uint32_t) a2 >> 3);
          }
        else if ((uint32_t) (a2 >> 32) == (uint32_t) -1)
          {
            d = (uint_fast32_t) 1 + 1 + ((- (uint32_t) a2) >> 3);
            b = (uint64_t) d << 32;
          }
        else
          goto subint;

        if (glt0(a1, b - a0, d, K))
          goto next;

#if 0
        for (s = K; (r = glt1(a1, b - a0, d, s)) < s; )
          {
            a0 += r * a1 + (r*(r-1)/2) * a2;
            a1 += r * a2;

            if (a0 >> 32 < 1)
              {
                uint64_t j;
                if ((void *) buffer == (void *) endbuff || n == INT_MAX)
                  return -1;
                n++;
                j = (NPTS - i) + (K - (s - r));
                buffer[0] = (uint32_t) j;
                buffer[1] = j >> 32;
                buffer += 2;
              }

            s -= r + 1;
            if (s == 0)
              break;

            a0 += a1;
            a1 += a2;
          }
        goto next;
#endif
      }

    subint:
      do
        {
          int_fast32_t k = LSSI;

          if (LOGSS)
            {
              /* Global line test on subsubinterval */
              if ((uint32_t) (a2 >> 32) == 0)
                {
                  if (!glt0(a1, ((uint64_t) 1 << 32) - a0,
                            (uint_fast32_t) 1 + 1 + ((uint32_t) a2 >> 11),
                            LSSI))
                    goto naive_method;
                }
              else if ((uint32_t) (a2 >> 32) == (uint32_t) -1)
                {
                  uint_fast32_t err;
                  err = (uint_fast32_t) 1 + 1 + ((- (uint32_t) a2) >> 11);
                  if (!glt0(a1, ((uint64_t) err << 32) - a0, err, LSSI))
                    goto naive_method;
                }
              else
                goto naive_method;

              a0 += a1 << LOGL;
              a1 += a2 << LOGL;
              a0 += (a2 << (2 * LOGL - 1)) - (a2 << (LOGL - 1));
              continue;
            }

        naive_method:
          do
            {
              if (a0 >> 32 < 1)
                {
                  uint64_t j;
                  if ((void *) buffer == (void *) endbuff || n == INT_MAX)
                    return -1;
                  n++;
                  j = (NPTS - i) + (LOGSS ? ((NSSI - h) << LOGL) : 0)
                    + (LSSI - k);
                  buffer[0] = (uint32_t) j;
                  buffer[1] = j >> 32;
                  buffer += 2;
                }
              a0 += a1;
              a1 += a2;
            }
          while (--k);
        }
      while (LOGSS && --h);

    next:
#if GMP_LIMB_BITS == 32
      mpn_add_n((mp_limb_t *) a0_0, (mp_limb_t *) a0_0,
                (mp_limb_t *) a0_1 + 1, 4);
      mpn_add_n((mp_limb_t *) a0_1, (mp_limb_t *) a0_1,
                (mp_limb_t *) a0_2 + 1, 5);
      mpn_add_n((mp_limb_t *) a0_2, (mp_limb_t *) a0_2,
                (mp_limb_t *) a0_3 + 1, 6);
      mpn_add_n((mp_limb_t *) a0_3, (mp_limb_t *) a0_3,
                (mp_limb_t *) a0_4 + 2, 7);
      mpn_add_n((mp_limb_t *) a0_4, (mp_limb_t *) a0_4,
                (mp_limb_t *) a0_5 + 1, 9);
      mpn_add_n((mp_limb_t *) a0_5, (mp_limb_t *) a0_5,
                (mp_limb_t *) a0_6 + 1, 10);
      mpn_add_n((mp_limb_t *) a0_6, (mp_limb_t *) a0_6,
                (mp_limb_t *) a0_7 + 1, 11);
      mpn_add_n((mp_limb_t *) a0_7, (mp_limb_t *) a0_7,
                (mp_limb_t *) a0_8 + 1, 12);
      mpn_add_n((mp_limb_t *) a0_8, (mp_limb_t *) a0_8,
                (mp_limb_t *) a0_9 + 1, 13);
      mpn_add_n((mp_limb_t *) a0_9, (mp_limb_t *) a0_9,
                (mp_limb_t *) a0_10 + 1, 14);
      mpn_add_n((mp_limb_t *) a0_10, (mp_limb_t *) a0_10,
                (mp_limb_t *) a0_11 + 2, 15);
      mpn_add_n((mp_limb_t *) a0_11, (mp_limb_t *) a0_11,
                (mp_limb_t *) a0_12 + 0, 17);
      mpn_add_n((mp_limb_t *) a1_0, (mp_limb_t *) a1_0,
                (mp_limb_t *) a1_1 + 1, 4);
      mpn_add_n((mp_limb_t *) a1_1, (mp_limb_t *) a1_1,
                (mp_limb_t *) a1_2 + 1, 5);
      mpn_add_n((mp_limb_t *) a1_2, (mp_limb_t *) a1_2,
                (mp_limb_t *) a1_3 + 1, 6);
      mpn_add_n((mp_limb_t *) a1_3, (mp_limb_t *) a1_3,
                (mp_limb_t *) a1_4 + 2, 7);
      mpn_add_n((mp_limb_t *) a1_4, (mp_limb_t *) a1_4,
                (mp_limb_t *) a1_5 + 1, 9);
      mpn_add_n((mp_limb_t *) a1_5, (mp_limb_t *) a1_5,
                (mp_limb_t *) a1_6 + 1, 10);
      mpn_add_n((mp_limb_t *) a1_6, (mp_limb_t *) a1_6,
                (mp_limb_t *) a1_7 + 1, 11);
      mpn_add_n((mp_limb_t *) a1_7, (mp_limb_t *) a1_7,
                (mp_limb_t *) a1_8 + 1, 12);
      mpn_add_n((mp_limb_t *) a1_8, (mp_limb_t *) a1_8,
                (mp_limb_t *) a1_9 + 1, 13);
      mpn_add_n((mp_limb_t *) a1_9, (mp_limb_t *) a1_9,
                (mp_limb_t *) a1_10 + 1, 14);
      mpn_add_n((mp_limb_t *) a1_10, (mp_limb_t *) a1_10,
                (mp_limb_t *) a1_11 + 2, 15);
      mpn_add_n((mp_limb_t *) a2_0, (mp_limb_t *) a2_0,
                (mp_limb_t *) a2_1 + 1, 4);
      mpn_add_n((mp_limb_t *) a2_1, (mp_limb_t *) a2_1,
                (mp_limb_t *) a2_2 + 1, 5);
      mpn_add_n((mp_limb_t *) a2_2, (mp_limb_t *) a2_2,
                (mp_limb_t *) a2_3 + 1, 6);
      mpn_add_n((mp_limb_t *) a2_3, (mp_limb_t *) a2_3,
                (mp_limb_t *) a2_4 + 2, 7);
      mpn_add_n((mp_limb_t *) a2_4, (mp_limb_t *) a2_4,
                (mp_limb_t *) a2_5 + 1, 9);
      mpn_add_n((mp_limb_t *) a2_5, (mp_limb_t *) a2_5,
                (mp_limb_t *) a2_6 + 1, 10);
      mpn_add_n((mp_limb_t *) a2_6, (mp_limb_t *) a2_6,
                (mp_limb_t *) a2_7 + 1, 11);
      mpn_add_n((mp_limb_t *) a2_7, (mp_limb_t *) a2_7,
                (mp_limb_t *) a2_8 + 1, 12);
      mpn_add_n((mp_limb_t *) a2_8, (mp_limb_t *) a2_8,
                (mp_limb_t *) a2_9 + 1, 13);
      mpn_add_n((mp_limb_t *) a2_9, (mp_limb_t *) a2_9,
                (mp_limb_t *) a2_10 + 1, 14);
#else
      mpn_add_n((mp_limb_t *) a0_0, (mp_limb_t *) a0_0,
                (mp_limb_t *) a0_1 + 1, 2);
      mpn_add_n((mp_limb_t *) a0_1, (mp_limb_t *) a0_1,
                (mp_limb_t *) a0_2 + 0, 3);
      mpn_add_n((mp_limb_t *) a0_2, (mp_limb_t *) a0_2,
                (mp_limb_t *) a0_3 + 1, 3);
      mpn_add_n((mp_limb_t *) a0_3, (mp_limb_t *) a0_3,
                (mp_limb_t *) a0_4 + 1, 4);
      mpn_add_n((mp_limb_t *) a0_4, (mp_limb_t *) a0_4,
                (mp_limb_t *) a0_5 + 0, 5);
      mpn_add_n((mp_limb_t *) a0_5, (mp_limb_t *) a0_5,
                (mp_limb_t *) a0_6 + 1, 5);
      mpn_add_n((mp_limb_t *) a0_6, (mp_limb_t *) a0_6,
                (mp_limb_t *) a0_7 + 0, 6);
      mpn_add_n((mp_limb_t *) a0_7, (mp_limb_t *) a0_7,
                (mp_limb_t *) a0_8 + 1, 6);
      mpn_add_n((mp_limb_t *) a0_8, (mp_limb_t *) a0_8,
                (mp_limb_t *) a0_9 + 0, 7);
      mpn_add_n((mp_limb_t *) a0_9, (mp_limb_t *) a0_9,
                (mp_limb_t *) a0_10 + 1, 7);
      mpn_add_n((mp_limb_t *) a0_10, (mp_limb_t *) a0_10,
                (mp_limb_t *) a0_11 + 1, 8);
      mpn_add_n((mp_limb_t *) a0_11, (mp_limb_t *) a0_11,
                (mp_limb_t *) a0_12 + 0, 9);
      mpn_add_n((mp_limb_t *) a1_0, (mp_limb_t *) a1_0,
                (mp_limb_t *) a1_1 + 1, 2);
      mpn_add_n((mp_limb_t *) a1_1, (mp_limb_t *) a1_1,
                (mp_limb_t *) a1_2 + 0, 3);
      mpn_add_n((mp_limb_t *) a1_2, (mp_limb_t *) a1_2,
                (mp_limb_t *) a1_3 + 1, 3);
      mpn_add_n((mp_limb_t *) a1_3, (mp_limb_t *) a1_3,
                (mp_limb_t *) a1_4 + 1, 4);
      mpn_add_n((mp_limb_t *) a1_4, (mp_limb_t *) a1_4,
                (mp_limb_t *) a1_5 + 0, 5);
      mpn_add_n((mp_limb_t *) a1_5, (mp_limb_t *) a1_5,
                (mp_limb_t *) a1_6 + 1, 5);
      mpn_add_n((mp_limb_t *) a1_6, (mp_limb_t *) a1_6,
                (mp_limb_t *) a1_7 + 0, 6);
      mpn_add_n((mp_limb_t *) a1_7, (mp_limb_t *) a1_7,
                (mp_limb_t *) a1_8 + 1, 6);
      mpn_add_n((mp_limb_t *) a1_8, (mp_limb_t *) a1_8,
                (mp_limb_t *) a1_9 + 0, 7);
      mpn_add_n((mp_limb_t *) a1_9, (mp_limb_t *) a1_9,
                (mp_limb_t *) a1_10 + 1, 7);
      mpn_add_n((mp_limb_t *) a1_10, (mp_limb_t *) a1_10,
                (mp_limb_t *) a1_11 + 1, 8);
      mpn_add_n((mp_limb_t *) a2_0, (mp_limb_t *) a2_0,
                (mp_limb_t *) a2_1 + 1, 2);
      mpn_add_n((mp_limb_t *) a2_1, (mp_limb_t *) a2_1,
                (mp_limb_t *) a2_2 + 0, 3);
      mpn_add_n((mp_limb_t *) a2_2, (mp_limb_t *) a2_2,
                (mp_limb_t *) a2_3 + 1, 3);
      mpn_add_n((mp_limb_t *) a2_3, (mp_limb_t *) a2_3,
                (mp_limb_t *) a2_4 + 1, 4);
      mpn_add_n((mp_limb_t *) a2_4, (mp_limb_t *) a2_4,
                (mp_limb_t *) a2_5 + 0, 5);
      mpn_add_n((mp_limb_t *) a2_5, (mp_limb_t *) a2_5,
                (mp_limb_t *) a2_6 + 1, 5);
      mpn_add_n((mp_limb_t *) a2_6, (mp_limb_t *) a2_6,
                (mp_limb_t *) a2_7 + 0, 6);
      mpn_add_n((mp_limb_t *) a2_7, (mp_limb_t *) a2_7,
                (mp_limb_t *) a2_8 + 1, 6);
      mpn_add_n((mp_limb_t *) a2_8, (mp_limb_t *) a2_8,
                (mp_limb_t *) a2_9 + 0, 7);
      mpn_add_n((mp_limb_t *) a2_9, (mp_limb_t *) a2_9,
                (mp_limb_t *) a2_10 + 1, 7);
#endif
    }
  while (i -= K);

  return n;
}

int main(int argc, char **argv)
{
  uint32_t *buffer;
  int32_t n;
  long pgen = 0;
#ifndef NOTIMING
  struct tms tbuf;
  double usertime = -1.0;
#endif

  assert(sizeof(uint32_t) == 4
      && (unsigned long) (uint32_t) -1 == 4294967295UL);

  if (argc > 1)
    {
      char *end;
      pgen = strtol(argv[1], &end, 10);
      if (*end != '\0' || pgen < 1 || pgen > 31)
        {
          fprintf(stderr, "Incorrect pgen argument\n");
          exit(6);
        }
    }

  if (!pgen)
    {
      if (printf("[1234]\t(0:34)\n") < 0 ||
          fflush(stdout) != 0)
        { fprintf(stderr, "Output error!\n"); exit(3); }
    }

  /* Allocate the buffer + a word */
  if ((buffer = (uint32_t *) malloc(BUFFSIZE+4)) == NULL)
    { fprintf(stderr, "Insufficient memory!\n"); exit(1); }

  n = test(buffer, (char *) buffer + BUFFSIZE, pgen);

  if (pgen)
    return 0;

#ifndef NOTIMING
  if (times(&tbuf) == (clock_t) -1 ||
      (usertime = (double) tbuf.tms_utime / sysconf(_SC_CLK_TCK)) < 0.0)
    {
      fprintf(stderr, "Could not get user time!\n");
      exit(7);
    }
#endif
  if (n < 0)
    { fprintf(stderr, "Too many exceptions!\n"); exit(2); }

  printf("<%ld>\n", (long int) n);
  while (n--)
    {
      int ret;
      ret = printf("[%08X,%08X]	[%u,%u]\n",
                   buffer[0], buffer[1], buffer[0], buffer[1]);
      if (ret < 0)
        {
          fprintf(stderr, "Output error (return value = %d)!\n", ret);
          exit(4);
        }
      buffer += 2;
    }

  if (printf("{1234}") < 0
#ifndef NOTIMING
   || printf("\t%.2f", usertime) < 0
#endif
   || printf("\n") < 0)
    { fprintf(stderr, "Output error!\n"); exit(5); }

  return 0;
}

/* end of tmp/tst-p345-54-0.c */

[Index of Archives]     [Linux C Programming]     [Linux Kernel]     [eCos]     [Fedora Development]     [Fedora Announce]     [Autoconf]     [The DWARVES Debugging Tools]     [Yosemite Campsites]     [Yosemite News]     [Linux GCC]

  Powered by Linux