Re: Unexpected interval comparison

Kyotaro HORIGUCHI <horiguchi.kyotaro@xxxxxxxxxxxxx> · Tue, 04 Apr 2017 17:15:03 +0900 (Tokyo Standard Time)

Thank you for the comment.

At Mon, 03 Apr 2017 11:35:25 -0400, Tom Lane <tgl@xxxxxxxxxxxxx> wrote in <23053.1491233725@xxxxxxxxxxxxx>
> Kyotaro HORIGUCHI <horiguchi.kyotaro@xxxxxxxxxxxxx> writes:
> > Ok, the attached patch changes the result type of
> > interval_cmp_value from TimeOffset(=int64) to new 128 bit
> > LinearInterval. The value is hidden under the functions
> > interval_eq/ge.../cmp and all other stuff seems to use the
> > functions.
> 
> Looking at this now ... why isn't the INT64_AU32 macro just
> 
> #define INT64_AU32(i64) ((i64) >> 32)
> 
> ?  The business with subtracting and re-adding 1 seems unnecessary, and it
> also creates a risk of overflow with the minimum possible int64 value.

It is equivalent to "i64 / (1<<32)" except for -INT64_MAX.

INT64_AU32 gives the value for the first term in the following
polynomial.

(int64)INT64_AU32(i64) * (2^32) + (int64)INT64_AL32(i64) = i64

The previous expression intended to avoid decimal arithmetic, but
gcc optimizes the simple division better (using cmovns-add-sar)
than the current INT64_AU32 (jmp-sar) so I changed it. This
doesn't suffer overflow.

-#define INT64_AU32(i64) (((i64) < 0 ? (((i64) - 1) >> 32) + 1: ((i64) >> 32)))
+#define INT64_AU32(i64) ((i64) / (1LL<<32))

In summation of terms in 128bit multiplication expression, I
noticed that the value of the second term's lower 32bit loses MSB
for certain cases. I changed LINEARINTERVAL_ADD_INT64 to accept
the MSB (as the 65th bit) separately.

The first attached is the revised patch and the second is
temporary sanity check code for non-128bit environment code. (but
works only on 128 bit environment)

regards,

-- 
Kyotaro Horiguchi
NTT Open Source Software Center
>From d79fb456ed551fcaedea3eb3420fd4d8e56bd48c Mon Sep 17 00:00:00 2001
From: Kyotaro Horiguchi <horiguchi.kyotaro@xxxxxxxxxxxxx>
Date: Mon, 3 Apr 2017 19:41:09 +0900
Subject: [PATCH 1/2] Fix overflow during interval comparison.

The values in interval are compared by TimeOffset results of
interval_cmp_value but it is so narrow that overflows quite easily.
This patch widen the output of the function to 128 bit. For platforms
without 128 bit arithmetic, a pair of 64 bit intergers is used
instead.
---
 src/backend/utils/adt/timestamp.c      | 127 ++++++++++++++++++++++++++++++---
 src/include/datatype/timestamp.h       |  13 ++++
 src/test/regress/expected/interval.out |  36 ++++++++++
 src/test/regress/sql/interval.sql      |  17 +++++
 4 files changed, 184 insertions(+), 9 deletions(-)

diff --git a/src/backend/utils/adt/timestamp.c b/src/backend/utils/adt/timestamp.c
index 4be1999..2411bd7 100644
--- a/src/backend/utils/adt/timestamp.c
+++ b/src/backend/utils/adt/timestamp.c
@@ -2289,25 +2289,122 @@ timestamptz_cmp_timestamp(PG_FUNCTION_ARGS)
 /*
  *		interval_relop	- is interval1 relop interval2
  */
-static inline TimeOffset
+#ifdef HAVE_INT128
+static inline LinearInterval
 interval_cmp_value(const Interval *interval)
 {
-	TimeOffset	span;
+	LinearInterval	span;
 
-	span = interval->time;
-	span += interval->month * INT64CONST(30) * USECS_PER_DAY;
-	span += interval->day * INT64CONST(24) * USECS_PER_HOUR;
+	span = (int128)interval->time;
+	span += (int128)interval->month * INT64CONST(30) * USECS_PER_DAY;
+	span += (int128)interval->day * INT64CONST(24) * USECS_PER_HOUR;
 
 	return span;
 }
+#else
+/*
+ * arithmetic 32 bit extraction from int64
+ *
+ * INT64_AU32 extracts significant 32 bit of int64 as a int64, and INT64_AL32
+ * extracts non-siginificant 32 bit as a int64. Both macros extends sign bits
+ * according to the given value. The values of these macros and the parameter
+ * value are in the following relationship.
+ *
+ * i64 = (int64)INT64_AU32(i64) * (2^32) + (int64)INT64_AL32(i64)
+ */
+#define INT64_AU32(i64) ((i64) / (1LL<<32))
+#define INT64_AL32(i64) (((i64) & 0xffffffff) | ((i64) < 0 ? 0xffffffff00000000 : 0))
+
+/*
+ * Adds signed 65 bit integer into LinearInterval variable. If s is not zero,
+ * its sign is used as v's sign.
+ */
+#define LINEARINTERVAL_ADD_INT65(li, v, s) \
+{ \
+	uint64 t = (uint64)(v); \
+	uint64 p = (li).lo;	\
+	(li).lo += t; \
+	if (s < 0 || (s == 0 && v < 0))	\
+		(li).hi --; \
+	if ((li).lo < p) \
+		(li).hi ++; \
+}
+
+static inline LinearInterval
+interval_cmp_value(const Interval *interval)
+{
+	LinearInterval	span = {0, 0};
+	int64	dayfraction;
+	int64	days, tmp;
+
+	/* days cannot overflow here */
+	dayfraction = interval->time % USECS_PER_DAY;
+	days = interval->time / USECS_PER_DAY;
+	days += interval->month * INT64CONST(30);
+	days += interval->day;
+
+	/* we assume arithmetic shift here */
+	Assert(-1 >> 1 == -1);
+
+	/*
+	 * perform 128 bit multiplication using 64 bit variables.
+	 *
+	 *   x * y = ((x.hi << 32) + x.lo) * (((y.hi << 32) + y.lo)
+	 *         = (x.hi * y.hi) << 64 +
+	 *           ((x.hi * y.lo) + (x.lo * y.hi)) << 32 +
+	 *           x.lo * y.lo
+	 */
+
+	/* We don't bother calculation results in zero */
+	if (days != 0)
+	{
+		/*
+		 * Here, the possible maximum number of days is 0xf_865ce7d8 and the
+		 * constant USECS_PER_DAY is 0x14_1dd76000. So the all term in the
+		 * formula can be calculated safely in 64 bit arithmetic.
+		 */
+		int64 days_u32 = INT64_AU32(days);
+		int64 days_l32 = INT64_AL32(days);
+
+		/* the first term */
+		span.hi = days_u32 * (USECS_PER_DAY >> 32);
+
+		/* the second term */
+		tmp = days_l32 * (USECS_PER_DAY >> 32)
+			+ days_u32 * (USECS_PER_DAY & 0xffffffff);
+		span.hi += INT64_AU32(tmp);
+
+		/* this shift may push out MSB. supply it explicitly */
+		LINEARINTERVAL_ADD_INT65(span, INT64_AL32(tmp) << 32, tmp);
+
+		/* the third term */
+		tmp = days_l32 * (USECS_PER_DAY & 0xffffffff);
+		LINEARINTERVAL_ADD_INT65(span, tmp, 0);
+	}
+
+	/* add dayfraction */
+	LINEARINTERVAL_ADD_INT65(span, dayfraction, 0);
+
+	return span;
+}
+#endif
 
 static int
 interval_cmp_internal(Interval *interval1, Interval *interval2)
 {
-	TimeOffset	span1 = interval_cmp_value(interval1);
-	TimeOffset	span2 = interval_cmp_value(interval2);
+	LinearInterval	span1 = interval_cmp_value(interval1);
+	LinearInterval	span2 = interval_cmp_value(interval2);
 
+#ifdef HAVE_INT128
 	return ((span1 < span2) ? -1 : (span1 > span2) ? 1 : 0);
+#else
+	if (span1.hi < span2.hi)
+		return -1;
+	if (span1.hi > span2.hi)
+		return 1;
+	return (span1.lo < span2.lo ? -1 :
+			(span1.lo > span2.lo ? 1 : 0));
+#endif
 }
 
 Datum
@@ -2384,9 +2481,21 @@ Datum
 interval_hash(PG_FUNCTION_ARGS)
 {
 	Interval   *interval = PG_GETARG_INTERVAL_P(0);
-	TimeOffset	span = interval_cmp_value(interval);
+	LinearInterval	span = interval_cmp_value(interval);
+	int64 span64;
+
+	/*
+	 * taking non-siginificant 64 bits for hashing. it is not necessary to
+	 * fold the significant half of the bits into the rest for this
+	 * usage. Anyway it is zero for most cases.
+	 */
+#ifdef HAVE_INT128
+	span64 = span & 0xffffffffffffffff;
+#else
+	span64 = span.lo;
+#endif
 
-	return DirectFunctionCall1(hashint8, Int64GetDatumFast(span));
+	return DirectFunctionCall1(hashint8, Int64GetDatumFast(span64));
 }
 
 /* overlaps_timestamp() --- implements the SQL OVERLAPS operator.
diff --git a/src/include/datatype/timestamp.h b/src/include/datatype/timestamp.h
index 16e1b4c..fd24987 100644
--- a/src/include/datatype/timestamp.h
+++ b/src/include/datatype/timestamp.h
@@ -48,6 +48,19 @@ typedef struct
 	int32		month;			/* months and years, after time for alignment */
 } Interval;
 
+#ifdef HAVE_INT128
+typedef int128	LinearInterval;
+#else
+/*
+ * LinearInterval's alternative defeinition for the environments without
+ * int128 arithmetics. See interval_cmp_value for datails.
+ */
+typedef struct
+{
+	int64	hi;	/* holds significant 64 bits including a sign bit */
+	uint64	lo; /* holds the lower 64 bits without sign */
+} LinearInterval;
+#endif
 
 /* Limits on the "precision" option (typmod) for these data types */
 #define MAX_TIMESTAMP_PRECISION 6
diff --git a/src/test/regress/expected/interval.out b/src/test/regress/expected/interval.out
index 946c97a..d4e6a00 100644
--- a/src/test/regress/expected/interval.out
+++ b/src/test/regress/expected/interval.out
@@ -207,6 +207,42 @@ SELECT '' AS fortyfive, r1.*, r2.*
            | 34 years        | 6 years
 (45 rows)
 
+-- intervals out of 64bit linear value in microseconds
+CREATE TABLE INTERVAL_TBL_OF (f1 interval);
+INSERT INTO INTERVAL_TBL_OF (f1) VALUES ('2147483647 days 2147483647 months');
+INSERT INTO INTERVAL_TBL_OF (f1) VALUES ('1 year');
+INSERT INTO INTERVAL_TBL_OF (f1) VALUES ('-2147483648 days -2147483648 months');
+SELECT '' AS "64bits", r1.*, r2.*
+   FROM INTERVAL_TBL_OF r1, INTERVAL_TBL_OF r2
+   WHERE r1.f1 > r2.f1
+   ORDER BY r1.f1, r2.f1;
+ 64bits |                   f1                   |                    f1                     
+--------+----------------------------------------+-------------------------------------------
+        | 1 year                                 | -178956970 years -8 mons -2147483648 days
+        | 178956970 years 7 mons 2147483647 days | -178956970 years -8 mons -2147483648 days
+        | 178956970 years 7 mons 2147483647 days | 1 year
+(3 rows)
+
+CREATE INDEX ON INTERVAL_TBL_OF USING btree (f1);
+SET enable_seqscan to false;
+EXPLAIN SELECT '' AS "64bits_2", f1
+  FROM INTERVAL_TBL_OF r1 ORDER BY f1;
+                                               QUERY PLAN                                               
+--------------------------------------------------------------------------------------------------------
+ Index Only Scan using interval_tbl_of_f1_idx on interval_tbl_of r1  (cost=0.13..12.18 rows=3 width=48)
+(1 row)
+
+SELECT '' AS "64bits_2", f1
+  FROM INTERVAL_TBL_OF r1 ORDER BY f1;
+ 64bits_2 |                    f1                     
+----------+-------------------------------------------
+          | -178956970 years -8 mons -2147483648 days
+          | 1 year
+          | 178956970 years 7 mons 2147483647 days
+(3 rows)
+
+SET enable_seqscan to default;
+DROP TABLE INTERVAL_TBL_OF;
 -- Test multiplication and division with intervals.
 -- Floating point arithmetic rounding errors can lead to unexpected results,
 -- though the code attempts to do the right thing and round up to days and
diff --git a/src/test/regress/sql/interval.sql b/src/test/regress/sql/interval.sql
index cff9ada..aa21673 100644
--- a/src/test/regress/sql/interval.sql
+++ b/src/test/regress/sql/interval.sql
@@ -59,6 +59,23 @@ SELECT '' AS fortyfive, r1.*, r2.*
    WHERE r1.f1 > r2.f1
    ORDER BY r1.f1, r2.f1;
 
+-- intervals out of 64bit linear value in microseconds
+CREATE TABLE INTERVAL_TBL_OF (f1 interval);
+INSERT INTO INTERVAL_TBL_OF (f1) VALUES ('2147483647 days 2147483647 months');
+INSERT INTO INTERVAL_TBL_OF (f1) VALUES ('1 year');
+INSERT INTO INTERVAL_TBL_OF (f1) VALUES ('-2147483648 days -2147483648 months');
+SELECT '' AS "64bits", r1.*, r2.*
+   FROM INTERVAL_TBL_OF r1, INTERVAL_TBL_OF r2
+   WHERE r1.f1 > r2.f1
+   ORDER BY r1.f1, r2.f1;
+CREATE INDEX ON INTERVAL_TBL_OF USING btree (f1);
+SET enable_seqscan to false;
+EXPLAIN SELECT '' AS "64bits_2", f1
+  FROM INTERVAL_TBL_OF r1 ORDER BY f1;
+SELECT '' AS "64bits_2", f1
+  FROM INTERVAL_TBL_OF r1 ORDER BY f1;
+SET enable_seqscan to default;
+DROP TABLE INTERVAL_TBL_OF;
 
 -- Test multiplication and division with intervals.
 -- Floating point arithmetic rounding errors can lead to unexpected results,
-- 
2.9.2

>From db899ea2bbee38aed367c522f898a3e0fe1ceffd Mon Sep 17 00:00:00 2001
From: Kyotaro Horiguchi <horiguchi.kyotaro@xxxxxxxxxxxxx>
Date: Tue, 4 Apr 2017 16:44:03 +0900
Subject: [PATCH 2/2] sanity check code

---
 src/backend/utils/adt/timestamp.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/src/backend/utils/adt/timestamp.c b/src/backend/utils/adt/timestamp.c
index 2411bd7..47af4cf 100644
--- a/src/backend/utils/adt/timestamp.c
+++ b/src/backend/utils/adt/timestamp.c
@@ -2385,6 +2385,20 @@ interval_cmp_value(const Interval *interval)
 	/* add dayfraction */
 	LINEARINTERVAL_ADD_INT65(span, dayfraction, 0);
 
+	{
+		__int128 v1 = (((__int128)span.hi)<<64) + (__int128)span.lo;
+		__int128 v2 = (__int128)interval->time;
+		v2 += (__int128)interval->month * INT64CONST(30) * USECS_PER_DAY;
+		v2 += (__int128)interval->day * INT64CONST(24) * USECS_PER_HOUR;
+		if (v1 != v2)
+			elog(LOG, "#### %lx:%lx != %lx:%lx for (%ld, %d, %d)",
+				 (uint64)(v1>>64),
+				 (uint64)(v1 & 0xffffffffffffffff),
+				 (uint64)(v2>>64),
+				 (uint64)(v2 & 0xffffffffffffffff),
+				interval->time, interval->month, interval->day);
+	}
+
 	return span;
 }
 #endif
-- 
2.9.2

-- 
Sent via pgsql-general mailing list (pgsql-general@xxxxxxxxxxxxxx)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-general