Re: GiST index performance

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Wed, 22 Apr 2009, Matthew Wakeling wrote:
I will post a patch when I have ported my bioseg code over to the seg data type.

Here is my patch ported over to the seg contrib package, attached. Apply it to seg.c and all should be well. A similar thing needs to be done to cube, but I haven't looked at that.

Matthew

--
An optimist sees the glass as half full, a pessimist as half empty,
and an engineer as having redundant storage capacity.
--- seg.c	2006-09-10 18:36:51.000000000 +0100
+++ seg.c_new	2009-04-22 17:03:13.000000000 +0100
@@ -69,7 +69,6 @@
 bool		seg_right(SEG * a, SEG * b);
 bool		seg_over_right(SEG * a, SEG * b);
 SEG		   *seg_union(SEG * a, SEG * b);
-SEG		   *seg_inter(SEG * a, SEG * b);
 void		rt_seg_size(SEG * a, float *sz);
 float	   *seg_size(SEG * a);
 
@@ -269,14 +268,22 @@
 gseg_penalty(GISTENTRY *origentry, GISTENTRY *newentry, float *result)
 {
 	SEG		   *ud;
+    SEG        *origrange;
+    SEG        *newrange;
 	float		tmp1,
 				tmp2;
 
-	ud = seg_union((SEG *) DatumGetPointer(origentry->key),
-				   (SEG *) DatumGetPointer(newentry->key));
+    origrange = (SEG *) DatumGetPointer(origentry->key);
+    newrange = (SEG *) DatumGetPointer(newentry->key);
+	ud = seg_union(origrange, newrange);
 	rt_seg_size(ud, &tmp1);
-	rt_seg_size((SEG *) DatumGetPointer(origentry->key), &tmp2);
-	*result = tmp1 - tmp2;
+	rt_seg_size(origrange, &tmp2);
+    if (tmp1 == tmp2) {
+        rt_seg_size(newrange, &tmp1);
+        *result = (tmp2 - tmp1) / tmp2;
+    } else {
+        *result = tmp1 - tmp2 + 1.0;
+    }
 
 #ifdef GIST_DEBUG
 	fprintf(stderr, "penalty\n");
@@ -297,27 +304,16 @@
 			   GIST_SPLITVEC *v)
 {
 	OffsetNumber i,
-				j;
 	SEG		   *datum_alpha,
-			   *datum_beta;
 	SEG		   *datum_l,
 			   *datum_r;
-	SEG		   *union_d,
-			   *union_dl,
-			   *union_dr;
-	SEG		   *inter_d;
-	bool		firsttime;
-	float		size_alpha,
-				size_beta,
-				size_union,
-				size_inter;
-	float		size_waste,
-				waste;
-	float		size_l,
-				size_r;
+    bool        firsttime;
+	float		lowest,
+				highest,
+                midpoint,
+                split,
+                midpointsum;
 	int			nbytes;
-	OffsetNumber seed_1 = 1,
-				seed_2 = 2;
 	OffsetNumber *left,
 			   *right;
 	OffsetNumber maxoff;
@@ -326,107 +322,64 @@
 	fprintf(stderr, "picksplit\n");
 #endif
 
-	maxoff = entryvec->n - 2;
+	maxoff = entryvec->n - 1;
 	nbytes = (maxoff + 2) * sizeof(OffsetNumber);
 	v->spl_left = (OffsetNumber *) palloc(nbytes);
 	v->spl_right = (OffsetNumber *) palloc(nbytes);
 
+    midpointsum = 0.0;
 	firsttime = true;
-	waste = 0.0;
+    lowest = 0.0;
+    highest = 0.0;
 
-	for (i = FirstOffsetNumber; i < maxoff; i = OffsetNumberNext(i))
+	for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i))
 	{
 		datum_alpha = (SEG *) DatumGetPointer(entryvec->vector[i].key);
-		for (j = OffsetNumberNext(i); j <= maxoff; j = OffsetNumberNext(j))
-		{
-			datum_beta = (SEG *) DatumGetPointer(entryvec->vector[j].key);
-
-			/* compute the wasted space by unioning these guys */
-			/* size_waste = size_union - size_inter; */
-			union_d = seg_union(datum_alpha, datum_beta);
-			rt_seg_size(union_d, &size_union);
-			inter_d = seg_inter(datum_alpha, datum_beta);
-			rt_seg_size(inter_d, &size_inter);
-			size_waste = size_union - size_inter;
-
-			/*
-			 * are these a more promising split that what we've already seen?
-			 */
-			if (size_waste > waste || firsttime)
-			{
-				waste = size_waste;
-				seed_1 = i;
-				seed_2 = j;
-				firsttime = false;
-			}
-		}
+        midpoint = (datum_alpha->lower + datum_alpha->upper) / 2;
+        midpointsum += midpoint;
+        if (firsttime || (midpoint < lowest))
+        {
+            lowest = midpoint;
+        }
+        if (firsttime || (midpoint > highest))
+        {
+            highest = midpoint;
+        }
+        firsttime = false;
 	}
 
+    split = midpointsum / maxoff;
 	left = v->spl_left;
 	v->spl_nleft = 0;
 	right = v->spl_right;
 	v->spl_nright = 0;
 
-	datum_alpha = (SEG *) DatumGetPointer(entryvec->vector[seed_1].key);
-	datum_l = seg_union(datum_alpha, datum_alpha);
-	rt_seg_size(datum_l, &size_l);
-	datum_beta = (SEG *) DatumGetPointer(entryvec->vector[seed_2].key);
-	datum_r = seg_union(datum_beta, datum_beta);
-	rt_seg_size(datum_r, &size_r);
+    datum_l = (SEG *) palloc(sizeof(*datum_l));
+    datum_l->lower = lowest;
+    datum_l->upper = lowest;
+    datum_r = (SEG *) palloc(sizeof(*datum_r));
+    datum_r->lower = highest;
+    datum_r->upper = highest;
 
 	/*
-	 * Now split up the regions between the two seeds.	An important property
-	 * of this split algorithm is that the split vector v has the indices of
-	 * items to be split in order in its left and right vectors.  We exploit
-	 * this property by doing a merge in the code that actually splits the
-	 * page.
-	 *
-	 * For efficiency, we also place the new index tuple in this loop. This is
-	 * handled at the very end, when we have placed all the existing tuples
-	 * and i == maxoff + 1.
+	 * Now split up the regions between the two seeds.
 	 */
 
-	maxoff = OffsetNumberNext(maxoff);
 	for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i))
 	{
-		/*
-		 * If we've already decided where to place this item, just put it on
-		 * the right list.	Otherwise, we need to figure out which page needs
-		 * the least enlargement in order to store the item.
-		 */
-
-		if (i == seed_1)
-		{
-			*left++ = i;
-			v->spl_nleft++;
-			continue;
-		}
-		else if (i == seed_2)
-		{
-			*right++ = i;
-			v->spl_nright++;
-			continue;
-		}
-
-		/* okay, which page needs least enlargement? */
 		datum_alpha = (SEG *) DatumGetPointer(entryvec->vector[i].key);
-		union_dl = seg_union(datum_l, datum_alpha);
-		union_dr = seg_union(datum_r, datum_alpha);
-		rt_seg_size(union_dl, &size_alpha);
-		rt_seg_size(union_dr, &size_beta);
+        midpoint = (datum_alpha->lower + datum_alpha->upper) / 2;
 
 		/* pick which page to add it to */
-		if (size_alpha - size_l < size_beta - size_r)
+        if (mipoint <= split)
 		{
-			datum_l = union_dl;
-			size_l = size_alpha;
+			datum_l = seg_union(datum_l, datum_alpha);
 			*left++ = i;
 			v->spl_nleft++;
 		}
 		else
 		{
-			datum_r = union_dr;
-			size_r = size_alpha;
+			datum_r = seg_union(datum_r, datum_alpha);
 			*right++ = i;
 			v->spl_nright++;
 		}
@@ -665,45 +618,6 @@
 	return (n);
 }
 
-
-SEG *
-seg_inter(SEG * a, SEG * b)
-{
-	SEG		   *n;
-
-	n = (SEG *) palloc(sizeof(*n));
-
-	/* take min of upper endpoints */
-	if (a->upper < b->upper)
-	{
-		n->upper = a->upper;
-		n->u_sigd = a->u_sigd;
-		n->u_ext = a->u_ext;
-	}
-	else
-	{
-		n->upper = b->upper;
-		n->u_sigd = b->u_sigd;
-		n->u_ext = b->u_ext;
-	}
-
-	/* take max of lower endpoints */
-	if (a->lower > b->lower)
-	{
-		n->lower = a->lower;
-		n->l_sigd = a->l_sigd;
-		n->l_ext = a->l_ext;
-	}
-	else
-	{
-		n->lower = b->lower;
-		n->l_sigd = b->l_sigd;
-		n->l_ext = b->l_ext;
-	}
-
-	return (n);
-}
-
 void
 rt_seg_size(SEG * a, float *size)
 {
-- 
Sent via pgsql-performance mailing list (pgsql-performance@xxxxxxxxxxxxxx)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-performance

[Postgresql General]     [Postgresql PHP]     [PHP Users]     [PHP Home]     [PHP on Windows]     [Kernel Newbies]     [PHP Classes]     [PHP Books]     [PHP Databases]     [Yosemite]

  Powered by Linux