Search Postgresql Archives

Re: [pg_trgm] Making similarity(?, ?) < ? use an index

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Artur, thanks for help. I managed to add the new strategy to the index. Hurray! I also discovered a bug in the process that I reported via the form.

I still have a few questions:

1. Naming - pg_trgm_match, match, threshold, trgm_check_match, ThresholdStrategyNumber - are these good names?
2. I made trgm_check_match IMMUTABLE. Are there any other modifies that should be there?
3. I defined % (text, pg_trgm_match) but didn't provide a commutator and other helper procedures. Which of them should I implement?
4. Can I obtain query and nlimit with less code?
5. The attached patch replaced "res = (*(int *) &tmpsml == *(int *) &nlimit || tmpsml > nlimit);" with "res = (tmpsml >= nlimit);" to fix the bug on my machine. I'm not sure whether that's the long-term fix we want to have. It's just there to help me make progress with trigrams.

Thanks for help.

Cheers
Greg
diff --git a/contrib/pg_trgm/pg_trgm--1.3.sql b/contrib/pg_trgm/pg_trgm--1.3.sql
index b279f7d..faa1fce 100644
--- a/contrib/pg_trgm/pg_trgm--1.3.sql
+++ b/contrib/pg_trgm/pg_trgm--1.3.sql
@@ -3,6 +3,8 @@
 -- complain if script is sourced in psql, rather than via CREATE EXTENSION
 \echo Use "CREATE EXTENSION pg_trgm" to load this file. \quit
 
+CREATE TYPE pg_trgm_match AS (match TEXT, threshold REAL);
+
 -- Deprecated function
 CREATE FUNCTION set_limit(float4)
 RETURNS float4
@@ -108,6 +110,18 @@ CREATE OPERATOR <->> (
         COMMUTATOR = '<<->'
 );
 
+CREATE OR REPLACE FUNCTION trgm_check_match(string TEXT, match pg_trgm_match) RETURNS bool AS $$
+BEGIN
+    RETURN similarity(match.match, string) >= match.threshold;
+END;
+$$ LANGUAGE plpgsql IMMUTABLE;
+
+CREATE OPERATOR %(
+	leftarg = text,
+	rightarg = pg_trgm_match,
+	procedure = trgm_check_match
+);
+
 -- gist key
 CREATE FUNCTION gtrgm_in(cstring)
 RETURNS gtrgm
@@ -126,7 +140,7 @@ CREATE TYPE gtrgm (
 );
 
 -- support functions for gist
-CREATE FUNCTION gtrgm_consistent(internal,text,smallint,oid,internal)
+CREATE FUNCTION gtrgm_consistent(internal,anynonarray,smallint,oid,internal)
 RETURNS bool
 AS 'MODULE_PATHNAME'
 LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;
@@ -171,7 +185,7 @@ CREATE OPERATOR CLASS gist_trgm_ops
 FOR TYPE text USING gist
 AS
         OPERATOR        1       % (text, text),
-        FUNCTION        1       gtrgm_consistent (internal, text, smallint, oid, internal),
+        FUNCTION        1       gtrgm_consistent (internal, anynonarray, smallint, oid, internal),
         FUNCTION        2       gtrgm_union (internal, internal),
         FUNCTION        3       gtrgm_compress (internal),
         FUNCTION        4       gtrgm_decompress (internal),
@@ -252,3 +266,6 @@ LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;
 ALTER OPERATOR FAMILY gin_trgm_ops USING gin ADD
         OPERATOR        7       %> (text, text),
         FUNCTION        6      (text,text) gin_trgm_triconsistent (internal, int2, text, int4, internal, internal, internal);
+
+ALTER OPERATOR FAMILY gist_trgm_ops USING gist ADD
+        OPERATOR        9       % (text, pg_trgm_match);
diff --git a/contrib/pg_trgm/trgm.h b/contrib/pg_trgm/trgm.h
index 8cd88e7..f2b6008 100644
--- a/contrib/pg_trgm/trgm.h
+++ b/contrib/pg_trgm/trgm.h
@@ -34,6 +34,7 @@
 #define RegExpICaseStrategyNumber		6
 #define WordSimilarityStrategyNumber	7
 #define WordDistanceStrategyNumber		8
+#define ThresholdStrategyNumber			9
 
 typedef char trgm[3];
 
diff --git a/contrib/pg_trgm/trgm_gist.c b/contrib/pg_trgm/trgm_gist.c
index 3a5aff9..3884b13 100644
--- a/contrib/pg_trgm/trgm_gist.c
+++ b/contrib/pg_trgm/trgm_gist.c
@@ -5,7 +5,10 @@
 
 #include "trgm.h"
 
+#include "access/htup.h"
+#include "access/htup_details.h"
 #include "access/stratnum.h"
+#include "utils/typcache.h"
 #include "fmgr.h"
 
 
@@ -181,7 +184,7 @@ Datum
 gtrgm_consistent(PG_FUNCTION_ARGS)
 {
 	GISTENTRY  *entry = (GISTENTRY *) PG_GETARG_POINTER(0);
-	text	   *query = PG_GETARG_TEXT_P(1);
+	text	   *query;
 	StrategyNumber strategy = (StrategyNumber) PG_GETARG_UINT16(2);
 
 	/* Oid		subtype = PG_GETARG_OID(3); */
@@ -189,10 +192,43 @@ gtrgm_consistent(PG_FUNCTION_ARGS)
 	TRGM	   *key = (TRGM *) DatumGetPointer(entry->key);
 	TRGM	   *qtrg;
 	bool		res;
-	Size		querysize = VARSIZE(query);
+	Size		querysize;
 	gtrgm_consistent_cache *cache;
 	double		nlimit;
 
+	HeapTupleHeader	query_match;
+	Oid				tupType;
+	int32			tupTypmod;
+	TupleDesc		tupdesc;
+	HeapTupleData	tuple;
+	bool			isnull;
+
+	if (strategy == ThresholdStrategyNumber)
+	{
+		query_match = PG_GETARG_HEAPTUPLEHEADER(1);
+		tupType = HeapTupleHeaderGetTypeId(query_match);
+		tupTypmod = HeapTupleHeaderGetTypMod(query_match);
+		tupdesc = lookup_rowtype_tupdesc(tupType, tupTypmod);
+
+		tuple.t_len = HeapTupleHeaderGetDatumLength(query_match);
+		ItemPointerSetInvalid(&(tuple.t_self));
+		tuple.t_tableOid = InvalidOid;
+		tuple.t_data = query_match;
+
+		query = DatumGetTextP(fastgetattr(&tuple, 1, tupdesc, &isnull));
+		querysize = VARSIZE(query);
+		nlimit = DatumGetFloat4(fastgetattr(&tuple, 2, tupdesc, &isnull));
+
+		ReleaseTupleDesc(tupdesc);
+	}
+	else
+	{
+		query = PG_GETARG_TEXT_P(1);
+		querysize = VARSIZE(query);
+		nlimit = (strategy == SimilarityStrategyNumber) ?
+				 similarity_threshold : word_similarity_threshold;
+	}
+
 	/*
 	 * We keep the extracted trigrams in cache, because trigram extraction is
 	 * relatively CPU-expensive.  When trying to reuse a cached value, check
@@ -220,6 +256,7 @@ gtrgm_consistent(PG_FUNCTION_ARGS)
 		{
 			case SimilarityStrategyNumber:
 			case WordSimilarityStrategyNumber:
+			case ThresholdStrategyNumber:
 				qtrg = generate_trgm(VARDATA(query),
 									 querysize - VARHDRSZ);
 				break;
@@ -289,10 +326,9 @@ gtrgm_consistent(PG_FUNCTION_ARGS)
 	{
 		case SimilarityStrategyNumber:
 		case WordSimilarityStrategyNumber:
+		case ThresholdStrategyNumber:
 			/* Similarity search is exact. Word similarity search is inexact */
 			*recheck = (strategy == WordSimilarityStrategyNumber);
-			nlimit = (strategy == SimilarityStrategyNumber) ?
-				similarity_threshold : word_similarity_threshold;
 
 			if (GIST_LEAF(entry))
 			{					/* all leafs contains orig trgm */
@@ -305,7 +341,7 @@ gtrgm_consistent(PG_FUNCTION_ARGS)
 				float4 volatile tmpsml = cnt_sml(qtrg, key, *recheck);
 
 				/* strange bug at freebsd 5.2.1 and gcc 3.3.3 */
-				res = (*(int *) &tmpsml == *(int *) &nlimit || tmpsml > nlimit);
+				res = (tmpsml >= nlimit);
 			}
 			else if (ISALLTRUE(key))
 			{					/* non-leaf contains signature */
@@ -474,6 +510,7 @@ gtrgm_distance(PG_FUNCTION_ARGS)
 	{
 		case DistanceStrategyNumber:
 		case WordDistanceStrategyNumber:
+		case ThresholdStrategyNumber:
 			*recheck = strategy == WordDistanceStrategyNumber;
 			if (GIST_LEAF(entry))
 			{					/* all leafs contains orig trgm */
-- 
Sent via pgsql-general mailing list (pgsql-general@xxxxxxxxxxxxxx)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-general

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Index of Archives]     [Postgresql Jobs]     [Postgresql Admin]     [Postgresql Performance]     [Linux Clusters]     [PHP Home]     [PHP on Windows]     [Kernel Newbies]     [PHP Classes]     [PHP Books]     [PHP Databases]     [Postgresql & PHP]     [Yosemite]
  Powered by Linux