Search Postgresql Archives

Re: Fragments in tsearch2 headline

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Hi,

I've ported the patch of Sushant Sinha for fragmented headlines to pg8.3.1
(http://archives.postgresql.org/pgsql-general/2007-11/msg00508.php)

W.r.t, http://archives.postgresql.org/pgsql-general/2008-03/msg00806.php
I can continue the work until this becomes an acceptable patch for pg.

Pierre-yves.
diff -Nrub postgresql-8.3.1-orig/contrib/tsearch2/tsearch2.c postgresql-8.3.1/contrib/tsearch2/tsearch2.c
--- postgresql-8.3.1-orig/contrib/tsearch2/tsearch2.c	2008-01-01 20:45:45.000000000 +0100
+++ postgresql-8.3.1/contrib/tsearch2/tsearch2.c	2008-05-22 11:35:07.000000000 +0200
@@ -82,6 +82,7 @@
 Datum		tsa_to_tsquery_name(PG_FUNCTION_ARGS);
 Datum		tsa_plainto_tsquery_name(PG_FUNCTION_ARGS);
 Datum		tsa_headline_byname(PG_FUNCTION_ARGS);
+Datum           tsa_headline_with_fragments(PG_FUNCTION_ARGS);
 Datum		tsa_ts_stat(PG_FUNCTION_ARGS);
 Datum		tsa_tsearch2(PG_FUNCTION_ARGS);
 Datum		tsa_rewrite_accum(PG_FUNCTION_ARGS);
@@ -101,6 +102,7 @@
 PG_FUNCTION_INFO_V1(tsa_to_tsquery_name);
 PG_FUNCTION_INFO_V1(tsa_plainto_tsquery_name);
 PG_FUNCTION_INFO_V1(tsa_headline_byname);
+PG_FUNCTION_INFO_V1(tsa_headline_with_fragments);
 PG_FUNCTION_INFO_V1(tsa_ts_stat);
 PG_FUNCTION_INFO_V1(tsa_tsearch2);
 PG_FUNCTION_INFO_V1(tsa_rewrite_accum);
@@ -358,6 +360,24 @@
 	return result;
 }
 
+/* tsa_headline_with_fragments(text, tsvector, text, tsquery, text) */
+Datum
+tsa_headline_with_fragments(PG_FUNCTION_ARGS)
+{
+	text	   *cfgname  = PG_GETARG_TEXT_P(0);
+	Datum		arg1 = PG_GETARG_DATUM(1);
+	Datum		arg2 = PG_GETARG_DATUM(2);
+	Datum		arg3 = PG_GETARG_DATUM(3);
+	Datum		arg4 = PG_GETARG_DATUM(4);
+	Oid	  config_oid;
+
+	config_oid = TextGetObjectId(regconfigin, cfgname);
+
+	return DirectFunctionCall5(ts_headline_with_fragments,
+                                   ObjectIdGetDatum(config_oid),
+                                   arg1, arg2, arg3, arg4);
+}
+
 /*
  * tsearch2 version of update trigger
  *
diff -Nrub postgresql-8.3.1-orig/contrib/tsearch2/tsearch2.sql.in postgresql-8.3.1/contrib/tsearch2/tsearch2.sql.in
--- postgresql-8.3.1-orig/contrib/tsearch2/tsearch2.sql.in	2007-11-28 20:33:04.000000000 +0100
+++ postgresql-8.3.1/contrib/tsearch2/tsearch2.sql.in	2008-05-22 11:55:51.000000000 +0200
@@ -384,6 +384,11 @@
 	LANGUAGE INTERNAL
         RETURNS NULL ON NULL INPUT IMMUTABLE;
 
+CREATE FUNCTION headline_with_fragments(text, text, tsquery, text)
+        RETURNS text
+        AS 'MODULE_PATHNAME', 'tsa_headline_with_fragments'
+        LANGUAGE C RETURNS NULL ON NULL INPUT IMMUTABLE;
+
 -- CREATE the OPERATOR class
 CREATE OPERATOR CLASS gist_tsvector_ops
 FOR TYPE tsvector USING gist
diff -Nrub postgresql-8.3.1-orig/src/backend/tsearch/ts_parse.c postgresql-8.3.1/src/backend/tsearch/ts_parse.c
--- postgresql-8.3.1-orig/src/backend/tsearch/ts_parse.c	2008-01-01 20:45:52.000000000 +0100
+++ postgresql-8.3.1/src/backend/tsearch/ts_parse.c	2008-05-22 13:02:39.000000000 +0200
@@ -578,6 +578,111 @@
 	FunctionCall1(&(prsobj->prsend), PointerGetDatum(prsdata));
 }
 
+#define COVER_SEP     "..."
+#define COVER_SEP_LEN (sizeof(COVER_SEP)-1)
+
+void
+hlparsetext_with_covers(Oid                 cfgId,
+                        HeadlineParsedText *prs,
+                        TSQuery             query,
+                        text               *in,
+                        struct coverpos    *covers,
+                        int4                numcovers)
+{
+	TSParserCacheEntry *prsobj;
+	TSConfigCacheEntry *cfg;
+        void               *prsdata;
+	LexizeData          ldata;
+	int4	            icover, startpos, endpos, currentpos = 0;
+
+	char       *lemm = NULL;
+	int4        lenlemm = 0;
+   	ParsedLex  *lexs;
+  	int4        type, startHL = 0;
+   	TSLexeme   *norms;
+   	int4        oldnumwords, newnumwords, i;
+
+	cfg = lookup_ts_config_cache(cfgId);
+	prsobj = lookup_ts_parser_cache(cfg->prsId);
+
+        prsdata = (void*) DatumGetPointer(FunctionCall2(&(prsobj->prsstart),
+                                                        PointerGetDatum(VARDATA(in)),
+                                                        Int32GetDatum(VARSIZE(in) - VARHDRSZ)));
+
+	LexizeInit(&ldata, cfg);
+
+	for (icover = 0; icover < numcovers; icover++)
+        {
+                if (!covers[icover].in)
+                        continue;
+
+                startpos = covers[icover].startpos;
+                endpos   = covers[icover].endpos;
+
+           	if (currentpos > endpos)
+           	{
+                        /* XXX - something wrong ... we have gone past the cover */
+                        continue;
+           	}
+
+           	/* see if we need to add a cover seperator */
+           	if (currentpos < startpos && startpos > 0)
+           	{
+               	        hladdword(prs, COVER_SEP, COVER_SEP_LEN, 3);
+                        prs->words[prs->curwords - 1].in = 1;
+           	}
+
+          	do
+           	{
+                       	type = DatumGetInt32(FunctionCall3(&(prsobj->prstoken),
+                                                           PointerGetDatum(prsdata),
+                                                           PointerGetDatum(&lemm),
+                                                           PointerGetDatum(&lenlemm)));
+
+                        LexizeAddLemm(&ldata, type, lemm, lenlemm);
+
+                       	do
+                       	{
+                                if ((norms = LexizeExec(&ldata, &lexs)) != NULL)
+                           	{
+                            	        TSLexeme *ptr = norms;
+
+                                        currentpos += 1;
+                                        while (ptr->lexeme)
+                                        {
+                                                if (ptr->flags & TSL_ADDPOS)
+                                                        currentpos += 1;
+                                                ptr++;
+                                        }
+                                }
+
+                                // start check
+                                if (!startHL && currentpos >= startpos)
+                                        startHL = 1;
+
+                                if (startHL)
+                                {
+                                        oldnumwords = prs->curwords;
+                                        addHLParsedLex(prs, query, lexs, norms);
+                                        newnumwords = prs->curwords;
+
+                                        for (i = oldnumwords; i < newnumwords; i++)
+                                        {
+                                                prs->words[i].in = 1;
+                                                if (prs->words[i].item)
+                                                        prs->words[i].selected = 1;
+                                        }
+                                }
+                        } while(norms && currentpos < endpos);
+
+                        if (currentpos >= endpos)
+                                break;
+        	} while (type > 0);
+        }
+
+	FunctionCall1(&(prsobj->prsend), PointerGetDatum(prsdata));
+}
+
 text *
 generateHeadline(HeadlineParsedText *prs)
 {
diff -Nrub postgresql-8.3.1-orig/src/backend/tsearch/wparser.c postgresql-8.3.1/src/backend/tsearch/wparser.c
--- postgresql-8.3.1-orig/src/backend/tsearch/wparser.c	2008-01-15 19:22:47.000000000 +0100
+++ postgresql-8.3.1/src/backend/tsearch/wparser.c	2008-05-22 13:02:39.000000000 +0200
@@ -370,3 +370,34 @@
 										PG_GETARG_DATUM(1),
 										PG_GETARG_DATUM(2)));
 }
+
+Datum
+ts_headline_with_fragments(PG_FUNCTION_ARGS)
+{
+	text	   *opt = (PG_NARGS() > 3 && PG_GETARG_POINTER(3)) ? PG_GETARG_TEXT_P(3) : NULL;
+	HeadlineParsedText *prs;
+	List	   *prsoptions;
+	text	   *out;
+
+	if (opt)
+		prsoptions = deserialize_deflist(PointerGetDatum(opt));
+	else
+		prsoptions = NIL;
+
+        prs = (HeadlineParsedText*) DatumGetPointer(DirectFunctionCall4(prsd_headline_with_fragments,
+                                                                        PG_GETARG_DATUM(0),
+                                                                        PG_GETARG_DATUM(1),
+                                                                        PG_GETARG_DATUM(2),
+                                                                        PointerGetDatum(prsoptions)));
+
+	out = generateHeadline(prs);
+
+	if (opt)
+		PG_FREE_IF_COPY(opt, 3);
+	pfree(prs->words);
+	pfree(prs->startsel);
+	pfree(prs->stopsel);
+        pfree(prs);
+
+	PG_RETURN_POINTER(out);
+}
diff -Nrub postgresql-8.3.1-orig/src/backend/tsearch/wparser_def.c postgresql-8.3.1/src/backend/tsearch/wparser_def.c
--- postgresql-8.3.1-orig/src/backend/tsearch/wparser_def.c	2008-01-01 20:45:52.000000000 +0100
+++ postgresql-8.3.1/src/backend/tsearch/wparser_def.c	2008-05-22 13:02:39.000000000 +0200
@@ -19,6 +19,7 @@
 #include "tsearch/ts_public.h"
 #include "tsearch/ts_type.h"
 #include "tsearch/ts_utils.h"
+#include "tsearch/ts_rank.h"
 #include "utils/builtins.h"
 
 
@@ -1886,3 +1887,191 @@
 
 	PG_RETURN_POINTER(prs);
 }
+
+Datum
+prsd_headline_with_fragments(PG_FUNCTION_ARGS)
+{
+        Oid        cfgId        = PG_GETARG_OID(0);
+	text	   *in	        = PG_GETARG_TEXT_P(1);
+	TSQuery    query        = PG_GETARG_TSQUERY(2);
+	List	   *prsoptions  = (List *) PG_GETARG_POINTER(3);
+
+        TSVector t = (TSVector) DatumGetPointer(DirectFunctionCall2(to_tsvector_byid,
+                                                                    ObjectIdGetDatum(cfgId),
+                                                                    PointerGetDatum(in)));
+
+	ListCell   *l;
+        HeadlineParsedText* prs = NULL;
+	DocRepresentation*  doc;
+	Extention	    ext;
+	int4		    coverlen, doclen;
+	int4		    startpos = 0, endpos = 0;
+	int4		    numWords = 0;
+        QueryRepresentation qr;
+	int4		    i, numcovers = 0, maxcovers = 32, maxstretch;
+        int                 maxcoverSize = 20, mincoverSize = 5, maxWords = 40;
+	int4		    min, minI = 0;
+
+	struct coverpos* covers = palloc(maxcovers*sizeof(struct coverpos));
+
+        prs = (HeadlineParsedText*) palloc0(sizeof(HeadlineParsedText));
+	prs->lenwords = 32;
+	prs->words = (HeadlineWordEntry *) palloc(sizeof(HeadlineWordEntry) * prs->lenwords);
+	prs->startsel = NULL;
+	prs->stopsel = NULL;
+
+	foreach(l, prsoptions)
+	{
+		DefElem    *defel = (DefElem *) lfirst(l);
+		char	   *val = defGetString(defel);
+
+		if (pg_strcasecmp(defel->defname, "MaxCoverSize") == 0)
+			maxcoverSize = pg_atoi(val, sizeof(int32), 0);
+		else if (pg_strcasecmp(defel->defname, "MinCoverSize") == 0)
+			mincoverSize = pg_atoi(val, sizeof(int32), 0);
+		else if (pg_strcasecmp(defel->defname, "MaxWords") == 0)
+			maxWords = pg_atoi(val, sizeof(int32), 0);
+		else if (pg_strcasecmp(defel->defname, "StartSel") == 0)
+			prs->startsel = pstrdup(val);
+		else if (pg_strcasecmp(defel->defname, "StopSel") == 0)
+			prs->stopsel = pstrdup(val);
+		else
+			ereport(ERROR,
+					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+					 errmsg("unrecognized headline parameter: \"%s\"",
+							defel->defname)));
+	}
+
+	if (mincoverSize >= maxcoverSize)
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("MinCoverSize should be less than MaxCoverSize")));
+	if (mincoverSize <= 0)
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("MinCoverSize should be positive")));
+	if (maxWords < 0)
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("MaxWords should be non-negative")));
+
+	if (!prs->startsel)
+		prs->startsel = pstrdup("<b>");
+	if (!prs->stopsel)
+		prs->stopsel = pstrdup("</b>");
+	prs->startsellen = strlen(prs->startsel);
+	prs->stopsellen = strlen(prs->stopsel);
+
+        qr.query = query;
+        qr.operandexist = (bool*) palloc0(sizeof(bool) * query->size);
+
+	/* start generating covers for the query */
+	doc = get_docrep(t, &qr, &doclen);
+	if (!doc)
+	{
+		pfree(covers);
+                pfree(t);
+                pfree(qr.operandexist);
+		PG_FREE_IF_COPY(in, 1);
+		PG_FREE_IF_COPY(query, 2);
+		PG_FREE_IF_COPY(prsoptions, 3);
+
+		/* cannot do anything */
+		PG_RETURN_POINTER(prs);
+	}
+
+	/* get all covers */
+	MemSet(&ext, 0, sizeof(Extention));
+	while (Cover(doc, doclen, &qr, &ext))
+	{
+		if (numcovers >= maxcovers)
+		{
+			maxcovers *= 2;
+			covers	   = repalloc(covers, sizeof(struct coverpos) * maxcovers);
+		}
+		covers[numcovers].startpos = ext.p;
+		covers[numcovers].endpos   = ext.q;
+		covers[numcovers].in	   = 0;
+		numcovers ++;
+	}
+
+	/* choose best covers */
+	while (maxWords - numWords > mincoverSize)
+	{
+		min = 9999999;/* XXX - will not display headlines that exceed 9999999 */
+		for (i = 0; i < numcovers; i ++)
+		{
+			coverlen = covers[i].endpos - covers[i].startpos + 1;
+			if (!covers[i].in && min > coverlen)
+			{
+				min  = coverlen;
+				minI = i;
+			}
+		}
+		if (min < 9999999)
+		{
+			covers[minI].in = 1;
+			/* adjust the size of cover
+			* if maxcoverSize >= len
+			*      then headline from ext.p - (maxcoverSize-len)/2 to ext.q + (maxcoverSize-len) /2
+			* if maxcoverSize < len
+			*      then headline from ext.p to ext.p +  maxcoverSize
+			*      (ensures starting lexeme is in the headline)
+			*/
+			/* cut down endpos if it crosses maxWords */
+			startpos = covers[minI].startpos;
+			endpos	 = covers[minI].endpos;
+			coverlen = endpos - startpos + 1;
+
+			/* truncate the cover if it exceeds max words */
+			if(numWords + coverlen > maxWords)
+				endpos = startpos + maxWords - numWords;
+			else
+			{
+				if (maxcoverSize >= coverlen)
+				{
+					/* what is the max we can stretch: min of
+					* 1. maxcoverSize
+					* 2. maxWords - numWords
+					*/
+					if (maxcoverSize > maxWords - numWords)
+						maxstretch = maxWords - numWords;
+					else
+						maxstretch = maxcoverSize;
+
+					/* divide the stretch on both sides of cover */
+					startpos -= (maxstretch - coverlen)/2;
+					endpos	 += (maxstretch - coverlen)/2;
+					if (startpos < 1)
+						startpos = 1;
+					/* XXX - do we need to check whether endpos crosses the document
+					* the other function would return if the document ends or the
+					* endpos is reached.
+					* Dropping this check for time being
+					*/
+				}
+				else if (maxcoverSize < coverlen)
+					endpos	 = startpos + maxcoverSize;
+			}
+			covers[minI].startpos = startpos;
+			covers[minI].endpos   = endpos;
+			numWords += endpos - startpos + 1;
+		}
+	else
+			break;
+	}
+
+	/* Render the headline */
+        if (maxWords > 0)
+                hlparsetext_with_covers(cfgId, prs, query, in, covers, numcovers);
+
+	/* clean up */
+        pfree(covers);
+        pfree(t);
+        pfree(qr.operandexist);
+        PG_FREE_IF_COPY(in, 1);
+        PG_FREE_IF_COPY(query, 2);
+        PG_FREE_IF_COPY(prsoptions, 3);
+
+	PG_RETURN_POINTER(prs);
+}
diff -Nrub postgresql-8.3.1-orig/src/backend/utils/adt/tsrank.c postgresql-8.3.1/src/backend/utils/adt/tsrank.c
--- postgresql-8.3.1-orig/src/backend/utils/adt/tsrank.c	2008-01-01 20:45:53.000000000 +0100
+++ postgresql-8.3.1/src/backend/utils/adt/tsrank.c	2008-05-22 13:02:39.000000000 +0200
@@ -17,6 +17,7 @@
 
 #include "tsearch/ts_type.h"
 #include "tsearch/ts_utils.h"
+#include "tsearch/ts_rank.h"
 #include "utils/array.h"
 #include "miscadmin.h"
 
@@ -463,14 +464,6 @@
 	PG_RETURN_FLOAT4(res);
 }
 
-typedef struct
-{
-	QueryItem **item;
-	int16		nitem;
-	uint8		wclass;
-	int32		pos;
-} DocRepresentation;
-
 static int
 compareDocR(const void *va, const void *vb)
 {
@@ -482,12 +475,6 @@
 	return (a->pos > b->pos) ? 1 : -1;
 }
 
-typedef struct
-{
-	TSQuery		query;
-	bool	   *operandexist;
-} QueryRepresentation;
-
 #define QR_GET_OPERAND_EXISTS(q, v)		( (q)->operandexist[ ((QueryItem*)(v)) - GETQUERY((q)->query) ] )
 #define QR_SET_OPERAND_EXISTS(q, v)  QR_GET_OPERAND_EXISTS(q,v) = true
 
@@ -499,17 +486,7 @@
 	return QR_GET_OPERAND_EXISTS(qr, val);
 }
 
-typedef struct
-{
-	int			pos;
-	int			p;
-	int			q;
-	DocRepresentation *begin;
-	DocRepresentation *end;
-} Extention;
-
-
-static bool
+bool
 Cover(DocRepresentation *doc, int len, QueryRepresentation *qr, Extention *ext)
 {
 	DocRepresentation *ptr;
@@ -590,7 +567,7 @@
 	return Cover(doc, len, qr, ext);
 }
 
-static DocRepresentation *
+DocRepresentation *
 get_docrep(TSVector txt, QueryRepresentation *qr, int *doclen)
 {
 	QueryItem  *item = GETQUERY(qr->query);
diff -Nrub postgresql-8.3.1-orig/src/include/tsearch/ts_rank.h postgresql-8.3.1/src/include/tsearch/ts_rank.h
--- postgresql-8.3.1-orig/src/include/tsearch/ts_rank.h	1970-01-01 01:00:00.000000000 +0100
+++ postgresql-8.3.1/src/include/tsearch/ts_rank.h	2008-05-22 13:02:39.000000000 +0200
@@ -0,0 +1,36 @@
+#ifndef __TSRANK_H__
+#define __TSRANK_H__
+
+#include "ts_type.h"
+#include "ts_cache.h"
+
+typedef struct
+{
+	QueryItem **item;
+	int16		nitem;
+	uint8		wclass;
+	int32		pos;
+} DocRepresentation;
+
+typedef struct
+{
+	TSQuery		query;
+	bool	   *operandexist;
+} QueryRepresentation;
+
+typedef struct
+{
+	int			pos;
+	int			p;
+	int			q;
+	DocRepresentation *begin;
+	DocRepresentation *end;
+} Extention;
+
+bool
+Cover(DocRepresentation *doc, int len, QueryRepresentation *qr, Extention *ext);
+
+DocRepresentation *
+get_docrep(TSVector txt, QueryRepresentation *qr, int *doclen);
+
+#endif /* __TSRANK_H__ */
diff -Nrub postgresql-8.3.1-orig/src/include/tsearch/ts_utils.h postgresql-8.3.1/src/include/tsearch/ts_utils.h
--- postgresql-8.3.1-orig/src/include/tsearch/ts_utils.h	2008-01-01 20:45:59.000000000 +0100
+++ postgresql-8.3.1/src/include/tsearch/ts_utils.h	2008-05-22 13:02:39.000000000 +0200
@@ -14,6 +14,7 @@
 
 #include "tsearch/ts_type.h"
 #include "tsearch/ts_public.h"
+#include "tsearch/ts_rank.h"
 #include "nodes/pg_list.h"
 
 /*
@@ -95,8 +96,25 @@
  *	3 generateHeadline to generate result text
  */
 
+struct coverpos
+{
+	int4 startpos;
+	int4 endpos;
+	int4 in;
+};
+
 extern void hlparsetext(Oid cfgId, HeadlineParsedText *prs, TSQuery query,
 			char *buf, int4 buflen);
+
+
+extern void
+hlparsetext_with_covers(Oid                 cfgId,
+                        HeadlineParsedText *prs,
+                        TSQuery             query,
+                        text               *in,
+                        struct coverpos    *covers,
+                        int4                numcovers);
+
 extern text *generateHeadline(HeadlineParsedText *prs);
 
 /*
@@ -227,6 +245,7 @@
 extern Datum prsd_end(PG_FUNCTION_ARGS);
 extern Datum prsd_headline(PG_FUNCTION_ARGS);
 extern Datum prsd_lextype(PG_FUNCTION_ARGS);
+extern Datum prsd_headline_with_fragments(PG_FUNCTION_ARGS);
 
 /*
  * Dictionary interface to SQL
@@ -264,6 +283,7 @@
 extern Datum ts_headline_byid(PG_FUNCTION_ARGS);
 extern Datum ts_headline(PG_FUNCTION_ARGS);
 extern Datum ts_headline_opt(PG_FUNCTION_ARGS);
+extern Datum ts_headline_with_fragments(PG_FUNCTION_ARGS);
 
 /*
  * current cfg
diff -Nrub postgresql-8.3.1-orig/src/interfaces/libpq/libpq.rc postgresql-8.3.1/src/interfaces/libpq/libpq.rc
--- postgresql-8.3.1-orig/src/interfaces/libpq/libpq.rc	2008-03-15 04:24:54.000000000 +0100
+++ postgresql-8.3.1/src/interfaces/libpq/libpq.rc	2008-05-22 13:13:16.000000000 +0200
@@ -1,8 +1,8 @@
 #include <winver.h>
 
 VS_VERSION_INFO VERSIONINFO
- FILEVERSION 8,3,1,8075
- PRODUCTVERSION 8,3,1,8075
+ FILEVERSION 8,3,1,8143
+ PRODUCTVERSION 8,3,1,8143
  FILEFLAGSMASK 0x3fL
  FILEFLAGS 0
  FILEOS VOS__WINDOWS32

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Index of Archives]     [Postgresql Jobs]     [Postgresql Admin]     [Postgresql Performance]     [Linux Clusters]     [PHP Home]     [PHP on Windows]     [Kernel Newbies]     [PHP Classes]     [PHP Books]     [PHP Databases]     [Postgresql & PHP]     [Yosemite]
  Powered by Linux