Re: Fragments in tsearch2 headline

"Sushant Sinha" <sushant354@xxxxxxxxx> · Sun, 11 Nov 2007 22:46:50 -0500

I wrote a headline generation function for my app and I have attached
the patch (against the cvs head). It generates multiple contexts in
which the query appears. Essentially, it uses the cover function to
generate all covers, chooses smallest covers and stretches each
selected cover according to the chosen parameters. I think ideally
changes should be made to prsd_headline function but I couldn't
understand that segment of code well.

The sql interface is

headline_with_fragments(text parser, tsvector docvector, text doc,
tsquery queryin, int4 maxcoverSize, int4 mincoverSize, int4 maxWords)
 RETURNS text

This will generate headline that contain maxWords and each cover
stretched to maxcoverSize. It will not add any fragment with less than
mincoverSize.
I am running my app with maxcoverSize = 20, mincoverSize = 5, maxWords = 40.
So it shows roughly two fragments per query.

If Teoder or Oleg want to add this to main branch, I will be happy to
clean it up and test it better.

-Sushant.

On Oct 31, 2007 6:26 PM, Catalin Marinas <catalin.marinas@xxxxxxxxx> wrote:
> On 30/10/2007, Oleg Bartunov <oleg@xxxxxxxxxx> wrote:
> > ok, then you have to formalize many things - how long should be excerpts,
> > how much excerpts to show, etc. In tsearch2 we have get_covers() function,
> > which produces all excerpts like:
> >
> > =# select get_covers(to_tsvector('1 2 3 4 5 3 4 abc x y z 2 3'), '2&3'::tsquery);
> >                     get_covers
> > ------------------------------------------------
> >   1 {1 2 3 }1 4 5 {2 3 4 abc x y z {3 2 }2 3 }3
> > (1 row)
>
> This function generates the lexemes, so cannot be used directly, but
> it is probably a good starting point.
>
> > Once you formalize your requirements, you can look on it and adapt to your
> > needs (and share with people). I think it could be nice contrib module.
>
> It seems that Sushant already wants to implement this function. He
> would probably be faster than me :-) (I'm relatively new to db stuff).
> Since I mainly rely on whatever a web hosting company provides, I'll
> probably stick with a Python implementation outside the SQL query.
>
> Thanks for your answers.
>
> --
> Catalin
>
> ---------------------------(end of broadcast)---------------------------
>
> TIP 5: don't forget to increase your free space map settings
>
? GNUmakefile
? config.log
? config.status
? contrib/tsearch2/rank.h
? src/Makefile.global
? src/include/pg_config.h
? src/include/stamp-h
? src/interfaces/ecpg/include/ecpg_config.h
Index: contrib/tsearch2/rank.c
===================================================================
RCS file: /projects/cvsroot/pgsql/contrib/tsearch2/rank.c,v
retrieving revision 1.23
diff -c -r1.23 rank.c
*** contrib/tsearch2/rank.c	27 Feb 2007 23:48:06 -0000	1.23
--- contrib/tsearch2/rank.c	12 Nov 2007 03:28:20 -0000
***************
*** 21,26 ****
--- 21,27 ----
  #include "tsvector.h"
  #include "query.h"
  #include "common.h"
+ #include "rank.h"

  PG_FUNCTION_INFO_V1(rank);
  Datum		rank(PG_FUNCTION_ARGS);
***************
*** 419,433 ****
  }

- typedef struct
- {
- 	ITEM	  **item;
- 	int16		nitem;
- 	bool		needfree;
- 	uint8		wclass;
- 	int32		pos;
- }	DocRepresentation;
- 
  static int
  compareDocR(const void *a, const void *b)
  {
--- 420,425 ----
***************
*** 457,473 ****
  	}
  }

! typedef struct
! {
! 	int			pos;
! 	int			p;
! 	int			q;
! 	DocRepresentation *begin;
! 	DocRepresentation *end;
! }	Extention;
! 
! 
! static bool
  Cover(DocRepresentation * doc, int len, QUERYTYPE * query, Extention * ext)
  {
  	DocRepresentation *ptr;
--- 449,455 ----
  	}
  }

! bool
  Cover(DocRepresentation * doc, int len, QUERYTYPE * query, Extention * ext)
  {
  	DocRepresentation *ptr;
***************
*** 538,544 ****
  	return Cover(doc, len, query, ext);
  }

! static DocRepresentation *
  get_docrep(tsvector * txt, QUERYTYPE * query, int *doclen)
  {
  	ITEM	   *item = GETQUERY(query);
--- 520,526 ----
  	return Cover(doc, len, query, ext);
  }

! DocRepresentation *
  get_docrep(tsvector * txt, QUERYTYPE * query, int *doclen)
  {
  	ITEM	   *item = GETQUERY(query);
Index: contrib/tsearch2/ts_cfg.c
===================================================================
RCS file: /projects/cvsroot/pgsql/contrib/tsearch2/ts_cfg.c,v
retrieving revision 1.24
diff -c -r1.24 ts_cfg.c
*** contrib/tsearch2/ts_cfg.c	6 Apr 2007 04:21:41 -0000	1.24
--- contrib/tsearch2/ts_cfg.c	12 Nov 2007 03:28:20 -0000
***************
*** 646,648 ****
--- 646,715 ----
  	ts_error(NOTICE, "TSearch cache cleaned");
  	PG_RETURN_VOID();
  }
+ 
+ void
+ add_cover_to_hl(WParserInfo *prsobj, HLPRSTEXT *prs, QUERYTYPE *query, LexizeData *ldata, int4 *currentpos, int4 startpos, int4 endpos)
+ {
+ 	char      *lemm = NULL;
+ 	int4      lenlemm = 0;
+    	ParsedLex *lexs;
+   	int4       type, startHL = 0;
+    	TSLexeme  *norms;
+    	char      *coversep = " ... ";
+    	int4      coverseplen = strlen(coversep);
+    	int4      oldnumwords, newnumwords, i;
+    	if (*currentpos > endpos)
+    	{
+     	/* XXX - something wrong ... we have gone past the cover */
+     	return;
+    	}
+    	/* see if we need to add a cover seperator */
+    	if (*currentpos < startpos && startpos > 0)
+    	{
+        	hladdword(prs, coversep, coverseplen, 3);
+        	prs->words[prs->curwords - 1].in = 1;
+    	}
+ 
+   	do
+    	{
+        	type = DatumGetInt32(FunctionCall3(
+         									&(prsobj->getlexeme_info),
+ 											PointerGetDatum(prsobj->prs),
+ 											PointerGetDatum(&lemm),
+ 				 							PointerGetDatum(&lenlemm)));
+ 		LexizeAddLemm(ldata, type, lemm, lenlemm);
+        	do 
+        	{
+ 			if ((norms = LexizeExec(ldata, &lexs)) != NULL)
+            	{
+             	TSLexeme *ptr = norms;
+                	*currentpos += 1;
+                	while(ptr->lexeme)
+                	{
+ 					if (ptr->flags & TSL_ADDPOS)
+ 						*currentpos += 1;
+ 					ptr++;
+   				}       
+ 			}
+ 			// start check
+ 			if (!startHL && *currentpos >= startpos)
+ 				startHL = 1;
+ 
+ 		 	if (startHL)
+ 			{
+ 				oldnumwords = prs->curwords;
+ 				addHLParsedLex(prs, query, lexs, norms);
+ 				newnumwords = prs->curwords;
+ 				for (i = oldnumwords; i < newnumwords; i++)
+ 				{
+ 					prs->words[i].in = 1;
+ 					if (prs->words[i].item)
+ 						prs->words[i].selected = 1;
+ 				}
+ 			}
+ 		} while(norms && *currentpos < endpos);
+ 		if (*currentpos >= endpos)
+ 			break;
+ 	} while (type > 0);
+ }
+ 
Index: contrib/tsearch2/tsearch.sql.in
===================================================================
RCS file: /projects/cvsroot/pgsql/contrib/tsearch2/tsearch.sql.in,v
retrieving revision 1.21
diff -c -r1.21 tsearch.sql.in
*** contrib/tsearch2/tsearch.sql.in	11 Nov 2007 03:25:34 -0000	1.21
--- contrib/tsearch2/tsearch.sql.in	12 Nov 2007 03:28:20 -0000
***************
*** 571,576 ****
--- 571,582 ----
  AS 'MODULE_PATHNAME', 'rank_cd_def'
  LANGUAGE C RETURNS NULL ON NULL INPUT IMMUTABLE;

+ CREATE FUNCTION headline_with_fragments(text, tsvector, text, tsquery, int4, int4, int4)
+ RETURNS text 
+ AS 'MODULE_PATHNAME', 'headline_with_fragments'
+ LANGUAGE C RETURNS NULL ON NULL INPUT IMMUTABLE;
+ 
+ 
  CREATE OR REPLACE FUNCTION headline(oid, text, tsquery, text)
  RETURNS text
  AS 'MODULE_PATHNAME', 'headline'
Index: contrib/tsearch2/wparser.c
===================================================================
RCS file: /projects/cvsroot/pgsql/contrib/tsearch2/wparser.c,v
retrieving revision 1.13
diff -c -r1.13 wparser.c
*** contrib/tsearch2/wparser.c	2 Apr 2007 11:42:04 -0000	1.13
--- contrib/tsearch2/wparser.c	12 Nov 2007 03:28:20 -0000
***************
*** 18,23 ****
--- 18,27 ----
  #include "snmap.h"
  #include "common.h"

+ #include "rank.h"
+ #include "tsvector.h"
+ #include "dict.h"
+ 
  /*********top interface**********/

  static Oid	current_parser_id = InvalidOid;
***************
*** 609,611 ****
--- 613,799 ----
  				(PG_NARGS() > 2) ? PG_GETARG_DATUM(2) : PointerGetDatum(NULL)
  										));
  }
+ 
+ 
+        
+ /* headline generation 
+  * Input:  parser, tsvector of a doc, doc, tsquery, min size of an excerpt, max size of an excerpt, maxWords in a headline
+  * Output: multiple excerpts of doc that contains query words
+  */
+ 
+ struct coverpos{
+   	int4 startpos;
+    	int4 endpos;
+    	int4 in;
+ };
+ 
+ PG_FUNCTION_INFO_V1(headline_with_fragments);
+ Datum      headline_with_fragments(PG_FUNCTION_ARGS);
+ 
+ Datum
+ headline_with_fragments(PG_FUNCTION_ARGS)
+ {
+    	DocRepresentation* doc;
+    	Extention          ext;
+    	char*              textdata;
+    	int4               coverlen, doclen, textlen;
+    	int4               startpos = 0, endpos = 0, currentpos = 0;
+    	int4               numWords = 0;
+    	TSCfgInfo*         cfg;
+    	WParserInfo*       prsobj;
+    	text*              out;
+    	LexizeData         ldata;
+    	HLPRSTEXT          prs;
+   	int4               i, numcovers = 0, maxcovers = 32, maxstretch;
+    	int4               min, minI = 0;
+    	struct coverpos* covers = palloc(maxcovers*sizeof(struct coverpos));
+    	/* get the input parameters */
+    	text       *name     = PG_GETARG_TEXT_P(0);
+    	tsvector   *t        = (tsvector *) PG_DETOAST_DATUM(PG_GETARG_DATUM(1));
+    	text       *in       = PG_GETARG_TEXT_P(2);
+    	QUERYTYPE  *query    = (QUERYTYPE *) PG_DETOAST_DATUM(PG_GETARG_DATUM(3));
+    	int4       maxcoverSize = PG_GETARG_INT32(4);
+    	int4       mincoverSize = PG_GETARG_INT32(5);
+    	int4       maxWords  = PG_GETARG_INT32(6);
+ 
+   	SET_FUNCOID();
+    	cfg = findcfg(name2id_cfg(name));
+    	prsobj = findprs(cfg->prs_id);
+ 
+    	textdata = VARDATA(in);
+    	textlen  = VARSIZE(in) - VARHDRSZ;
+    	/* start generating covers for the query */
+    	doc = get_docrep(t, query, &doclen);
+   	if (!doc)
+    	{
+ 		pfree(covers);
+   		PG_FREE_IF_COPY(name, 0);
+   		PG_FREE_IF_COPY(t, 1);
+   		PG_FREE_IF_COPY(in, 2);
+  		PG_FREE_IF_COPY(query, 3);
+  		/* cannot do anything */
+  		out = (text*) palloc(4);
+ 		SET_VARSIZE(out, 4);
+ 		PG_RETURN_POINTER(out);
+  	}
+ 
+ 
+   	memset(&prs, 0, sizeof(HLPRSTEXT));
+   	prs.lenwords = 32;
+    	prs.words = (HLWORD *) palloc(sizeof(HLWORD) * prs.lenwords);
+    	prs.startsel = "<b>";
+    	prs.stopsel  = "</b>";
+    	prs.startsellen = strlen(prs.startsel);
+    	prs.stopsellen = strlen(prs.stopsel);
+ 
+    	prsobj->prs = (void *) DatumGetPointer(
+    											FunctionCall2(
+ 															&(prsobj->start_info),
+ 															PointerGetDatum(textdata),
+ 															Int32GetDatum(textlen)));
+ 
+ 	LexizeInit(&ldata, cfg);
+ 
+    	MemSet(&ext, 0, sizeof(Extention));
+   	/* get all covers */
+    	while (Cover(doc, doclen, query, &ext))
+    	{
+   		if (numcovers >= maxcovers)
+ 		{
+ 	 		maxcovers *= 2;
+ 			covers     = repalloc(covers, sizeof(struct coverpos) * maxcovers);
+ 		}   
+ 		covers[numcovers].startpos = ext.p;
+ 		covers[numcovers].endpos   = ext.q;
+ 		covers[numcovers].in       = 0;
+  		numcovers ++;
+   	}
+    	/* choose best covers */
+    	while (maxWords - numWords > mincoverSize)
+    	{
+ 		min = 9999999;/* XXX - will not display headlines that exceed 9999999 */
+  		for (i = 0; i < numcovers; i ++)
+ 		{
+ 			coverlen = covers[i].endpos - covers[i].startpos + 1;
+ 			if (!covers[i].in && min > coverlen)
+ 			{
+ 				min  = coverlen; 
+ 				minI = i;
+ 			}   
+ 		}
+ 		if (min < 9999999)
+ 		{
+ 			covers[minI].in = 1;
+ 			/* adjust the size of cover  
+ 			* if maxcoverSize >= len 
+ 			*      then headline from ext.p - (maxcoverSize-len)/2 to ext.q + (maxcoverSize-len) /2
+ 			* if maxcoverSize < len 
+ 			*      then headline from ext.p to ext.p +  maxcoverSize 
+ 			*      (ensures starting lexeme is in the headline)
+ 			*/         
+ 			/* cut down endpos if it crosses maxWords */
+ 			startpos = covers[minI].startpos;
+ 			endpos   = covers[minI].endpos;
+ 			coverlen = endpos - startpos + 1;
+ 	
+ 			/* truncate the cover if it exceeds max words */
+ 			if(numWords + coverlen > maxWords)
+ 				endpos = startpos + maxWords - numWords;
+ 			else
+ 			{
+ 				if (maxcoverSize >= coverlen)
+ 				{
+ 					/* what is the max we can stretch: min of 
+ 					* 1. maxcoverSize
+ 					* 2. maxWords - numWords
+ 					*/
+ 					if (maxcoverSize > maxWords - numWords)
+ 						maxstretch = maxWords - numWords;
+ 					else    
+ 						maxstretch = maxcoverSize;
+ 	
+ 					/* divide the stretch on both sides of cover */
+ 					startpos -= (maxstretch - coverlen)/2;
+ 					endpos   += (maxstretch - coverlen)/2;
+ 					if (startpos < 1)
+ 						startpos = 1;
+ 					/* XXX - do we need to check whether endpos crosses the document 
+ 					* the other function would return if the document ends or the 
+ 					* endpos is reached.
+ 					* Dropping this check for time being 
+ 					*/	
+ 				}       
+ 				else if (maxcoverSize < coverlen)
+ 					endpos   = startpos + maxcoverSize;
+ 			}
+ 			covers[minI].startpos = startpos;
+ 			covers[minI].endpos   = endpos;
+ 			numWords += endpos - startpos + 1;
+  		}
+        	else 
+ 			break;
+   	}   
+ 
+    	/* start rendering the headline */
+    	numWords = 0;
+ 	for (i = 0; i < numcovers && numWords < maxWords; i++)
+ 	{
+ 		if (covers[i].in)
+ 			add_cover_to_hl(prsobj, &prs, query, &ldata, &currentpos, covers[i].startpos, covers[i].endpos);
+ 	}
+    
+    	FunctionCall1(
+ 					&(prsobj->end_info),
+ 					PointerGetDatum(prsobj->prs)
+ 				);
+    
+    	out = genhl(&prs);
+    	/* clean up */
+   	pfree(covers);
+    	PG_FREE_IF_COPY(name, 0);
+    	PG_FREE_IF_COPY(t, 1);
+    	PG_FREE_IF_COPY(in, 2);
+    	PG_FREE_IF_COPY(query, 3);
+ 	
+    	PG_RETURN_POINTER(out);
+ }
---------------------------(end of broadcast)---------------------------
TIP 6: explain analyze is your friend