Thanks Pierre for porting this! I just tested this for my application and it works. There was a small bug in that startHL has to be initialized to 0 for each chosen cover. I fixed that and attached the new patch. Teodor did not want a separate function. He wanted it as an extension to ts_headline. One way to do this will be to invoke it only when options like MaxCoverSize is used. It will be slightly ugly though. It still seems to have bugs. I will try to clean that up. -Sushant. On Thu, 2008-05-22 at 13:31 +0200, Pierre-Yves Strub wrote: > Hi, > > I've ported the patch of Sushant Sinha for fragmented headlines to pg8.3.1 > (http://archives.postgresql.org/pgsql-general/2007-11/msg00508.php) > > W.r.t, http://archives.postgresql.org/pgsql-general/2008-03/msg00806.php > I can continue the work until this becomes an acceptable patch for pg. > > Pierre-yves.
diff -Nurb postgresql-8.3.1/contrib/tsearch2/tsearch2.c postgresql-8.3.1-orig/contrib/tsearch2/tsearch2.c --- postgresql-8.3.1/contrib/tsearch2/tsearch2.c 2008-01-01 14:45:45.000000000 -0500 +++ postgresql-8.3.1-orig/contrib/tsearch2/tsearch2.c 2008-05-22 18:44:16.000000000 -0400 @@ -82,6 +82,7 @@ Datum tsa_to_tsquery_name(PG_FUNCTION_ARGS); Datum tsa_plainto_tsquery_name(PG_FUNCTION_ARGS); Datum tsa_headline_byname(PG_FUNCTION_ARGS); +Datum tsa_headline_with_fragments(PG_FUNCTION_ARGS); Datum tsa_ts_stat(PG_FUNCTION_ARGS); Datum tsa_tsearch2(PG_FUNCTION_ARGS); Datum tsa_rewrite_accum(PG_FUNCTION_ARGS); @@ -101,6 +102,7 @@ PG_FUNCTION_INFO_V1(tsa_to_tsquery_name); PG_FUNCTION_INFO_V1(tsa_plainto_tsquery_name); PG_FUNCTION_INFO_V1(tsa_headline_byname); +PG_FUNCTION_INFO_V1(tsa_headline_with_fragments); PG_FUNCTION_INFO_V1(tsa_ts_stat); PG_FUNCTION_INFO_V1(tsa_tsearch2); PG_FUNCTION_INFO_V1(tsa_rewrite_accum); @@ -358,6 +360,24 @@ return result; } +/* tsa_headline_with_fragments(text, tsvector, text, tsquery, text) */ +Datum +tsa_headline_with_fragments(PG_FUNCTION_ARGS) +{ + text *cfgname = PG_GETARG_TEXT_P(0); + Datum arg1 = PG_GETARG_DATUM(1); + Datum arg2 = PG_GETARG_DATUM(2); + Datum arg3 = PG_GETARG_DATUM(3); + Datum arg4 = PG_GETARG_DATUM(4); + Oid config_oid; + + config_oid = TextGetObjectId(regconfigin, cfgname); + + return DirectFunctionCall5(ts_headline_with_fragments, + ObjectIdGetDatum(config_oid), + arg1, arg2, arg3, arg4); +} + /* * tsearch2 version of update trigger * diff -Nurb postgresql-8.3.1/contrib/tsearch2/tsearch2.sql.in postgresql-8.3.1-orig/contrib/tsearch2/tsearch2.sql.in --- postgresql-8.3.1/contrib/tsearch2/tsearch2.sql.in 2007-11-28 14:33:04.000000000 -0500 +++ postgresql-8.3.1-orig/contrib/tsearch2/tsearch2.sql.in 2008-05-22 18:44:16.000000000 -0400 @@ -384,6 +384,11 @@ LANGUAGE INTERNAL RETURNS NULL ON NULL INPUT IMMUTABLE; +CREATE FUNCTION headline_with_fragments(text, text, tsquery, text) + RETURNS text + AS 'MODULE_PATHNAME', 'tsa_headline_with_fragments' + LANGUAGE C RETURNS NULL ON NULL INPUT IMMUTABLE; + -- CREATE the OPERATOR class CREATE OPERATOR CLASS gist_tsvector_ops FOR TYPE tsvector USING gist diff -Nurb postgresql-8.3.1/src/backend/tsearch/ts_parse.c postgresql-8.3.1-orig/src/backend/tsearch/ts_parse.c --- postgresql-8.3.1/src/backend/tsearch/ts_parse.c 2008-01-01 14:45:52.000000000 -0500 +++ postgresql-8.3.1-orig/src/backend/tsearch/ts_parse.c 2008-05-22 18:49:53.000000000 -0400 @@ -578,6 +578,112 @@ FunctionCall1(&(prsobj->prsend), PointerGetDatum(prsdata)); } +#define COVER_SEP "..." +#define COVER_SEP_LEN (sizeof(COVER_SEP)-1) + +void +hlparsetext_with_covers(Oid cfgId, + HeadlineParsedText *prs, + TSQuery query, + text *in, + struct coverpos *covers, + int4 numcovers) +{ + TSParserCacheEntry *prsobj; + TSConfigCacheEntry *cfg; + void *prsdata; + LexizeData ldata; + int4 icover, startpos, endpos, currentpos = 0; + + char *lemm = NULL; + int4 lenlemm = 0; + ParsedLex *lexs; + int4 type, startHL = 0; + TSLexeme *norms; + int4 oldnumwords, newnumwords, i; + + cfg = lookup_ts_config_cache(cfgId); + prsobj = lookup_ts_parser_cache(cfg->prsId); + + prsdata = (void*) DatumGetPointer(FunctionCall2(&(prsobj->prsstart), + PointerGetDatum(VARDATA(in)), + Int32GetDatum(VARSIZE(in) - VARHDRSZ))); + + LexizeInit(&ldata, cfg); + + for (icover = 0; icover < numcovers; icover++) + { + if (!covers[icover].in) + continue; + + startpos = covers[icover].startpos; + endpos = covers[icover].endpos; + + if (currentpos > endpos) + { + /* XXX - something wrong ... we have gone past the cover */ + continue; + } + + /* see if we need to add a cover seperator */ + if (currentpos < startpos && startpos > 0) + { + hladdword(prs, COVER_SEP, COVER_SEP_LEN, 3); + prs->words[prs->curwords - 1].in = 1; + } + /* add words to the headline only when currentpos crosses the startpos */ + startHL = 0; + do + { + type = DatumGetInt32(FunctionCall3(&(prsobj->prstoken), + PointerGetDatum(prsdata), + PointerGetDatum(&lemm), + PointerGetDatum(&lenlemm))); + + LexizeAddLemm(&ldata, type, lemm, lenlemm); + + do + { + if ((norms = LexizeExec(&ldata, &lexs)) != NULL) + { + TSLexeme *ptr = norms; + + currentpos += 1; + while (ptr->lexeme) + { + if (ptr->flags & TSL_ADDPOS) + currentpos += 1; + ptr++; + } + } + + // start check + if (!startHL && currentpos >= startpos) + startHL = 1; + + if (startHL) + { + oldnumwords = prs->curwords; + addHLParsedLex(prs, query, lexs, norms); + newnumwords = prs->curwords; + + for (i = oldnumwords; i < newnumwords; i++) + { + prs->words[i].in = 1; + if (prs->words[i].item) + prs->words[i].selected = 1; + } + } + } while(norms && currentpos < endpos); + + if (currentpos >= endpos) + break; + } while (type > 0); + } + + FunctionCall1(&(prsobj->prsend), PointerGetDatum(prsdata)); +} + text * generateHeadline(HeadlineParsedText *prs) { diff -Nurb postgresql-8.3.1/src/backend/tsearch/wparser.c postgresql-8.3.1-orig/src/backend/tsearch/wparser.c --- postgresql-8.3.1/src/backend/tsearch/wparser.c 2008-01-15 13:22:47.000000000 -0500 +++ postgresql-8.3.1-orig/src/backend/tsearch/wparser.c 2008-05-22 18:44:16.000000000 -0400 @@ -370,3 +370,34 @@ PG_GETARG_DATUM(1), PG_GETARG_DATUM(2))); } + +Datum +ts_headline_with_fragments(PG_FUNCTION_ARGS) +{ + text *opt = (PG_NARGS() > 3 && PG_GETARG_POINTER(3)) ? PG_GETARG_TEXT_P(3) : NULL; + HeadlineParsedText *prs; + List *prsoptions; + text *out; + + if (opt) + prsoptions = deserialize_deflist(PointerGetDatum(opt)); + else + prsoptions = NIL; + + prs = (HeadlineParsedText*) DatumGetPointer(DirectFunctionCall4(prsd_headline_with_fragments, + PG_GETARG_DATUM(0), + PG_GETARG_DATUM(1), + PG_GETARG_DATUM(2), + PointerGetDatum(prsoptions))); + + out = generateHeadline(prs); + + if (opt) + PG_FREE_IF_COPY(opt, 3); + pfree(prs->words); + pfree(prs->startsel); + pfree(prs->stopsel); + pfree(prs); + + PG_RETURN_POINTER(out); +} diff -Nurb postgresql-8.3.1/src/backend/tsearch/wparser_def.c postgresql-8.3.1-orig/src/backend/tsearch/wparser_def.c --- postgresql-8.3.1/src/backend/tsearch/wparser_def.c 2008-01-01 14:45:52.000000000 -0500 +++ postgresql-8.3.1-orig/src/backend/tsearch/wparser_def.c 2008-05-22 18:44:16.000000000 -0400 @@ -19,6 +19,7 @@ #include "tsearch/ts_public.h" #include "tsearch/ts_type.h" #include "tsearch/ts_utils.h" +#include "tsearch/ts_rank.h" #include "utils/builtins.h" @@ -1886,3 +1887,191 @@ PG_RETURN_POINTER(prs); } + +Datum +prsd_headline_with_fragments(PG_FUNCTION_ARGS) +{ + Oid cfgId = PG_GETARG_OID(0); + text *in = PG_GETARG_TEXT_P(1); + TSQuery query = PG_GETARG_TSQUERY(2); + List *prsoptions = (List *) PG_GETARG_POINTER(3); + + TSVector t = (TSVector) DatumGetPointer(DirectFunctionCall2(to_tsvector_byid, + ObjectIdGetDatum(cfgId), + PointerGetDatum(in))); + + ListCell *l; + HeadlineParsedText* prs = NULL; + DocRepresentation* doc; + Extention ext; + int4 coverlen, doclen; + int4 startpos = 0, endpos = 0; + int4 numWords = 0; + QueryRepresentation qr; + int4 i, numcovers = 0, maxcovers = 32, maxstretch; + int maxcoverSize = 20, mincoverSize = 5, maxWords = 40; + int4 min, minI = 0; + + struct coverpos* covers = palloc(maxcovers*sizeof(struct coverpos)); + + prs = (HeadlineParsedText*) palloc0(sizeof(HeadlineParsedText)); + prs->lenwords = 32; + prs->words = (HeadlineWordEntry *) palloc(sizeof(HeadlineWordEntry) * prs->lenwords); + prs->startsel = NULL; + prs->stopsel = NULL; + + foreach(l, prsoptions) + { + DefElem *defel = (DefElem *) lfirst(l); + char *val = defGetString(defel); + + if (pg_strcasecmp(defel->defname, "MaxCoverSize") == 0) + maxcoverSize = pg_atoi(val, sizeof(int32), 0); + else if (pg_strcasecmp(defel->defname, "MinCoverSize") == 0) + mincoverSize = pg_atoi(val, sizeof(int32), 0); + else if (pg_strcasecmp(defel->defname, "MaxWords") == 0) + maxWords = pg_atoi(val, sizeof(int32), 0); + else if (pg_strcasecmp(defel->defname, "StartSel") == 0) + prs->startsel = pstrdup(val); + else if (pg_strcasecmp(defel->defname, "StopSel") == 0) + prs->stopsel = pstrdup(val); + else + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("unrecognized headline parameter: \"%s\"", + defel->defname))); + } + + if (mincoverSize >= maxcoverSize) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("MinCoverSize should be less than MaxCoverSize"))); + if (mincoverSize <= 0) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("MinCoverSize should be positive"))); + if (maxWords < 0) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("MaxWords should be non-negative"))); + + if (!prs->startsel) + prs->startsel = pstrdup("<b>"); + if (!prs->stopsel) + prs->stopsel = pstrdup("</b>"); + prs->startsellen = strlen(prs->startsel); + prs->stopsellen = strlen(prs->stopsel); + + qr.query = query; + qr.operandexist = (bool*) palloc0(sizeof(bool) * query->size); + + /* start generating covers for the query */ + doc = get_docrep(t, &qr, &doclen); + if (!doc) + { + pfree(covers); + pfree(t); + pfree(qr.operandexist); + PG_FREE_IF_COPY(in, 1); + PG_FREE_IF_COPY(query, 2); + PG_FREE_IF_COPY(prsoptions, 3); + + /* cannot do anything */ + PG_RETURN_POINTER(prs); + } + + /* get all covers */ + MemSet(&ext, 0, sizeof(Extention)); + while (Cover(doc, doclen, &qr, &ext)) + { + if (numcovers >= maxcovers) + { + maxcovers *= 2; + covers = repalloc(covers, sizeof(struct coverpos) * maxcovers); + } + covers[numcovers].startpos = ext.p; + covers[numcovers].endpos = ext.q; + covers[numcovers].in = 0; + numcovers ++; + } + + /* choose best covers */ + while (maxWords - numWords > mincoverSize) + { + min = 9999999;/* XXX - will not display headlines that exceed 9999999 */ + for (i = 0; i < numcovers; i ++) + { + coverlen = covers[i].endpos - covers[i].startpos + 1; + if (!covers[i].in && min > coverlen) + { + min = coverlen; + minI = i; + } + } + if (min < 9999999) + { + covers[minI].in = 1; + /* adjust the size of cover + * if maxcoverSize >= len + * then headline from ext.p - (maxcoverSize-len)/2 to ext.q + (maxcoverSize-len) /2 + * if maxcoverSize < len + * then headline from ext.p to ext.p + maxcoverSize + * (ensures starting lexeme is in the headline) + */ + /* cut down endpos if it crosses maxWords */ + startpos = covers[minI].startpos; + endpos = covers[minI].endpos; + coverlen = endpos - startpos + 1; + + /* truncate the cover if it exceeds max words */ + if(numWords + coverlen > maxWords) + endpos = startpos + maxWords - numWords; + else + { + if (maxcoverSize >= coverlen) + { + /* what is the max we can stretch: min of + * 1. maxcoverSize + * 2. maxWords - numWords + */ + if (maxcoverSize > maxWords - numWords) + maxstretch = maxWords - numWords; + else + maxstretch = maxcoverSize; + + /* divide the stretch on both sides of cover */ + startpos -= (maxstretch - coverlen)/2; + endpos += (maxstretch - coverlen)/2; + if (startpos < 1) + startpos = 1; + /* XXX - do we need to check whether endpos crosses the document + * the other function would return if the document ends or the + * endpos is reached. + * Dropping this check for time being + */ + } + else if (maxcoverSize < coverlen) + endpos = startpos + maxcoverSize; + } + covers[minI].startpos = startpos; + covers[minI].endpos = endpos; + numWords += endpos - startpos + 1; + } + else + break; + } + + /* Render the headline */ + if (maxWords > 0) + hlparsetext_with_covers(cfgId, prs, query, in, covers, numcovers); + + /* clean up */ + pfree(covers); + pfree(t); + pfree(qr.operandexist); + PG_FREE_IF_COPY(in, 1); + PG_FREE_IF_COPY(query, 2); + PG_FREE_IF_COPY(prsoptions, 3); + + PG_RETURN_POINTER(prs); +} diff -Nurb postgresql-8.3.1/src/backend/utils/adt/tsrank.c postgresql-8.3.1-orig/src/backend/utils/adt/tsrank.c --- postgresql-8.3.1/src/backend/utils/adt/tsrank.c 2008-01-01 14:45:53.000000000 -0500 +++ postgresql-8.3.1-orig/src/backend/utils/adt/tsrank.c 2008-05-22 18:44:16.000000000 -0400 @@ -17,6 +17,7 @@ #include "tsearch/ts_type.h" #include "tsearch/ts_utils.h" +#include "tsearch/ts_rank.h" #include "utils/array.h" #include "miscadmin.h" @@ -463,14 +464,6 @@ PG_RETURN_FLOAT4(res); } -typedef struct -{ - QueryItem **item; - int16 nitem; - uint8 wclass; - int32 pos; -} DocRepresentation; - static int compareDocR(const void *va, const void *vb) { @@ -482,12 +475,6 @@ return (a->pos > b->pos) ? 1 : -1; } -typedef struct -{ - TSQuery query; - bool *operandexist; -} QueryRepresentation; - #define QR_GET_OPERAND_EXISTS(q, v) ( (q)->operandexist[ ((QueryItem*)(v)) - GETQUERY((q)->query) ] ) #define QR_SET_OPERAND_EXISTS(q, v) QR_GET_OPERAND_EXISTS(q,v) = true @@ -499,17 +486,7 @@ return QR_GET_OPERAND_EXISTS(qr, val); } -typedef struct -{ - int pos; - int p; - int q; - DocRepresentation *begin; - DocRepresentation *end; -} Extention; - - -static bool +bool Cover(DocRepresentation *doc, int len, QueryRepresentation *qr, Extention *ext) { DocRepresentation *ptr; @@ -590,7 +567,7 @@ return Cover(doc, len, qr, ext); } -static DocRepresentation * +DocRepresentation * get_docrep(TSVector txt, QueryRepresentation *qr, int *doclen) { QueryItem *item = GETQUERY(qr->query); diff -Nurb postgresql-8.3.1/src/include/tsearch/ts_rank.h postgresql-8.3.1-orig/src/include/tsearch/ts_rank.h --- postgresql-8.3.1/src/include/tsearch/ts_rank.h 1969-12-31 19:00:00.000000000 -0500 +++ postgresql-8.3.1-orig/src/include/tsearch/ts_rank.h 2008-05-22 18:44:16.000000000 -0400 @@ -0,0 +1,36 @@ +#ifndef __TSRANK_H__ +#define __TSRANK_H__ + +#include "ts_type.h" +#include "ts_cache.h" + +typedef struct +{ + QueryItem **item; + int16 nitem; + uint8 wclass; + int32 pos; +} DocRepresentation; + +typedef struct +{ + TSQuery query; + bool *operandexist; +} QueryRepresentation; + +typedef struct +{ + int pos; + int p; + int q; + DocRepresentation *begin; + DocRepresentation *end; +} Extention; + +bool +Cover(DocRepresentation *doc, int len, QueryRepresentation *qr, Extention *ext); + +DocRepresentation * +get_docrep(TSVector txt, QueryRepresentation *qr, int *doclen); + +#endif /* __TSRANK_H__ */ diff -Nurb postgresql-8.3.1/src/include/tsearch/ts_utils.h postgresql-8.3.1-orig/src/include/tsearch/ts_utils.h --- postgresql-8.3.1/src/include/tsearch/ts_utils.h 2008-01-01 14:45:59.000000000 -0500 +++ postgresql-8.3.1-orig/src/include/tsearch/ts_utils.h 2008-05-22 18:44:16.000000000 -0400 @@ -14,6 +14,7 @@ #include "tsearch/ts_type.h" #include "tsearch/ts_public.h" +#include "tsearch/ts_rank.h" #include "nodes/pg_list.h" /* @@ -95,8 +96,25 @@ * 3 generateHeadline to generate result text */ +struct coverpos +{ + int4 startpos; + int4 endpos; + int4 in; +}; + extern void hlparsetext(Oid cfgId, HeadlineParsedText *prs, TSQuery query, char *buf, int4 buflen); + + +extern void +hlparsetext_with_covers(Oid cfgId, + HeadlineParsedText *prs, + TSQuery query, + text *in, + struct coverpos *covers, + int4 numcovers); + extern text *generateHeadline(HeadlineParsedText *prs); /* @@ -227,6 +245,7 @@ extern Datum prsd_end(PG_FUNCTION_ARGS); extern Datum prsd_headline(PG_FUNCTION_ARGS); extern Datum prsd_lextype(PG_FUNCTION_ARGS); +extern Datum prsd_headline_with_fragments(PG_FUNCTION_ARGS); /* * Dictionary interface to SQL @@ -264,6 +283,7 @@ extern Datum ts_headline_byid(PG_FUNCTION_ARGS); extern Datum ts_headline(PG_FUNCTION_ARGS); extern Datum ts_headline_opt(PG_FUNCTION_ARGS); +extern Datum ts_headline_with_fragments(PG_FUNCTION_ARGS); /* * current cfg diff -Nurb postgresql-8.3.1/src/interfaces/libpq/libpq.rc postgresql-8.3.1-orig/src/interfaces/libpq/libpq.rc --- postgresql-8.3.1/src/interfaces/libpq/libpq.rc 2008-03-14 23:24:54.000000000 -0400 +++ postgresql-8.3.1-orig/src/interfaces/libpq/libpq.rc 2008-05-22 18:44:16.000000000 -0400 @@ -1,8 +1,8 @@ #include <winver.h> VS_VERSION_INFO VERSIONINFO - FILEVERSION 8,3,1,8075 - PRODUCTVERSION 8,3,1,8075 + FILEVERSION 8,3,1,8143 + PRODUCTVERSION 8,3,1,8143 FILEFLAGSMASK 0x3fL FILEFLAGS 0 FILEOS VOS__WINDOWS32