Search Postgresql Archives

Per-document document statistics

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 




I'm trying to figure out how to compile text search statistics on a per-document basis.
While I successfully compute text search statistics for the entire corpus with a call
to ts_stat after having inserted all documents, what I also want is to run ts_stat on
the tsvector for each row so as to get the term frequency per document.

Sample code and comments follow.


-- Dumped from database version 9.5.7

CREATE DATABASE nlp;

\connect nlp

CREATE EXTENSION IF NOT EXISTS plpgsql WITH SCHEMA pg_catalog;

SET search_path = public, pg_catalog;

-- This table stores one document per row

CREATE TABLE document (
    document_id serial primary key,
    content text NOT NULL,
    document_vector tsvector
);

-- This is the table I need help with how to to populate
-- with term frequency per document

CREATE TABLE document_statistics (
    document_id integer primary key,
    word text,
    ndoc bigint, /* this will be one, since there is only one document */
    nentry bigint /* this is the number of interest */
);


ALTER TABLE ONLY document_statistics
    ADD CONSTRAINT document_statistics_document_id_fkey
    FOREIGN KEY (document_id)
    REFERENCES document(document_id);


CREATE FUNCTION document_bit() RETURNS trigger
    LANGUAGE plpgsql
    AS $$
BEGIN
	-- Compile document statistics for each document upon insert

	SELECT to_tsvector('simple', new.content) INTO new.document_vector;	
	RETURN new;
END;
$$;

CREATE TRIGGER document_bit
    BEFORE INSERT OR UPDATE ON document
    FOR EACH ROW EXECUTE PROCEDURE document_bit();


-- Sample data

INSERT INTO document (content) VALUES ('Hello World!');
INSERT INTO document (content) VALUES ('The quick brown dog jumped over the lazy dog');
INSERT INTO document (content) VALUES ('One flew over the coo coo''s nest',);

-- Once all the individual documents are inserted, then
-- calculate overall corpus statistics

insert into corpus_statistics select * from ts_stat('select document_vector from document');


-- I'm thinking something like this proposed after insert trigger
-- is where I want to compute document statistics, but can't
-- figure out how to make it work

CREATE FUNCTION document_ait() RETURNS trigger
    LANGUAGE plpgsql
    AS $$
DECLARE
	word_stat record;
BEGIN
/*
	--Here's one pathetic try

	FOR word_stat IN
		select * from ts_stat('select * from ' || (new.document_vector))
		LOOP
		RAISE NOTICE '%' , word_stat;
		INSERT INTO public.document_statistics(
            document_id, word, ndoc, nentry)
		VALUES (new.document_id, word_stat.word, word_stat.ndoc, word_stat.nentry);
	END LOOP;
*/
	RETURN new;
END;
$$;

CREATE TRIGGER document_ait AFTER INSERT ON document FOR EACH ROW EXECUTE PROCEDURE document_ait();




[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Index of Archives]     [Postgresql Jobs]     [Postgresql Admin]     [Postgresql Performance]     [Linux Clusters]     [PHP Home]     [PHP on Windows]     [Kernel Newbies]     [PHP Classes]     [PHP Books]     [PHP Databases]     [Postgresql & PHP]     [Yosemite]

  Powered by Linux