Re: Multi-parameter aggregates.

Berend Tober <btober@xxxxxxxxxxxxxxxx> · Fri, 25 Nov 2005 05:13:34 -0500

Michael Fuhr wrote:

On Mon, Nov 21, 2005 at 11:03:22AM -0500, Berend Tober wrote:

I'm interested in defining a covariance aggregate function. (As a 

...aggregates must take a single value, so the above won't
work as written.  However, in PostgreSQL 8.0 or later you could
define the aggregate's base type to be a composite type ...

Thanks for you help. Here is what I came up with, using the existing 
POINT data type, which when you think about it makes a lot of sense, and 
even thoug having to explicitly cast a pair of columns as a point type 
is a little cludgy, I'm get the analytical tool I need, regardless.

CREATE TYPE public._regression AS (
   n integer,
   x double precision,
   y double precision,
   xy double precision
   );

CREATE OR REPLACE FUNCTION public.weighted_average(double precision, 
double precision, double precision)
   RETURNS double precision AS '
   BEGIN
   IF $1>1.0 THEN
       RAISE EXCEPTION ''Weighted average % coefficient exceeds 
unity.'', $1;
   ELSIF $1<0 THEN
       RAISE EXCEPTION ''Weighted average % coefficient less than 
zero.'', $1;
   END IF;
   RETURN $1*$2 + (1.-$1)*$3;
   END;
   'LANGUAGE 'plpgsql' IMMUTABLE STRICT;

CREATE OR REPLACE FUNCTION public.regression_accum(_regression, point)
   RETURNS _regression AS '
   SELECT
       $1.n+1,
       weighted_average($1.n/($1.n+1.), $1.x, $2[0]),
       weighted_average($1.n/($1.n+1.), $1.y, $2[1]),
       weighted_average($1.n/($1.n+1.), $1.xy, $2[0]*$2[1]);
   'LANGUAGE 'sql' IMMUTABLE STRICT;
GRANT EXECUTE ON FUNCTION public.regression_accum(_regression, point) TO 
public;
COMMENT ON FUNCTION public.regression_accum(_regression, point)
   IS 'regression aggregate transition function';

CREATE OR REPLACE FUNCTION public.covariance(_regression)
   RETURNS double precision AS '
   BEGIN
   IF $1.n = 0 THEN
       RAISE EXCEPTION ''No covariance data'';
   END IF;
   RETURN $1.xy - $1.x*$1.y;
   END;
   'LANGUAGE 'plpgsql' IMMUTABLE STRICT;
GRANT EXECUTE ON FUNCTION public.covariance(_regression) TO public;
COMMENT ON FUNCTION public.covariance(_regression)
   IS 'cov aggregate final function';

CREATE AGGREGATE public.cov(
 BASETYPE=point,
 SFUNC=regression_accum,
 STYPE=_regression,
 FINALFUNC=covariance,
 INITCOND='(0,0,0,0)'
);

-- Examples

CREATE LOCAL TEMPORARY TABLE test (
   example integer,
   x numeric,
   y numeric
) without oids;

-- Hand calculated: cov(x,y)=0.25
INSERT INTO test VALUES (1, 0.0, 0.0);
INSERT INTO test VALUES (1, 1.0, 1.0);

-- Hand calculated: cov(x,x)=0.389, cov(y,y)=1.556, cov(x,y)=0.778
INSERT INTO test VALUES (2, 0.0, 0.0);
INSERT INTO test VALUES (2, 1.0, 2.0);
INSERT INTO test VALUES (2, 1.5, 3.0);

-- Hand calculated: cov(x,x)=0.264, cov(y,y)=0.192, cov(x,y)=0.214
INSERT INTO test VALUES (3, 0.25, 0.125);
INSERT INTO test VALUES (3, 1.0, 1.0);
INSERT INTO test VALUES (3, 1.5, 1.1);

--
http://www.quantlet.com/mdstat/scripts/mva/htmlbook/mvahtmlnode22.html: 
cov(x,y) = -80.02
INSERT INTO test VALUES (4, 230, 125);
INSERT INTO test VALUES (4, 181,  99);
INSERT INTO test VALUES (4, 165,  97);
INSERT INTO test VALUES (4, 150, 115);
INSERT INTO test VALUES (4,  97, 120);
INSERT INTO test VALUES (4, 192, 100);
INSERT INTO test VALUES (4, 181,  80);
INSERT INTO test VALUES (4, 189,  90);
INSERT INTO test VALUES (4, 172,  95);
INSERT INTO test VALUES (4, 170, 125);

/*
Note that the value COV(X,X) and COV(Y,Y) do not equal output from
the native Postgresql aggregates VARIANCE(x) and VARIANCE(y),
respectively. I cannot explain why they are different, but COV()
is consistent with direct calculation.
*/

SELECT
   'Example '||example AS title,
   COUNT(*) AS N,
   COV(POINT(x,x)) AS covXX,
   COV(POINT(y,y)) AS covYY,
   COV(POINT(x,y)) AS covXY,
   VARIANCE(x),
   VARIANCE(y)
   FROM test
   GROUP BY 1
   ORDER BY 1;