Michael Fuhr wrote:
On Mon, Nov 21, 2005 at 11:03:22AM -0500, Berend Tober wrote:
I'm interested in defining a covariance aggregate function. (As a
...aggregates must take a single value, so the above won't
work as written. However, in PostgreSQL 8.0 or later you could
define the aggregate's base type to be a composite type ...
Thanks for you help. Here is what I came up with, using the existing
POINT data type, which when you think about it makes a lot of sense, and
even thoug having to explicitly cast a pair of columns as a point type
is a little cludgy, I'm get the analytical tool I need, regardless.
CREATE TYPE public._regression AS (
n integer,
x double precision,
y double precision,
xy double precision
);
CREATE OR REPLACE FUNCTION public.weighted_average(double precision,
double precision, double precision)
RETURNS double precision AS '
BEGIN
IF $1>1.0 THEN
RAISE EXCEPTION ''Weighted average % coefficient exceeds
unity.'', $1;
ELSIF $1<0 THEN
RAISE EXCEPTION ''Weighted average % coefficient less than
zero.'', $1;
END IF;
RETURN $1*$2 + (1.-$1)*$3;
END;
'LANGUAGE 'plpgsql' IMMUTABLE STRICT;
CREATE OR REPLACE FUNCTION public.regression_accum(_regression, point)
RETURNS _regression AS '
SELECT
$1.n+1,
weighted_average($1.n/($1.n+1.), $1.x, $2[0]),
weighted_average($1.n/($1.n+1.), $1.y, $2[1]),
weighted_average($1.n/($1.n+1.), $1.xy, $2[0]*$2[1]);
'LANGUAGE 'sql' IMMUTABLE STRICT;
GRANT EXECUTE ON FUNCTION public.regression_accum(_regression, point) TO
public;
COMMENT ON FUNCTION public.regression_accum(_regression, point)
IS 'regression aggregate transition function';
CREATE OR REPLACE FUNCTION public.covariance(_regression)
RETURNS double precision AS '
BEGIN
IF $1.n = 0 THEN
RAISE EXCEPTION ''No covariance data'';
END IF;
RETURN $1.xy - $1.x*$1.y;
END;
'LANGUAGE 'plpgsql' IMMUTABLE STRICT;
GRANT EXECUTE ON FUNCTION public.covariance(_regression) TO public;
COMMENT ON FUNCTION public.covariance(_regression)
IS 'cov aggregate final function';
CREATE AGGREGATE public.cov(
BASETYPE=point,
SFUNC=regression_accum,
STYPE=_regression,
FINALFUNC=covariance,
INITCOND='(0,0,0,0)'
);
-- Examples
CREATE LOCAL TEMPORARY TABLE test (
example integer,
x numeric,
y numeric
) without oids;
-- Hand calculated: cov(x,y)=0.25
INSERT INTO test VALUES (1, 0.0, 0.0);
INSERT INTO test VALUES (1, 1.0, 1.0);
-- Hand calculated: cov(x,x)=0.389, cov(y,y)=1.556, cov(x,y)=0.778
INSERT INTO test VALUES (2, 0.0, 0.0);
INSERT INTO test VALUES (2, 1.0, 2.0);
INSERT INTO test VALUES (2, 1.5, 3.0);
-- Hand calculated: cov(x,x)=0.264, cov(y,y)=0.192, cov(x,y)=0.214
INSERT INTO test VALUES (3, 0.25, 0.125);
INSERT INTO test VALUES (3, 1.0, 1.0);
INSERT INTO test VALUES (3, 1.5, 1.1);
--
http://www.quantlet.com/mdstat/scripts/mva/htmlbook/mvahtmlnode22.html:
cov(x,y) = -80.02
INSERT INTO test VALUES (4, 230, 125);
INSERT INTO test VALUES (4, 181, 99);
INSERT INTO test VALUES (4, 165, 97);
INSERT INTO test VALUES (4, 150, 115);
INSERT INTO test VALUES (4, 97, 120);
INSERT INTO test VALUES (4, 192, 100);
INSERT INTO test VALUES (4, 181, 80);
INSERT INTO test VALUES (4, 189, 90);
INSERT INTO test VALUES (4, 172, 95);
INSERT INTO test VALUES (4, 170, 125);
/*
Note that the value COV(X,X) and COV(Y,Y) do not equal output from
the native Postgresql aggregates VARIANCE(x) and VARIANCE(y),
respectively. I cannot explain why they are different, but COV()
is consistent with direct calculation.
*/
SELECT
'Example '||example AS title,
COUNT(*) AS N,
COV(POINT(x,x)) AS covXX,
COV(POINT(y,y)) AS covYY,
COV(POINT(x,y)) AS covXY,
VARIANCE(x),
VARIANCE(y)
FROM test
GROUP BY 1
ORDER BY 1;