Re: a JOIN on same table, but 'slided over'

PFC <lists@xxxxxxxxxx> · Wed, 27 Jun 2007 00:00:48 +0200



	OK, check...

test=> CREATE TABLE test (id INTEGER PRIMARY KEY);
test=> INSERT INTO test SELECT random()*5 + n*10 FROM  
generate_series( 1,100000 ) AS n;
test=> SELECT * FROM test LIMIT 10;
  id
-----
  11
  23
  31
  41
  52
  63
  70
  85
  94
 103

test=> ANALYZE test;
ANALYZE

	-- Self Join 1

test=> EXPLAIN ANALYZE SELECT t1.id AS current_id, t2.id AS next_id
FROM test t1, test t2
WHERE t2.id = ( SELECT min(id) FROM test AS t3 WHERE t3.id > t1.id )
ORDER BY t1.id;
                                                                        QUERY  
PLAN
-----------------------------------------------------------------------------------------------------------------------------------------------------------
 Sort  (cost=26703.19..26953.19 rows=100000 width=8) (actual  
time=5240.392..5271.529 rows=99999 loops=1)
   Sort Key: t1.id
   ->  Hash Join  (cost=2691.00..18398.37 rows=100000 width=8) (actual  
time=106.588..5179.737 rows=99999 loops=1)
         Hash Cond: ((subplan) = t2.id)
         ->  Seq Scan on test t1  (cost=0.00..1441.00 rows=100000 width=4)  
(actual time=0.013..34.782 rows=100000 loops=1)
         ->  Hash  (cost=1441.00..1441.00 rows=100000 width=4) (actual  
time=106.420..106.420 rows=100000 loops=1)
               ->  Seq Scan on test t2  (cost=0.00..1441.00 rows=100000  
width=4) (actual time=0.007..43.077 rows=100000 loops=1)
         SubPlan
           ->  Result  (cost=0.03..0.04 rows=1 width=0) (actual  
time=0.023..0.023 rows=1 loops=199999)
                 InitPlan
                   ->  Limit  (cost=0.00..0.03 rows=1 width=4) (actual  
time=0.021..0.022 rows=1 loops=199999)
                         ->  Index Scan using test_pkey on test t3   
(cost=0.00..1029.59 rows=33333 width=4) (actual time=0.020..0.020 rows=1  
loops=199999)
                               Index Cond: (id > $0)
                               Filter: (id IS NOT NULL)
 Total runtime: 5295.677 ms

	-- Self Join 2

test=> set enable_hashjoin TO 0;
test=> EXPLAIN ANALYZE SELECT t1.id AS current_id, t2.id AS next_id
FROM test t1, test t2
WHERE t2.id = ( SELECT min(id) FROM test AS t3 WHERE t3.id > t1.id )
ORDER BY t1.id;
                                                                              QUERY  
PLAN
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
 Sort  (cost=30806.48..31056.48 rows=100000 width=8) (actual  
time=2876.249..2903.011 rows=99999 loops=1)
   Sort Key: t1.id
   ->  Merge Join  (cost=9745.82..22501.66 rows=100000 width=8) (actual  
time=2547.830..2820.347 rows=99999 loops=1)
         Merge Cond: (t2.id = "inner"."?column2?")
         ->  Index Scan using test_pkey on test t2  (cost=0.00..2828.26  
rows=100000 width=4) (actual time=0.035..67.747 rows=100000 loops=1)
         ->  Sort  (cost=9745.82..9995.82 rows=100000 width=4) (actual  
time=2547.779..2582.889 rows=100000 loops=1)
               Sort Key: (subplan)
               ->  Seq Scan on test t1  (cost=0.00..1441.00 rows=100000  
width=4) (actual time=0.060..2487.728 rows=100000 loops=1)
                     SubPlan
                       ->  Result  (cost=0.03..0.04 rows=1 width=0)  
(actual time=0.023..0.023 rows=1 loops=100000)
                             InitPlan
                               ->  Limit  (cost=0.00..0.03 rows=1 width=4)  
(actual time=0.021..0.022 rows=1 loops=100000)
                                     ->  Index Scan using test_pkey on  
test t3  (cost=0.00..1029.59 rows=33333 width=4) (actual time=0.020..0.020  
rows=1 loops=100000)
                                           Index Cond: (id > $0)
                                           Filter: (id IS NOT NULL)
 Total runtime: 2923.804 ms

	-- DISTINCT ON

test=> EXPLAIN SELECT DISTINCT ON (t1.id) t1.id AS current_id, t2.id AS  
next_id
FROM test t1 JOIN test t2 ON t2.id > t1.id
ORDER BY t1.id, t2.id;
                                           QUERY PLAN
-------------------------------------------------------------------------------------------------
 Unique  (cost=729806679.75..746473346.41 rows=100000 width=8)
   ->  Sort  (cost=729806679.75..738140013.08 rows=3333333333 width=8)
         Sort Key: t1.id, t2.id
         ->  Nested Loop  (cost=0.00..100028973.00 rows=3333333333 width=8)
               ->  Seq Scan on test t1  (cost=0.00..1441.00 rows=100000  
width=4)
               ->  Index Scan using test_pkey on test t2   
(cost=0.00..583.61 rows=33333 width=4)
                     Index Cond: (t2.id > t1.id)
(7 lignes)

	This one takes much longer (I interrupted it).

	-- Using a function

CREATE TYPE test_type AS ( current_id INTEGER, next_id INTEGER );

CREATE OR REPLACE FUNCTION testfunc( )
    RETURNS SETOF test_type
    LANGUAGE plpgsql
    AS
$$
DECLARE
	_row		test_type;
BEGIN
	_row.current_id = NULL;
	
	FOR _row.next_id IN SELECT id FROM test ORDER BY id LOOP
		IF _row.current_id IS NOT NULL THEN
			RETURN NEXT _row;
		END IF;
		_row.current_id = _row.next_id;
	END LOOP;
END;
$$;

test=> EXPLAIN ANALYZE SELECT * FROM testfunc();
                                                     QUERY PLAN
--------------------------------------------------------------------------------------------------------------------
 Function Scan on testfunc  (cost=0.00..12.50 rows=1000 width=8) (actual  
time=211.702..238.322 rows=100000 loops=1)
 Total runtime: 262.369 ms

	Same results, at least 10x faster on large datasets...