Hi,
I'm having trouble with some sql statements which use an _expression_ with many columns and distinct in the column list of the select.
select distinct col1,col2,.....col20,col21
from table1 left join table2 on <join condition>,...
where
<other expressions>;
The negative result is a big sort with teporary files.
-> Sort (cost=5813649.93..5853067.63 rows=15767078 width=80) (actual time=79027.079..81556.059 rows=12076838 loops=1)
Sort Method: external sort Disk: 1086096kB
By the way - for this query I have a work_mem of 1 GB - so raising this further is not generally possible - also not for one special command, due to parallelism.
How do I get around this?
I have one idea and like to know if there any other approaches or an even known better solution to that problem. By using group by I don't need the big sort for the distinct - I reduce it (theoreticly) to the key columns.
select <list of key columns>,<non key column>
from tables1left join table2 on <join condition>,...
where
<other conditions>
group by <list of key columns>
Another question would be what's the aggregate function which needs as less as possible resources (time).
Below is a list of sql statements which shows a reduced sample of the sql and one of the originating sqls.
Any hints are welcome. They may safe hours
Best Regards,
Uwe
create table a(a_id int,a_a1 int, a_a2 int, a_a3 int, a_a4 int, a_a5 int, a_a6 int, a_a7 int, a_a8 int, a_a9 int, a_a10 int, primary key (a_id));
create table b(b_id int,b_a1 int, b_a2 int, b_a3 int, b_a4 int, b_a5 int, b_a6 int, b_a7 int, b_a8 int, b_a9 int, b_a10 int, primary key (b_id));
insert into a select generate_series(1,1000000),generate_series(1,1000000),generate_series(1,1000000),generate_series(1,1000000),generate_series(1,1000000),generate_series(1,1000000),generate_series(1,1000000),generate_series(1,1000000),generate_series(1,1000000),generate_series(1,1000000),generate_series(1,1000000);
insert into b select generate_series(1,1000000),generate_series(1,1000000),generate_series(1,1000000),generate_series(1,1000000),generate_series(1,1000000),generate_series(1,1000000),generate_series(1,1000000),generate_series(1,1000000),generate_series(1,1000000),generate_series(1,1000000),generate_series(1,1000000);
-- current state
------------------------------
postgres=# explain analyze verbose select distinct a_id,b_id,coalesce(a_a1,0), a_a2, a_a3, a_a4, a_a5, a_a6, a_a7, a_a8
, a_a9, a_a10,b_a1, b_a2, b_a3, b_a4, b_a5, b_a6, b_a7, b_a8 , b_a9,
b_a10 from a left join b on a_id=b_id;
QUERY PLAN
--------------------------------------------------------------------------------------------------------------------------------------------------------------
-------------------------------------------------
HashAggregate (cost=128694.42..138694.64 rows=1000022 width=88) (actual time=3127.884..3647.814 rows=1000000 loops=1)
Output: a.a_id, b.b_id, (COALESCE(a.a_a1, 0)), a.a_a2, a.a_a3, a.a_a4, a.a_a5, a.a_a6, a.a_a7, a.a_a8, a.a_a9, a.a_a10, b.b_a1, b.b_a2, b.b_a3, b.b_a4, b.b
_a5, b.b_a6, b.b_a7, b.b_a8, b.b_a9, b.b_a10
-> Hash Left Join (cost=31846.50..73693.21 rows=1000022 width=88) (actual time=361.938..2010.894 rows=1000000 loops=1)
Output: a.a_id, b.b_id, COALESCE(a.a_a1, 0), a.a_a2, a.a_a3, a.a_a4, a.a_a5, a.a_a6, a.a_a7, a.a_a8, a.a_a9, a.a_a10, b.b_a1, b.b_a2, b.b_a3, b.b_a4,
b.b_a5, b.b_a6, b.b_a7, b.b_a8, b.b_a9, b.b_a10
Hash Cond: (a.a_id = b.b_id)
-> Seq Scan on a (cost=0.00..19346.22 rows=1000022 width=44) (actual time=0.014..118.918 rows=1000000 loops=1)
Output: a.a_id, a.a_a1, a.a_a2, a.a_a3, a.a_a4, a.a_a5, a.a_a6, a.a_a7, a.a_a8, a.a_a9, a.a_a10
-> Hash (cost=19346.22..19346.22 rows=1000022 width=44) (actual time=361.331..361.331 rows=1000000 loops=1)
Output: b.b_id, b.b_a1, b.b_a2, b.b_a3, b.b_a4, b.b_a5, b.b_a6, b.b_a7, b.b_a8, b.b_a9, b.b_a10
-> Seq Scan on b (cost=0.00..19346.22 rows=1000022 width=44) (actual time=0.008..119.711 rows=1000000 loops=1)
Output: b.b_id, b.b_a1, b.b_a2, b.b_a3, b.b_a4, b.b_a5, b.b_a6, b.b_a7, b.b_a8, b.b_a9, b.b_a10
Total runtime: 3695.845 ms
-- possible future state
------------------------------
postgres=# explain analyze verbose select a_id,b_id, min(coalesce(a_a1,0)), min( a_a2), min( a_a3), min( a_a4), min( a_a5), min( a_a6), min( a_a7), min( a_a8 ), min( a_a9), min( a_a10), min(b_a1), min( b_a2), min( b_a3), min( b_a4), min( b_a5), min( b_a6), min( b_a7), min( b_a8 ), min( b_a9), min( b_a10) from a left join b on a_id=b_id group by a_id,b_id;
QUERY PLAN
--------------------------------------------------------------------------------------------------------------------------------------------------------------
-----------------------------------------------------------------------------------------------------------------------------------------------
HashAggregate (cost=128694.42..188695.74 rows=1000022 width=88) (actual time=3057.096..3884.902 rows=1000000 loops=1)
Output: a.a_id, b.b_id, min(COALESCE(a.a_a1, 0)), min(a.a_a2), min(a.a_a3), min(a.a_a4), min(a.a_a5), min(a.a_a6), min(a.a_a7), min(a.a_a8), min(a.a_a9), m
in(a.a_a10), min(b.b_a1), min(b.b_a2), min(b.b_a3), min(b.b_a4), min(b.b_a5), min(b.b_a6), min(b.b_a7), min(b.b_a8), min(b.b_a9), min(b.b_a10)
-> Hash Left Join (cost=31846.50..73693.21 rows=1000022 width=88) (actual time=362.611..1809.991 rows=1000000 loops=1)
Output: a.a_id, a.a_a1, a.a_a2, a.a_a3, a.a_a4, a.a_a5, a.a_a6, a.a_a7, a.a_a8, a.a_a9, a.a_a10, b.b_id, b.b_a1, b.b_a2, b.b_a3, b.b_a4, b.b_a5, b.b_
a6, b.b_a7, b.b_a8, b.b_a9, b.b_a10
Hash Cond: (a.a_id = b.b_id)
-> Seq Scan on a (cost=0.00..19346.22 rows=1000022 width=44) (actual time=0.014..119.920 rows=1000000 loops=1)
Output: a.a_id, a.a_a1, a.a_a2, a.a_a3, a.a_a4, a.a_a5, a.a_a6, a.a_a7, a.a_a8, a.a_a9, a.a_a10
-> Hash (cost=19346.22..19346.22 rows=1000022 width=44) (actual time=362.002..362.002 rows=1000000 loops=1)
Output: b.b_id, b.b_a1, b.b_a2, b.b_a3, b.b_a4, b.b_a5, b.b_a6, b.b_a7, b.b_a8, b.b_a9, b.b_a10
-> Seq Scan on b (cost=0.00..19346.22 rows=1000022 width=44) (actual time=0.010..121.665 rows=1000000 loops=1)
Output: b.b_id, b.b_a1, b.b_a2, b.b_a3, b.b_a4, b.b_a5, b.b_a6, b.b_a7, b.b_a8, b.b_a9, b.b_a10
Total runtime: 3934.146 ms
One of my originating sql statements is:
select distinct
l.link_id, ra.bridge, ra.tunnel, ra.urban, ra.long_haul, ra.stub,
COALESCE(rac.admin_class, 0), ra.functional_class, ra.speed_category,ra.travel_direction,
ra.paved, ra.private, ra.tollway, ra.boat_ferry, ra.rail_ferry,
ra.multi_digitized, ra.in_process_data, ra.automobiles, ra.buses, ra.taxis,
ra.carpools, ra.pedestrians, ra.trucks, ra.through_traffic, ra.deliveries, ra.emergency_vehicles,
ra.ramp,ra.roundabout,ra.square, ra.parking_lot_road, ra.controlled_access, ra.frontage,
CASE WHEN COALESCE(cl.INTERSECTION_ID,0) = 0 THEN 'N' ELSE 'Y' END,
ra.TRANSPORT_VERIFIED, ra.PLURAL_JUNCTION, ra.vignette, ra.SCENIC_ROUTE, ra.four_wheel_drive
from nds.link l
join nndb.link_road_attributes lra on l.nndb_id = lra.feature_id
join nndb.road_attributes ra on lra.road_attr_id = ra.road_attr_id
left join nds.road_admin_class rac on rac.nndb_feature_id = l.nndb_id
left join nndb.complex_intersection_link cl on l.nndb_id = cl.link_id