How to improve: performance of query on postgresql 8.3 takes days

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Dear postgresql list,


I have some troubles generating data for a analysis task at hand.


I have a table (table A) containing 5 million records and 28 number of attributes. This table is 461MB big if I copy it to a csv file.


I want to create another table (table B) based on the contents of table A plus some 15 extra attributes (in pl/pgsql written functions which produce those extra attributes)


So my statement looks like this:


create tableB as (

select some attributes, function1(A.attribute1)as attributeX+1, function2(A.attribute1,A.Attribute2,A.attribute3,A.attribute4,A.attribute5) as attribute X+2......function15(A.attribute1,A.attribute9) as attributeX+15 from tableA as A)


This takes almost 60 hours to finish on my database server running debian 5.0 with XFS as filesystem containing 4GB RAM. I'm using postgresql server version 8.3 (but am seeing the same phenomena on my FreeBSD 8.0 database server running postgresql 8.4 as well)


I arrived at 15 functions because I had 7 or 8 joins in the past and saw that my disk was getting hid and I had heard someplace that RAM is faster so I rewrote those 7 or 8 joins as functions in pl/pgsql. They were just simple lookups, although some of the functions are looking stuff up in tables containing 78000 records. However, I thought this wouldn't be a problem because they are simple functions which look up the value of one variable based on a parameter. 3 of the more special functions are shown here:


CREATE OR REPLACE FUNCTION agenttype1(a character)

RETURNS integer AS

$BODY$

DECLARE

i integer;

t1_rij canxagents%ROWTYPE;

BEGIN

select * into t1_rij from canxagents where agent = a;

if NOT FOUND THEN i := 0;

ELSE

if t1_rij.aantal >= 0 and t1_rij.aantal <=499 THEN i := 1;

ELSE

if t1_rij.aantal > 500 and t1_rij.aantal <=1999 THEN i := 2;

ELSE

if t1_rij.aantal >= 2000 THEN i := 3;

END IF;

END IF;

END IF;

END IF;

return i ;

END;

$BODY$

LANGUAGE 'plpgsql' VOLATILE

COST 100;



CREATE OR REPLACE FUNCTION agenttype2(a character)

RETURNS integer AS

$BODY$

DECLARE

i integer;

t1_rij showagents%ROWTYPE;

BEGIN

select * into t1_rij from showagents where agent = a;

if NOT FOUND THEN i := 0;

ELSE

if t1_rij.aantal >= 0 and t1_rij.aantal <=499 THEN i := 1;

ELSE

if t1_rij.aantal > 500 and t1_rij.aantal <=999 THEN i := 2;

ELSE

if t1_rij.aantal >= 1000 THEN i := 3;

END IF;

END IF;

END IF;

END IF;

return i ;

END;

$BODY$

LANGUAGE 'plpgsql' VOLATILE

COST 100;



CREATE OR REPLACE FUNCTION agenttype3(a character)

RETURNS integer AS

$BODY$

DECLARE

i integer;

t1_rij noagents%ROWTYPE;

BEGIN

select * into t1_rij from noagents where agent = a;

if NOT FOUND THEN i := 0;

ELSE

if t1_rij.aantal >= 0 and t1_rij.aantal <=299 THEN i := 1;

ELSE

if t1_rij.aantal > 300 and t1_rij.aantal <=899 THEN i := 2;

ELSE

if t1_rij.aantal >= 900 THEN i := 3;

END IF;

END IF;

END IF;

END IF;

return i ;

END;

$BODY$

LANGUAGE 'plpgsql' VOLATILE

COST 100;


The interesting parts of my postgresql.conf file look like this:


#------------------------------------------------------------------------------

# RESOURCE USAGE (except WAL)

#------------------------------------------------------------------------------


# - Memory -


shared_buffers = 512MB # min 128kB or max_connections*16kB

# (change requires restart)

temp_buffers = 8MB # min 800kB

#max_prepared_transactions = 5 # can be 0 or more

# (change requires restart)

# Note: Increasing max_prepared_transactions costs ~600 bytes of shared memory

# per transaction slot, plus lock space (see max_locks_per_transaction).

work_mem = 50MB # min 64kB

maintenance_work_mem = 256MB # min 1MB

#max_stack_depth = 2MB # min 100kB


# - Free Space Map -


max_fsm_pages = 153600 # min max_fsm_relations*16, 6 bytes each

# (change requires restart)

#max_fsm_relations = 1000 # min 100, ~70 bytes each

# (change requires restart)


# - Kernel Resource Usage -


#max_files_per_process = 1000 # min 25

# (change requires restart)

#shared_preload_libraries = '' # (change requires restart)


# - Cost-Based Vacuum Delay -


#vacuum_cost_delay = 0 # 0-1000 milliseconds

#vacuum_cost_page_hit = 1 # 0-10000 credits

#vacuum_cost_page_miss = 10 # 0-10000 credits

#vacuum_cost_page_dirty = 20 # 0-10000 credits

#vacuum_cost_limit = 200 # 1-10000 credits


# - Background Writer -


#bgwriter_delay = 200ms # 10-10000ms between rounds

#bgwriter_lru_maxpages = 100 # 0-1000 max buffers written/round

#bgwriter_lru_multiplier = 2.0 # 0-10.0 multipler on buffers scanned/round



#------------------------------------------------------------------------------

# WRITE AHEAD LOG

#------------------------------------------------------------------------------


# - Settings -


#fsync = on # turns forced synchronization on or off

#synchronous_commit = on # immediate fsync at commit

#wal_sync_method = fsync # the default is the first option

# supported by the operating system:

# open_datasync

# fdatasync

# fsync

# fsync_writethrough

# open_sync

#full_page_writes = on # recover from partial page writes

#wal_buffers = 64kB # min 32kB

# (change requires restart)

#wal_writer_delay = 200ms # 1-10000 milliseconds


#commit_delay = 0 # range 0-100000, in microseconds

#commit_siblings = 5 # range 1-1000


# - Checkpoints -


#checkpoint_segments = 3 # in logfile segments, min 1, 16MB each

#checkpoint_timeout = 5min # range 30s-1h

#checkpoint_completion_target = 0.5 # checkpoint target duration, 0.0 - 1.0

#checkpoint_warning = 30s # 0 is off


# - Archiving -


#archive_mode = off # allows archiving to be done

# (change requires restart)

#archive_command = '' # command to use to archive a logfile segment

#archive_timeout = 0 # force a logfile segment switch after this

# time; 0 is off



#------------------------------------------------------------------------------

# QUERY TUNING

#------------------------------------------------------------------------------


# - Planner Method Configuration -


#enable_bitmapscan = on

#enable_hashagg = on

#enable_hashjoin = on

#enable_indexscan = on

#enable_mergejoin = on

#enable_nestloop = on

#enable_seqscan = on

#enable_sort = on

#enable_tidscan = on


# - Planner Cost Constants -


#seq_page_cost = 1.0 # measured on an arbitrary scale

#random_page_cost = 4.0 # same scale as above

#cpu_tuple_cost = 0.01 # same scale as above

#cpu_index_tuple_cost = 0.005 # same scale as above

#cpu_operator_cost = 0.0025 # same scale as above

effective_cache_size = 256MB # was 128


# - Genetic Query Optimizer -


#geqo = on

#geqo_threshold = 12

#geqo_effort = 5 # range 1-10

#geqo_pool_size = 0 # selects default based on effort

#geqo_generations = 0 # selects default based on effort

#geqo_selection_bias = 2.0 # range 1.5-2.0


# - Other Planner Options -


#default_statistics_target = 10 # range 1-1000

#constraint_exclusion = off

#from_collapse_limit = 8

#join_collapse_limit = 8 # 1 disables collapsing of explicit

# JOIN clauses




Questions


  1. What can I do to let the creation of table B go faster?

  2. Do you think the use of indices (but where) would help me? I didn't go that route because in fact I don't have a where clause in the create table B statement. I could put indices on the little tables I'm using in the functions.

  3. What about the functions? Should I code them differently?

  4. What about my server configuration. What could be done over there?


Thanks in advanced



[Postgresql General]     [Postgresql PHP]     [PHP Users]     [PHP Home]     [PHP on Windows]     [Kernel Newbies]     [PHP Classes]     [PHP Books]     [PHP Databases]     [Yosemite]

  Powered by Linux