Re: Problems with pg_locks explosion

Armand du Plessis <adp@xxxxxxx> · Tue, 2 Apr 2013 01:09:42 +0200

Thanks for the reply. 
I've now updated the background writer settings to:

# - Background Writer -

bgwriter_delay = 200ms                  # 10-10000ms between rounds

bgwriter_lru_maxpages = 400             # 0-1000 max buffers written/round
bgwriter_lru_multiplier = 2.0           # 0-10.0 multipler on buffers scanned/round

checkpoint_segments = 128

checkpoint_timeout = 25min  

It's still happening at the moment, this time without any checkpoint entries in the log :(

Below the output from vmstat. I'm not sure what to look for in there?

Thanks again,

Armand

$ sudo vmstat 5
procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu-----

 r  b   swpd   free   buff  cache   si   so    bi    bo   in   cs us sy id wa st
 2  0      0 485800   4224 44781700    0    0   167    91    1    0 21  1 78  0  0	

 7  0      0 353920   4224 44836176    0    0  6320    54 21371 12921 11  2 87  0  0	
32  0      0 352220   4232 44749544    0    0  1110     8 19414 9620  6 42 52  0  0	

 3  0      0 363044   4232 44615772    0    0    59  1943 11185 3774  0 81 18  0  0	
48  0      0 360076   4240 44550744    0    0     0    34 9563 5210  0 74 26  0  0	

33  0      0 413708   4240 44438248    0    0    92   962 11250 8169  0 61 39  0  0	
109  0      0 418080   4240 44344596    0    0   605  3490 10098 6216  1 49 50  0  0	

58  0      0 425388   4240 44286528    0    0     5    10 10794 2470  1 91  8  0  0	
53  0      0 435864   4240 44243000    0    0    11     0 9755 2428  0 92  8  0  0	

12  0      0 440792   4248 44213164    0    0   134     5 7883 3038  0 51 49  0  0	
 3  0      0 440360   4256 44158684    0    0   548   146 8450 3930  2 27 70  0  0	

 2  0      0 929236   4256 44248608    0    0 10466   845 22575 14196 20  5 74  0  0	
 4  0      0 859160   4256 44311828    0    0  7120    61 20890 12835 12  1 86  0  0	

 4  0      0 685308   4256 44369404    0    0  6110    24 20645 12545 13  1 85  0  0	
 4  0      0 695440   4256 44396304    0    0  5351  1208 19529 11781 11  1 88  0  0	

 4  0      0 628276   4256 44468116    0    0  9202     0 19875 12172  9  1 89  0  0	
 6  0      0 579716   4256 44503848    0    0  3799    22 19223 11772 10  1 88  0  0	

 3  1      0 502948   4256 44539784    0    0  3721  6700 20620 11939 13  1 85  0  0	
 4  0      0 414120   4256 44583456    0    0  3860   856 19801 12092 10  1 89  0  0	

 6  0      0 349240   4256 44642880    0    0  6122    48 19834 11933 11  2 87  0  0	
 3  0      0 400536   4256 44535872    0    0  6287     5 18945 11461 10  1 89  0  0	

 3  0      0 364256   4256 44592412    0    0  5487  2018 20145 12344 11  1 87  0  0	
 7  0      0 343732   4256 44598784    0    0  4209    24 19099 11482 10  1 88  0  0	

 6  0      0 339608   4236 44576768    0    0  6805   151 18821 11333  9  2 89  0  0	
 9  1      0 339364   4236 44556884    0    0  2597  4339 19205 11918 11  3 85  0  0	

24  0      0 341596   4236 44480368    0    0  6165  5309 19353 11562 11  4 84  1  0	
30  0      0 359044   4236 44416452    0    0  1364     6 12638 6138  5 28 67  0  0	

 4  0      0 436468   4224 44326500    0    0  3704  1264 11346 7545  4 27 68  0  0	
 3  1      0 459736   4224 44384788    0    0  6541     8 20159 12097 11  1 88  0  0	

 8  1      0 347812   4224 44462100    0    0 12292  2860 20851 12377  9  1 89  1  0	
 1  0      0 379752   4224 44402396    0    0  5849   147 20171 12253 11  1 88  0  0	

 4  0      0 453692   4216 44243480    0    0  6546   269 20689 13028 12  2 86  0  0	
 8  0      0 390160   4216 44259768    0    0  4243     0 20476 21238  6 16 78  0  0	

 6  0      0 344504   4216 44336264    0    0  7214     2 20919 12625 11  1 87  0  0	
 4  0      0 350128   4200 44324976    0    0 10726  2173 20417 12351 10  1 88  0  0	

 2  1      0 362300   4200 44282484    0    0  7148   714 22469 14468 12  2 86  0  0	
 3  0      0 366252   4184 44311680    0    0  7617   133 20487 12364  9  1 90  0  0	

 6  0      0 368904   4184 44248152    0    0  5162     6 22910 15221 14  7 80  0  0	
 2  0      0 383108   4184 44276780    0    0  5846  1120 21109 12563 11  1 88  0  0	

 7  0      0 338348   4184 44274472    0    0  9270     5 21243 12698 10  1 88  0  0	
24  0      0 339676   4184 44213036    0    0  6639    18 22976 12700 13 12 74  0  0	

12  0      0 371848   4184 44146500    0    0   657   133 18968 7445  5 53 43  0  0	
37  0      0 374516   4184 44076212    0    0    16     2 9156 4472  1 48 52  0  0	

16  0      0 398412   4184 43971060    0    0   127     0 9967 6018  0 48 52  0  0	
 4  0      0 417312   4184 44084392    0    0 17434  1072 23661 14268 16  6 78  1  0	

 4  0      0 407672   4184 44139896    0    0  5785     0 19779 11869 11  1 88  0  0	
 9  0      0 349544   4184 44051596    0    0  6899     8 20376 12774 10  3 88  0  0	

 5  0      0 424628   4184 44059628    0    0  9105   175 24546 15354 13 20 66  1  0	
 2  0      0 377164   4184 44070564    0    0  9363     3 21191 12608 11  2 87  0  0	

 5  0      0 353360   4184 44040804    0    0  6661     0 20931 12815 12  2 85  0  0	
 4  0      0 355144   4180 44034620    0    0  7061     8 21264 12379 11  3 86  0  0	

procs -----------memory---------- ---swap-- -----io---- --system-- -----cpu-----
 r  b   swpd   free   buff  cache   si   so    bi    bo   in   cs us sy id wa st

21  0      0 358396   4180 43958420    0    0  7595  1749 23258 12299 10 27 63  0  0	
 6  1      0 437480   4160 43922152    0    0 17565    14 17059 14928  6 18 74  2  0	

 6  0      0 380304   4160 43993932    0    0 10120   168 21519 12798 11  2 87  0  0	
 8  0      0 337740   4160 44007432    0    0  6033   520 20872 12461 11  1 88  0  0	

13  0      0 349712   4132 43927784    0    0  6777     6 20919 12568 11  2 86  0  0	
 6  1      0 351180   4112 43899756    0    0  8640     0 22543 12519 11 10 78  0  0	

 6  0      0 356392   4112 43921532    0    0  6206    48 20383 12050 12  1 86  0  0	
 6  0      0 355552   4108 43863448    0    0  6106     3 21244 11817  9  9 82  0  0	

 3  0      0 364992   7312 43856824    0    0 11283   199 21296 12638 13  2 85  0  0	
 4  1      0 371968   7120 43818552    0    0  6715  1534 22322 13305 11  7 81  0  0	

debug2: channel 0: window 999365 sent adjust 49211
12  0      0 338540   7120 43822256    0    0  9142     3 21520 12194 13  5 82  0  0	

 8  0      0 386016   7112 43717136    0    0  2123     3 20465 11466  8 20 72  0  0	
 8  0      0 352388   7112 43715872    0    0 10366    51 25758 13879 16 19 65  0  0	

20  0      0 351472   7112 43701060    0    0 13091    10 23766 12832 11 11 77  1  0	
 2  0      0 386820   7112 43587520    0    0   482   210 17187 6773  3 69 28  0  0	

64  0      0 401956   7112 43473728    0    0     0     5 10796 9487  0 55 44  0  0

On Tue, Apr 2, 2013 at 12:56 AM, Vasilis Ventirozos <v.ventirozos@xxxxxxxxx> wrote:

Hello, i think that your system during the checkpoint pauses all clients in order to flush all data from controller's cache to the disks if i were you i'd try to tune my checkpoint parameters better, if that doesn't work, show us some vmstat output please 

Vasilis Ventirozos
---------- Forwarded message ----------
From: "Armand du Plessis" <adp@xxxxxxx>
Date: Apr 2, 2013 1:37 AM
Subject: [PERFORM] Problems with pg_locks explosion

To: "pgsql-performance" <pgsql-performance@xxxxxxxxxxxxxx>
Cc: 

[Apologies, I first sent this to the incorrect list, postgres-admin, in the event you receive it twice]

Hi there,

I'm hoping someone on the list can shed some light on an issue I'm having with our Postgresql cluster. I'm literally tearing out my hair and don't have a deep enough understanding of Postgres to find the problem. 

What's happening is I had severe disk/io issues on our original Postgres cluster (9.0.8) and switched to a new instance with a RAID-0 volume array. The machine's CPU usage would hover around 30% and our database would run lightning fast with pg_locks hovering between 100-200. 

Within a few seconds something would trigger a massive increase in pg_locks so that it suddenly shoots up to 4000-8000. At this point everything dies. Queries that usually take a few milliseconds takes minutes and everything is unresponsive until I restart postgres. 

The instance still idles at this point. The only clue I could find was that it usually starts a few minutes after the checkpoint entries appear in my logs. 

Any suggestions would really be appreciated. It's killing our business at the moment. I can supply more info if required but pasted what I thought would be useful below. Not sure what else to change in the settings. 

Kind regards,

Armand

It's on Amazon EC2 - 

* cc2.8xlarge instance type 
* 6 volumes in RAID-0 configuration. (1000 PIOPS) 

60.5 GiB of memory

88 EC2 Compute Units (2 x Intel Xeon E5-2670, eight-core)
3370 GB of instance storage

64-bit platform
I/O Performance: Very High (10 Gigabit Ethernet)

EBS-Optimized Available: No**
API name: cc2.8xlarge

postgresql.conf
fsync = off
full_page_writes = off

default_statistics_target = 100
maintenance_work_mem = 1GB
checkpoint_completion_target = 0.9

effective_cache_size = 48GB
work_mem = 64MB
wal_buffers = -1

checkpoint_segments = 128
shared_buffers = 32GB
max_connections = 800

effective_io_concurrency = 3 # Down from 6

# - Background Writer -

#bgwriter_delay = 200ms                 # 10-10000ms between rounds

#bgwriter_lru_maxpages = 100            # 0-1000 max buffers written/round
#bgwriter_lru_multiplier = 2.0          # 0-10.0 multipler on buffers scanned/round

$ free
             total       used       free     shared    buffers     cached

Mem:      61368192   60988180     380012          0        784   44167172
-/+ buffers/cache:   16820224   44547968

Swap:            0          0          0

$ top -c 

top - 21:55:51 up 12 days, 12:41,  4 users,  load average: 6.03, 16.10, 24.15
top - 21:55:54 up 12 days, 12:41,  4 users,  load average: 6.03, 15.94, 24.06

Tasks: 837 total,   6 running, 831 sleeping,   0 stopped,   0 zombie
Cpu(s): 15.7%us,  1.7%sy,  0.0%ni, 81.6%id,  0.3%wa,  0.0%hi,  0.6%si,  0.0%st

Mem:  61368192k total, 54820988k used,  6547204k free,     9032k buffer

[ec2-user@ip-10-155-231-112 ~]$ sudo iostat

Linux 3.2.39-6.88.amzn1.x86_64 () 	04/01/2013 	_x86_64_	(32 CPU)

avg-cpu:  %user   %nice %system %iowait  %steal   %idle
          21.00    0.00    1.10    0.26    0.00   77.63

Device:            tps   Blk_read/s   Blk_wrtn/s   Blk_read   Blk_wrtn
xvda              0.21         5.00         2.22    5411830    2401368

xvdk             98.32      1774.67       969.86 1919359965 1048932113
xvdj             98.28      1773.68       969.14 1918288697 1048156776

xvdi             98.29      1773.69       969.61 1918300250 1048662470
xvdh             98.24      1773.92       967.54 1918544618 1046419936

xvdg             98.27      1774.15       968.85 1918790636 1047842846
xvdf             98.32      1775.56       968.69 1920316435 1047668172

md127           733.85     10645.68      5813.70 11513598393 6287682313

What bugs me on this is the throughput percentage on the volumes in Cloudwatch is 100% on all volumes. 

The problems seem to overlap with checkpoints. 

2013-04-01 21:31:35.592 UTC,,,26877,,5159fa5f.68fd,1,,2013-04-01 21:21:35 UTC,,0,LOG,00000,"checkpoint starting: time",,,,,,,,,""

2013-04-01 21:40:35.033 UTC,,,26877,,5159fa5f.68fd,2,,2013-04-01 21:21:35 UTC,,0,LOG,00000,"checkpoint complete: wrote 100635 buffers (2.4%); 0 transaction log file(s) added, 0 removed, 1 recycled; write=539.439 s, sync=0.000 s, total=539.441 s; sync files=0, longest=0.000 s, average=0.000 s",,,,,,,,,""

2013-04-01 21:41:35.093 UTC,,,26877,,5159fa5f.68fd,3,,2013-04-01 21:21:35 UTC,,0,LOG,00000,"checkpoint starting: time",,,,,,,,,""