On Thu, Nov 25, 2021 at 8:58 AM James Sewell <james.sewell@xxxxxxxxxxxx> wrote: >> >> The hypothesis I'm thinking of is that incoming sessions are being blocked >> somewhere before they can acquire a ProcArray entry; if so, they'd not >> show up in either pg_stat_activity or pg_locks. What we have to look for >> then is evidence of somebody holding a strong lock on a shared relation. >> Try "select * from pg_locks where locktype = 'relation' and database = 0". >> > > That doesn't show anything when the issue is happening. > > Strace shows the startup processes looping like so: > > 13:51:15.510850 clock_gettime(CLOCK_MONOTONIC, {tv_sec=19439358, tv_nsec=733559666}) = 0 > 13:51:15.511254 futex(0x7ee891405238, FUTEX_WAKE, 1) = 1 > 13:51:15.511422 clock_gettime(CLOCK_MONOTONIC, {tv_sec=19439358, tv_nsec=734037545}) = 0 > 13:51:15.511690 futex(0x7ee891405938, FUTEX_WAIT, 0, NULL) = 0 > 13:51:15.520470 clock_gettime(CLOCK_MONOTONIC, {tv_sec=19439358, tv_nsec=743136450}) = 0 > 13:51:15.520772 clock_gettime(CLOCK_MONOTONIC, {tv_sec=19439358, tv_nsec=743376746}) = 0 > 13:51:15.521005 futex(0x7ee891405938, FUTEX_WAIT, 0, NULL) = 0 > 13:51:15.539309 clock_gettime(CLOCK_MONOTONIC, {tv_sec=19439358, tv_nsec=761920081}) = 0 > 13:51:15.539508 futex(0x7ee891406bb8, FUTEX_WAKE, 1) = 1 > 13:51:15.539690 clock_gettime(CLOCK_MONOTONIC, {tv_sec=19439358, tv_nsec=762298276}) = 0 > 13:51:15.539882 futex(0x7ee891405938, FUTEX_WAIT, 0, NULL) = 0 > 13:51:15.547311 clock_gettime(CLOCK_MONOTONIC, {tv_sec=19439358, tv_nsec=769929341}) = 0 > 13:51:15.547757 futex(0x7ee89140a738, FUTEX_WAKE, 1) = 1 > 13:51:15.547931 clock_gettime(CLOCK_MONOTONIC, {tv_sec=19439358, tv_nsec=770524120}) = 0 > 13:51:15.548089 futex(0x7ee891405938, FUTEX_WAIT, 0, NULL) = 0 > 13:51:15.551482 futex(0x7ee891405938, FUTEX_WAIT, 0, NULL) = -1 EAGAIN (Resource temporarily unavailable) > 13:51:15.551950 futex(0x7ee891405938, FUTEX_WAIT, 0, NULL) = 0 > 13:51:15.593631 futex(0x7ee891405938, FUTEX_WAIT, 0, NULL) = -1 EAGAIN (Resource temporarily unavailable) > 13:51:15.593807 futex(0x7ee891405938, FUTEX_WAIT, 0, NULL) = 0 > 13:51:15.626167 futex(0x7ee891405938, FUTEX_WAIT, 0, NULL) = 0 > 13:51:15.627590 clock_gettime(CLOCK_MONOTONIC, {tv_sec=19439358, tv_nsec=850204434}) = 0 > 13:51:15.627817 futex(0x7ee8914033b8, FUTEX_WAKE, 1) = 1 > 13:51:15.627987 open("pg_subtrans/A100", O_RDWR) = 16 > 13:51:15.628270 lseek(16, 131072, SEEK_SET) = 131072 > 13:51:15.628461 read(16, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 8192) = 8192 > 13:51:15.628677 close(16) = 0 > 13:51:15.628911 open("pg_subtrans/A0E1", O_RDWR) = 16 > 13:51:15.629132 lseek(16, 122880, SEEK_SET) = 122880 > 13:51:15.629309 read(16, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 8192) = 8192 > 13:51:15.629503 close(16) = 0 > 13:51:15.629685 clock_gettime(CLOCK_MONOTONIC, {tv_sec=19439358, tv_nsec=852272645}) = 0 > 13:51:15.629863 futex(0x7ee891405938, FUTEX_WAIT, 0, NULL) = 0 > 13:51:15.656694 clock_gettime(CLOCK_MONOTONIC, {tv_sec=19439358, tv_nsec=879314944}) = 0 > 13:51:15.656923 futex(0x7ee8914125b8, FUTEX_WAKE, 1) = 1 > 13:51:15.657129 open("pg_subtrans/A0EE", O_RDWR) = 16 > 13:51:15.657385 lseek(16, 73728, SEEK_SET) = 73728 > 13:51:15.657638 read(16, "\0\0\0\0\0H\356\240\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\5H\356\240\0\0\0\0"..., 8192) = 8192 > 13:51:15.657907 close(16) = 0 > 13:51:15.658141 clock_gettime(CLOCK_MONOTONIC, {tv_sec=19439358, tv_nsec=880753952}) = 0 > 13:51:15.658346 futex(0x7ee891405938, FUTEX_WAIT, 0, NULL) = 0 > 13:51:15.689705 clock_gettime(CLOCK_MONOTONIC, {tv_sec=19439358, tv_nsec=912331058}) = 0 > 13:51:15.689950 futex(0x7ee89140a0b8, FUTEX_WAKE, 1) = 1 > 13:51:15.690182 futex(0x7ee8914118b8, FUTEX_WAKE, 1) = 1 > 13:51:15.690420 open("pg_subtrans/A0EE", O_RDWR) = 16 > 13:51:15.690742 lseek(16, 73728, SEEK_SET) = 73728 > 13:51:15.691037 read(16, "\0\0\0\0\0H\356\240\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\5H\356\240\0\0\0\0"..., 8192) = 8192 > 13:51:15.691303 close(16) = 0 > 13:51:15.691582 clock_gettime(CLOCK_MONOTONIC, {tv_sec=19439358, tv_nsec=914202777}) = 0 > 13:51:15.691837 futex(0x7ee891405938, FUTEX_WAIT, 0, NULL) = 0 > 13:51:15.713319 futex(0x7ee891405938, FUTEX_WAIT, 0, NULL) = 0 > 13:51:15.716423 clock_gettime(CLOCK_MONOTONIC, {tv_sec=19439358, tv_nsec=939067321}) = 0 > 13:51:15.716737 futex(0x7ee89140a0b8, FUTEX_WAKE, 1) = 1 > 13:51:15.717010 futex(0x7ee89140f3b8, FUTEX_WAKE, 1) = 1 > 13:51:15.717286 clock_gettime(CLOCK_MONOTONIC, {tv_sec=19439358, tv_nsec=939899218}) = 0 > 13:51:15.717578 futex(0x7ee891405938, FUTEX_WAIT, 0, NULL) = 0 > 13:51:15.722270 futex(0x7ee891405938, FUTEX_WAIT, 0, NULL) = 0 > 13:51:15.746391 clock_gettime(CLOCK_MONOTONIC, {tv_sec=19439358, tv_nsec=969020657}) = 0 > 13:51:15.746664 futex(0x7ee89140fa38, FUTEX_WAKE, 1) = 1 > 13:51:15.746877 open("pg_subtrans/A0EE", O_RDWR) = 16 > > > While gdb shows similar to: > > 24 ^[[?1034h(gdb) bt > 23 #0 0x00007f18fb9deb3b in do_futex_wait.constprop.1 () from /lib64/libpthread.so.0 > 22 #1 0x00007f18fb9debcf in __new_sem_wait_slow.constprop.0 () from /lib64/libpthread.so.0 > 21 #2 0x00007f18fb9dec6b in sem_wait@@GLIBC_2.2.5 () from /lib64/libpthread.so.0 > 20 #3 0x000000000071c482 in PGSemaphoreLock () > 19 #4 0x000000000079078c in LWLockAcquire () > 18 #5 0x00000000004fa9ae in SimpleLruWaitIO.isra.1 () > 17 #6 0x00000000004fb2a4 in SimpleLruReadPage () > 16 #7 0x00000000004fbc07 in SubTransGetParent () > 15 #8 0x00000000004fbc57 in SubTransGetTopmostTransaction () > 14 #9 0x00000000008f65a7 in XidInMVCCSnapshot () > 13 #10 0x00000000008f662e in HeapTupleSatisfiesMVCC () > 12 #11 0x00000000004c436e in heapgetpage () > 11 #12 0x00000000004c58e4 in heap_getnext () > 10 #13 0x00000000008ab8b5 in ScanPgRelation () > 9 #14 0x00000000008ae124 in RelationBuildDesc () > 8 #15 0x00000000008b010e in load_critical_index () > 7 #16 0x00000000008b39ac in RelationCacheInitializePhase3 () > 6 #17 0x00000000008cbd56 in InitPostgres () > 5 #18 0x00000000007a1cfe in PostgresMain () > 4 #19 0x000000000048a9c0 in ServerLoop () > 3 #20 0x000000000072f11e in PostmasterMain () > 2 #21 0x000000000048b854 in main () > > Does that shed any light? Seems like some of the processes are taking a long time or stuck while reading/writing SLRU pages, and due to that while creating a new connection the backend process is not able to check the transaction status (from pg_xlog) of the pg_class tuple and gets stuck/taking a long time in a startup. -- Regards, Dilip Kumar EnterpriseDB: http://www.enterprisedb.com