Hi all,
I have a postgresql cluster (13.7) with two nodes (master,slave) compiled in docker (alpine 3.15) with flags:
./configure --prefix=${PG_DIR} --exec-prefix=${PG_DIR} --enable-integer-datetimes --enable-thread-safety --disable-rpath --with-uuid=e2fs --with-gnu-ld --with-pgport=5432 --with-system-tzdata=/usr/share/zoneinfo --with-llvm --with-gssapi --with-ldap --with-icu --with-tcl --with-perl --with-python --with-pam --with-openssl --with-libxml --with-libxslt
and running with the following configuration:
listen_addresses = '*'
port = 5432
max_connections = 1000
unix_socket_directories = '/opt/pg/data'
superuser_reserved_connections = 3
shared_buffers = 6GB
temp_buffers = 32MB
max_prepared_transactions = 100
work_mem = 1146kB
maintenance_work_mem = 1792MB
max_stack_depth = 4MB
dynamic_shared_memory_type = posix
archive_command = '/opt/pg/bin/pgbackrest --stanza=aws-prdan archive-push %p'
archive_mode = on
max_wal_senders = 10
min_wal_size = 2GB
max_wal_size = 3GB
wal_level = logical
checkpoint_completion_target = 0.9
effective_cache_size = 18GB
log_destination = 'stderr'
logging_collector = on
log_directory = 'log'
log_filename = 'postgresql-%Y-%m-%d.log'
log_truncate_on_rotation = on
log_rotation_age = 1d
log_rotation_size = 0
log_checkpoints = on
log_line_prefix = '%m [%p] - [%a - %u - %d] [%h] : %e'
log_min_duration_statement = 0
stats_temp_directory = '/opt/pg/pg_stat_tmp'
autovacuum = on
autovacuum_max_workers = 3
autovacuum_vacuum_threshold = 50
autovacuum_analyze_threshold = 50
datestyle = 'iso, dmy'
default_text_search_config = 'pg_catalog.simple'
jit = on
jit_above_cost = 100000
jit_debugging_support = off
jit_dump_bitcode = off
jit_expressions = on
jit_inline_above_cost = 500000
jit_optimize_above_cost = 500000
jit_profiling_support = off
jit_provider = llvmjit
jit_tuple_deforming = on
max_worker_processes = 8
max_parallel_workers_per_gather = 2
max_parallel_workers = 2
default_statistics_target = 100
synchronous_commit = off
random_page_cost = 1.1
effective_io_concurrency = 200
track_activity_query_size = 10000
pg_stat_statements.track = all
port = 5432
max_connections = 1000
unix_socket_directories = '/opt/pg/data'
superuser_reserved_connections = 3
shared_buffers = 6GB
temp_buffers = 32MB
max_prepared_transactions = 100
work_mem = 1146kB
maintenance_work_mem = 1792MB
max_stack_depth = 4MB
dynamic_shared_memory_type = posix
archive_command = '/opt/pg/bin/pgbackrest --stanza=aws-prdan archive-push %p'
archive_mode = on
max_wal_senders = 10
min_wal_size = 2GB
max_wal_size = 3GB
wal_level = logical
checkpoint_completion_target = 0.9
effective_cache_size = 18GB
log_destination = 'stderr'
logging_collector = on
log_directory = 'log'
log_filename = 'postgresql-%Y-%m-%d.log'
log_truncate_on_rotation = on
log_rotation_age = 1d
log_rotation_size = 0
log_checkpoints = on
log_line_prefix = '%m [%p] - [%a - %u - %d] [%h] : %e'
log_min_duration_statement = 0
stats_temp_directory = '/opt/pg/pg_stat_tmp'
autovacuum = on
autovacuum_max_workers = 3
autovacuum_vacuum_threshold = 50
autovacuum_analyze_threshold = 50
datestyle = 'iso, dmy'
default_text_search_config = 'pg_catalog.simple'
jit = on
jit_above_cost = 100000
jit_debugging_support = off
jit_dump_bitcode = off
jit_expressions = on
jit_inline_above_cost = 500000
jit_optimize_above_cost = 500000
jit_profiling_support = off
jit_provider = llvmjit
jit_tuple_deforming = on
max_worker_processes = 8
max_parallel_workers_per_gather = 2
max_parallel_workers = 2
default_statistics_target = 100
synchronous_commit = off
random_page_cost = 1.1
effective_io_concurrency = 200
track_activity_query_size = 10000
pg_stat_statements.track = all
Sometimes my cluster nodes hangs down and not responging. Logs ends without any error:
2022-07-02 00:42:51.755 P00 INFO: archive-push command begin 2.39: [pg_wal/00000001000055960000002C] --archive-async --compress-type=lz4 --exec-id=787394-196ba324 --log-level-console=info
--log-level-file=detail --pg1-path=/opt/pg/data --process-max=4 --repo1-cipher-pass=<redacted> --repo1-cipher-type=aes-256-cbc --repo1-path=/repo-path --repo1-s3-bucket=backup-postgresql
--repo1-s3-endpoint=s3.eu-central-1.amazonaws.com --repo1-s3-key=<redacted> --repo1-s3-key-secret=<redacted> --repo1-s3-region=eu-central-1 --repo1-type=s3 --stanza=aws-prdan
2022-07-02 00:42:51.755 P00 INFO: pushed WAL file '00000001000055960000002C' to the archive asynchronously
2022-07-02 00:42:51.755 P00 INFO: archive-push command end: completed successfully (1ms)
2022-07-02 07:54:34.333 GMT [15] - [ - - ] [] : 00000LOG: starting PostgreSQL 13.7 on x86_64-pc-linux-musl, compiled by gcc (Alpine 10.3.1_git20211027) 10.3.1 20211027, 64-bit
2022-07-02 07:54:34.335 GMT [15] - [ - - ] [] : 00000LOG: listening on IPv4 address "0.0.0.0", port 5432
--log-level-file=detail --pg1-path=/opt/pg/data --process-max=4 --repo1-cipher-pass=<redacted> --repo1-cipher-type=aes-256-cbc --repo1-path=/repo-path --repo1-s3-bucket=backup-postgresql
--repo1-s3-endpoint=s3.eu-central-1.amazonaws.com --repo1-s3-key=<redacted> --repo1-s3-key-secret=<redacted> --repo1-s3-region=eu-central-1 --repo1-type=s3 --stanza=aws-prdan
2022-07-02 00:42:51.755 P00 INFO: pushed WAL file '00000001000055960000002C' to the archive asynchronously
2022-07-02 00:42:51.755 P00 INFO: archive-push command end: completed successfully (1ms)
2022-07-02 07:54:34.333 GMT [15] - [ - - ] [] : 00000LOG: starting PostgreSQL 13.7 on x86_64-pc-linux-musl, compiled by gcc (Alpine 10.3.1_git20211027) 10.3.1 20211027, 64-bit
2022-07-02 07:54:34.335 GMT [15] - [ - - ] [] : 00000LOG: listening on IPv4 address "0.0.0.0", port 5432
I can login to database using psql but can't execute command, I can't stop database kill -INT, kill -QUIT, docker stop. Database size on zfs volume already have 3,8T. At the end I restart entry host and all back to normal work. Situation occurs on master and slave nodes.
Please tell me what I should do to trace and fix the problem.
Thanks for your attention.
Regards
BS