Andrej Vanek <andrej.vanek.sk@xxxxxxxxx> writes: > Hello, > > I tried to run pg_basebackup. Return value is 1. > > How to find out its reason? > (I suspect that some wal after backup is missing- but how to find out the real reason? How to fix it?) Well there are more than 80 cases of exit 1 in a couple .c files and most/all are preceeded unsurprisingly with an fprintf(stderr. Are you not getting something informative on your display and/or sending stderr to somewher else? $ echo $* ./src/bin/pg_basebackup/pg_basebackup.c ./src/backend/replication/basebackup.c $ grep -h -B 4 -i 'exit.*(1' $* | egrep -hi 'fprintf\(stderr|exit' fprintf(stderr, _("%s: directory name too long\n"), progname); exit(1); fprintf(stderr, _("%s: multiple \"=\" signs in tablespace mapping\n"), progname); exit(1); fprintf(stderr, exit(1); fprintf(stderr, _("%s: old directory is not an absolute path in tablespace mapping: %s\n"), exit(1); fprintf(stderr, _("%s: new directory is not an absolute path in tablespace mapping: %s\n"), exit(1); fprintf(stderr, _("%s: could not read from ready pipe: %s\n"), exit(1); fprintf(stderr, exit(1); fprintf(stderr, disconnect_and_exit(1); fprintf(stderr, disconnect_and_exit(1); exit(1); fprintf(stderr, disconnect_and_exit(1); fprintf(stderr, _("%s: could not create background process: %s\n"), disconnect_and_exit(1); fprintf(stderr, _("%s: could not create background thread: %s\n"), disconnect_and_exit(1); fprintf(stderr, disconnect_and_exit(1); fprintf(stderr, disconnect_and_exit(1); fprintf(stderr, _("%s: could not access directory \"%s\": %s\n"), disconnect_and_exit(1); fprintf(stderr, exit(1); fprintf(stderr, exit(1); fprintf(stderr, _("%s: transfer rate must be greater than zero\n"), exit(1); fprintf(stderr, exit(1); fprintf(stderr, exit(1); fprintf(stderr, exit(1); fprintf(stderr, disconnect_and_exit(1); fprintf(stderr, _("%s: could not write to file \"%s\": %s\n"), disconnect_and_exit(1); fprintf(stderr, disconnect_and_exit(1); fprintf(stderr, disconnect_and_exit(1); fprintf(stderr, disconnect_and_exit(1); fprintf(stderr, disconnect_and_exit(1); fprintf(stderr, _("%s: could not create file \"%s\": %s\n"), disconnect_and_exit(1); fprintf(stderr, _("%s: could not get COPY data stream: %s"), disconnect_and_exit(1); fprintf(stderr, disconnect_and_exit(1); fprintf(stderr, disconnect_and_exit(1); fprintf(stderr, _("%s: could not read COPY data: %s"), disconnect_and_exit(1); fprintf(stderr, _("%s: could not get COPY data stream: %s"), disconnect_and_exit(1); fprintf(stderr, _("%s: could not read COPY data: %s"), disconnect_and_exit(1); fprintf(stderr, _("%s: invalid tar block header size: %d\n"), disconnect_and_exit(1); fprintf(stderr, disconnect_and_exit(1); fprintf(stderr, disconnect_and_exit(1); fprintf(stderr, disconnect_and_exit(1); fprintf(stderr, _("%s: could not create file \"%s\": %s\n"), disconnect_and_exit(1); fprintf(stderr, _("%s: could not write to file \"%s\": %s\n"), disconnect_and_exit(1); fprintf(stderr, disconnect_and_exit(1); fprintf(stderr, _("%s: out of memory\n"), progname); exit(1); fprintf(stderr, _("%s: out of memory\n"), progname); disconnect_and_exit(1); fprintf(stderr, _("%s: out of memory\n"), progname); disconnect_and_exit(1); fprintf(stderr, _("%s: out of memory\n"), progname); disconnect_and_exit(1); fprintf(stderr, _("%s: could not create file \"%s\": %s\n"), progname, filename, strerror(errno)); disconnect_and_exit(1); fprintf(stderr, disconnect_and_exit(1); exit(1); fprintf(stderr, _("%s: incompatible server version %s\n"), disconnect_and_exit(1); disconnect_and_exit(1); disconnect_and_exit(1); fprintf(stderr, _("%s: could not send replication command \"%s\": %s"), disconnect_and_exit(1); fprintf(stderr, _("%s: could not initiate base backup: %s"), disconnect_and_exit(1); fprintf(stderr, disconnect_and_exit(1); fprintf(stderr, _("%s: could not get backup header: %s"), disconnect_and_exit(1); fprintf(stderr, _("%s: no data returned from server\n"), progname); disconnect_and_exit(1); fprintf(stderr, disconnect_and_exit(1); fprintf(stderr, disconnect_and_exit(1); fprintf(stderr, disconnect_and_exit(1); fprintf(stderr, _("%s: final receive failed: %s"), disconnect_and_exit(1); fprintf(stderr, disconnect_and_exit(1); fprintf(stderr, _("%s: could not wait for child process: %s\n"), disconnect_and_exit(1); fprintf(stderr, _("%s: child %d died, expected %d\n"), disconnect_and_exit(1); if (!WIFEXITED(status)) fprintf(stderr, _("%s: child process did not exit normally\n"), disconnect_and_exit(1); if (WEXITSTATUS(status) != 0) fprintf(stderr, _("%s: child process exited with error %d\n"), progname, WEXITSTATUS(status)); disconnect_and_exit(1); fprintf(stderr, disconnect_and_exit(1); fprintf(stderr, _("%s: could not wait for child thread: %s\n"), disconnect_and_exit(1); fprintf(stderr, _("%s: could not get child thread exit status: %s\n"), disconnect_and_exit(1); fprintf(stderr, _("%s: child thread exited with error %u\n"), disconnect_and_exit(1); fprintf(stderr, exit(1); fprintf(stderr, exit(1); fprintf(stderr, exit(1); fprintf(stderr, exit(1); fprintf(stderr, _("%s: invalid compression level \"%s\"\n"), exit(1); fprintf(stderr, _("%s: invalid checkpoint argument \"%s\", must be \"fast\" or \"spread\"\n"), exit(1); fprintf(stderr, _("%s: invalid status interval \"%s\"\n"), exit(1); fprintf(stderr, _("Try \"%s --help\" for more information.\n"), exit(1); fprintf(stderr, _("Try \"%s --help\" for more information.\n"), exit(1); fprintf(stderr, _("%s: no target directory specified\n"), progname); fprintf(stderr, _("Try \"%s --help\" for more information.\n"), exit(1); fprintf(stderr, _("Try \"%s --help\" for more information.\n"), exit(1); fprintf(stderr, _("Try \"%s --help\" for more information.\n"), exit(1); fprintf(stderr, _("Try \"%s --help\" for more information.\n"), exit(1); fprintf(stderr, _("%s: transaction log directory location must be " fprintf(stderr, _("Try \"%s --help\" for more information.\n"), exit(1); fprintf(stderr, exit(1); fprintf(stderr, _("%s: could not create symbolic link \"%s\": %s\n"), exit(1); fprintf(stderr, _("%s: symlinks are not supported on this platform\n")); exit(1); > > thanks, Andrej > --------------details: > environment: CentOS 6.7, postgres 9.5.1 > ( PostgreSQL 9.5.1 on x86_64-pc-linux-gnu, compiled by gcc (GCC) 4.4.7 20120313 (Red Hat 4.4.7-16), 64-bit) > > I tried 2 forms of pg_basebackup (-X fetch and -X stream). Both were issued from a script: > # su - postgres -c "/usr/pgsql-9.5/bin/pg_basebackup -h ${DB_MASTER_IP} -D ${GEO_STDBY_DATA} -U pgreplic -P -v -X fetch" 2>${LOG_FILE}.stderr  >> ${LOG_FILE} > # echo $? > 1       <--------------pg_basebackup failed! > # cat log.stderr > # cat /var/log/cluster/geo_repair.log.err > transaction log start point: 0/E3000028 on timeline 1 > WARNING:  skipping special file "./pg_hba.conf" > WARNING:  skipping special file "./pg_hba.conf.save" > transaction log end point: 0/E30000F8 > pg_basebackup: base backup completed       <------------------no reason for pg_basebackup failure! > # cp /tmp/pg_hba.conf /tmp/postgresql.conf /pg_data/ > # su - postgres -c "/usr/pgsql-9.5/bin/pg_ctl -D /pg_data/ start" > # tail /pg_data/pg_log/postgresql-Fri.log > `pg_xlog/0000000100000000000000E2' -> `../backups/arc/0000000100000000000000E2' > 2016-04-15 23:15:10 CEST:pgreplic@[unknown]:[10667] WARNING:  skipping special file "./pg_hba.conf" > 2016-04-15 23:15:10 CEST:pgreplic@[unknown]:[10667] WARNING:  skipping special file "./pg_hba.conf.save"     <---------------recorded in pg_log on master node and > copied by pg_basebackup (note time difference between two servers) > 2016-04-15 23:15:02 CEST:@:[23321] LOG:  database system was interrupted; last known up at 2016-04-15 23:15:10 CEST > 2016-04-15 23:15:02 CEST:postgres@postgres:[23329] FATAL:  the database system is starting up > 2016-04-15 23:15:03 CEST:@:[23321] LOG:  entering standby mode > 2016-04-15 23:15:03 CEST:@:[23321] LOG:  database system was not properly shut down; automatic recovery in progress <---------something missing from pg_basebackup > 2016-04-15 23:15:03 CEST:@:[23321] LOG:  redo starts at 0/E3000028 > 2016-04-15 23:15:03 CEST:@:[23321] LOG:  consistent recovery state reached at 0/E4000000 > 2016-04-15 23:15:03 CEST:@:[23295] LOG:  database system is ready to accept read only connections > 2016-04-15 23:15:03 CEST:@:[23356] LOG:  started streaming WAL from primary at 0/E4000000 on timeline 1 > -------second trial > # su - postgres -c "/usr/pgsql-9.5/bin/pg_basebackup -h ${DB_MASTER_IP} -D ${GEO_STDBY_DATA} -U pgreplic -P -v -X stream" > # echo $? > 1 > #  cat /var/log/cluster/geo_repair.log.err > transaction log start point: 0/E5000028 on timeline 1 > pg_basebackup: starting background WAL receiver > WARNING:  skipping special file "./pg_hba.conf" > WARNING:  skipping special file "./pg_hba.conf.save" > transaction log end point: 0/E50000F8 > pg_basebackup: waiting for background process to finish streaming ... > pg_basebackup: could not wait for child process: No child processes     <----what does this mean? I think it failed to start process to fetching wal logs created > during backup: but neither on master node neither on pg_basebackup output here is any information about reason.. (max_wal_senders on master is 10: I see no reason to > fail). > > postgres logs: > `pg_xlog/0000000100000000000000E4' -> `../backups/arc/0000000100000000000000E4' > 2016-04-15 23:35:09 CEST:pgreplic@[unknown]:[29035] WARNING:  skipping special file "./pg_hba.conf" > 2016-04-15 23:35:09 CEST:pgreplic@[unknown]:[29035] WARNING:  skipping special file "./pg_hba.conf.save" > 2016-04-15 23:35:01 CEST:@:[28926] LOG:  database system was interrupted; last known up at 2016-04-15 23:35:09 CEST > 2016-04-15 23:35:01 CEST:postgres@postgres:[28938] FATAL:  the database system is starting up > 2016-04-15 23:35:02 CEST:@:[28926] LOG:  entering standby mode > 2016-04-15 23:35:02 CEST:@:[28926] LOG:  database system was not properly shut down; automatic recovery in progress  <------------this means something missing from > pg_basebackup > 2016-04-15 23:35:02 CEST:@:[28926] LOG:  redo starts at 0/E5000028 > 2016-04-15 23:35:02 CEST:@:[28926] LOG:  consistent recovery state reached at 0/E6000000 > 2016-04-15 23:35:02 CEST:@:[28904] LOG:  database system is ready to accept read only connections > 2016-04-15 23:35:02 CEST:@:[28989] LOG:  started streaming WAL from primary at 0/E6000000 on timeline 1 > > postgres params on master node: > log_line_prefix = '%t:%u@%d:[%p] ' > logging_collector = on > wal_buffers = 16MB > max_wal_size = 200MB > log_temp_files = 1MB > max_connections = 170 > shared_buffers = 512MB > effective_cache_size = 1500MB > work_mem = 48MB > log_lock_waits = on > log_min_duration_statement = 10000 > shared_preload_libraries = 'pg_stat_statements' > include '/var/lib/pgsql/tmp/rep_mode.conf' # added by pgsql RA > wal_level = hot_standby > archive_mode = on > max_wal_senders = 10 > hot_standby = on > wal_keep_segments = 128 > archive_command = '/opt/postgres/dbconf/archive_command.sh %p %f' > wal_receiver_status_interval = 2 > max_standby_streaming_delay = -1 > max_standby_archive_delay = -1 > restart_after_crash = off > hot_standby_feedback = on > -- Jerry Sievers Postgres DBA/Development Consulting e: postgres.consulting@xxxxxxxxxxx p: 312.241.7800 -- Sent via pgsql-general mailing list (pgsql-general@xxxxxxxxxxxxxx) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-general