Hello,
> Are you not getting something informative on your display and/or sending
stderr to somewher else?- in case -X fetch no informative output
- in case -X stream following:
pg_basebackup: could not wait for child process: No child processes
Now after re-test directly from command-line it works. It fails just when launched from a script fired by crm_mon -d -E my-script
Regards, Andrej
2016-04-16 1:18 GMT+02:00 Jerry Sievers <gsievers19@xxxxxxxxxxx>:
Andrej Vanek <andrej.vanek.sk@xxxxxxxxx> writes:
> Hello,
>
> I tried to run pg_basebackup. Return value is 1.
>
> How to find out its reason?
> (I suspect that some wal after backup is missing- but how to find out the real reason? How to fix it?)
Well there are more than 80 cases of exit 1 in a couple .c files and
most/all are preceeded unsurprisingly with an fprintf(stderr.
Are you not getting something informative on your display and/or sending
stderr to somewher else?
$ echo $*
./src/bin/pg_basebackup/pg_basebackup.c ./src/backend/replication/basebackup.c
$ grep -h -B 4 -i 'exit.*(1' $* | egrep -hi 'fprintf\(stderr|exit'
fprintf(stderr, _("%s: directory name too long\n"), progname);
exit(1);
fprintf(stderr, _("%s: multiple \"=\" signs in tablespace mapping\n"), progname);
exit(1);
fprintf(stderr,
exit(1);
fprintf(stderr, _("%s: old directory is not an absolute path in tablespace mapping: %s\n"),
exit(1);
fprintf(stderr, _("%s: new directory is not an absolute path in tablespace mapping: %s\n"),
exit(1);
fprintf(stderr, _("%s: could not read from ready pipe: %s\n"),
exit(1);
fprintf(stderr,
exit(1);
fprintf(stderr,
disconnect_and_exit(1);
fprintf(stderr,
disconnect_and_exit(1);
exit(1);
fprintf(stderr,
disconnect_and_exit(1);
fprintf(stderr, _("%s: could not create background process: %s\n"),
disconnect_and_exit(1);
fprintf(stderr, _("%s: could not create background thread: %s\n"),
disconnect_and_exit(1);
fprintf(stderr,
disconnect_and_exit(1);
fprintf(stderr,
disconnect_and_exit(1);
fprintf(stderr, _("%s: could not access directory \"%s\": %s\n"),
disconnect_and_exit(1);
fprintf(stderr,
exit(1);
fprintf(stderr,
exit(1);
fprintf(stderr, _("%s: transfer rate must be greater than zero\n"),
exit(1);
fprintf(stderr,
exit(1);
fprintf(stderr,
exit(1);
fprintf(stderr,
exit(1);
fprintf(stderr,
disconnect_and_exit(1);
fprintf(stderr, _("%s: could not write to file \"%s\": %s\n"),
disconnect_and_exit(1);
fprintf(stderr,
disconnect_and_exit(1);
fprintf(stderr,
disconnect_and_exit(1);
fprintf(stderr,
disconnect_and_exit(1);
fprintf(stderr,
disconnect_and_exit(1);
fprintf(stderr, _("%s: could not create file \"%s\": %s\n"),
disconnect_and_exit(1);
fprintf(stderr, _("%s: could not get COPY data stream: %s"),
disconnect_and_exit(1);
fprintf(stderr,
disconnect_and_exit(1);
fprintf(stderr,
disconnect_and_exit(1);
fprintf(stderr, _("%s: could not read COPY data: %s"),
disconnect_and_exit(1);
fprintf(stderr, _("%s: could not get COPY data stream: %s"),
disconnect_and_exit(1);
fprintf(stderr, _("%s: could not read COPY data: %s"),
disconnect_and_exit(1);
fprintf(stderr, _("%s: invalid tar block header size: %d\n"),
disconnect_and_exit(1);
fprintf(stderr,
disconnect_and_exit(1);
fprintf(stderr,
disconnect_and_exit(1);
fprintf(stderr,
disconnect_and_exit(1);
fprintf(stderr, _("%s: could not create file \"%s\": %s\n"),
disconnect_and_exit(1);
fprintf(stderr, _("%s: could not write to file \"%s\": %s\n"),
disconnect_and_exit(1);
fprintf(stderr,
disconnect_and_exit(1);
fprintf(stderr, _("%s: out of memory\n"), progname);
exit(1);
fprintf(stderr, _("%s: out of memory\n"), progname);
disconnect_and_exit(1);
fprintf(stderr, _("%s: out of memory\n"), progname);
disconnect_and_exit(1);
fprintf(stderr, _("%s: out of memory\n"), progname);
disconnect_and_exit(1);
fprintf(stderr, _("%s: could not create file \"%s\": %s\n"), progname, filename, strerror(errno));
disconnect_and_exit(1);
fprintf(stderr,
disconnect_and_exit(1);
exit(1);
fprintf(stderr, _("%s: incompatible server version %s\n"),
disconnect_and_exit(1);
disconnect_and_exit(1);
disconnect_and_exit(1);
fprintf(stderr, _("%s: could not send replication command \"%s\": %s"),
disconnect_and_exit(1);
fprintf(stderr, _("%s: could not initiate base backup: %s"),
disconnect_and_exit(1);
fprintf(stderr,
disconnect_and_exit(1);
fprintf(stderr, _("%s: could not get backup header: %s"),
disconnect_and_exit(1);
fprintf(stderr, _("%s: no data returned from server\n"), progname);
disconnect_and_exit(1);
fprintf(stderr,
disconnect_and_exit(1);
fprintf(stderr,
disconnect_and_exit(1);
fprintf(stderr,
disconnect_and_exit(1);
fprintf(stderr, _("%s: final receive failed: %s"),
disconnect_and_exit(1);
fprintf(stderr,
disconnect_and_exit(1);
fprintf(stderr, _("%s: could not wait for child process: %s\n"),
disconnect_and_exit(1);
fprintf(stderr, _("%s: child %d died, expected %d\n"),
disconnect_and_exit(1);
if (!WIFEXITED(status))
fprintf(stderr, _("%s: child process did not exit normally\n"),
disconnect_and_exit(1);
if (WEXITSTATUS(status) != 0)
fprintf(stderr, _("%s: child process exited with error %d\n"),
progname, WEXITSTATUS(status));
disconnect_and_exit(1);
fprintf(stderr,
disconnect_and_exit(1);
fprintf(stderr, _("%s: could not wait for child thread: %s\n"),
disconnect_and_exit(1);
fprintf(stderr, _("%s: could not get child thread exit status: %s\n"),
disconnect_and_exit(1);
fprintf(stderr, _("%s: child thread exited with error %u\n"),
disconnect_and_exit(1);
fprintf(stderr,
exit(1);
fprintf(stderr,
exit(1);
fprintf(stderr,
exit(1);
fprintf(stderr,
exit(1);
fprintf(stderr, _("%s: invalid compression level \"%s\"\n"),
exit(1);
fprintf(stderr, _("%s: invalid checkpoint argument \"%s\", must be \"fast\" or \"spread\"\n"),
exit(1);
fprintf(stderr, _("%s: invalid status interval \"%s\"\n"),
exit(1);
fprintf(stderr, _("Try \"%s --help\" for more information.\n"),
exit(1);
fprintf(stderr, _("Try \"%s --help\" for more information.\n"),
exit(1);
fprintf(stderr, _("%s: no target directory specified\n"), progname);
fprintf(stderr, _("Try \"%s --help\" for more information.\n"),
exit(1);
fprintf(stderr, _("Try \"%s --help\" for more information.\n"),
exit(1);
fprintf(stderr, _("Try \"%s --help\" for more information.\n"),
exit(1);
fprintf(stderr, _("Try \"%s --help\" for more information.\n"),
exit(1);
fprintf(stderr, _("%s: transaction log directory location must be "
fprintf(stderr, _("Try \"%s --help\" for more information.\n"),
exit(1);
fprintf(stderr,
exit(1);
fprintf(stderr, _("%s: could not create symbolic link \"%s\": %s\n"),
exit(1);
fprintf(stderr, _("%s: symlinks are not supported on this platform\n"));
exit(1);
--
>
> thanks, Andrej
> --------------details:
> environment: CentOS 6.7, postgres 9.5.1
> ( PostgreSQL 9.5.1 on x86_64-pc-linux-gnu, compiled by gcc (GCC) 4.4.7 20120313 (Red Hat 4.4.7-16), 64-bit)
>
> I tried 2 forms of pg_basebackup (-X fetch and -X stream). Both were issued from a script:
> # su - postgres -c "/usr/pgsql-9.5/bin/pg_basebackup -h ${DB_MASTER_IP} -D ${GEO_STDBY_DATA} -U pgreplic -P -v -X fetch" 2>${LOG_FILE}.stderr >> ${LOG_FILE}
> # echo $?
> 1 <--------------pg_basebackup failed!
> # cat log.stderr
> # cat /var/log/cluster/geo_repair.log.err
> transaction log start point: 0/E3000028 on timeline 1
> WARNING: skipping special file "./pg_hba.conf"
> WARNING: skipping special file "./pg_hba.conf.save"
> transaction log end point: 0/E30000F8
> pg_basebackup: base backup completed <------------------no reason for pg_basebackup failure!
> # cp /tmp/pg_hba.conf /tmp/postgresql.conf /pg_data/
> # su - postgres -c "/usr/pgsql-9.5/bin/pg_ctl -D /pg_data/ start"
> # tail /pg_data/pg_log/postgresql-Fri.log
> `pg_xlog/0000000100000000000000E2' -> `../backups/arc/0000000100000000000000E2'
> 2016-04-15 23:15:10 CEST:pgreplic@[unknown]:[10667] WARNING: skipping special file "./pg_hba.conf"
> 2016-04-15 23:15:10 CEST:pgreplic@[unknown]:[10667] WARNING: skipping special file "./pg_hba.conf.save" <---------------recorded in pg_log on master node and
> copied by pg_basebackup (note time difference between two servers)
> 2016-04-15 23:15:02 CEST:@:[23321] LOG: database system was interrupted; last known up at 2016-04-15 23:15:10 CEST
> 2016-04-15 23:15:02 CEST:postgres@postgres:[23329] FATAL: the database system is starting up
> 2016-04-15 23:15:03 CEST:@:[23321] LOG: entering standby mode
> 2016-04-15 23:15:03 CEST:@:[23321] LOG: database system was not properly shut down; automatic recovery in progress <---------something missing from pg_basebackup
> 2016-04-15 23:15:03 CEST:@:[23321] LOG: redo starts at 0/E3000028
> 2016-04-15 23:15:03 CEST:@:[23321] LOG: consistent recovery state reached at 0/E4000000
> 2016-04-15 23:15:03 CEST:@:[23295] LOG: database system is ready to accept read only connections
> 2016-04-15 23:15:03 CEST:@:[23356] LOG: started streaming WAL from primary at 0/E4000000 on timeline 1
> -------second trial
> # su - postgres -c "/usr/pgsql-9.5/bin/pg_basebackup -h ${DB_MASTER_IP} -D ${GEO_STDBY_DATA} -U pgreplic -P -v -X stream"
> # echo $?
> 1
> # cat /var/log/cluster/geo_repair.log.err
> transaction log start point: 0/E5000028 on timeline 1
> pg_basebackup: starting background WAL receiver
> WARNING: skipping special file "./pg_hba.conf"
> WARNING: skipping special file "./pg_hba.conf.save"
> transaction log end point: 0/E50000F8
> pg_basebackup: waiting for background process to finish streaming ...
> pg_basebackup: could not wait for child process: No child processes <----what does this mean? I think it failed to start process to fetching wal logs created
> during backup: but neither on master node neither on pg_basebackup output here is any information about reason.. (max_wal_senders on master is 10: I see no reason to
> fail).
>
> postgres logs:
> `pg_xlog/0000000100000000000000E4' -> `../backups/arc/0000000100000000000000E4'
> 2016-04-15 23:35:09 CEST:pgreplic@[unknown]:[29035] WARNING: skipping special file "./pg_hba.conf"
> 2016-04-15 23:35:09 CEST:pgreplic@[unknown]:[29035] WARNING: skipping special file "./pg_hba.conf.save"
> 2016-04-15 23:35:01 CEST:@:[28926] LOG: database system was interrupted; last known up at 2016-04-15 23:35:09 CEST
> 2016-04-15 23:35:01 CEST:postgres@postgres:[28938] FATAL: the database system is starting up
> 2016-04-15 23:35:02 CEST:@:[28926] LOG: entering standby mode
> 2016-04-15 23:35:02 CEST:@:[28926] LOG: database system was not properly shut down; automatic recovery in progress <------------this means something missing from
> pg_basebackup
> 2016-04-15 23:35:02 CEST:@:[28926] LOG: redo starts at 0/E5000028
> 2016-04-15 23:35:02 CEST:@:[28926] LOG: consistent recovery state reached at 0/E6000000
> 2016-04-15 23:35:02 CEST:@:[28904] LOG: database system is ready to accept read only connections
> 2016-04-15 23:35:02 CEST:@:[28989] LOG: started streaming WAL from primary at 0/E6000000 on timeline 1
>
> postgres params on master node:
> log_line_prefix = '%t:%u@%d:[%p] '
> logging_collector = on
> wal_buffers = 16MB
> max_wal_size = 200MB
> log_temp_files = 1MB
> max_connections = 170
> shared_buffers = 512MB
> effective_cache_size = 1500MB
> work_mem = 48MB
> log_lock_waits = on
> log_min_duration_statement = 10000
> shared_preload_libraries = 'pg_stat_statements'
> include '/var/lib/pgsql/tmp/rep_mode.conf' # added by pgsql RA
> wal_level = hot_standby
> archive_mode = on
> max_wal_senders = 10
> hot_standby = on
> wal_keep_segments = 128
> archive_command = '/opt/postgres/dbconf/archive_command.sh %p %f'
> wal_receiver_status_interval = 2
> max_standby_streaming_delay = -1
> max_standby_archive_delay = -1
> restart_after_crash = off
> hot_standby_feedback = on
>
Jerry Sievers
Postgres DBA/Development Consulting
e: postgres.consulting@xxxxxxxxxxx
p: 312.241.7800