I managed to get strace on both sides, however it doesn't make me wiser
- there is nothing obvious for me.
Additionally I see that replication works more or less for new messages,
but older are not processed.
I have several subfolders in my mailbox, some of them unreplicated. If I
change anything in the subfolder now - the folder is replicated, but
other subfolders remain not replicated - unless I change anything in them.
Below strace, maybe someone can find anything meaningful from it:
1: On replica
rename("/var/lib/imap/proc/6794.new", "/var/lib/imap/proc/6794") = 0
getpid() = 6794
openat(AT_FDCWD, "/var/lib/imap/proc/6794.new", O_RDWR|O_CREAT|O_TRUNC,
0666) = 12
fstat(12, {st_mode=S_IFREG|0600, st_size=0, ...}) = 0
write(12, "imap\tifs.local.navi.pl [192.168."..., 53) = 53
close(12) = 0
rename("/var/lib/imap/proc/6794.new", "/var/lib/imap/proc/6794") = 0
rt_sigprocmask(SIG_BLOCK, [INT QUIT ALRM TERM CHLD], [], 8) = 0
pselect6(1, [0], NULL, NULL, {tv_sec=1920, tv_nsec=0}, {[], 8}) = 1 (in
[0], left {tv_sec=1919, tv_nsec=999999238})
rt_sigprocmask(SIG_SETMASK, [], NULL, 8) = 0
rt_sigprocmask(SIG_BLOCK, [INT QUIT ALRM TERM CHLD], [], 8) = 0
pselect6(1, [0], NULL, NULL, {tv_sec=1920, tv_nsec=0}, {[], 8}) = 1 (in
[0], left {tv_sec=1919, tv_nsec=999999439})
rt_sigprocmask(SIG_SETMASK, [], NULL, 8) = 0
read(0, "", 5) = 0
unlink("/var/lib/imap/proc/6794") = 0
openat(AT_FDCWD, "/dev/null", O_RDWR) = 12
shutdown(0, SHUT_RD) = -1 ENOTCONN (Transport
endpoint is not connected)
dup2(12, 0) = 0
shutdown(1, SHUT_RD) = -1 ENOTCONN (Transport
endpoint is not connected)
dup2(12, 1) = 1
shutdown(2, SHUT_RD) = -1 ENOTCONN (Transport
endpoint is not connected)
dup2(12, 2) = 2
close(12) = 0
close(11) = 0
getpid() = 6794
write(3, "\1\0\0\0\212\32\0\0", 8) = 8
rt_sigaction(SIGALRM, {sa_handler=0x7f57e06da710, sa_mask=[],
sa_flags=SA_RESTORER|SA_RESETHAND|SA_SIGINFO,
sa_restorer=0x7f57dc71a960}, NULL, 8) = 0
rt_sigaction(SIGQUIT, {sa_handler=0x7f57e06da710, sa_mask=[],
sa_flags=SA_RESTORER|SA_RESETHAND|SA_SIGINFO,
sa_restorer=0x7f57dc71a960}, NULL, 8) = 0
rt_sigaction(SIGINT, {sa_handler=0x7f57e06da710, sa_mask=[],
sa_flags=SA_RESTORER|SA_RESETHAND|SA_SIGINFO,
sa_restorer=0x7f57dc71a960}, NULL, 8) = 0
rt_sigaction(SIGTERM, {sa_handler=0x7f57e06da710, sa_mask=[],
sa_flags=SA_RESTORER|SA_RESETHAND|SA_SIGINFO,
sa_restorer=0x7f57dc71a960}, NULL, 8) = 0
rt_sigaction(SIGUSR2, {sa_handler=0x7f57e06da710, sa_mask=[],
sa_flags=SA_RESTORER|SA_RESETHAND|SA_SIGINFO,
sa_restorer=0x7f57dc71a960}, NULL, 8) = 0
rt_sigaction(SIGHUP, {sa_handler=0x7f57e06da710, sa_mask=[],
sa_flags=SA_RESTORER|SA_RESTART|SA_SIGINFO, sa_restorer=0x7f57dc71a960},
NULL, 8) = 0
alarm(83) = 0
fcntl(8, F_SETLKW, {l_type=F_WRLCK, l_whence=SEEK_SET, l_start=0,
l_len=0}) = 0
stat("/usr/local/cyrus-3.0.13/libexec/imapd", {st_mode=S_IFREG|0755,
st_size=1181120, ...}) = 0
rt_sigaction(SIGHUP, {sa_handler=0x7f57e06da710, sa_mask=[],
sa_flags=SA_RESTORER|SA_SIGINFO, sa_restorer=0x7f57dc71a960}, NULL, 8) = 0
rt_sigprocmask(SIG_BLOCK, [INT QUIT ALRM TERM CHLD], [], 8) = 0
pselect6(5, [4], NULL, NULL, NULL, {[], 8}) = ? ERESTARTNOHAND (To be
restarted if no handler)
--- SIGALRM {si_signo=SIGALRM, si_code=SI_KERNEL} ---
rt_sigreturn({mask=[INT QUIT ALRM TERM CHLD]}) = -1 EINTR (Interrupted
system call)
rt_sigprocmask(SIG_SETMASK, [], NULL, 8) = 0
rt_sigaction(SIGHUP, {sa_handler=0x7f57e06da710, sa_mask=[],
sa_flags=SA_RESTORER|SA_RESTART|SA_SIGINFO, sa_restorer=0x7f57dc71a960},
NULL, 8) = 0
fcntl(8, F_SETLKW, {l_type=F_UNLCK, l_whence=SEEK_SET, l_start=0,
l_len=0}) = 0
close(5) = 0
munmap(0x7f57e14c1000, 16384) = 0
close(7) = 0
munmap(0x7f57e14bd000, 16384) = 0
close(6) = 0
unlink("/var/lib/imap/socket/idle.6794") = 0
close(9) = 0
munmap(0x7f57e14a9000, 16384) = 0
exit_group(0) = ?
+++ exited with 0 +++
2. On master:
nanosleep({1, 0}, 0x7ffe12318060) = 0
stat("/var/local/imapd_sync_stop", 0x7ffe12318280) = -1 ENOENT (No such
file or directory)
stat("/var/lib/imap/sync/log-run", 0x7ffe12318190) = -1 ENOENT (No such
file or directory)
stat("/var/lib/imap/sync/log", {st_mode=S_IFREG|0600, st_size=26, ...}) = 0
rename("/var/lib/imap/sync/log", "/var/lib/imap/sync/log-run") = 0
open("/var/lib/imap/sync/log-run", O_RDWR) = 8
fcntl(8, F_SETLKW, {l_type=F_WRLCK, l_whence=SEEK_SET, l_start=0,
l_len=0}) = 0
fcntl(8, F_SETLKW, {l_type=F_UNLCK, l_whence=SEEK_SET, l_start=0,
l_len=0}) = 0
read(8, "MAILBOX navi.pl!user.olaf\n", 4096) = 26
read(8, "", 4096) = 0
write(6, "\27\3\3\0B\372\222\265gb\346\246$\211
\367e\241\212\30\364\345[GJ\204\225\n\255?\356\343"..., 71) = 71
rt_sigprocmask(SIG_BLOCK, [INT QUIT ALRM TERM CHLD], [], 8) = 0
pselect6(7, [6], NULL, NULL, {0, 0}, {[], 8}) = 0 (Timeout)
rt_sigprocmask(SIG_SETMASK, [], NULL, 8) = 0
sendto(7, "<179>Apr 21 14:56:46 sync_client"..., 103, MSG_NOSIGNAL,
NULL, 0) = 103
write(6,
"\27\3\3\0(\372\222\265gb\346\246%\216\222#\336\16t\320\236|bv\25\261)6/\242\217l"...,
45) = 45
sendto(7, "<179>Apr 21 14:56:46 sync_client"..., 101, MSG_NOSIGNAL,
NULL, 0) = 101
sendto(7, "<179>Apr 21 14:56:46 sync_client"..., 86, MSG_NOSIGNAL, NULL,
0) = 86
sendto(7, "<179>Apr 21 14:56:46 sync_client"..., 113, MSG_NOSIGNAL,
NULL, 0) = 113
close(8) = 0
write(6,
"\27\3\3\0\"\372\222\265gb\346\246&\261B\342|\260\231\217\307[\16\\\335\211\324w\345\2544\""...,
39) = 39
shutdown(6, SHUT_RD) = 0
close(6) = 0
close(5) = 0
munmap(0x7fc6ed5bb000, 16384) = 0
close(3) = 0
munmap(0x7fc6ed5d0000, 49152) = 0
close(4) = 0
exit_group(0) = ?
+++ exited with 0 +++
On 2020-04-21 15:00, Michael Menge wrote:
Quoting Olaf Frączyk <olaf@xxxxxxx>:
Thank you for the telemetry hint :)
I don't use the syncserver - the replication is done via IMAP port on
the replica side. I have no idea how to have strace spawned by cyrus
master process. When I attach later to imapd using strace -p I'm
afraid some info already will be lost.
You can try "/usr/bin/strace <strace_option> <path_to_imapd>/imapd
<imapd_options>" as cmd
or you can use prefork=1 in the service to prefok one imap process and
connect
to this one before you start the sync_client.
The syncserver is marked as deprecated in the docs, so I went with
the more modern option. Maybe here is the problem ;)
i am using syncserver since cyrus-imapd 2.3. So I had no reason to
change it.
The funny thing is that from time to time the replication progresses
a little. I don't like non-repetitive behavior ;)
Thanks again for the hints.
On 2020-04-21 14:13, Michael Menge wrote:
Hi,
Quoting Olaf Frączyk <olaf@xxxxxxx>:
The current situation is:
1. Replica:
stopped and started the replica
no activity on replica - iotop and top show nothing
the only messages on replica is incoming connection from master
2. Master:
when I run sync_client -r I still get:
Apr 21 12:38:36 ifs sync_client[29518]: Reprocessing sync log file
/var/lib/imap/sync/log-run
Apr 21 12:43:27 ifs sync_client[29518]: IOERROR: zero length
response to MAILBOXES (idle for too long)
Apr 21 12:43:27 ifs sync_client[29518]: IOERROR: zero length
response to RESTART (idle for too long)
Apr 21 12:43:27 ifs sync_client[29518]: Error in do_sync(): bailing
out! Bad protocol
Apr 21 12:43:27 ifs sync_client[29518]: Processing sync log file
/var/lib/imap/sync/log-run failed: Bad protocol
3. There is 27 GB of about 45 GB replicated and there is no further
progress
4. How to find out why the replica doesn't respond?
You can enable telemetry logging on the replica by creating a folder
/var/lib/imap/log/<SYNC_AUTHNAME>
where <SYNC_AUTHNAME> is the value of the sync_authname. If you give
cyrus write permissions for this folder
it will create log-files for each process and what it received and
send from/to the sync_client with timestamps.
Also you can try to use strace on the syncserver process to figure
out which files it is accessing
On 2020-04-21 11:18, Olaf Frączyk wrote:
I also found out that when I see on master:
Apr 21 11:12:38 ifs sync_client[27996]: IOERROR: zero length
response to MAILBOXES (idle for too long)
Apr 21 11:12:38 ifs sync_client[27996]: IOERROR: zero length
response to RESTART (idle for too long)
Apr 21 11:12:38 ifs sync_client[27996]: Error in do_sync():
bailing out! Bad protocol
Apr 21 11:12:38 ifs sync_client[27996]: Processing sync log file
/var/lib/imap/sync/log-run failed: Bad protocol
I also get on the replica:
Apr 21 11:12:38 skink1 imap[5775]: Connection reset by peer,
closing connection
But I also see, that before it happens, there is no activity both
on the replica and on the master for some time.
So maybe the imap server process is not recovering correctly in
the longlock situation?
On 2020-04-21 11:07, Olaf Frączyk wrote:
Hi,
When I run sync_client -r on the master I see the following on
the replica:
Apr 21 10:56:15 skink1 imap[5775]: mailbox: longlock
navi.pl!user.olaf for 1.7 seconds
Apr 21 10:56:20 skink1 imap[5775]: mailbox: longlock
navi.pl!user.piotr for 2.0 seconds
Apr 21 10:56:23 skink1 imap[5775]: mailbox: longlock
navi.pl!user.olaf for 2.9 seconds
Apr 21 10:56:26 skink1 imap[5775]: mailbox: longlock
navi.pl!user.piotr for 3.0 seconds
The mailboxes have several GB in Inbox folder and several GB in
subfolders. The inbox folders have about 20,000 messages, the
subfolders upto 15,000
Could it cause problems?
the longlock is normaly not the problem. While on process has the lock
no other process can write to the mailbox, but on the replica there
normaly
is no other process that should access the mailbox
Maybe I should move the sync_client from START section to
SERVICES, it seems that it is not automatically restarted
I havend tried starting sync_client in the service section.
I start the sync_client via systemd.
On 2020-04-21 08:47, Michael Menge wrote:
Hi Olaf
Quoting Olaf Frączyk <olaf@xxxxxxx>:
Hi,
I upgraded to 3.0.13 but it didn't help.
This time it copied about 18GB
in the logs I still see:
1 - inefficient replication
2 - IOERROR: zero length response to MAILBOXES (idle for too long)
IOERROR: zero length response to RESTART (idle for too long)
Error in do_sync(): bailing out! Bad protocol
But I have no idea what can I do next and why it fails
Apr 21 02:24:46 ifs sync_client[12656]: IOERROR: zero length
response to MAILBOXES (idle for too long)
Apr 21 02:24:46 ifs sync_client[12656]: IOERROR: zero length
response to RESTART (idle for too long)
Apr 21 02:24:46 ifs sync_client[12656]: Error in do_sync():
bailing out! Bad protocol
do you see any errors on the syncserver side. The error look
like the
sync_client is waiting for a reply of the server.
--------------------------------------------------------------------------------
M.Menge Tel.: (49) 7071/29-70316
Universität Tübingen Fax.: (49) 7071/29-5912
Zentrum für Datenverarbeitung mail:
michael.menge@xxxxxxxxxxxxxxxxxxxx
Wächterstraße 76
72074 Tübingen
----
Cyrus Home Page: http://www.cyrusimap.org/
List Archives/Info: http://lists.andrew.cmu.edu/pipermail/info-cyrus/
To Unsubscribe:
https://lists.andrew.cmu.edu/mailman/listinfo/info-cyrus
----
Cyrus Home Page: http://www.cyrusimap.org/
List Archives/Info: http://lists.andrew.cmu.edu/pipermail/info-cyrus/
To Unsubscribe:
https://lists.andrew.cmu.edu/mailman/listinfo/info-cyrus
--------------------------------------------------------------------------------
M.Menge Tel.: (49) 7071/29-70316
Universität Tübingen Fax.: (49) 7071/29-5912
Zentrum für Datenverarbeitung mail:
michael.menge@xxxxxxxxxxxxxxxxxxxx
Wächterstraße 76
72074 Tübingen
----
Cyrus Home Page: http://www.cyrusimap.org/
List Archives/Info: http://lists.andrew.cmu.edu/pipermail/info-cyrus/
To Unsubscribe:
https://lists.andrew.cmu.edu/mailman/listinfo/info-cyrus
----
Cyrus Home Page: http://www.cyrusimap.org/
List Archives/Info: http://lists.andrew.cmu.edu/pipermail/info-cyrus/
To Unsubscribe:
https://lists.andrew.cmu.edu/mailman/listinfo/info-cyrus