Re: Replication failed 3.0.5 -> 3.0.13, now 3.0.13->3.0.13

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



I managed to get strace on both sides, however it doesn't make me wiser - there is nothing obvious for me.

Additionally I see that replication works more or less for new messages, but older are not processed.

I have several subfolders in my mailbox, some of them unreplicated. If I change anything in the subfolder now - the folder is replicated, but other subfolders remain not replicated - unless I change anything in them.

Below strace, maybe someone can find anything meaningful from it:

1: On replica

rename("/var/lib/imap/proc/6794.new", "/var/lib/imap/proc/6794") = 0
getpid()                                = 6794
openat(AT_FDCWD, "/var/lib/imap/proc/6794.new", O_RDWR|O_CREAT|O_TRUNC, 0666) = 12
fstat(12, {st_mode=S_IFREG|0600, st_size=0, ...}) = 0
write(12, "imap\tifs.local.navi.pl [192.168."..., 53) = 53
close(12)                               = 0
rename("/var/lib/imap/proc/6794.new", "/var/lib/imap/proc/6794") = 0
rt_sigprocmask(SIG_BLOCK, [INT QUIT ALRM TERM CHLD], [], 8) = 0
pselect6(1, [0], NULL, NULL, {tv_sec=1920, tv_nsec=0}, {[], 8}) = 1 (in [0], left {tv_sec=1919, tv_nsec=999999238})
rt_sigprocmask(SIG_SETMASK, [], NULL, 8) = 0
rt_sigprocmask(SIG_BLOCK, [INT QUIT ALRM TERM CHLD], [], 8) = 0
pselect6(1, [0], NULL, NULL, {tv_sec=1920, tv_nsec=0}, {[], 8}) = 1 (in [0], left {tv_sec=1919, tv_nsec=999999439})
rt_sigprocmask(SIG_SETMASK, [], NULL, 8) = 0
read(0, "", 5)                          = 0
unlink("/var/lib/imap/proc/6794")       = 0
openat(AT_FDCWD, "/dev/null", O_RDWR)   = 12
shutdown(0, SHUT_RD)                    = -1 ENOTCONN (Transport endpoint is not connected)
dup2(12, 0)                             = 0
shutdown(1, SHUT_RD)                    = -1 ENOTCONN (Transport endpoint is not connected)
dup2(12, 1)                             = 1
shutdown(2, SHUT_RD)                    = -1 ENOTCONN (Transport endpoint is not connected)
dup2(12, 2)                             = 2
close(12)                               = 0
close(11)                               = 0
getpid()                                = 6794
write(3, "\1\0\0\0\212\32\0\0", 8)      = 8
rt_sigaction(SIGALRM, {sa_handler=0x7f57e06da710, sa_mask=[], sa_flags=SA_RESTORER|SA_RESETHAND|SA_SIGINFO, sa_restorer=0x7f57dc71a960}, NULL, 8) = 0 rt_sigaction(SIGQUIT, {sa_handler=0x7f57e06da710, sa_mask=[], sa_flags=SA_RESTORER|SA_RESETHAND|SA_SIGINFO, sa_restorer=0x7f57dc71a960}, NULL, 8) = 0 rt_sigaction(SIGINT, {sa_handler=0x7f57e06da710, sa_mask=[], sa_flags=SA_RESTORER|SA_RESETHAND|SA_SIGINFO, sa_restorer=0x7f57dc71a960}, NULL, 8) = 0 rt_sigaction(SIGTERM, {sa_handler=0x7f57e06da710, sa_mask=[], sa_flags=SA_RESTORER|SA_RESETHAND|SA_SIGINFO, sa_restorer=0x7f57dc71a960}, NULL, 8) = 0 rt_sigaction(SIGUSR2, {sa_handler=0x7f57e06da710, sa_mask=[], sa_flags=SA_RESTORER|SA_RESETHAND|SA_SIGINFO, sa_restorer=0x7f57dc71a960}, NULL, 8) = 0 rt_sigaction(SIGHUP, {sa_handler=0x7f57e06da710, sa_mask=[], sa_flags=SA_RESTORER|SA_RESTART|SA_SIGINFO, sa_restorer=0x7f57dc71a960}, NULL, 8) = 0
alarm(83)                               = 0
fcntl(8, F_SETLKW, {l_type=F_WRLCK, l_whence=SEEK_SET, l_start=0, l_len=0}) = 0 stat("/usr/local/cyrus-3.0.13/libexec/imapd", {st_mode=S_IFREG|0755, st_size=1181120, ...}) = 0 rt_sigaction(SIGHUP, {sa_handler=0x7f57e06da710, sa_mask=[], sa_flags=SA_RESTORER|SA_SIGINFO, sa_restorer=0x7f57dc71a960}, NULL, 8) = 0
rt_sigprocmask(SIG_BLOCK, [INT QUIT ALRM TERM CHLD], [], 8) = 0
pselect6(5, [4], NULL, NULL, NULL, {[], 8}) = ? ERESTARTNOHAND (To be restarted if no handler)
--- SIGALRM {si_signo=SIGALRM, si_code=SI_KERNEL} ---
rt_sigreturn({mask=[INT QUIT ALRM TERM CHLD]}) = -1 EINTR (Interrupted system call)
rt_sigprocmask(SIG_SETMASK, [], NULL, 8) = 0
rt_sigaction(SIGHUP, {sa_handler=0x7f57e06da710, sa_mask=[], sa_flags=SA_RESTORER|SA_RESTART|SA_SIGINFO, sa_restorer=0x7f57dc71a960}, NULL, 8) = 0 fcntl(8, F_SETLKW, {l_type=F_UNLCK, l_whence=SEEK_SET, l_start=0, l_len=0}) = 0
close(5)                                = 0
munmap(0x7f57e14c1000, 16384)           = 0
close(7)                                = 0
munmap(0x7f57e14bd000, 16384)           = 0
close(6)                                = 0
unlink("/var/lib/imap/socket/idle.6794") = 0
close(9)                                = 0
munmap(0x7f57e14a9000, 16384)           = 0
exit_group(0)                           = ?
+++ exited with 0 +++


2. On master:

nanosleep({1, 0}, 0x7ffe12318060)       = 0
stat("/var/local/imapd_sync_stop", 0x7ffe12318280) = -1 ENOENT (No such file or directory) stat("/var/lib/imap/sync/log-run", 0x7ffe12318190) = -1 ENOENT (No such file or directory)
stat("/var/lib/imap/sync/log", {st_mode=S_IFREG|0600, st_size=26, ...}) = 0
rename("/var/lib/imap/sync/log", "/var/lib/imap/sync/log-run") = 0
open("/var/lib/imap/sync/log-run", O_RDWR) = 8
fcntl(8, F_SETLKW, {l_type=F_WRLCK, l_whence=SEEK_SET, l_start=0, l_len=0}) = 0 fcntl(8, F_SETLKW, {l_type=F_UNLCK, l_whence=SEEK_SET, l_start=0, l_len=0}) = 0
read(8, "MAILBOX navi.pl!user.olaf\n", 4096) = 26
read(8, "", 4096)                       = 0
write(6, "\27\3\3\0B\372\222\265gb\346\246$\211 \367e\241\212\30\364\345[GJ\204\225\n\255?\356\343"..., 71) = 71
rt_sigprocmask(SIG_BLOCK, [INT QUIT ALRM TERM CHLD], [], 8) = 0
pselect6(7, [6], NULL, NULL, {0, 0}, {[], 8}) = 0 (Timeout)
rt_sigprocmask(SIG_SETMASK, [], NULL, 8) = 0
sendto(7, "<179>Apr 21 14:56:46 sync_client"..., 103, MSG_NOSIGNAL, NULL, 0) = 103 write(6, "\27\3\3\0(\372\222\265gb\346\246%\216\222#\336\16t\320\236|bv\25\261)6/\242\217l"..., 45) = 45 sendto(7, "<179>Apr 21 14:56:46 sync_client"..., 101, MSG_NOSIGNAL, NULL, 0) = 101 sendto(7, "<179>Apr 21 14:56:46 sync_client"..., 86, MSG_NOSIGNAL, NULL, 0) = 86 sendto(7, "<179>Apr 21 14:56:46 sync_client"..., 113, MSG_NOSIGNAL, NULL, 0) = 113
close(8)                                = 0
write(6, "\27\3\3\0\"\372\222\265gb\346\246&\261B\342|\260\231\217\307[\16\\\335\211\324w\345\2544\""..., 39) = 39
shutdown(6, SHUT_RD)                    = 0
close(6)                                = 0
close(5)                                = 0
munmap(0x7fc6ed5bb000, 16384)           = 0
close(3)                                = 0
munmap(0x7fc6ed5d0000, 49152)           = 0
close(4)                                = 0
exit_group(0)                           = ?
+++ exited with 0 +++

On 2020-04-21 15:00, Michael Menge wrote:

Quoting Olaf Frączyk <olaf@xxxxxxx>:

Thank you for the telemetry hint :)

I don't use the syncserver - the replication is done via IMAP port on the replica side. I have no idea how to have strace spawned by cyrus master process. When I attach later to imapd using strace -p I'm afraid some info already will be lost.


You can try "/usr/bin/strace <strace_option> <path_to_imapd>/imapd <imapd_options>" as cmd or you can use prefork=1 in the service to prefok one imap process and connect
to this one before you start the sync_client.



The syncserver is marked as deprecated in the docs, so I went with the more modern option. Maybe here is the problem ;)


i am using syncserver since cyrus-imapd 2.3. So I had no reason to change it.


The funny thing is that from time to time the replication progresses a little. I don't like non-repetitive behavior ;)

Thanks again for the hints.

On 2020-04-21 14:13, Michael Menge wrote:
Hi,

Quoting Olaf Frączyk <olaf@xxxxxxx>:

The current situation is:

1. Replica:

stopped and started the replica

no activity on replica - iotop and top show nothing

the only messages on replica is incoming connection from master

2. Master:

when I run sync_client -r I still get:

Apr 21 12:38:36 ifs sync_client[29518]: Reprocessing sync log file /var/lib/imap/sync/log-run Apr 21 12:43:27 ifs sync_client[29518]: IOERROR: zero length response to MAILBOXES (idle for too long) Apr 21 12:43:27 ifs sync_client[29518]: IOERROR: zero length response to RESTART (idle for too long) Apr 21 12:43:27 ifs sync_client[29518]: Error in do_sync(): bailing out! Bad protocol Apr 21 12:43:27 ifs sync_client[29518]: Processing sync log file /var/lib/imap/sync/log-run failed: Bad protocol

3. There is 27 GB of about 45 GB replicated and there is no further progress

4. How to find out why the replica doesn't respond?


You can enable telemetry logging on the replica by creating a folder /var/lib/imap/log/<SYNC_AUTHNAME> where <SYNC_AUTHNAME> is the value of the sync_authname. If you give cyrus write permissions for this folder it will create log-files for each process and what it received and send from/to the sync_client with timestamps.

Also you can try to use strace on the syncserver process to figure out which files it is accessing


On 2020-04-21 11:18, Olaf Frączyk wrote:
I also found out that when I see on master:

Apr 21 11:12:38 ifs sync_client[27996]: IOERROR: zero length response to MAILBOXES (idle for too long) Apr 21 11:12:38 ifs sync_client[27996]: IOERROR: zero length response to RESTART (idle for too long) Apr 21 11:12:38 ifs sync_client[27996]: Error in do_sync(): bailing out! Bad protocol Apr 21 11:12:38 ifs sync_client[27996]: Processing sync log file /var/lib/imap/sync/log-run failed: Bad protocol

I also get on the replica:

Apr 21 11:12:38 skink1 imap[5775]: Connection reset by peer, closing connection

But I also see, that before it happens, there is no activity both on the replica and on the master for some time.

So maybe the imap server process is not recovering correctly in the longlock situation?

On 2020-04-21 11:07, Olaf Frączyk wrote:
Hi,

When I run sync_client -r on the master I see the following on the replica:

Apr 21 10:56:15 skink1 imap[5775]: mailbox: longlock navi.pl!user.olaf for 1.7 seconds Apr 21 10:56:20 skink1 imap[5775]: mailbox: longlock navi.pl!user.piotr for 2.0 seconds Apr 21 10:56:23 skink1 imap[5775]: mailbox: longlock navi.pl!user.olaf for 2.9 seconds Apr 21 10:56:26 skink1 imap[5775]: mailbox: longlock navi.pl!user.piotr for 3.0 seconds

The mailboxes have several GB in Inbox folder and several GB in subfolders. The inbox folders have about 20,000 messages, the subfolders upto 15,000

Could it cause problems?

the longlock is normaly not the problem. While on process has the lock
no other process can write to the mailbox, but on the replica there normaly
is no other process that should access the mailbox


Maybe I should move the sync_client from START section to SERVICES, it seems that it is not automatically restarted

I havend tried starting sync_client in the service section.
I start the sync_client via systemd.

On 2020-04-21 08:47, Michael Menge wrote:
Hi Olaf


Quoting Olaf Frączyk <olaf@xxxxxxx>:

Hi,

I upgraded to 3.0.13 but it didn't help.

This time it copied about 18GB

in the logs I still see:

1 - inefficient replication

2 - IOERROR: zero length response to MAILBOXES (idle for too long)
IOERROR: zero length response to RESTART (idle for too long)
Error in do_sync(): bailing out! Bad protocol

But I have no idea what can I do next and why it fails

Apr 21 02:24:46 ifs sync_client[12656]: IOERROR: zero length response to MAILBOXES (idle for too long) Apr 21 02:24:46 ifs sync_client[12656]: IOERROR: zero length response to RESTART (idle for too long) Apr 21 02:24:46 ifs sync_client[12656]: Error in do_sync(): bailing out! Bad protocol

do you see any errors on the syncserver side. The error look like the
sync_client is waiting for a reply of the server.


-------------------------------------------------------------------------------- M.Menge                                Tel.: (49) 7071/29-70316
Universität Tübingen                   Fax.: (49) 7071/29-5912
Zentrum für Datenverarbeitung          mail: michael.menge@xxxxxxxxxxxxxxxxxxxx
Wächterstraße 76
72074 Tübingen

----
Cyrus Home Page: http://www.cyrusimap.org/
List Archives/Info: http://lists.andrew.cmu.edu/pipermail/info-cyrus/
To Unsubscribe:
https://lists.andrew.cmu.edu/mailman/listinfo/info-cyrus
----
Cyrus Home Page: http://www.cyrusimap.org/
List Archives/Info: http://lists.andrew.cmu.edu/pipermail/info-cyrus/
To Unsubscribe:
https://lists.andrew.cmu.edu/mailman/listinfo/info-cyrus



--------------------------------------------------------------------------------
M.Menge                                Tel.: (49) 7071/29-70316
Universität Tübingen                   Fax.: (49) 7071/29-5912
Zentrum für Datenverarbeitung          mail: michael.menge@xxxxxxxxxxxxxxxxxxxx
Wächterstraße 76
72074 Tübingen

----
Cyrus Home Page: http://www.cyrusimap.org/
List Archives/Info: http://lists.andrew.cmu.edu/pipermail/info-cyrus/
To Unsubscribe:
https://lists.andrew.cmu.edu/mailman/listinfo/info-cyrus
----
Cyrus Home Page: http://www.cyrusimap.org/
List Archives/Info: http://lists.andrew.cmu.edu/pipermail/info-cyrus/
To Unsubscribe:
https://lists.andrew.cmu.edu/mailman/listinfo/info-cyrus




[Index of Archives]     [Cyrus SASL]     [Squirrel Mail]     [Asterisk PBX]     [Video For Linux]     [Photo]     [Yosemite News]     [gtk]     [KDE]     [Gimp on Windows]     [Steve's Art]

  Powered by Linux