Hi, The openais crashed when the script execute "openais-cfgtool -r" command every 10min, the cluster is RRP and one ring is disconnected at that time. I dump the following information from the core-dump file and collected the logs. Core was generated by `aisexec'. Program terminated with signal 11, Segmentation fault. #0 0x00007f3735ccfca9 in memcpy () from /lib64/libc.so.6 Missing separate debuginfos, use: debuginfo-install cman-2.0.115-49.el6.x86_64 glibc-2.12-1.7.el6.x86_64 libgcc-4.4.4-13.el6.x86_64 (gdb) info threads 10 Thread 3970 0x00007f3735d2fb67 in semop () from /lib64/libc.so.6 9 Thread 3968 0x00007f3735d2fb67 in semop () from /lib64/libc.so.6 8 Thread 3498 0x00007f3735d2fb67 in semop () from /lib64/libc.so.6 7 Thread 3445 0x00007f3735d2fb67 in semop () from /lib64/libc.so.6 6 Thread 3443 0x00007f3735d2fb67 in semop () from /lib64/libc.so.6 5 Thread 3440 0x00007f3735d2fb67 in semop () from /lib64/libc.so.6 4 Thread 3418 0x00007f3735d2fb67 in semop () from /lib64/libc.so.6 3 Thread 3400 0x00007f3735d23e33 in poll () from /lib64/libc.so.6 2 Thread 3403 0x00007f3735ccc010 in __strlen_sse2 () from /lib64/libc.so.6 * 1 Thread 3399 0x00007f3735ccfca9 in memcpy () from /lib64/libc.so.6 (gdb) bt #0 0x00007f3735ccfca9 in memcpy () from /lib64/libc.so.6 #1 0x0000000000424563 in openais_response_send (conn=0xb775c0, msg=0x7fff7c753590, mlen=24) at ipc.c:981 #2 0x00007f372f315e40 in message_handler_req_exec_cfg_ringreenable (message=0x7fff7c753630, nodeid=4) at cfg.c:271 #3 0x000000000041f121 in deliver_fn (nodeid=4, iovec=0x7fff7c7536d0, iov_len=1, endian_conversion_required=0) at main.c:407 #4 0x00000000004188fc in app_deliver_fn (nodeid=4, iovec=0x7fff7c7536c0, iov_len=1, endian_conversion_required=0) at totempg.c:462 #5 0x0000000000418f60 in totempg_deliver_fn (nodeid=4, iovec=0x7f37349842c0, iov_len=1, endian_conversion_required=0) at totempg.c:606 #6 0x0000000000417a49 in totemmrp_deliver_fn (nodeid=4, iovec=0x7f37349842c0, iov_len=3, endian_conversion_required=0) at totemmrp.c:82 #7 0x0000000000415645 in messages_deliver_to_app (instance=0x7f37367b8010, skip=0, end_point=5) at totemsrp.c:3592 #8 0x0000000000415208 in message_handler_orf_token (instance=0x7f37367b8010, msg=0xb63a28, msg_len=70, endian_conversion_needed=0) at totemsrp.c:3464 #9 0x0000000000417860 in main_deliver_fn (context=0x7f37367b8010, msg=0xb63a28, msg_len=70) at totemsrp.c:4208 #10 0x000000000040c74c in active_token_recv (instance=0xb1a200, iface_no=0, context=0x7f37367b8010, msg=0xb63a28, msg_len=70, token_seq=19) at totemrrp.c:1174 #11 0x000000000040cc3b in rrp_deliver_fn (context=0xb1a2f0, msg=0xb63a28, msg_len=70) at totemrrp.c:1331 #12 0x000000000040919f in net_deliver_fn (handle=0, fd=5, revents=1, data="" at totemnet.c:708 #13 0x0000000000406e03 in poll_run (handle=0) at aispoll.c:402 #14 0x000000000041fb32 in main (argc=1, argv=0x7fff7c7571c8) at main.c:629 (gdb) thread 2 [Switching to thread 2 (Thread 3403)]#0 0x00007f3735ccc010 in __strlen_sse2 () from /lib64/libc.so.6 (gdb) bt #0 0x00007f3735ccc010 in __strlen_sse2 () from /lib64/libc.so.6 #1 0x00007f372d4f9d45 in ?? () from /lib64/libgcc_s.so.1 #2 0x00007f372d4fa8bc in ?? () from /lib64/libgcc_s.so.1 #3 0x00007f3735d6a8a6 in dl_iterate_phdr () from /lib64/libc.so.6 #4 0x00007f372d4fb1d7 in _Unwind_Find_FDE () from /lib64/libgcc_s.so.1 #5 0x00007f372d4f85a3 in ?? () from /lib64/libgcc_s.so.1 #6 0x00007f372d4f8ee9 in ?? () from /lib64/libgcc_s.so.1 #7 0x00007f372d4f9226 in _Unwind_ForcedUnwind () from /lib64/libgcc_s.so.1 #8 0x00007f3735fd8c60 in __pthread_unwind () from /lib64/libpthread.so.0 #9 0x00007f3735fd0b1a in sigcancel_handler () from /lib64/libpthread.so.0 #10 <signal handler called> #11 0x00007f3735fd643c in pthread_cond_wait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0 #12 0x000000000041df79 in worker_thread (thread_data_in=0xb17df8) at wthread.c:73 #13 0x00007f3735fd27e1 in start_thread () from /lib64/libpthread.so.0 #14 0x00007f3735d2d53d in clone () from /lib64/libc.so.6 (gdb) thread 3 [Switching to thread 3 (Thread 3400)]#0 0x00007f3735d23e33 in poll () from /lib64/libc.so.6 (gdb) bt #0 0x00007f3735d23e33 in poll () from /lib64/libc.so.6 #1 0x00000000004255ac in prioritized_timer_thread (data="" at timer.c:125 #2 0x00007f3735fd27e1 in start_thread () from /lib64/libpthread.so.0 #3 0x00007f3735d2d53d in clone () from /lib64/libc.so.6 (gdb) thread 4 [Switching to thread 4 (Thread 3418)]#0 0x00007f3735d2fb67 in semop () from /lib64/libc.so.6 (gdb) bt #0 0x00007f3735d2fb67 in semop () from /lib64/libc.so.6 #1 0x000000000042327a in pthread_ipc_consumer (conn=0xb758c0) at ipc.c:414 #2 0x00007f3735fd27e1 in start_thread () from /lib64/libpthread.so.0 #3 0x00007f3735d2d53d in clone () from /lib64/libc.so.6 (gdb) thread 5 [Switching to thread 5 (Thread 3440)]#0 0x00007f3735d2fb67 in semop () from /lib64/libc.so.6 (gdb) bt #0 0x00007f3735d2fb67 in semop () from /lib64/libc.so.6 #1 0x000000000042327a in pthread_ipc_consumer (conn=0xb759c0) at ipc.c:414 #2 0x00007f3735fd27e1 in start_thread () from /lib64/libpthread.so.0 #3 0x00007f3735d2d53d in clone () from /lib64/libc.so.6 (gdb) thread 6 [Switching to thread 6 (Thread 3443)]#0 0x00007f3735d2fb67 in semop () from /lib64/libc.so.6 (gdb) byt Undefined command: "byt". Try "help". (gdb) by Undefined command: "by". Try "help". (gdb) bt #0 0x00007f3735d2fb67 in semop () from /lib64/libc.so.6 [Switching to thread 7 (Thread 3445)]#0 0x00007f3735d2fb67 in semop () from /lib64/libc.so.6 (gdb) bt #0 0x00007f3735d2fb67 in semop () from /lib64/libc.so.6 #1 0x000000000042327a in pthread_ipc_consumer (conn=0xb79100) at ipc.c:414 #2 0x00007f3735fd27e1 in start_thread () from /lib64/libpthread.so.0 #3 0x00007f3735d2d53d in clone () from /lib64/libc.so.6 (gdb) thread 8 [Switching to thread 8 (Thread 3498)]#0 0x00007f3735d2fb67 in semop () from /lib64/libc.so.6 (gdb) bt #0 0x00007f3735d2fb67 in semop () from /lib64/libc.so.6 #1 0x000000000042327a in pthread_ipc_consumer (conn=0xb78ff0) at ipc.c:414 #2 0x00007f3735fd27e1 in start_thread () from /lib64/libpthread.so.0 #3 0x00007f3735d2d53d in clone () from /lib64/libc.so.6 (gdb) thread 9 [Switching to thread 9 (Thread 3968)]#0 0x00007f3735d2fb67 in semop () from /lib64/libc.so.6 (gdb) bt #0 0x00007f3735d2fb67 in semop () from /lib64/libc.so.6 #1 0x000000000042327a in pthread_ipc_consumer (conn=0xb79f00) at ipc.c:414 #2 0x00007f3735fd27e1 in start_thread () from /lib64/libpthread.so.0 #3 0x00007f3735d2d53d in clone () from /lib64/libc.so.6 (gdb) thread 10 [Switching to thread 10 (Thread 3970)]#0 0x00007f3735d2fb67 in semop () from /lib64/libc.so.6 (gdb) bt #0 0x00007f3735d2fb67 in semop () from /lib64/libc.so.6 #1 0x000000000042327a in pthread_ipc_consumer (conn=0xb77270) at ipc.c:414 #2 0x00007f3735fd27e1 in start_thread () from /lib64/libpthread.so.0 #3 0x00007f3735d2d53d in clone () from /lib64/libc.so.6 The following is the log: 2012-06-11T16:07:12.719555+08:00 h56 openais[3399]: [TOTEM] entering GATHER state from 0. 2012-06-11T16:07:12.720465+08:00 h56 openais[3399]: [TOTEM] Creating commit token because I am the rep. 2012-06-11T16:07:12.720490+08:00 h56 openais[3399]: [TOTEM] Storing new sequence id for ring 32c38 2012-06-11T16:07:12.720734+08:00 h56 openais[3399]: [TOTEM] entering COMMIT state. 2012-06-11T16:07:12.724990+08:00 h56 openais[3399]: [TOTEM] entering RECOVERY state. 2012-06-11T16:07:12.726192+08:00 h56 openais[3399]: [TOTEM] position [0] member 192.168.1.56: 2012-06-11T16:07:12.726222+08:00 h56 openais[3399]: [TOTEM] previous ring seq 207924 rep 192.168.1.54 2012-06-11T16:07:12.726227+08:00 h56 openais[3399]: [TOTEM] aru e1 high delivered e1 received flag 1 2012-06-11T16:07:12.726231+08:00 h56 openais[3399]: [TOTEM] position [1] member 192.168.1.58: 2012-06-11T16:07:12.726235+08:00 h56 openais[3399]: [TOTEM] previous ring seq 207924 rep 192.168.1.54 2012-06-11T16:07:12.726247+08:00 h56 openais[3399]: [TOTEM] aru e1 high delivered e1 received flag 1 2012-06-11T16:07:12.726251+08:00 h56 openais[3399]: [TOTEM] position [2] member 192.168.1.60: 2012-06-11T16:07:12.726255+08:00 h56 openais[3399]: [TOTEM] previous ring seq 207924 rep 192.168.1.54 2012-06-11T16:07:12.726260+08:00 h56 openais[3399]: [TOTEM] aru e1 high delivered e1 received flag 1 2012-06-11T16:07:12.726264+08:00 h56 openais[3399]: [TOTEM] position [3] member 192.168.1.68: 2012-06-11T16:07:12.726267+08:00 h56 openais[3399]: [TOTEM] previous ring seq 207924 rep 192.168.1.54 2012-06-11T16:07:12.726271+08:00 h56 openais[3399]: [TOTEM] aru e1 high delivered e1 received flag 1 2012-06-11T16:07:12.726275+08:00 h56 openais[3399]: [TOTEM] Did not need to originate any messages in recovery. 2012-06-11T16:07:12.734362+08:00 h56 openais[3399]: [TOTEM] Sending initial ORF token 2012-06-11T16:07:13.239944+08:00 h56 openais[3399]: [TOTEM] Sending initial ORF token 2012-06-11T16:07:14.758583+08:00 h56 openais[3399]: [CLM ] CLM CONFIGURATION CHANGE 2012-06-11T16:07:14.758605+08:00 h56 openais[3399]: [CLM ] New Configuration: 2012-06-11T16:07:14.758609+08:00 h56 openais[3399]: [CLM ] #011r(0) ip(192.168.1.56) r(1) ip(11.168.1.56) 2012-06-11T16:07:14.758612+08:00 h56 openais[3399]: [CLM ] #011r(0) ip(192.168.1.58) r(1) ip(11.168.1.58) 2012-06-11T16:07:14.758616+08:00 h56 openais[3399]: [CLM ] #011r(0) ip(192.168.1.60) r(1) ip(11.168.1.60) 2012-06-11T16:07:14.758620+08:00 h56 openais[3399]: [CLM ] #011r(0) ip(192.168.1.68) r(1) ip(11.168.1.68) 2012-06-11T16:07:14.758623+08:00 h56 openais[3399]: [CLM ] Members Left: 2012-06-11T16:07:14.758626+08:00 h56 openais[3399]: [CLM ] #011r(0) ip(192.168.1.54) r(1) ip(11.168.1.54) 2012-06-11T16:07:14.758630+08:00 h56 openais[3399]: [CLM ] Members Joined: 2012-06-11T16:07:14.758633+08:00 h56 openais[3399]: [CLM ] CLM CONFIGURATION CHANGE 2012-06-11T16:07:14.758637+08:00 h56 openais[3399]: [CLM ] New Configuration: 2012-06-11T16:07:14.758806+08:00 h56 openais[3399]: [CLM ] #011r(0) ip(192.168.1.56) r(1) ip(11.168.1.56) 2012-06-11T16:07:14.758823+08:00 h56 openais[3399]: [CLM ] #011r(0) ip(192.168.1.58) r(1) ip(11.168.1.58) 2012-06-11T16:07:14.758827+08:00 h56 openais[3399]: [CLM ] #011r(0) ip(192.168.1.60) r(1) ip(11.168.1.60) 2012-06-11T16:07:14.758830+08:00 h56 openais[3399]: [CLM ] #011r(0) ip(192.168.1.68) r(1) ip(11.168.1.68) 2012-06-11T16:07:14.758834+08:00 h56 openais[3399]: [CLM ] Members Left: 2012-06-11T16:07:14.758837+08:00 h56 openais[3399]: [CLM ] Members Joined: 2012-06-11T16:07:14.758840+08:00 h56 openais[3399]: [SYNC ] This node is within the primary component and will provide service. 2012-06-11T16:07:14.758840+08:00 h56 openais[3399]: [SYNC ] This node is within the primary component and will provide service. 2012-06-11T16:07:14.759505+08:00 h56 openais[3399]: [TOTEM] entering OPERATIONAL state. 2012-06-11T16:07:14.759780+08:00 h56 gfs_controld[3434]: cluster node 5 removed 2012-06-11T16:07:14.759791+08:00 h56 gfs_controld[3434]: node_history_cluster_remove no nodeid 5 2012-06-11T16:07:14.759944+08:00 h56 clurgmgrd[3477]: <info> State change: 192.168.1.54 DOWN 2012-06-11T16:07:15.343206+08:00 h56 fenced[3422]: cluster is down, exiting 2012-06-11T16:07:15.343845+08:00 h56 dlm_controld[3428]: cluster is down, exiting 2012-06-11T16:07:15.343863+08:00 h56 gfs_controld[3434]: cluster is down, exiting 2012-06-11T16:07:15.343867+08:00 h56 gfs_controld[3434]: groupd_dispatch error -1 errno 112 2012-06-11T16:07:15.343870+08:00 h56 gfs_controld[3434]: cpg_dispatch error 2 2012-06-11T16:07:15.344059+08:00 h56 ccsd[3391]: Attempt to close an unopened CCS descriptor (250). 2012-06-11T16:07:15.344069+08:00 h56 ccsd[3391]: Error while processing disconnect: Invalid request descriptor 2012-06-11T16:07:15.344513+08:00 h56 clurgmgrd[3477]: <warning> #67: Shutting down uncleanly |
_______________________________________________ discuss mailing list discuss@xxxxxxxxxxxx http://lists.corosync.org/mailman/listinfo/discuss