Thanks Sage. Even after I stopped mon.1 (19.193.207.131), the problem still persisted, those two left mons kept electing. I checked the network, clock and they are all good. One problem I noticed is the %CPU of the monitor being elected as leader, it is always around 100%, no matter we have 3 monitors in or 2 monitors in, the other monitor's %CPU are less than 10%. I used *pstack* checking the busy one and have the following result, looks like thread #25 make it busy. Meanwhile, I kept seeing the logs like '2014-01-24 11:03:18.959087 7f87eff4b700 0 mon.osd153@1(leader) e1 handle_command mon_command({"prefix": "osd crush create-or-move", "args": ["root=default", "host=osd84"], "id": 725, "weight": 3.6299999999999999} v 0) v1' from within the leader's log, though all OSDs were stopped at the time, is it like event replay? bash-4.1$ sudo pstack 1159 Thread 30 (Thread 0x7f87f7619700 (LWP 1160)): #0 0x00000037a980b43c in pthread_cond_wait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0 #1 0x00000000007b7acb in ceph::log::Log::entry() () #2 0x00000037a9807851 in start_thread () from /lib64/libpthread.so.0 #3 0x00000037a94e890d in clone () from /lib64/libc.so.6 Thread 29 (Thread 0x7f87f64bd700 (LWP 1161)): #0 0x00000037a980d811 in sem_timedwait () from /lib64/libpthread.so.0 #1 0x000000000070de60 in CephContextServiceThread::entry() () #2 0x00000037a9807851 in start_thread () from /lib64/libpthread.so.0 #3 0x00000037a94e890d in clone () from /lib64/libc.so.6 Thread 28 (Thread 0x7f87f5abc700 (LWP 1162)): #0 0x00000037a94df253 in poll () from /lib64/libc.so.6 #1 0x0000000000705d5a in AdminSocket::entry() () #2 0x00000037a9807851 in start_thread () from /lib64/libpthread.so.0 #3 0x00000037a94e890d in clone () from /lib64/libc.so.6 Thread 27 (Thread 0x7f87f134d700 (LWP 1163)): #0 0x00000037a980b43c in pthread_cond_wait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0 #1 0x0000000000731d31 in SimpleMessenger::reaper_entry() () #2 0x000000000073475d in SimpleMessenger::ReaperThread::entry() () #3 0x00000037a9807851 in start_thread () from /lib64/libpthread.so.0 #4 0x00000037a94e890d in clone () from /lib64/libc.so.6 Thread 26 (Thread 0x7f87f094c700 (LWP 1164)): #0 0x00000037a980e054 in __lll_lock_wait () from /lib64/libpthread.so.0 #1 0x00000037a9810e1d in _L_cond_lock_886 () from /lib64/libpthread.so.0 #2 0x00000037a9810cf7 in __pthread_mutex_cond_lock () from /lib64/libpthread.so.0 #3 0x00000037a980b875 in pthread_cond_timedwait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0 #4 0x000000000072715e in SafeTimer::timer_thread() () #5 0x0000000000728fbd in SafeTimerThread::entry() () #6 0x00000037a9807851 in start_thread () from /lib64/libpthread.so.0 #7 0x00000037a94e890d in clone () from /lib64/libc.so.6 Thread 25 (Thread 0x7f87eff4b700 (LWP 1165)): #0 0x0000000000707e8e in crush_hash32_3 () #1 0x0000000000734c14 in crush_choose () #2 0x000000000073518f in crush_do_rule () #3 0x000000000068a8a3 in CrushWrapper::do_rule(int, int, std::vector<int, std::allocator<int> >&, int, std::vector<unsigned int, std::allocator<unsigned int> > const&) const () #4 0x0000000000720dff in OSDMap::_pg_to_osds(pg_pool_t const&, pg_t, std::vector<int, std::allocator<int> >&) const () #5 0x0000000000720ff1 in OSDMap::pg_to_raw_up(pg_t, std::vector<int, std::allocator<int> >&) const () #6 0x0000000000590ee6 in OSDMonitor::remove_redundant_pg_temp() () #7 0x00000000005adfb5 in OSDMonitor::create_pending() () #8 0x000000000058cca9 in PaxosService::_active() () #9 0x000000000056063a in Context::complete(int) () #10 0x000000000056424d in finish_contexts(CephContext*, std::list<Context*, std::allocator<Context*> >&, int) () #11 0x000000000058461e in Paxos::handle_last(MMonPaxos*) () #12 0x000000000058579b in Paxos::dispatch(PaxosServiceMessage*) () #13 0x000000000055c9ca in Monitor::_ms_dispatch(Message*) () #14 0x0000000000578502 in Monitor::ms_dispatch(Message*) () #15 0x00000000007bc552 in DispatchQueue::entry() () #16 0x000000000073268d in DispatchQueue::DispatchThread::entry() () #17 0x00000037a9807851 in start_thread () from /lib64/libpthread.so.0 #18 0x00000037a94e890d in clone () from /lib64/libc.so.6 Thread 24 (Thread 0x7f87ef54a700 (LWP 1166)): #0 0x00000037a94df253 in poll () from /lib64/libc.so.6 #1 0x00000000007db3f9 in Accepter::entry() () #2 0x00000037a9807851 in start_thread () from /lib64/libpthread.so.0 #3 0x00000037a94e890d in clone () from /lib64/libc.so.6 Thread 23 (Thread 0x7f87eea48700 (LWP 1168)): #0 0x00000037a980b43c in pthread_cond_wait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0 #1 0x00000000007ccc89 in Pipe::writer() () #2 0x00000000007da52d in Pipe::Writer::entry() () #3 0x00000037a9807851 in start_thread () from /lib64/libpthread.so.0 #4 0x00000037a94e890d in clone () from /lib64/libc.so.6 Thread 22 (Thread 0x7f87ee947700 (LWP 1169)): #0 0x00000037a94df253 in poll () from /lib64/libc.so.6 #1 0x0000000000646726 in SignalHandler::entry() () #2 0x00000037a9807851 in start_thread () from /lib64/libpthread.so.0 #3 0x00000037a94e890d in clone () from /lib64/libc.so.6 Thread 21 (Thread 0x7f87eeb49700 (LWP 1173)): #0 0x00000037a980b7bb in pthread_cond_timedwait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0 #1 0x00000000007c7147 in Pipe::fault(bool) () #2 0x00000000007c97a2 in Pipe::connect() () #3 0x00000000007ccf63 in Pipe::writer() () #4 0x00000000007da52d in Pipe::Writer::entry() () #5 0x00000037a9807851 in start_thread () from /lib64/libpthread.so.0 #6 0x00000037a94e890d in clone () from /lib64/libc.so.6 Thread 20 (Thread 0x7f87edc33700 (LWP 1195)): #0 0x00000037a94df253 in poll () from /lib64/libc.so.6 #1 0x00000000007c08f9 in Pipe::tcp_read_wait() () #2 0x00000000007c8755 in Pipe::tcp_read(char*, int) () #3 0x00000000007d757f in Pipe::reader() () #4 0x00000000007da54d in Pipe::Reader::entry() () #5 0x00000037a9807851 in start_thread () from /lib64/libpthread.so.0 #6 0x00000037a94e890d in clone () from /lib64/libc.so.6 Thread 19 (Thread 0x7f87edb32700 (LWP 1196)): #0 0x00000037a980b43c in pthread_cond_wait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0 #1 0x00000000007ccc89 in Pipe::writer() () #2 0x00000000007da52d in Pipe::Writer::entry() () #3 0x00000037a9807851 in start_thread () from /lib64/libpthread.so.0 #4 0x00000037a94e890d in clone () from /lib64/libc.so.6 Thread 18 (Thread 0x7f87eda31700 (LWP 1197)): #0 0x00000037a94df253 in poll () from /lib64/libc.so.6 #1 0x00000000007c08f9 in Pipe::tcp_read_wait() () #2 0x00000000007c8755 in Pipe::tcp_read(char*, int) () #3 0x00000000007d757f in Pipe::reader() () #4 0x00000000007da54d in Pipe::Reader::entry() () #5 0x00000037a9807851 in start_thread () from /lib64/libpthread.so.0 #6 0x00000037a94e890d in clone () from /lib64/libc.so.6 Thread 17 (Thread 0x7f87ed930700 (LWP 1198)): #0 0x00000037a980b43c in pthread_cond_wait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0 #1 0x00000000007ccc89 in Pipe::writer() () #2 0x00000000007da52d in Pipe::Writer::entry() () #3 0x00000037a9807851 in start_thread () from /lib64/libpthread.so.0 #4 0x00000037a94e890d in clone () from /lib64/libc.so.6 Thread 16 (Thread 0x7f87edd34700 (LWP 1205)): #0 0x00000037a94df253 in poll () from /lib64/libc.so.6 #1 0x00000000007c08f9 in Pipe::tcp_read_wait() () #2 0x00000000007c8755 in Pipe::tcp_read(char*, int) () #3 0x00000000007d757f in Pipe::reader() () #4 0x00000000007da54d in Pipe::Reader::entry() () #5 0x00000037a9807851 in start_thread () from /lib64/libpthread.so.0 #6 0x00000037a94e890d in clone () from /lib64/libc.so.6 Thread 15 (Thread 0x7f87ede35700 (LWP 1211)): #0 0x00000037a94df253 in poll () from /lib64/libc.so.6 #1 0x00000000007c08f9 in Pipe::tcp_read_wait() () #2 0x00000000007c8755 in Pipe::tcp_read(char*, int) () #3 0x00000000007d757f in Pipe::reader() () #4 0x00000000007da54d in Pipe::Reader::entry() () #5 0x00000037a9807851 in start_thread () from /lib64/libpthread.so.0 #6 0x00000037a94e890d in clone () from /lib64/libc.so.6 Thread 14 (Thread 0x7f87ed82f700 (LWP 1212)): #0 0x00000037a980b43c in pthread_cond_wait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0 #1 0x00000000007ccc89 in Pipe::writer() () #2 0x00000000007da52d in Pipe::Writer::entry() () #3 0x00000037a9807851 in start_thread () from /lib64/libpthread.so.0 #4 0x00000037a94e890d in clone () from /lib64/libc.so.6 Thread 13 (Thread 0x7f87ed72e700 (LWP 1213)): #0 0x00000037a94df253 in poll () from /lib64/libc.so.6 #1 0x00000000007c08f9 in Pipe::tcp_read_wait() () #2 0x00000000007c8755 in Pipe::tcp_read(char*, int) () #3 0x00000000007d757f in Pipe::reader() () #4 0x00000000007da54d in Pipe::Reader::entry() () #5 0x00000037a9807851 in start_thread () from /lib64/libpthread.so.0 #6 0x00000037a94e890d in clone () from /lib64/libc.so.6 Thread 12 (Thread 0x7f87ed62d700 (LWP 1214)): #0 0x00000037a980b43c in pthread_cond_wait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0 #1 0x00000000007ccc89 in Pipe::writer() () #2 0x00000000007da52d in Pipe::Writer::entry() () #3 0x00000037a9807851 in start_thread () from /lib64/libpthread.so.0 #4 0x00000037a94e890d in clone () from /lib64/libc.so.6 Thread 11 (Thread 0x7f87ed52c700 (LWP 1215)): #0 0x00000037a94df253 in poll () from /lib64/libc.so.6 #1 0x00000000007c08f9 in Pipe::tcp_read_wait() () #2 0x00000000007c8755 in Pipe::tcp_read(char*, int) () #3 0x00000000007d757f in Pipe::reader() () #4 0x00000000007da54d in Pipe::Reader::entry() () #5 0x00000037a9807851 in start_thread () from /lib64/libpthread.so.0 #6 0x00000037a94e890d in clone () from /lib64/libc.so.6 Thread 10 (Thread 0x7f87ed42b700 (LWP 1216)): #0 0x00000037a980b43c in pthread_cond_wait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0 #1 0x00000000007ccc89 in Pipe::writer() () #2 0x00000000007da52d in Pipe::Writer::entry() () #3 0x00000037a9807851 in start_thread () from /lib64/libpthread.so.0 #4 0x00000037a94e890d in clone () from /lib64/libc.so.6 Thread 9 (Thread 0x7f87ed32a700 (LWP 1217)): #0 0x00000037a94df253 in poll () from /lib64/libc.so.6 #1 0x00000000007c08f9 in Pipe::tcp_read_wait() () #2 0x00000000007c8755 in Pipe::tcp_read(char*, int) () #3 0x00000000007d757f in Pipe::reader() () #4 0x00000000007da54d in Pipe::Reader::entry() () #5 0x00000037a9807851 in start_thread () from /lib64/libpthread.so.0 #6 0x00000037a94e890d in clone () from /lib64/libc.so.6 Thread 8 (Thread 0x7f87ed229700 (LWP 1218)): #0 0x00000037a980b43c in pthread_cond_wait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0 #1 0x00000000007ccc89 in Pipe::writer() () #2 0x00000000007da52d in Pipe::Writer::entry() () #3 0x00000037a9807851 in start_thread () from /lib64/libpthread.so.0 #4 0x00000037a94e890d in clone () from /lib64/libc.so.6 Thread 7 (Thread 0x7f87ecf26700 (LWP 1221)): #0 0x00000037a94df253 in poll () from /lib64/libc.so.6 #1 0x00000000007c08f9 in Pipe::tcp_read_wait() () #2 0x00000000007c8755 in Pipe::tcp_read(char*, int) () #3 0x00000000007d757f in Pipe::reader() () #4 0x00000000007da54d in Pipe::Reader::entry() () #5 0x00000037a9807851 in start_thread () from /lib64/libpthread.so.0 #6 0x00000037a94e890d in clone () from /lib64/libc.so.6 Thread 6 (Thread 0x7f87ece25700 (LWP 1222)): #0 0x00000037a980b43c in pthread_cond_wait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0 #1 0x00000000007ccc89 in Pipe::writer() () #2 0x00000000007da52d in Pipe::Writer::entry() () #3 0x00000037a9807851 in start_thread () from /lib64/libpthread.so.0 #4 0x00000037a94e890d in clone () from /lib64/libc.so.6 Thread 5 (Thread 0x7f87ecd24700 (LWP 1223)): #0 0x00000037a94df253 in poll () from /lib64/libc.so.6 #1 0x00000000007c08f9 in Pipe::tcp_read_wait() () #2 0x00000000007c8755 in Pipe::tcp_read(char*, int) () #3 0x00000000007d757f in Pipe::reader() () #4 0x00000000007da54d in Pipe::Reader::entry() () #5 0x00000037a9807851 in start_thread () from /lib64/libpthread.so.0 #6 0x00000037a94e890d in clone () from /lib64/libc.so.6 Thread 4 (Thread 0x7f87ecc23700 (LWP 1224)): #0 0x00000037a980b43c in pthread_cond_wait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0 #1 0x00000000007ccc89 in Pipe::writer() () #2 0x00000000007da52d in Pipe::Writer::entry() () #3 0x00000037a9807851 in start_thread () from /lib64/libpthread.so.0 #4 0x00000037a94e890d in clone () from /lib64/libc.so.6 Thread 3 (Thread 0x7f87ecb22700 (LWP 1225)): #0 0x00000037a94df253 in poll () from /lib64/libc.so.6 #1 0x00000000007c08f9 in Pipe::tcp_read_wait() () #2 0x00000000007c8755 in Pipe::tcp_read(char*, int) () #3 0x00000000007d757f in Pipe::reader() () #4 0x00000000007da54d in Pipe::Reader::entry() () #5 0x00000037a9807851 in start_thread () from /lib64/libpthread.so.0 #6 0x00000037a94e890d in clone () from /lib64/libc.so.6 Thread 2 (Thread 0x7f87eca21700 (LWP 1226)): #0 0x00000037a980b43c in pthread_cond_wait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0 #1 0x00000000007ccc89 in Pipe::writer() () #2 0x00000000007da52d in Pipe::Writer::entry() () #3 0x00000037a9807851 in start_thread () from /lib64/libpthread.so.0 #4 0x00000037a94e890d in clone () from /lib64/libc.so.6 Thread 1 (Thread 0x7f87f761a7a0 (LWP 1159)): #0 0x00000037a98080ad in pthread_join () from /lib64/libpthread.so.0 #1 0x0000000000735352 in Thread::join(void**) () #2 0x000000000072f8ba in SimpleMessenger::wait() () #3 0x00000000005202fa in main () On Jan 23, 2014, at 10:32 PM, Sage Weil <sage@xxxxxxxxxxx> wrote: On Thu, 23 Jan 2014, Guang wrote:Hi Joao, |
_______________________________________________ ceph-users mailing list ceph-users@xxxxxxxxxxxxxx http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com