Hi, It seems my OSD processes keep crashing randomly and I don't know why. It seems to happens when the cluster is trying to re-balance... In normal usange I didn't notice any crash like that. We running ceph 0.61.7 on an up to date ubuntu 12.04 (all packages including kernel are current). Anyone have an idea ? TRACE: ceph version 0.61.7 (8f010aff684e820ecc837c25ac77c7a05d7191ff) 1: /usr/bin/ceph-osd() [0x79219a] 2: (()+0xfcb0) [0x7fd692da1cb0] 3: (gsignal()+0x35) [0x7fd69155a425] 4: (abort()+0x17b) [0x7fd69155db8b] 5: (__gnu_cxx::__verbose_terminate_handler()+0x11d) [0x7fd691eac69d] 6: (()+0xb5846) [0x7fd691eaa846] 7: (()+0xb5873) [0x7fd691eaa873] 8: (()+0xb596e) [0x7fd691eaa96e] 9: (ceph::__ceph_assert_fail(char const*, char const*, int, char const*)+0x1df) [0x84303f] 10: (PG::RecoveryState::Recovered::Recovered(boost::statechart::state<PG::RecoveryState::Recovered, PG::RecoveryState::Active, boost::mpl::list<mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na>, (boost::statechart::history_mode)0>::my_context)+0x38f) [0x6d932f] 11: (boost::statechart::state<PG::RecoveryState::Recovered, PG::RecoveryState::Active, boost::mpl::list<mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na>, (boost::statechart::history_mode)0>::shallow_construct(boost::intrusive_ptr<PG::RecoveryState::Active> const&, boost::statechart::state_machine<PG::RecoveryState::RecoveryMachine, PG::RecoveryState::Initial, std::allocator<void>, boost::statechart::null_exception_translator>&)+0x5c) [0x6f270c] 12: (PG::RecoveryState::Recovering::react(PG::AllReplicasRecovered const&)+0xb4) [0x6d9454] 13: (boost::statechart::simple_state<PG::RecoveryState::Recovering, PG::RecoveryState::Active, boost::mpl::list<mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na>, (boost::statechart::history_mode)0>::react_impl(boost::statechart::event_base const&, void const*)+0xda) [0x6f296a] 14: (boost::statechart::state_machine<PG::RecoveryState::RecoveryMachine, PG::RecoveryState::Initial, std::allocator<void>, boost::statechart::null_exception_translator>::send_event(boost::statechart::event_base const&)+0x5b) [0x6e320b] 15: (boost::statechart::state_machine<PG::RecoveryState::RecoveryMachine, PG::RecoveryState::Initial, std::allocator<void>, boost::statechart::null_exception_translator>::process_event(boost::statechart::event_base const&)+0x11) [0x6e34e1] 16: (PG::handle_peering_event(std::tr1::shared_ptr<PG::CephPeeringEvt>, PG::RecoveryCtx*)+0x347) [0x69aaf7] 17: (OSD::process_peering_events(std::list<PG*, std::allocator<PG*> > const&, ThreadPool::TPHandle&)+0x2f5) [0x632fc5] 18: (OSD::PeeringWQ::_process(std::list<PG*, std::allocator<PG*> > const&, ThreadPool::TPHandle&)+0x12) [0x66e2d2] 19: (ThreadPool::worker(ThreadPool::WorkThread*)+0x4e6) [0x838476] 20: (ThreadPool::WorkThread::entry()+0x10) [0x83a2a0] 21: (()+0x7e9a) [0x7fd692d99e9a] 22: (clone()+0x6d) [0x7fd691617ccd] NOTE: a copy of the executable, or `objdump -rdS <executable>` is needed to interpret this. --- begin dump of recent events --- -3> 2013-08-12 15:58:15.561005 7fd683d78700 1 -- 10.136.48.18:6814/21240 <== osd.56 10.136.48.14:0/17437 44 ==== osd_ping(ping e8959 stamp 2013-08-12 15:58:15.556022) v2 ==== 47+0+0 (355096560 0 0) 0xc4e81c0 con 0x12fbeb00 -2> 2013-08-12 15:58:15.561038 7fd683d78700 1 -- 10.136.48.18:6814/21240 --> 10.136.48.14:0/17437 -- osd_ping(ping_reply e8959 stamp 2013-08-12 15:58:15.556022) v2 -- ?+0 0x1683ec40 con 0x12fbeb00 -1> 2013-08-12 15:58:15.568600 7fd67e56d700 1 -- 10.136.48.18:6813/21240 --> osd.44 10.136.48.15:6820/25671 -- osd_sub_op(osd.20.0:1293 25.328 699ac328/rbd_data.ae2732ae8944a.0000000000240828/head//25 [push] v 8424'11 snapset=0=[]:[] snapc=0=[]) v7 -- ?+0 0x2df0f400 0> 2013-08-12 15:58:15.581608 7fd681d74700 -1 *** Caught signal (Aborted) ** in thread 7fd681d74700 ceph version 0.61.7 (8f010aff684e820ecc837c25ac77c7a05d7191ff) 1: /usr/bin/ceph-osd() [0x79219a] 2: (()+0xfcb0) [0x7fd692da1cb0] 3: (gsignal()+0x35) [0x7fd69155a425] 4: (abort()+0x17b) [0x7fd69155db8b] 5: (__gnu_cxx::__verbose_terminate_handler()+0x11d) [0x7fd691eac69d] 6: (()+0xb5846) [0x7fd691eaa846] 7: (()+0xb5873) [0x7fd691eaa873] 8: (()+0xb596e) [0x7fd691eaa96e] 9: (ceph::__ceph_assert_fail(char const*, char const*, int, char const*)+0x1df) [0x84303f] 10: (PG::RecoveryState::Recovered::Recovered(boost::statechart::state<PG::RecoveryState::Recovered, PG::RecoveryState::Active, boost::mpl::list<mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na>, (boost::statechart::history_mode)0>::my_context)+0x38f) [0x6d932f] 11: (boost::statechart::state<PG::RecoveryState::Recovered, PG::RecoveryState::Active, boost::mpl::list<mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na>, (boost::statechart::history_mode)0>::shallow_construct(boost::intrusive_ptr<PG::RecoveryState::Active> const&, boost::statechart::state_machine<PG::RecoveryState::RecoveryMachine, PG::RecoveryState::Initial, std::allocator<void>, boost::statechart::null_exception_translator>&)+0x5c) [0x6f270c] 12: (PG::RecoveryState::Recovering::react(PG::AllReplicasRecovered const&)+0xb4) [0x6d9454] 13: (boost::statechart::simple_state<PG::RecoveryState::Recovering, PG::RecoveryState::Active, boost::mpl::list<mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na>, (boost::statechart::history_mode)0>::react_impl(boost::statechart::event_base const&, void const*)+0xda) [0x6f296a] 14: (boost::statechart::state_machine<PG::RecoveryState::RecoveryMachine, PG::RecoveryState::Initial, std::allocator<void>, boost::statechart::null_exception_translator>::send_event(boost::statechart::event_base const&)+0x5b) [0x6e320b] 15: (boost::statechart::state_machine<PG::RecoveryState::RecoveryMachine, PG::RecoveryState::Initial, std::allocator<void>, boost::statechart::null_exception_translator>::process_event(boost::statechart::event_base const&)+0x11) [0x6e34e1] 16: (PG::handle_peering_event(std::tr1::shared_ptr<PG::CephPeeringEvt>, PG::RecoveryCtx*)+0x347) [0x69aaf7] 17: (OSD::process_peering_events(std::list<PG*, std::allocator<PG*> > const&, ThreadPool::TPHandle&)+0x2f5) [0x632fc5] 18: (OSD::PeeringWQ::_process(std::list<PG*, std::allocator<PG*> > const&, ThreadPool::TPHandle&)+0x12) [0x66e2d2] 19: (ThreadPool::worker(ThreadPool::WorkThread*)+0x4e6) [0x838476] 20: (ThreadPool::WorkThread::entry()+0x10) [0x83a2a0] 21: (()+0x7e9a) [0x7fd692d99e9a] 22: (clone()+0x6d) [0x7fd691617ccd] NOTE: a copy of the executable, or `objdump -rdS <executable>` is needed to interpret this. --- logging levels --- 0/ 5 none 0/ 1 lockdep 0/ 1 context 1/ 1 crush 1/ 5 mds 1/ 5 mds_balancer 1/ 5 mds_locker 1/ 5 mds_log 1/ 5 mds_log_expire 1/ 5 mds_migrator 0/ 1 buffer 0/ 1 timer 0/ 1 filer 0/ 1 striper 0/ 1 objecter 0/ 5 rados 0/ 5 rbd 0/ 5 journaler 0/ 5 objectcacher 0/ 5 client 0/ 5 osd 0/ 5 optracker 0/ 5 objclass 1/ 3 filestore 1/ 3 journal 0/ 5 ms 1/ 5 mon 0/10 monc 0/ 5 paxos 0/ 5 tp 1/ 5 auth 1/ 5 crypto 1/ 1 finisher 1/ 5 heartbeatmap 1/ 5 perfcounter 1/ 5 rgw 1/ 5 hadoop 1/ 5 javaclient 1/ 5 asok 1/ 1 throttle -2/-2 (syslog threshold) -1/-1 (stderr threshold) max_recent 10000 max_new 1000 log_file /var/log/ceph/ceph-osd.20.log --- end dump of recent events --- --
|
_______________________________________________ ceph-users mailing list ceph-users@xxxxxxxxxxxxxx http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com