Hi All, I upgraded from emperor to firefly. Initial upgrade went smoothly and all placement groups were active+clean . Next I executed 'ceph osd crush tunables optimal' to upgrade CRUSH mapping. Now I keep having OSDs go down or have requests blocked for long periods of time. I start back up the down OSDs and recovery eventually stops, but with 100s of "incomplete" and "down+incomplete" pgs remaining. The ceph web page says "If you see this state [incomplete], report a bug, and try to start any failed OSDs that may contain the needed information." Well, all the OSDs are up, though some have blocked requests. Also, the logs of the OSDs which go down have this message: 2014-11-02 21:46:33.615829 7ffcf0421700 0 -- 192.168.164.192:6810/31314 >> 192.168.164.186:6804/20934 pipe(0x2faa0280 sd=261 :6810 s=2 pgs=9 19 cs=25 l=0 c=0x2ed022c0).fault with nothing to send, going to standby 2014-11-02 21:49:11.440142 7ffce4cf3700 0 -- 192.168.164.192:6810/31314 >> 192.168.164.186:6804/20934 pipe(0xe512a00 sd=249 :6810 s=0 pgs=0 cs=0 l=0 c=0x2a308b00).accept connect_seq 26 vs existing 25 state standby 2014-11-02 21:51:20.085676 7ffcf6e3e700 -1 osd/PG.cc: In function 'PG::RecoveryState::Crashed::Crashed(boost::statechart::state<PG::RecoveryS tate::Crashed, PG::RecoveryState::RecoveryMachine>::my_context)' thread 7ffcf6e3e700 time 2014-11-02 21:51:20.052242 osd/PG.cc: 5424: FAILED assert(0 == "we got a bad state machine event") ceph version 0.80.7 (6c0127fcb58008793d3c8b62d925bc91963672a3) 1: (PG::RecoveryState::Crashed::Crashed(boost::statechart::state<PG::RecoveryState::Crashed, PG::RecoveryState::RecoveryMachine, boost::mpl: :list<mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_: :na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na>, (boost::statechart::history_mode)0>::my_context)+0x12f) [0x87c6ef] 2: /usr/bin/ceph-osd() [0x8aeae9] 3: (boost::statechart::detail::reaction_result boost::statechart::simple_state<PG::RecoveryState::Started, PG::RecoveryState::RecoveryMachin e, PG::RecoveryState::Start, (boost::statechart::history_mode)0>::local_react_impl_non_empty::local_react_impl<boost::mpl::list2<boost::state chart::custom_reaction<PG::IntervalFlush>, boost::statechart::transition<boost::statechart::event_base, PG::RecoveryState::Crashed, boost::st atechart::detail::no_context<boost::statechart::event_base>, &boost::statechart::detail::no_context<boost::statechart::event_base>::no_functi on> >, boost::statechart::simple_state<PG::RecoveryState::Started, PG::RecoveryState::RecoveryMachine, PG::RecoveryState::Start, (boost::stat echart::history_mode)0> >(boost::statechart::simple_state<PG::RecoveryState::Started, PG::RecoveryState::RecoveryMachine, PG::RecoveryState:: Start, (boost::statechart::history_mode)0>&, boost::statechart::event_base const&, void const*)+0xbf) [0x8dd3ff] 4: (boost::statechart::detail::reaction_result boost::statechart::simple_state<PG::RecoveryState::Started, PG::RecoveryState::RecoveryMachin e, PG::RecoveryState::Start, (boost::statechart::history_mode)0>::local_react_impl_non_empty::local_react_impl<boost::mpl::list3<boost::state chart::custom_reaction<PG::FlushedEvt>, boost::statechart::custom_reaction<PG::IntervalFlush>, boost::statechart::transition<boost::statechar t::event_base, PG::RecoveryState::Crashed, boost::statechart::detail::no_context<boost::statechart::event_base>, &boost::statechart::detail:: no_context<boost::statechart::event_base>::no_function> >, boost::statechart::simple_state<PG::RecoveryState::Started, PG::RecoveryState::Rec overyMachine, PG::RecoveryState::Start, (boost::statechart::history_mode)0> >(boost::statechart::simple_state<PG::RecoveryState::Started, PG: :RecoveryState::RecoveryMachine, PG::RecoveryState::Start, (boost::statechart::history_mode)0>&, boost::statechart::event_base const&, void c onst*)+0x57) [0x8dd4e7] 5: (boost::statechart::detail::reaction_result boost::statechart::simple_state<PG::RecoveryState::Started, PG::RecoveryState::RecoveryMachin e, PG::RecoveryState::Start, (boost::statechart::history_mode)0>::local_react_impl_non_empty::local_react_impl<boost::mpl::list5<boost::state chart::custom_reaction<PG::AdvMap>, boost::statechart::custom_reaction<PG::NullEvt>, boost::statechart::custom_reaction<PG::FlushedEvt>, boos t::statechart::custom_reaction<PG::IntervalFlush>, boost::statechart::transition<boost::statechart::event_base, PG::RecoveryState::Crashed, b oost::statechart::detail::no_context<boost::statechart::event_base>, &boost::statechart::detail::no_context<boost::statechart::event_base>::n o_function> >, boost::statechart::simple_state<PG::RecoveryState::Started, PG::RecoveryState::RecoveryMachine, PG::RecoveryState::Start, (boo st::statechart::history_mode)0> >(boost::statechart::simple_state<PG::RecoveryState::Started, PG::RecoveryState::RecoveryMachine, PG::RecoveryState::Start, (boost::statechart::history_mode)0>&, boost::statechart::event_base const&, void const*)+0x57) [0x8dd637] 6: (boost::statechart::detail::reaction_result boost::statechart::simple_state<PG::RecoveryState::Started, PG::RecoveryState::RecoveryMachine, PG::RecoveryState::Start, (boost::statechart::history_mode)0>::local_react_impl_non_empty::local_react_impl<boost::mpl::list<boost::statechart::custom_reaction<PG::QueryState>, boost::statechart::custom_reaction<PG::AdvMap>, boost::statechart::custom_reaction<PG::NullEvt>, boost::statechart::custom_reaction<PG::FlushedEvt>, boost::statechart::custom_reaction<PG::IntervalFlush>, boost::statechart::transition<boost::statechart::event_base, PG::RecoveryState::Crashed, boost::statechart::detail::no_context<boost::statechart::event_base>, &boost::statechart::detail::no_context<boost::statechart::event_base>::no_function>, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na>, boost::statechart::simple_state<PG::RecoveryState::Started, PG::RecoveryState::RecoveryMachine, PG::RecoveryState::Start, (boost::statechart::history_mode)0> >(boost::statechart::simple_state<PG::RecoveryState::Started, PG::RecoveryState::RecoveryMachine, PG::RecoveryState::Start, (boost::statechart::history_mode)0>&, boost::statechart::event_base const&, void const*)+0x57) [0x8dd6e7] 7: (boost::statechart::state_machine<PG::RecoveryState::RecoveryMachine, PG::RecoveryState::Initial, std::allocator<void>, boost::statechart::null_exception_translator>::send_event(boost::statechart::event_base const&)+0x5b) [0x8bcc1b] 8: (boost::statechart::state_machine<PG::RecoveryState::RecoveryMachine, PG::RecoveryState::Initial, std::allocator<void>, boost::statechart::null_exception_translator>::process_event(boost::statechart::event_base const&)+0x19) [0x8bcca9] 9: (PG::RecoveryState::handle_event(std::tr1::shared_ptr<PG::CephPeeringEvt>, PG::RecoveryCtx*)+0x31) [0x8bcd41] 10: (PG::handle_peering_event(std::tr1::shared_ptr<PG::CephPeeringEvt>, PG::RecoveryCtx*)+0x368) [0x872a08] 11: (OSD::process_peering_events(std::list<PG*, std::allocator<PG*> > const&, ThreadPool::TPHandle&)+0x40c) [0x77619c] 12: (OSD::PeeringWQ::_process(std::list<PG*, std::allocator<PG*> > const&, ThreadPool::TPHandle&)+0x14) [0x7d31e4] 13: (ThreadPool::worker(ThreadPool::WorkThread*)+0x68a) [0xb8173a] 14: (ThreadPool::WorkThread::entry()+0x10) [0xb82980] 15: (()+0x6b50) [0x7ffd10f98b50] 16: (clone()+0x6d) [0x7ffd0fbbc7bd] NOTE: a copy of the executable, or `objdump -rdS <executable>` is needed to interpret this. --- begin dump of recent events --- Any ideas? Thanks, Chad. _______________________________________________ ceph-users mailing list ceph-users@xxxxxxxxxxxxxx http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com