Hi, It worked out :-) Once again many thanks ! On Fri, May 4, 2012 at 10:35 PM, Samuel Just <sam.just@xxxxxxxxxxxxx> wrote: > I have pushed another patch to wip-snap-workaround to bypass that > crash. Let me know about any further problems! > -Sam > > On Thu, May 3, 2012 at 4:29 PM, Tomasz Paszkowski <ss7pro@xxxxxxxxx> wrote: >> Hi, >> >> Now it dies with stack trace as below. If i understand correctly I >> need to implement the same trick with intersection before subtract as >> you have made ? >> >> >> >> 0x00007ffff5ebb445 in raise () from /lib/x86_64-linux-gnu/libc.so.6 >> (gdb) bt >> #0 0x00007ffff5ebb445 in raise () from /lib/x86_64-linux-gnu/libc.so.6 >> #1 0x00007ffff5ebebab in abort () from /lib/x86_64-linux-gnu/libc.so.6 >> #2 0x00007ffff680969d in __gnu_cxx::__verbose_terminate_handler() () >> from /usr/lib/x86_64-linux-gnu/libstdc++.so.6 >> #3 0x00007ffff6807846 in ?? () from /usr/lib/x86_64-linux-gnu/libstdc++.so.6 >> #4 0x00007ffff6807873 in std::terminate() () >> from /usr/lib/x86_64-linux-gnu/libstdc++.so.6 >> #5 0x00007ffff680796e in __cxa_throw () >> from /usr/lib/x86_64-linux-gnu/libstdc++.so.6 >> #6 0x0000000000695eb0 in ceph::__ceph_assert_fail ( >> assertion=0x80b1cd "_size >= 0", file=0x80a202 "./include/interval_set.h", >> line=382, >> func=0x81c020 "void interval_set<T>::erase(T, T) [with T = snapid_t]") >> at common/assert.cc:75 >> #7 0x00000000005ebe03 in erase (len=..., start=..., this=0x1a14d30) >> at ./include/interval_set.h:382 >> #8 interval_set<snapid_t>::subtract (this=0x1a14d30, a=...) >> at ./include/interval_set.h:404 >> #9 0x0000000000730530 in PG::activate (this=0x1a14800, t=..., tfin=..., >> query_map=..., activator_map=0x7fffea4527a0) at osd/PG.cc:1239 >> #10 0x00000000007324a2 in PG::RecoveryState::Active::Active (this=0x3048280, >> ctx=...) at osd/PG.cc:4016 >> #11 0x000000000074799c in >> boost::statechart::state<PG::RecoveryState::Active, PG---Type <return> >> to continue, or q <return> to quit--- >> ::RecoveryState::Primary, boost::mpl::list<mpl_::na, mpl_::na, >> mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, >> mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, >> mpl_::na, mpl_::na, mpl_::na, mpl_::na>, >> (boost::statechart::history_mode)0>::shallow_construct (pContext=..., >> outermostContextBase=...) at /usr/include/boost/statechart/state.hpp:89 >> #12 0x0000000000747bd2 in deep_construct (outermostContextBase=..., >> pContext=...) at /usr/include/boost/statechart/state.hpp:79 >> #13 construct (outermostContextBase=..., pContext=...) >> at /usr/include/boost/statechart/detail/constructor.hpp:93 >> #14 boost::statechart::simple_state<PG::RecoveryState::Peering, >> PG::RecoveryState::Primary, PG::RecoveryState::GetInfo, >> (boost::statechart::history_mode)0>::transit_impl<PG::RecoveryState::Active, >> PG::RecoveryState::RecoveryMachine, >> boost::statechart::detail::no_transition_function> (this=0x3048280, >> transitionAction=...) at /usr/include/boost/statechart/simple_state.hpp:798 >> #15 0x0000000000747e52 in transit<PG::RecoveryState::Active> (this=0x3048280) >> at /usr/include/boost/statechart/simple_state.hpp:314 >> #16 react_without_action (stt=...) >> at /usr/include/boost/statechart/transition.hpp:38 >> #17 react (stt=...) >> at /usr/include/boost/statechart/detail/reaction_dispatcher.hpp:47 >> #18 react (stt=..., evt=..., eventType=<optimized out>) >> at /usr/include/boost/statechart/detail/reaction_dispatcher.hpp:92 >> #19 react (stt=..., evt=..., eventType=<optimized out>) >> ---Type <return> to continue, or q <return> to quit--- >> at /usr/include/boost/statechart/detail/reaction_dispatcher.hpp:109 >> #20 react<PG::RecoveryState::Peering, boost::statechart::event_base, >> void const*> (stt=..., evt=..., eventType=<optimized out>) >> at /usr/include/boost/statechart/transition.hpp:59 >> #21 local_react_impl<boost::mpl::list2<boost::statechart::transition<PG::RecoveryState::Activate, >> PG::RecoveryState::Active>, >> boost::statechart::custom_reaction<PG::RecoveryState::AdvMap> >, >> boost::statechart::simple_state<PG::RecoveryState::Peering, >> PG::RecoveryState::Primary, PG::RecoveryState::GetInfo, >> (boost::statechart::history_mode)0> > (eventType=0xb17c20, evt=..., >> stt=...) >> at /usr/include/boost/statechart/simple_state.hpp:816 >> #22 local_react<boost::mpl::list2<boost::statechart::transition<PG::RecoveryState::Activate, >> PG::RecoveryState::Active>, >> boost::statechart::custom_reaction<PG::RecoveryState::AdvMap> > > >> (this=0x3048280, eventType=0xb17c20, evt=...) >> at /usr/include/boost/statechart/simple_state.hpp:851 >> #23 local_react_impl<boost::mpl::list<boost::statechart::custom_reaction<PG::RecoveryState::QueryState>, >> boost::statechart::transition<PG::RecoveryState::Activate, >> PG::RecoveryState::Active>, >> boost::statechart::custom_reaction<PG::RecoveryState::AdvMap> >, >> boost::statechart::simple_state<PG::RecoveryState::Peering, >> PG::RecoveryState::Primary, PG::RecoveryState::GetInfo, >> (boost::statechart::history_mode)0> > (eventType=0xb17c20, evt=..., >> stt=...) >> at /usr/include/boost/statechart/simple_state.hpp:820 >> #24 local_react<boost::mpl::list<boost::statechart::custom_reaction<PG::RecoveryState::QueryState>, >> boost::statechart::transition<PG::RecoveryState::Activate, P---Type >> <return> to continue, or q <return> to quit--- >> G::RecoveryState::Active>, >> boost::statechart::custom_reaction<PG::RecoveryState::AdvMap> > > >> (eventType=0xb17c20, evt=..., this=0x3048280) >> at /usr/include/boost/statechart/simple_state.hpp:851 >> #25 boost::statechart::simple_state<PG::RecoveryState::Peering, >> PG::RecoveryState::Primary, PG::RecoveryState::GetInfo, >> (boost::statechart::history_mode)0>::react_impl (this=0x3048280, >> evt=..., eventType=0xb17c20) >> at /usr/include/boost/statechart/simple_state.hpp:489 >> >> #26 0x0000000000745c9b in react_impl (eventType=0xb17c20, evt=..., >> this=0x3578640) at /usr/include/boost/statechart/simple_state.hpp:498 >> #27 boost::statechart::simple_state<PG::RecoveryState::WaitUpThru, >> PG::RecoveryState::Peering, boost::mpl::list<mpl_::na, mpl_::na, >> mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, >> mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, >> mpl_::na, mpl_::na, mpl_::na, mpl_::na>, >> (boost::statechart::history_mode)0>::react_impl (this=0x3578640, >> evt=..., >> eventType=0xb17c20) at /usr/include/boost/statechart/simple_state.hpp:482 >> #28 0x000000000057be6b in operator() (this=<synthetic pointer>) >> at /usr/include/boost/statechart/state_machine.hpp:87 >> #29 operator()<boost::statechart::detail::send_function<boost::statechart::detail::state_base<std::allocator<void>, >> boost::statechart::detail::rtti_policy>, >> boost::statechart::event_base, const void*>, >> boost::statechart::state_machine<PG::RecoveryState::RecoveryMachine, >> PG::RecoveryState::Initial>::exception_event_handler> (action=..., >> this=<optimized out>) >> at /usr/include/boost/statechart/null_exception_translator.hpp:33 >> ---Type <return> to continue, or q <return> to quit--- >> #30 send_event (evt=..., this=0x1a15600) >> at /usr/include/boost/statechart/state_machine.hpp:885 >> #31 boost::statechart::state_machine<PG::RecoveryState::RecoveryMachine, >> PG::RecoveryState::Initial, std::allocator<void>, >> boost::statechart::null_exception_translator>::process_queued_events >> (this=0x1a15600) >> at /usr/include/boost/statechart/state_machine.hpp:910 >> #32 0x000000000073e57e in >> boost::statechart::state_machine<PG::RecoveryState::RecoveryMachine, >> PG::RecoveryState::Initial, std::allocator<void>, >> boost::statechart::null_exception_translator>::process_event >> (this=0x1a15600, evt=...) >> at /usr/include/boost/statechart/state_machine.hpp:280 >> #33 0x0000000000716fc5 in PG::RecoveryState::handle_activate_map ( >> this=0x1a15600, rctx=0x7fffea4527d0) at osd/PG.cc:4919 >> #34 0x00000000005bb044 in handle_activate_map (rctx=0x7fffea4527d0, >> this=0x1a14800) at ./osd/PG.h:1413 >> #35 OSD::activate_map (this=0xbc3000, t=..., tfin=...) at osd/OSD.cc:3615 >> #36 0x00000000005d3c9e in OSD::handle_osd_map (this=0xbc3000, m=0x3009e00) >> at osd/OSD.cc:3286 >> #37 0x00000000005d4c6b in OSD::_dispatch (this=0xbc3000, m=0x3009e00) >> at osd/OSD.cc:2780 >> #38 0x00000000005d5275 in OSD::ms_dispatch (this=0xbc3000, m=0x3009e00) >> at osd/OSD.cc:2605 >> #39 0x000000000067a90b in ms_deliver_dispatch (m=0x3009e00, this=0xba8680) >> at msg/Messenger.h:178 >> ---Type <return> to continue, or q <return> to quit--- >> #40 SimpleMessenger::dispatch_entry (this=0xba8680) >> at msg/SimpleMessenger.cc:363 >> #41 0x0000000000648f0d in SimpleMessenger::DispatchThread::entry ( >> this=<optimized out>) at msg/SimpleMessenger.h:560 >> #42 0x00007ffff79c2e9a in start_thread () >> from /lib/x86_64-linux-gnu/libpthread.so.0 >> #43 0x00007ffff5f774bd in clone () from /lib/x86_64-linux-gnu/libc.so.6 >> #44 0x0000000000000000 in ?? () >> >> >> On Mon, Apr 30, 2012 at 6:53 PM, Samuel Just <sam.just@xxxxxxxxxxxxx> wrote: >>> I apologise for the delay in getting back to you. I just pushed a >>> branch called wip-snap-workaround based on v0.45. It should at least >>> avoid the crash you saw. Let me know if you hit further trouble. >>> -Sam >>> >>> On Thu, Apr 26, 2012 at 8:12 AM, Tomasz Paszkowski <ss7pro@xxxxxxxxx> wrote: >>>> Hi, >>>> >>>> Anyone have any idea how to fix this ? Can i just correct conflict >>>> data in osdmaps ? >>>> >>>> >>>> On Wed, Apr 25, 2012 at 3:42 PM, Tomasz Paszkowski <ss7pro@xxxxxxxxx> wrote: >>>>> After removing pool snapshot I was trying to make self managed >>>>> snapshot and after reading source this was the root cause of this >>>>> problem. >>>>> >>>>> >>>>> On Wed, Apr 25, 2012 at 1:24 PM, Tomasz Paszkowski <ss7pro@xxxxxxxxx> wrote: >>>>>> after upgrade to v0.45 stack trace is as follows: >>>>>> >>>>>> Program received signal SIGABRT, Aborted. >>>>>> [Switching to Thread 0x7fffeac55700 (LWP 11011)] >>>>>> 0x00007ffff5ebb445 in raise () from /lib/x86_64-linux-gnu/libc.so.6 >>>>>> (gdb) bt >>>>>> #0 0x00007ffff5ebb445 in raise () from /lib/x86_64-linux-gnu/libc.so.6 >>>>>> #1 0x00007ffff5ebebab in abort () from /lib/x86_64-linux-gnu/libc.so.6 >>>>>> #2 0x00007ffff680969d in __gnu_cxx::__verbose_terminate_handler() () >>>>>> from /usr/lib/x86_64-linux-gnu/libstdc++.so.6 >>>>>> #3 0x00007ffff6807846 in ?? () from /usr/lib/x86_64-linux-gnu/libstdc++.so.6 >>>>>> #4 0x00007ffff6807873 in std::terminate() () >>>>>> from /usr/lib/x86_64-linux-gnu/libstdc++.so.6 >>>>>> #5 0x00007ffff680796e in __cxa_throw () >>>>>> from /usr/lib/x86_64-linux-gnu/libstdc++.so.6 >>>>>> #6 0x0000000000695ec0 in ceph::__ceph_assert_fail ( >>>>>> assertion=0x80b1ed "_size >= 0", file=0x80a222 "./include/interval_set.h", >>>>>> line=382, >>>>>> func=0x81bf60 "void interval_set<T>::erase(T, T) [with T = snapid_t]") >>>>>> at common/assert.cc:75 >>>>>> #7 0x00000000005d1359 in erase (len=..., start=..., this=0xbe5738) >>>>>> at ./include/interval_set.h:382 >>>>>> #8 subtract (a=..., this=0xbe5738) at ./include/interval_set.h:404 >>>>>> #9 OSD::advance_map (this=0xbca000, t=..., tfin=<optimized out>) >>>>>> at osd/OSD.cc:3475 >>>>>> #10 0x00000000005d33bf in OSD::handle_osd_map (this=0xbca000, m=0x2a4c800) >>>>>> at osd/OSD.cc:3272 >>>>>> #11 0x00000000005d4c9b in OSD::_dispatch (this=0xbca000, m=0x2a4c800) >>>>>> at osd/OSD.cc:2780 >>>>>> ---Type <return> to continue, or q <return> to quit--- >>>>>> #12 0x00000000005d52a5 in OSD::ms_dispatch (this=0xbca000, m=0x2a4c800) >>>>>> at osd/OSD.cc:2605 >>>>>> #13 0x000000000067a91b in ms_deliver_dispatch (m=0x2a4c800, this=0xba8680) >>>>>> at msg/Messenger.h:178 >>>>>> #14 SimpleMessenger::dispatch_entry (this=0xba8680) >>>>>> at msg/SimpleMessenger.cc:363 >>>>>> #15 0x0000000000648f1d in SimpleMessenger::DispatchThread::entry ( >>>>>> this=<optimized out>) at msg/SimpleMessenger.h:560 >>>>>> #16 0x00007ffff79c2e9a in start_thread () >>>>>> from /lib/x86_64-linux-gnu/libpthread.so.0 >>>>>> #17 0x00007ffff5f774bd in clone () from /lib/x86_64-linux-gnu/libc.so.6 >>>>>> #18 0x0000000000000000 in ?? () >>>>>> >>>>>> >>>>>> On Wed, Apr 25, 2012 at 12:11 PM, Tomasz Paszkowski <ss7pro@xxxxxxxxx> wrote: >>>>>>> osd dump is like this: >>>>>>> >>>>>>> pool 0 'data' rep size 2 crush_ruleset 0 object_hash rjenkins pg_num >>>>>>> 768 pgp_num 768 lpg_num 2 lpgp_num 2 last_change 1 owner 0 >>>>>>> crash_replay_interval 45 >>>>>>> pool 1 'metadata' rep size 2 crush_ruleset 1 object_hash rjenkins >>>>>>> pg_num 768 pgp_num 768 lpg_num 2 lpgp_num 2 last_change 1 owner 0 >>>>>>> pool 2 'rbd' rep size 2 crush_ruleset 2 object_hash rjenkins pg_num >>>>>>> 768 pgp_num 768 lpg_num 2 lpgp_num 2 last_change 1 owner 0 >>>>>>> pool 9 'nova' rep size 2 crush_ruleset 0 object_hash rjenkins pg_num >>>>>>> 2568 pgp_num 2568 lpg_num 0 lpgp_num 0 last_change 1435 owner >>>>>>> 18446744073709551615 >>>>>>> removed_snaps [1~1] >>>>>>> pool 10 'glance' rep size 2 crush_ruleset 0 object_hash rjenkins >>>>>>> pg_num 2568 pgp_num 2568 lpg_num 0 lpgp_num 0 last_change 132 owner >>>>>>> 18446744073709551615 >>>>>>> >>>>>>> >>>>>>> On Wed, Apr 25, 2012 at 11:04 AM, Tomasz Paszkowski <ss7pro@xxxxxxxxx> wrote: >>>>>>>> Hi, >>>>>>>> >>>>>>>> After making and removing snapshot from one of the pools, all of the >>>>>>>> osd in cluster are dying with log like below: >>>>>>>> >>>>>>>> >>>>>>>> 2012-04-25 11:01:00.938313 7f66694b9700 osd.1 1434 removing old >>>>>>>> osdmap epoch 966 >>>>>>>> 2012-04-25 11:01:00.938330 7f66694b9700 osd.1 1434 removing old >>>>>>>> osdmap epoch 967 >>>>>>>> 2012-04-25 11:01:00.938348 7f66694b9700 osd.1 1434 advance to epoch >>>>>>>> 1435 (<= newest 1470) >>>>>>>> 2012-04-25 11:01:00.939437 7f66694b9700 osd.1 1435 advance_map epoch >>>>>>>> 1435 1325 pgs >>>>>>>> 2012-04-25 11:01:00.939455 7f66694b9700 osd.1 1435 pool 0 removed >>>>>>>> snaps [], unchanged (snap_epoch = 0) >>>>>>>> 2012-04-25 11:01:00.939469 7f66694b9700 osd.1 1435 pool 1 removed >>>>>>>> snaps [], unchanged (snap_epoch = 0) >>>>>>>> 2012-04-25 11:01:00.939482 7f66694b9700 osd.1 1435 pool 2 removed >>>>>>>> snaps [], unchanged (snap_epoch = 0) >>>>>>>> ./include/interval_set.h: In function 'void interval_set<T>::erase(T, >>>>>>>> T) [with T = snapid_t]' thread 7f66694b9700 time 2012-04-25 >>>>>>>> 11:01:00.939509 >>>>>>>> ./include/interval_set.h: 382: FAILED assert(_size >= 0) >>>>>>>> ceph version 0.44.1 (commit:c89b7f22c8599eb974e75a2f7a5f855358199dee) >>>>>>>> 1: (OSD::advance_map(ObjectStore::Transaction&, C_Contexts*)+0x2971) [0x5cfb51] >>>>>>>> 2: (OSD::handle_osd_map(MOSDMap*)+0x193c) [0x5d162c] >>>>>>>> 3: (OSD::_dispatch(Message*)+0x2eb) [0x5d34fb] >>>>>>>> 4: (OSD::ms_dispatch(Message*)+0x129) [0x5d3a59] >>>>>>>> 5: (SimpleMessenger::dispatch_entry()+0x78b) [0x67513b] >>>>>>>> 6: (SimpleMessenger::DispatchThread::entry()+0xd) [0x52124d] >>>>>>>> 7: (()+0x7e9a) [0x7f6676226e9a] >>>>>>>> 8: (clone()+0x6d) [0x7f66747db4bd] >>>>>>>> ceph version 0.44.1 (commit:c89b7f22c8599eb974e75a2f7a5f855358199dee) >>>>>>>> 1: (OSD::advance_map(ObjectStore::Transaction&, C_Contexts*)+0x2971) [0x5cfb51] >>>>>>>> 2: (OSD::handle_osd_map(MOSDMap*)+0x193c) [0x5d162c] >>>>>>>> 3: (OSD::_dispatch(Message*)+0x2eb) [0x5d34fb] >>>>>>>> 4: (OSD::ms_dispatch(Message*)+0x129) [0x5d3a59] >>>>>>>> 5: (SimpleMessenger::dispatch_entry()+0x78b) [0x67513b] >>>>>>>> 6: (SimpleMessenger::DispatchThread::entry()+0xd) [0x52124d] >>>>>>>> 7: (()+0x7e9a) [0x7f6676226e9a] >>>>>>>> 8: (clone()+0x6d) [0x7f66747db4bd] >>>>>>>> *** Caught signal (Aborted) ** >>>>>>>> in thread 7f66694b9700 >>>>>>>> ceph version 0.44.1 (commit:c89b7f22c8599eb974e75a2f7a5f855358199dee) >>>>>>>> 1: /usr/bin/ceph-osd() [0x6fa0c6] >>>>>>>> 2: (()+0xfcb0) [0x7f667622ecb0] >>>>>>>> 3: (gsignal()+0x35) [0x7f667471f445] >>>>>>>> 4: (abort()+0x17b) [0x7f6674722bab] >>>>>>>> 5: (__gnu_cxx::__verbose_terminate_handler()+0x11d) [0x7f667506d69d] >>>>>>>> 6: (()+0xb5846) [0x7f667506b846] >>>>>>>> 7: (()+0xb5873) [0x7f667506b873] >>>>>>>> 8: (()+0xb596e) [0x7f667506b96e] >>>>>>>> 9: (ceph::__ceph_assert_fail(char const*, char const*, int, char >>>>>>>> const*)+0x200) [0x68f420] >>>>>>>> 10: (OSD::advance_map(ObjectStore::Transaction&, C_Contexts*)+0x2971) >>>>>>>> [0x5cfb51] >>>>>>>> 11: (OSD::handle_osd_map(MOSDMap*)+0x193c) [0x5d162c] >>>>>>>> 12: (OSD::_dispatch(Message*)+0x2eb) [0x5d34fb] >>>>>>>> 13: (OSD::ms_dispatch(Message*)+0x129) [0x5d3a59] >>>>>>>> 14: (SimpleMessenger::dispatch_entry()+0x78b) [0x67513b] >>>>>>>> 15: (SimpleMessenger::DispatchThread::entry()+0xd) [0x52124d] >>>>>>>> 16: (()+0x7e9a) [0x7f6676226e9a] >>>>>>>> 17: (clone()+0x6d) [0x7f66747db4bd] >>>>>>>> >>>>>>>> >>>>>>>> -- >>>>>>>> Tomasz Paszkowski >>>>>>>> SS7, Asterisk, SAN, Datacenter, Cloud Computing >>>>>>>> +48500166299 >>>>>>> >>>>>>> >>>>>>> >>>>>>> -- >>>>>>> Tomasz Paszkowski >>>>>>> SS7, Asterisk, SAN, Datacenter, Cloud Computing >>>>>>> +48500166299 >>>>>> >>>>>> >>>>>> >>>>>> -- >>>>>> Tomasz Paszkowski >>>>>> SS7, Asterisk, SAN, Datacenter, Cloud Computing >>>>>> +48500166299 >>>>> >>>>> >>>>> >>>>> -- >>>>> Tomasz Paszkowski >>>>> SS7, Asterisk, SAN, Datacenter, Cloud Computing >>>>> +48500166299 >>>> >>>> >>>> >>>> -- >>>> Tomasz Paszkowski >>>> SS7, Asterisk, SAN, Datacenter, Cloud Computing >>>> +48500166299 >>>> -- >>>> To unsubscribe from this list: send the line "unsubscribe ceph-devel" in >>>> the body of a message to majordomo@xxxxxxxxxxxxxxx >>>> More majordomo info at http://vger.kernel.org/majordomo-info.html >> >> >> >> -- >> Tomasz Paszkowski >> SS7, Asterisk, SAN, Datacenter, Cloud Computing >> +48500166299 >> -- >> To unsubscribe from this list: send the line "unsubscribe ceph-devel" in >> the body of a message to majordomo@xxxxxxxxxxxxxxx >> More majordomo info at http://vger.kernel.org/majordomo-info.html -- Tomasz Paszkowski SS7, Asterisk, SAN, Datacenter, Cloud Computing +48500166299 -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html