Hi, We added 2 new JBODs (each with 60disks - 1PB) and wanted to extend our 3PB cluster to 5PB. We tried to do several approaches: Add jbods with weight 0 and reweight them slowly with 0.01 - This seems to be very slow... We change their weight to use it at max and cluster started to recover/rebalance things around. (with ~20% of objects not in place). All went smooth for a few hours, but now some random OSDs crash with similar error messages below [1]. Is this known? Or I am pushing it too much? What might be the reason for this crash? Let me know if you need more details Thanks [1] { "assert_condition": "abort", "assert_file": "/home/jenkins-build/build/workspace/ceph-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos8/DIST/centos8/MACHINE_SIZE/gigantic/release/15.2.13/rpm/el8/BUILD/ceph-15.2.13/src/osd/PeeringState.cc", "assert_func": "PeeringState::Crashed::Crashed(boost::statechart::state<PeeringState::Crashed, PeeringState::PeeringMachine>::my_context)", "assert_line": 4243, "assert_msg": "/home/jenkins-build/build/workspace/ceph-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos8/DIST/centos8/MACHINE_SIZE/gigantic/release/15.2.13/rpm/el8/BUILD/ceph-15.2.13/src/osd/PeeringState.cc: In function 'PeeringState::Crashed::Crashed(boost::statechart::state<PeeringState::Crashed, PeeringState::PeeringMachine>::my_context)' thread 7f7cd0dec700 time 2021-07-13T15:50:44.086598-0700\n/home/jenkins-build/build/workspace/ceph-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos8/DIST/centos8/MACHINE_SIZE/gigantic/release/15.2.13/rpm/el8/BUILD/ceph-15.2.13/src/osd/PeeringState.cc: 4243: ceph_abort_msg(\"we got a bad state machine event\")\n", "assert_thread_name": "tp_osd_tp", "backtrace": [ "(()+0x12b20) [0x7f7cf1474b20]", "(gsignal()+0x10f) [0x7f7cf00db7ff]", "(abort()+0x127) [0x7f7cf00c5c35]", "(ceph::__ceph_abort(char const*, int, char const*, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&)+0x1b6) [0x55981fd083e9]", "(PeeringState::Crashed::Crashed(boost::statechart::state<PeeringState::Crashed, PeeringState::PeeringMachine, boost::mpl::list<mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na>, (boost::statechart::history_mode)0>::my_context)+0xc4) [0x559820082714]", "(boost::statechart::state<PeeringState::Crashed, PeeringState::PeeringMachine, boost::mpl::list<mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na>, (boost::statechart::history_mode)0>::deep_construct(boost::statechart::state_machine<PeeringState::PeeringMachine, PeeringState::Initial, std::allocator<boost::statechart::none>, boost::statechart::null_exception_translator>* const&, boost::statechart::state_machine<PeeringState::PeeringMachine, PeeringState::Initial, std::allocator<boost::statechart::none>, boost::statechart::null_exception_translator>&)+0x3a) [0x5598200b812a]", "(boost::statechart::simple_state<PeeringState::Primary, PeeringState::Started, PeeringState::Peering, (boost::statechart::history_mode)0>::react_impl(boost::statechart::event_base const&, void const*)+0x21a) [0x5598200b8eda]", "(boost::statechart::simple_state<PeeringState::Backfilling, PeeringState::Active, boost::mpl::list<mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na>, (boost::statechart::history_mode)0>::react_impl(boost::statechart::event_base const&, void const*)+0xc0) [0x5598200b2ac0]", "(boost::statechart::state_machine<PeeringState::PeeringMachine, PeeringState::Initial, std::allocator<boost::statechart::none>, boost::statechart::null_exception_translator>::process_event(boost::statechart::event_base const&)+0x5b) [0x55981feac2ab]", "(PG::do_peering_event(std::shared_ptr<PGPeeringEvent>, PeeringCtx&)+0x2d1) [0x55981fe9e8a1]", "(OSD::dequeue_peering_evt(OSDShard*, PG*, std::shared_ptr<PGPeeringEvent>, ThreadPool::TPHandle&)+0x29c) [0x55981fe15c7c]", "(ceph::osd::scheduler::PGPeeringItem::run(OSD*, OSDShard*, boost::intrusive_ptr<PG>&, ThreadPool::TPHandle&)+0x56) [0x559820047906]", "(OSD::ShardedOpWQ::_process(unsigned int, ceph::heartbeat_handle_d*)+0x12ef) [0x55981fe0892f]", "(ShardedThreadPool::shardedthreadpool_worker(unsigned int)+0x5c4) [0x559820448f84]", "(ShardedThreadPool::WorkThreadSharded::entry()+0x14) [0x55982044bbe4]", "(()+0x814a) [0x7f7cf146a14a]", "(clone()+0x43) [0x7f7cf01a0f23]" ], "ceph_version": "15.2.13", "crash_id": "2021-07-13T22:50:44.104753Z_40759660-ecd8-420d-8c00-bac2c1c46760", "entity_name": "osd.288", "os_id": "centos", "os_name": "CentOS Linux", "os_version": "8", "os_version_id": "8", "process_name": "ceph-osd", "stack_sig": "c31b62beceef1c1fcd1b8ea9db822deef8e9680a5c65bbb76a4c8ef8b2a0f3a1", "timestamp": "2021-07-13T22:50:44.104753Z", "utsname_hostname": "data-16-1.tier2", "utsname_machine": "x86_64", "utsname_release": "5.11.16-1.el8.elrepo.x86_64", "utsname_sysname": "Linux", "utsname_version": "#1 SMP Mon Apr 19 19:16:48 EDT 2021" } _______________________________________________ ceph-users mailing list -- ceph-users@xxxxxxx To unsubscribe send an email to ceph-users-leave@xxxxxxx