The same osd crashed today: 0> 2022-10-24T06:30:00.875+0000 7f0bbf3bc700 -1 *** Caught signal (Segmentation fault) ** in thread 7f0bbf3bc700 thread_name:bstore_kv_final ceph version 16.2.10 (45fa1a083152e41a408d15505f594ec5f1b4fe17) pacific (stable) 1: /lib64/libpthread.so.0(+0x12ce0) [0x7f0bcee65ce0] 2: (BlueStore::Onode::put()+0x29f) [0x562c2ea293df] 3: (std::_Rb_tree<boost::intrusive_ptr<BlueStore::Onode>, boost::intrusive_ptr<BlueStore::Onode>, std::_Identity<boost::intrusive_ptr<BlueStore::Onode> >, std::less<boost::intrusive_ptr<BlueStore::Onode> >, std::allocator<boost::intrusive_ptr<BlueStore::Onode> > >::_M_erase(std::_Rb_tree_node<boost::intrusive_ptr<BlueStore::Onode> >*)+0x31) [0x562c2eade4d1] 4: (BlueStore::TransContext::~TransContext()+0x12f) [0x562c2eade7ff] 5: (BlueStore::_txc_finish(BlueStore::TransContext*)+0x23e) [0x562c2ea8969e] 6: (BlueStore::_txc_state_proc(BlueStore::TransContext*)+0x257) [0x562c2ea95d87] 7: (BlueStore::_kv_finalize_thread()+0x54e) [0x562c2eaafdae] 8: (BlueStore::KVFinalizeThread::entry()+0x11) [0x562c2eae3d71] 9: /lib64/libpthread.so.0(+0x81ca) [0x7f0bcee5b1ca] 10: clone() NOTE: a copy of the executable, or `objdump -rdS <executable>` is needed to interpret this. Tyler Stachecki <stachecki.tyler@xxxxxxxxx> 于2022年10月24日周一 11:24写道: > On Sun, Oct 23, 2022 at 11:04 PM can zhu <zhucan.k8s@xxxxxxxxx> wrote: > > > > crash info: > > > > { > > "backtrace": [ > > "/lib64/libpthread.so.0(+0x12ce0) [0x7f82e87cece0]", > > "(BlueStore::Onode::put()+0x1a3) [0x55bd21a422e3]", > > "(std::_Hashtable<ghobject_t, std::pair<ghobject_t const, > > boost::intrusive_ptr<BlueStore::Onode> >, > > mempool::pool_allocator<(mempool::pool_index_t)4, std::pair<ghobject_t > > const, boost::intrusive_ptr<BlueStore::Onode> > >, > > std::__detail::_Select1st, std::equal_to<ghobject_t>, > > std::hash<ghobject_t>, std::__detail::_Mod_range_hashing, > > std::__detail::_Default_ranged_hash, std::__detail::_Prime_rehash_policy, > > std::__detail::_Hashtable_traits<true, false, true> >::_M_erase(unsigned > > long, std::__detail::_Hash_node_base*, > > std::__detail::_Hash_node<std::pair<ghobject_t const, > > boost::intrusive_ptr<BlueStore::Onode> >, true>*)+0x68) > [0x55bd21af7bc8]", > > "(BlueStore::OnodeSpace::_remove(ghobject_t const&)+0x29b) > > [0x55bd21a420eb]", > > "(LruOnodeCacheShard::_trim_to(unsigned long)+0xeb) > > [0x55bd21afce6b]", > > "(BlueStore::OnodeSpace::add(ghobject_t const&, > > boost::intrusive_ptr<BlueStore::Onode>&)+0x49d) [0x55bd21a42dfd]", > > "(BlueStore::Collection::get_onode(ghobject_t const&, bool, > > bool)+0x46a) [0x55bd21a86e1a]", > > "(BlueStore::_txc_add_transaction(BlueStore::TransContext*, > > ceph::os::Transaction*)+0x1124) [0x55bd21aab2e4]", > > > > > "(BlueStore::queue_transactions(boost::intrusive_ptr<ObjectStore::CollectionImpl>&, > > std::vector<ceph::os::Transaction, std::allocator<ceph::os::Transaction> > > >&, boost::intrusive_ptr<TrackedOp>, ThreadPool::TPHandle*)+0x316) > > [0x55bd21ac76d6]", > > "(non-virtual thunk to > > PrimaryLogPG::queue_transactions(std::vector<ceph::os::Transaction, > > std::allocator<ceph::os::Transaction> >&, > > boost::intrusive_ptr<OpRequest>)+0x58) [0x55bd2170b878]", > > > > "(ReplicatedBackend::do_repop(boost::intrusive_ptr<OpRequest>)+0xeb0) > > [0x55bd218fdff0]", > > > > > "(ReplicatedBackend::_handle_message(boost::intrusive_ptr<OpRequest>)+0x267) > > [0x55bd2190e357]", > > > "(PGBackend::handle_message(boost::intrusive_ptr<OpRequest>)+0x52) > > [0x55bd2173ed52]", > > "(PrimaryLogPG::do_request(boost::intrusive_ptr<OpRequest>&, > > ThreadPool::TPHandle&)+0x5de) [0x55bd216e268e]", > > "(OSD::dequeue_op(boost::intrusive_ptr<PG>, > > boost::intrusive_ptr<OpRequest>, ThreadPool::TPHandle&)+0x309) > > [0x55bd21569fc9]", > > "(ceph::osd::scheduler::PGOpItem::run(OSD*, OSDShard*, > > boost::intrusive_ptr<PG>&, ThreadPool::TPHandle&)+0x68) > [0x55bd217c8e78]", > > "(OSD::ShardedOpWQ::_process(unsigned int, > > ceph::heartbeat_handle_d*)+0xc28) [0x55bd215874c8]", > > "(ShardedThreadPool::shardedthreadpool_worker(unsigned > int)+0x5c4) > > [0x55bd21c042a4]", > > "(ShardedThreadPool::WorkThreadSharded::entry()+0x14) > > [0x55bd21c07184]", > > "/lib64/libpthread.so.0(+0x81ca) [0x7f82e87c41ca]", > > "clone()" > > ], > > "ceph_version": "16.2.10", > > "crash_id": > > "2022-10-21T22:15:37.801458Z_ded9e80e-6c08-4502-8887-0438cdcd4a1c", > > "entity_name": "osd.21", > > "os_id": "centos", > > "os_name": "CentOS Stream", > > "os_version": "8", > > "os_version_id": "8", > > "process_name": "ceph-osd", > > "stack_sig": > > "a43cb3d3dcfcda8bffe97d30a1cb9c244ba20d595a2c8759a5cc8274781b0020", > > "timestamp": "2022-10-21T22:15:37.801458Z", > > "utsname_hostname": "node07", > > "utsname_machine": "x86_64", > > "utsname_release": "3.10.0-1160.45.1.el7.x86_64", > > "utsname_sysname": "Linux", > > "utsname_version": "#1 SMP Wed Oct 13 17:20:51 UTC 2021" > > } > > > > message info: > > [root@node07 ~]# dmesg -T | grep osd > > [Sat Oct 22 06:18:46 2022] tp_osd_tp[23817]: segfault at 55bd00000001 ip > > 00007f82e931cd6a sp 00007f82c6592f08 error 4 in > > libtcmalloc.so.4.5.3[7f82e92e1000+4d000] > > _______________________________________________ > > ceph-users mailing list -- ceph-users@xxxxxxx > > To unsubscribe send an email to ceph-users-leave@xxxxxxx > > There is currently a known race condition in onode reference counting > that affects all versions of Ceph [1][2]. Your backtrace is different > from everything else I've seen so far, though the top function being > Onode::put under a trimming operation is very suspicious. If you can > reproduce this and follow the tracker/report back after the fix, it > may be helpful to the community. > > Cheers, > Tyler > > [1] https://tracker.ceph.com/issues/57895 > [2] https://tracker.ceph.com/issues/56382 > _______________________________________________ ceph-users mailing list -- ceph-users@xxxxxxx To unsubscribe send an email to ceph-users-leave@xxxxxxx