Hi,
I had an osd crash yesterday, with 15.2.7.
seem similar:
ceph crash info
2020-12-13T02:37:57.475315Z_63f91999-ca9c-49a5-b381-5fad9780dbbb
{
"backtrace": [
"(()+0x12730) [0x7f6bccbb5730]",
"(std::_Rb_tree<boost::intrusive_ptr<AsyncConnection>,
boost::intrusive_ptr<AsyncConnection>,
std::_Identity<boost::intrusive_ptr<AsyncConnection> >,
std::less<boost::intrusive_ptr<AsyncConnection> >,
std::allocator<boost::intrusive_ptr<AsyncConnection> >
>::find(boost::intrusive_ptr<AsyncConnection> const&) const+0x24)
[0x559442799394]",
"(AsyncConnection::_stop()+0xa7) [0x5594427939d7]",
"(ProtocolV2::stop()+0x8b) [0x5594427bb41b]",
"(ProtocolV2::_fault()+0x6b) [0x5594427bb59b]",
"(ProtocolV2::handle_read_frame_preamble_main(std::unique_ptr<ceph::buffer::v15_2_0::ptr_node,
ceph::buffer::v15_2_0::ptr_node::disposer>&&, int)+0x328)
[0x5594427d15e8]",
"(ProtocolV2::run_continuation(Ct<ProtocolV2>&)+0x34) [0x5594427bc114]",
"(AsyncConnection::process()+0x79c) [0x55944279682c]",
"(EventCenter::process_events(unsigned int,
std::chrono::duration<unsigned long, std::ratio<1l, 1000000000l>
>*)+0xa2d) [0x5594425fa91d]",
"(()+0x11f41cb) [0x5594426001cb]",
"(()+0xbbb2f) [0x7f6bcca7ab2f]",
"(()+0x7fa3) [0x7f6bccbaafa3]",
"(clone()+0x3f) [0x7f6bcc7584cf]"
],
"ceph_version": "15.2.7",
"crash_id":
"2020-12-13T02:37:57.475315Z_63f91999-ca9c-49a5-b381-5fad9780dbbb",
"entity_name": "osd.57",
"os_id": "10",
"os_name": "Debian GNU/Linux 10 (buster)",
"os_version": "10 (buster)",
"os_version_id": "10",
"process_name": "ceph-osd",
"stack_sig":
"897fe7f6bf2184fafd5b8a29905a147cb66850db318f6e874292a278aeb615bb",
"timestamp": "2020-12-13T02:37:57.475315Z",
"utsname_hostname": "ceph5-9",
"utsname_machine": "x86_64",
"utsname_release": "4.19.0-11-amd64",
"utsname_sysname": "Linux",
"utsname_version": "#1 SMP Debian 4.19.146-1 (2020-09-17)"
}
On 02/12/2020 20:43, Ivan Kurnosov wrote:
Hi Team,
this night I have caught the following segfault.
Nothing else looks suspicious (but I'm a quite newbie in ceph management,
so perhaps just don't know where to look at).
I could not google any similar segfault from anybody else.
Was it a known problem fixed in later versions?
The cluster has been running for quite a while now (several months) and
this has happened for the first time.
```
debug 0> 2020-12-02T10:31:09.295+0000 7f3e61943700 -1 *** Caught
signal (Segmentation fault) **
in thread 7f3e61943700 thread_name:msgr-worker-1
ceph version 15.2.4 (7447c15c6ff58d7fce91843b705a268a1917325c) octopus
(stable)
1: (()+0x12dd0) [0x7f3e65933dd0]
2: (std::_Rb_tree<boost::intrusive_ptr<AsyncConnection>,
boost::intrusive_ptr<AsyncConnection>,
std::_Identity<boost::intrusive_ptr<AsyncConnection> >,
std::less<boost::intrusive_ptr<AsyncConnection> >,
std::allocator<boost::intrusive_ptr<AsyncConnection> >
::find(boost::intrusive_ptr<AsyncConnection> const&) const+0x2c)
[0x55cee407ca7c]
3: (AsyncConnection::_stop()+0xab) [0x55cee40767eb]
4: (ProtocolV2::stop()+0x8f) [0x55cee40a189f]
5: (ProtocolV2::_fault()+0x133) [0x55cee40a1b03]
6:
(ProtocolV2::handle_read_frame_preamble_main(std::unique_ptr<ceph::buffer::v15_2_0::ptr_node,
ceph::buffer::v15_2_0::ptr_node::disposer>&&, int)+0x551) [0x55cee40a63d1]
7: (ProtocolV2::run_continuation(Ct<ProtocolV2>&)+0x3c) [0x55cee40a273c]
8: (AsyncConnection::process()+0x8a9) [0x55cee4079ab9]
9: (EventCenter::process_events(unsigned int,
std::chrono::duration<unsigned long, std::ratio<1l, 1000000000l> >*)+0xcb7)
[0x55cee3eceb67]
10: (()+0xdb914c) [0x55cee3ed414c]
11: (()+0xc2b73) [0x7f3e64f83b73]
12: (()+0x82de) [0x7f3e659292de]
13: (clone()+0x43) [0x7f3e64660e83]
NOTE: a copy of the executable, or `objdump -rdS <executable>` is needed
to interpret this.
--- logging levels ---
0/ 5 none
0/ 1 lockdep
0/ 1 context
1/ 1 crush
1/ 5 mds
1/ 5 mds_balancer
1/ 5 mds_locker
1/ 5 mds_log
1/ 5 mds_log_expire
1/ 5 mds_migrator
0/ 1 buffer
0/ 1 timer
0/ 1 filer
0/ 1 striper
0/ 1 objecter
0/ 5 rados
0/ 5 rbd
0/ 5 rbd_mirror
0/ 5 rbd_replay
0/ 5 rbd_rwl
0/ 5 journaler
0/ 5 objectcacher
0/ 5 immutable_obj_cache
0/ 5 client
1/ 5 osd
0/ 5 optracker
0/ 5 objclass
1/ 3 filestore
1/ 3 journal
0/ 0 ms
1/ 5 mon
0/10 monc
1/ 5 paxos
0/ 5 tp
1/ 5 auth
1/ 5 crypto
1/ 1 finisher
1/ 1 reserver
1/ 5 heartbeatmap
1/ 5 perfcounter
1/ 5 rgw
1/ 5 rgw_sync
1/10 civetweb
1/ 5 javaclient
1/ 5 asok
1/ 1 throttle
0/ 0 refs
1/ 5 compressor
1/ 5 bluestore
1/ 5 bluefs
1/ 3 bdev
1/ 5 kstore
4/ 5 rocksdb
4/ 5 leveldb
4/ 5 memdb
1/ 5 fuse
1/ 5 mgr
1/ 5 mgrc
1/ 5 dpdk
1/ 5 eventtrace
1/ 5 prioritycache
0/ 5 test
-2/-2 (syslog threshold)
99/99 (stderr threshold)
--- pthread ID / name mapping for recent threads ---
7f3e3d88b700 / osd_srv_heartbt
7f3e41893700 / tp_osd_tp
7f3e4589b700 / tp_osd_tp
7f3e4f8af700 / rocksdb:dump_st
7f3e52a8c700 / safe_timer
7f3e53a8e700 / ms_dispatch
7f3e566b7700 / bstore_mempool
7f3e5d0ca700 / safe_timer
7f3e61943700 / msgr-worker-1
7f3e62144700 / msgr-worker-0
max_recent 10000
max_new 1000
log_file
/var/lib/ceph/crash/2020-12-02T10:31:09.301492Z_84e8430f-30fd-469f-8e22-c2e1ccc675da/log
--- end dump of recent events ---
reraise_fatal: default handler for signal 11 didn't terminate the process?
```
_______________________________________________
ceph-users mailing list -- ceph-users@xxxxxxx
To unsubscribe send an email to ceph-users-leave@xxxxxxx