Hello,
one of our mgrs is constantly dying.
Everything worked fine for a long time but now it happens every two
weeks or so.
We have two clusters. Both use the same ceph version 14.2.8. Each
cluster hosts three ceph-mgrs.
Only one and always the same ceph-mgr is dying on the same machine on
one of the two clusters.
The net shows a tracker ticket:
https://tracker.ceph.com/issues/24995
However it affects Ceph 12.
I did not find any hardware issues yet, maybe just a reboot helps, but
the log shows the following regarding prometheus:
*** Caught signal (Segmentation fault) **
in thread 7fb3f752f700 thread_name:prometheus
ceph version 14.2.8 (2d095e947a02261ce61424021bb43bd3022d35cb) nautilus
(stable)
1: (()+0x12890) [0x7fb4205d1890]
2:
(ceph::buffer::v14_2_0::ptr_node::cloner::operator()(ceph::buffer::v14_2_0::ptr_node
const&)+0x40) [0x7fb42158ca60]
3: (()+0xdd89c) [0x55fc4b5a889c]
4: (ActivePyModules::get_python(std::__cxx11::basic_string<char,
std::char_traits<char>, std::allocator<char> > const&)+0x1cb0)
[0x55fc4b60e3e0]
5: (()+0x1524ab) [0x55fc4b61d4ab]
[...]
The crash log shows:
{
"os_version_id": "18.04",
"utsname_release": "4.15.0-76-generic",
"os_name": "Ubuntu",
"entity_name": "mgr.xxxx",
"timestamp": "xxxx",
"process_name": "ceph-mgr",
"utsname_machine": "x86_64",
"utsname_sysname": "Linux",
"os_version": "18.04.4 LTS (Bionic Beaver)",
"os_id": "ubuntu",
"utsname_version": "#86-Ubuntu SMP Fri Jan 17 17:24:28 UTC 2020",
"backtrace": [
"(()+0x12890) [0x7fb4205d1890]",
"(ceph::buffer::v14_2_0::ptr_node::cloner::operator()(ceph::buffer::v14_2_0::ptr_node
const&)+0x40) [0x7fb42158ca60]",
"(()+0xdd89c) [0x55fc4b5a889c]",
"(ActivePyModules::get_python(std::__cxx11::basic_string<char,
std::char_traits<char>, std::allocator<char> > const&)+0x1cb0)
[0x55fc4b60e3e0]",
"(()+0x1524ab) [0x55fc4b61d4ab]",
"(PyEval_EvalFrameEx()+0x8010) [0x7fb420af0770]",
"(PyEval_EvalFrameEx()+0x8b5b) [0x7fb420af12bb]",
"(PyEval_EvalFrameEx()+0x8b5b) [0x7fb420af12bb]",
"(PyEval_EvalFrameEx()+0x8b5b) [0x7fb420af12bb]",
"(PyEval_EvalCodeEx()+0x7d8) [0x7fb420c2d908]",
"(PyEval_EvalFrameEx()+0x5bf6) [0x7fb420aee356]",
"(PyEval_EvalCodeEx()+0x7d8) [0x7fb420c2d908]",
"(()+0x17a71d) [0x7fb420bc871d]",
"(PyObject_Call()+0x43) [0x7fb420bc2903]",
"(PyEval_EvalFrameEx()+0x5314) [0x7fb420aeda74]",
"(PyEval_EvalCodeEx()+0x7d8) [0x7fb420c2d908]",
"(()+0x17a71d) [0x7fb420bc871d]",
"(PyObject_Call()+0x43) [0x7fb420bc2903]",
"(()+0x1a1eec) [0x7fb420befeec]",
"(PyObject_Call()+0x43) [0x7fb420bc2903]",
"(()+0x12030a) [0x7fb420b6e30a]",
"(PyObject_Call()+0x43) [0x7fb420bc2903]",
"(PyEval_EvalFrameEx()+0x5314) [0x7fb420aeda74]",
"(PyEval_EvalCodeEx()+0x7d8) [0x7fb420c2d908]",
"(()+0x17a639) [0x7fb420bc8639]",
"(PyObject_Call()+0x43) [0x7fb420bc2903]",
"(()+0x1a1eec) [0x7fb420befeec]",
"(PyObject_Call()+0x43) [0x7fb420bc2903]",
"(()+0x1a2704) [0x7fb420bf0704]",
"(PyObject_Call()+0x43) [0x7fb420bc2903]",
"(PyEval_EvalFrameEx()+0x41e1) [0x7fb420aec941]",
"(PyEval_EvalFrameEx()+0x8b5b) [0x7fb420af12bb]",
"(PyEval_EvalFrameEx()+0x8b5b) [0x7fb420af12bb]",
"(PyEval_EvalFrameEx()+0x8b5b) [0x7fb420af12bb]",
"(PyEval_EvalCodeEx()+0x7d8) [0x7fb420c2d908]",
"(()+0x17a639) [0x7fb420bc8639]",
"(PyObject_Call()+0x43) [0x7fb420bc2903]",
"(()+0x1a1eec) [0x7fb420befeec]",
"(PyObject_Call()+0x43) [0x7fb420bc2903]",
"(()+0x11f862) [0x7fb420b6d862]",
"(()+0x1240ca) [0x7fb420b720ca]",
"(PyObject_Call()+0x43) [0x7fb420bc2903]",
"(PyEval_EvalFrameEx()+0x41e1) [0x7fb420aec941]",
"(PyEval_EvalFrameEx()+0x8b5b) [0x7fb420af12bb]",
"(PyEval_EvalCodeEx()+0x7d8) [0x7fb420c2d908]",
"(()+0x17a71d) [0x7fb420bc871d]",
"(PyObject_Call()+0x43) [0x7fb420bc2903]",
"(()+0x1a1eec) [0x7fb420befeec]",
"(PyObject_Call()+0x43) [0x7fb420bc2903]",
"(()+0x12030a) [0x7fb420b6e30a]",
"(PyObject_Call()+0x43) [0x7fb420bc2903]",
"(PyEval_EvalFrameEx()+0x5314) [0x7fb420aeda74]",
"(PyEval_EvalCodeEx()+0x7d8) [0x7fb420c2d908]",
"(PyEval_EvalFrameEx()+0x5bf6) [0x7fb420aee356]",
"(PyEval_EvalCodeEx()+0x7d8) [0x7fb420c2d908]",
"(()+0x17a639) [0x7fb420bc8639]",
"(PyObject_Call()+0x43) [0x7fb420bc2903]",
"(()+0x1a1eec) [0x7fb420befeec]",
"(PyObject_Call()+0x43) [0x7fb420bc2903]",
"(()+0x11f862) [0x7fb420b6d862]",
"(()+0x1240ca) [0x7fb420b720ca]",
"(PyObject_Call()+0x43) [0x7fb420bc2903]",
"(PyEval_EvalFrameEx()+0x41e1) [0x7fb420aec941]",
"(PyEval_EvalCodeEx()+0x7d8) [0x7fb420c2d908]",
"(()+0x17a639) [0x7fb420bc8639]",
"(PyObject_Call()+0x43) [0x7fb420bc2903]",
"(()+0x1a1eec) [0x7fb420befeec]",
"(PyObject_Call()+0x43) [0x7fb420bc2903]",
"(()+0x12030a) [0x7fb420b6e30a]",
"(PyObject_Call()+0x43) [0x7fb420bc2903]",
"(PyEval_EvalFrameEx()+0x41e1) [0x7fb420aec941]",
"(PyEval_EvalCodeEx()+0x7d8) [0x7fb420c2d908]",
"(()+0x17a639) [0x7fb420bc8639]",
"(PyObject_Call()+0x43) [0x7fb420bc2903]",
"(()+0x1a1eec) [0x7fb420befeec]",
"(PyObject_Call()+0x43) [0x7fb420bc2903]",
"(()+0x12030a) [0x7fb420b6e30a]",
"(PyObject_Call()+0x43) [0x7fb420bc2903]",
"(PyEval_EvalFrameEx()+0x41e1) [0x7fb420aec941]",
"(PyEval_EvalCodeEx()+0x7d8) [0x7fb420c2d908]",
"(()+0x17a639) [0x7fb420bc8639]",
"(PyObject_Call()+0x43) [0x7fb420bc2903]",
"(()+0x1a1eec) [0x7fb420befeec]",
"(PyObject_Call()+0x43) [0x7fb420bc2903]",
"(()+0x12030a) [0x7fb420b6e30a]",
"(PyObject_Call()+0x43) [0x7fb420bc2903]",
"(PyEval_EvalFrameEx()+0x41e1) [0x7fb420aec941]",
"(PyEval_EvalCodeEx()+0x7d8) [0x7fb420c2d908]",
"(()+0x17a639) [0x7fb420bc8639]",
"(PyObject_Call()+0x43) [0x7fb420bc2903]",
"(()+0x1a1eec) [0x7fb420befeec]",
"(PyObject_Call()+0x43) [0x7fb420bc2903]",
"(()+0x12030a) [0x7fb420b6e30a]",
"(PyObject_Call()+0x43) [0x7fb420bc2903]",
"(PyEval_EvalFrameEx()+0x41e1) [0x7fb420aec941]",
"(PyEval_EvalFrameEx()+0x8b5b) [0x7fb420af12bb]",
"(PyEval_EvalFrameEx()+0x8b5b) [0x7fb420af12bb]",
"(PyEval_EvalFrameEx()+0x8b5b) [0x7fb420af12bb]",
"(PyEval_EvalFrameEx()+0x8b5b) [0x7fb420af12bb]"
],
"utsname_hostname": "xxxx",
"crash_id": "xxxx",
"archived": "xxxx",
"ceph_version": "14.2.8
Best regards,
Malte
_______________________________________________
ceph-users mailing list -- ceph-users@xxxxxxx
To unsubscribe send an email to ceph-users-leave@xxxxxxx