Hi, Sebastian, thanks for your suggestion, i'd tried to turn off prometheus as the post suggested, but still crash every few days. Then look into the mgr container and found lots of defunct ssh processes every few minutes, it seems related to remote exec issues. Tired to turn on mgr debug and look through all possible logs under all ceph nodes, but still no obvious indication. UID PID PPID C STIME TTY TIME CMD ceph 1 0 1 09:41 ? 00:00:25 /usr/bin/ceph-mgr -n mgr.sds01-cp.cwcxek -f --setuser ceph --setgroup ceph --default-log-to-file=false --default-log-to-stderr=true --default-log-stderr-prefix=debug ceph 91 1 0 09:41 ? 00:00:00 [ssh] <defunct> ceph 94 1 0 09:41 ? 00:00:00 [ssh] <defunct> ceph 98 1 0 09:41 ? 00:00:00 [ssh] <defunct> ceph 99 1 0 09:41 ? 00:00:00 [ssh] <defunct> ceph 100 1 0 09:41 ? 00:00:00 [ssh] <defunct> ceph 101 1 0 09:41 ? 00:00:00 [ssh] <defunct> ceph 102 1 0 09:41 ? 00:00:00 [ssh] <defunct> ceph 164 1 0 09:51 ? 00:00:00 [ssh] <defunct> ceph 165 1 0 09:51 ? 00:00:00 [ssh] <defunct> ceph 166 1 0 09:51 ? 00:00:00 [ssh] <defunct> ceph 167 1 0 09:51 ? 00:00:00 [ssh] <defunct> ceph 168 1 0 09:51 ? 00:00:00 [ssh] <defunct> ceph 169 1 0 09:51 ? 00:00:00 [ssh] <defunct> ceph 170 1 0 09:51 ? 00:00:00 [ssh] <defunct> ceph 178 1 0 10:01 ? 00:00:00 [ssh] <defunct> ceph 179 1 0 10:01 ? 00:00:00 [ssh] <defunct> ceph 180 1 0 10:01 ? 00:00:00 [ssh] <defunct> ceph 181 1 0 10:01 ? 00:00:00 [ssh] <defunct> ceph 182 1 0 10:01 ? 00:00:00 [ssh] <defunct> ceph 183 1 0 10:01 ? 00:00:00 [ssh] <defunct> ceph 184 1 0 10:01 ? 00:00:00 [ssh] <defunct> ceph 192 1 0 10:11 ? 00:00:00 [ssh] <defunct> ceph 193 1 0 10:11 ? 00:00:00 [ssh] <defunct> ceph 194 1 0 10:11 ? 00:00:00 [ssh] <defunct> ceph 195 1 0 10:11 ? 00:00:00 [ssh] <defunct> ceph 196 1 0 10:11 ? 00:00:00 [ssh] <defunct> ceph 197 1 0 10:11 ? 00:00:00 [ssh] <defunct> ceph 198 1 0 10:11 ? 00:00:00 [ssh] <defunct> ceph 223 1 0 10:21 ? 00:00:00 ssh -C -F /tmp/cephadm-conf-ugty_66i -i /tmp/cephadm-identity-mp8d481u cephadm@192.168.88.18 sudo python3 -c "import sys;exec(eval(sys.stdin.readline()))" ceph 224 1 0 10:21 ? 00:00:00 ssh -C -F /tmp/cephadm-conf-ugty_66i -i /tmp/cephadm-identity-mp8d481u cephadm@192.168.88.17 sudo python3 -c "import sys;exec(eval(sys.stdin.readline()))" ceph 225 1 0 10:21 ? 00:00:00 ssh -C -F /tmp/cephadm-conf-ugty_66i -i /tmp/cephadm-identity-mp8d481u cephadm@192.168.88.20 sudo python3 -c "import sys;exec(eval(sys.stdin.readline()))" ceph 226 1 0 10:21 ? 00:00:00 ssh -C -F /tmp/cephadm-conf-ugty_66i -i /tmp/cephadm-identity-mp8d481u cephadm@192.168.99.16 sudo python3 -c "import sys;exec(eval(sys.stdin.readline()))" ceph 227 1 0 10:21 ? 00:00:00 ssh -C -F /tmp/cephadm-conf-ugty_66i -i /tmp/cephadm-identity-mp8d481u cephadm@192.168.88.19 sudo python3 -c "import sys;exec(eval(sys.stdin.readline()))" ceph 228 1 0 10:21 ? 00:00:00 ssh -C -F /tmp/cephadm-conf-ugty_66i -i /tmp/cephadm-identity-mp8d481u cephadm@192.168.88.15 sudo python3 -c "import sys;exec(eval(sys.stdin.readline()))" ceph 229 1 0 10:21 ? 00:00:00 ssh -C -F /tmp/cephadm-conf-ugty_66i -i /tmp/cephadm-identity-mp8d481u cephadm@192.168.88.14 sudo python3 -c "import sys;exec(eval(sys.stdin.readline()))" root 237 0 0 10:21 pts/0 00:00:00 bash root 253 237 0 10:22 pts/0 00:00:00 ps -eaf On Thu, Feb 11, 2021 at 6:00 PM Sebastian Luna Valero < sebastian.luna.valero@xxxxxxxxx> wrote: > Hi, > > The following thread on this emailing list might be relevant: > > https://lists.ceph.io/hyperkitty/list/ceph-users@xxxxxxx/thread/IAJRTIMFALJTZD3KYBHT4G7GEL6EHRR5/#IAJRTIMFALJTZD3KYBHT4G7GEL6EHRR5 > > Best regards, > Sebastian > > > El jue, 11 feb 2021 a las 10:32, levin ng (<levindecaro@xxxxxxxxx>) > escribió: > > > Hi all, > > > > I’d recently deployed ceph 15.2.8 with 3(mon,mgr,rgw,mds) and 4 (osd) > > total 7 host, however I encountered mgr crash a few times a week, the > > crashing mgr can be any one of 3. I couldn’t identify the problem behind > > and here is the crash info, appreciate anyone if you have suggestions > that > > I could narrow it down. > > > > Thank you very much. > > > > { > > "assert_condition": "ret == 0", > > "assert_file": > > > > > "/home/jenkins-build/build/workspace/ceph-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos8/DIST/centos8/MACHINE_SIZE/gigantic/release/15.2.8/rpm/el8/BUILD/ceph-15.2.8/src/common/Thread.cc", > > "assert_func": "void Thread::create(const char*, size_t)", > > "assert_line": 157, > > "assert_msg": > > > > > "/home/jenkins-build/build/workspace/ceph-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos8/DIST/centos8/MACHINE_SIZE/gigantic/release/15.2.8/rpm/el8/BUILD/ceph-15.2.8/src/common/Thread.cc: > > In function 'void Thread::create(const char*, size_t)' thread > 7f833addc700 > > time > > > > > 2021-02-10T20:00:32.980508+0000\n/home/jenkins-build/build/workspace/ceph-build/ARCH/x86_64/AVAILABLE_ARCH/x86_64/AVAILABLE_DIST/centos8/DIST/centos8/MACHINE_SIZE/gigantic/release/15.2.8/rpm/el8/BUILD/ceph-15.2.8/src/common/Thread.cc: > > 157: FAILED ceph_assert(ret == 0)\n", > > "assert_thread_name": "mgr-fin", > > "backtrace": [ > > "(()+0x12b20) [0x7f835a51cb20]", > > "(gsignal()+0x10f) [0x7f8358f6d7ff]", > > "(abort()+0x127) [0x7f8358f57c35]", > > "(ceph::__ceph_assert_fail(char const*, char const*, int, char > > const*)+0x1a9) [0x7f835c07b735]", > > "(()+0x27a8fe) [0x7f835c07b8fe]", > > "(()+0x34cef6) [0x7f835c14def6]", > > "(DispatchQueue::start()+0x3a) [0x7f835c29697a]", > > "(AsyncMessenger::ready()+0xcd) [0x7f835c3340cd]", > > "(Messenger::add_dispatcher_head(Dispatcher*)+0x68) > > [0x7f835c3f8478]", > > "(MonClient::get_monmap_and_config()+0xbb) [0x7f835c3f66ab]", > > "(ceph_mount_info::init()+0x4d) [0x7f834298435d]", > > "(()+0x3680f) [0x7f8342cd280f]", > > "(()+0x19d421) [0x7f835ba5c421]", > > "(_PyEval_EvalFrameDefault()+0x498) [0x7f835ba5ce08]", > > "(()+0x179c78) [0x7f835ba38c78]", > > "(()+0x19d1c7) [0x7f835ba5c1c7]", > > "(_PyEval_EvalFrameDefault()+0x498) [0x7f835ba5ce08]", > > "(()+0x179c78) [0x7f835ba38c78]", > > "(()+0x19d1c7) [0x7f835ba5c1c7]", > > "(_PyEval_EvalFrameDefault()+0x498) [0x7f835ba5ce08]", > > "(()+0x1221d4) [0x7f835b9e11d4]", > > "(()+0x122c55) [0x7f835b9e1c55]", > > "(()+0x19cf27) [0x7f835ba5bf27]", > > "(_PyEval_EvalFrameDefault()+0x498) [0x7f835ba5ce08]", > > "(_PyFunction_FastCallDict()+0x122) [0x7f835b9b9ec2]", > > "(_PyObject_FastCallDict()+0x70e) [0x7f835b9bac9e]", > > "(()+0x10dc70) [0x7f835b9ccc70]", > > "(_PyObject_FastCallDict()+0x6ec) [0x7f835b9bac7c]", > > "(PyObject_CallFunctionObjArgs()+0xe8) [0x7f835b9dbd48]", > > "(_PyEval_EvalFrameDefault()+0x2588) [0x7f835ba5eef8]", > > "(()+0xf99b4) [0x7f835b9b89b4]", > > "(()+0x179e60) [0x7f835ba38e60]", > > "(()+0x19d1c7) [0x7f835ba5c1c7]", > > "(_PyEval_EvalFrameDefault()+0x10d5) [0x7f835ba5da45]", > > "(()+0x179c78) [0x7f835ba38c78]", > > "(()+0x19d1c7) [0x7f835ba5c1c7]", > > "(_PyEval_EvalFrameDefault()+0x498) [0x7f835ba5ce08]", > > "(()+0xfa326) [0x7f835b9b9326]", > > "(()+0x179e60) [0x7f835ba38e60]", > > "(()+0x19d1c7) [0x7f835ba5c1c7]", > > "(_PyEval_EvalFrameDefault()+0x498) [0x7f835ba5ce08]", > > "(()+0x179c78) [0x7f835ba38c78]", > > "(()+0x19d1c7) [0x7f835ba5c1c7]", > > "(_PyEval_EvalFrameDefault()+0x498) [0x7f835ba5ce08]", > > "(_PyFunction_FastCallDict()+0x122) [0x7f835b9b9ec2]", > > "(_PyObject_FastCallDict()+0x70e) [0x7f835b9bac9e]", > > "(()+0x10dc70) [0x7f835b9ccc70]", > > "(PyObject_Call()+0x4b) [0x7f835b9c1acb]", > > "(PyObject_CallMethod()+0x10b) [0x7f835ba5ac6b]", > > "(ActivePyModule::handle_command(ModuleCommand const&, MgrSession > > const&, std::map<std::__cxx11::basic_string<char, std::char_traits<char>, > > std::allocator<char> >, boost::variant<std::__cxx11::basic_string<char, > > std::char_traits<char>, std::allocator<char> >, bool, long, double, > > std::vector<std::__cxx11::basic_string<char, std::char_traits<char>, > > std::allocator<char> >, std::allocator<std::__cxx11::basic_string<char, > > std::char_traits<char>, std::allocator<char> > > >, std::vector<long, > > std::allocator<long> >, std::vector<double, std::allocator<double> > >, > > std::less<void>, > std::allocator<std::pair<std::__cxx11::basic_string<char, > > std::char_traits<char>, std::allocator<char> > const, > > boost::variant<std::__cxx11::basic_string<char, std::char_traits<char>, > > std::allocator<char> >, bool, long, double, > > std::vector<std::__cxx11::basic_string<char, std::char_traits<char>, > > std::allocator<char> >, std::allocator<std::__cxx11::basic_string<char, > > std::char_traits<char>, std::allocator<char> > > >, std::vector<long, > > std::allocator<long> >, std::vector<double, std::allocator<double> > > > > > > > > const&, ceph::buffer::v15_2_0::list const&, > > std::__cxx11::basic_stringstream<char, std::char_traits<char>, > > std::allocator<char> >*, std::__cxx11::basic_stringstream<char, > > std::char_traits<char>, std::allocator<char> >*)+0x222) > [0x55bc0b8a0cb2]", > > "(()+0x1b0fdd) [0x55bc0b8f5fdd]", > > "(Context::complete(int)+0xd) [0x55bc0b8b0bdd]", > > "(Finisher::finisher_thread_entry()+0x1a5) [0x7f835c10b465]", > > "(()+0x814a) [0x7f835a51214a]", > > "(clone()+0x43) [0x7f8359032f23]" > > ], > > "ceph_version": "15.2.8", > > "crash_id": > > "2021-02-10T20:00:32.989661Z_201fd5fb-6e0a-4b50-8a95-fdf9ed9aeb81", > > "entity_name": "mgr.sds01-cp.cwcxek", > > "os_id": "centos", > > "os_name": "CentOS Linux", > > "os_version": "8", > > "os_version_id": "8", > > "process_name": "ceph-mgr", > > "stack_sig": > > "e1c15d685283e7598b128a37a328ba86ec433dfef97597ac9453b5d52608feda", > > "timestamp": "2021-02-10T20:00:32.989661Z", > > "utsname_hostname": "sds01-cp", > > "utsname_machine": "x86_64", > > "utsname_release": "4.18.0-240.10.1.el8_3.x86_64", > > "utsname_sysname": "Linux", > > "utsname_version": "#1 SMP Wed Dec 16 03:30:52 EST 2020" > > } > > _______________________________________________ > > ceph-users mailing list -- ceph-users@xxxxxxx > > To unsubscribe send an email to ceph-users-leave@xxxxxxx > > > _______________________________________________ > ceph-users mailing list -- ceph-users@xxxxxxx > To unsubscribe send an email to ceph-users-leave@xxxxxxx > _______________________________________________ ceph-users mailing list -- ceph-users@xxxxxxx To unsubscribe send an email to ceph-users-leave@xxxxxxx