Updating the Ceph cluster from the Octopus version (v15.2.17) to Quincy (v17.2.6). We used ceph-deploy to update all ceph packages on all hosts, and then we restarted the services one by one (mon -> mgr -> osd -> rgw). During the restart on the first node, all osd encountered an issue where they didn't change to the "up" state and got stuck in "booting" state. # ceph daemon osd.3 status { "cluster_fsid": "f95b201c-4cd6-4c36-a54e-7f2b68608b8f", "osd_fsid": "b0141718-a2ac-4a26-808b-17b6741b789e", "whoami": 3, "state": "booting", "oldest_map": 4437792, "newest_map": 4441114, "num_pgs": 29 } While changing "ceph osd require-osd-release quincy," the monitor service crashed with errors. # ceph report | jq '.osdmap.require_osd_release' "nautilus" -2> 2023-07-25T12:10:20.977+0600 7f245a84f700 5 mon.ceph-ph-mon1-dc3@0(leader).paxos(paxos updating c 81819224..81819937) is_readable = 1 - now=2023-07-25T12:10:20.981801+0600 lease_expire=2023-07-25T12:10:25.959818+0600 has v0 lc 81819937 -1> 2023-07-25T12:10:20.997+0600 7f245a84f700 -1 /build/ceph-17.2.6/src/mon/OSDMonitor.cc: In function 'bool OSDMonitor::prepare_command_impl(MonOpRequestRef, const cmdmap_t&)' thread 7f245a84f700 time 2023-07-25T12:10:20.981991+0600 /build/ceph-17.2.6/src/mon/OSDMonitor.cc: 11631: FAILED ceph_assert(osdmap.require_osd_release >= ceph_release_t::octopus) ceph version 17.2.6 (d7ff0d10654d2280e08f1ab989c7cdf3064446a5) quincy (stable) 1: (ceph::__ceph_assert_fail(char const*, char const*, int, char const*)+0x14f) [0x7f24629d3878] 2: /usr/lib/ceph/libceph-common.so.2(+0x27da8a) [0x7f24629d3a8a] 3: (OSDMonitor::prepare_command_impl(boost::intrusive_ptr<MonOpRequest>, std::map<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, boost::variant<std::__cxx11::basic_stri ng<char, std::char_traits<char>, std::allocator<char> >, bool, long, double, std::vector<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::allocator<std::__cxx11::basi c_string<char, std::char_traits<char>, std::allocator<char> > > >, std::vector<long, std::allocator<long> >, std::vector<double, std::allocator<double> > >, std::less<void>, std::allocator<std::pair<std: :__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, boost::variant<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, bool, long, double, std: :vector<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > >, std::vector<lo ng, std::allocator<long> >, std::vector<double, std::allocator<double> > > > > > const&)+0xcb13) [0x5569f209a823] 4: (OSDMonitor::prepare_command(boost::intrusive_ptr<MonOpRequest>)+0x45f) [0x5569f20ab89f] 5: (OSDMonitor::prepare_update(boost::intrusive_ptr<MonOpRequest>)+0x162) [0x5569f20baa42] 6: (PaxosService::dispatch(boost::intrusive_ptr<MonOpRequest>)+0x716) [0x5569f201fd86] 7: (PaxosService::C_RetryMessage::_finish(int)+0x6c) [0x5569f1f4f93c] 8: (C_MonOp::finish(int)+0x4b) [0x5569f1ebbb3b] 9: (Context::complete(int)+0xd) [0x5569f1ebaa0d] 10: (void finish_contexts<std::__cxx11::list<Context*, std::allocator<Context*> > >(ceph::common::CephContext*, std::__cxx11::list<Context*, std::allocator<Context*> >&, int)+0xb0) [0x5569f1ef11e0] 11: (Paxos::finish_round()+0xb1) [0x5569f2015a61] 12: (Paxos::handle_last(boost::intrusive_ptr<MonOpRequest>)+0x11e3) [0x5569f20172a3] 13: (Paxos::dispatch(boost::intrusive_ptr<MonOpRequest>)+0x49f) [0x5569f2019f7f] 14: (Monitor::dispatch_op(boost::intrusive_ptr<MonOpRequest>)+0x14f4) [0x5569f1eb7f34] 15: (Monitor::_ms_dispatch(Message*)+0xa68) [0x5569f1eb8bd8] 16: (Dispatcher::ms_dispatch2(boost::intrusive_ptr<Message> const&)+0x5d) [0x5569f1ef2c4d] 17: (Messenger::ms_deliver_dispatch(boost::intrusive_ptr<Message> const&)+0x460) [0x7f2462c71da0] 18: (DispatchQueue::entry()+0x58f) [0x7f2462c6f63f] 19: (DispatchQueue::DispatchThread::entry()+0x11) [0x7f2462d40b61] 20: /lib/x86_64-linux-gnu/libpthread.so.0(+0x9609) [0x7f24624f4609] 21: clone() 0> 2023-07-25T12:10:21.009+0600 7f245a84f700 -1 *** Caught signal (Aborted) ** in thread 7f245a84f700 thread_name:ms_dispatch ceph version 17.2.6 (d7ff0d10654d2280e08f1ab989c7cdf3064446a5) quincy (stable) 1: /lib/x86_64-linux-gnu/libpthread.so.0(+0x153c0) [0x7f24625003c0] 2: gsignal() 3: abort() 4: (ceph::__ceph_assert_fail(char const*, char const*, int, char const*)+0x1b7) [0x7f24629d38e0] 5: /usr/lib/ceph/libceph-common.so.2(+0x27da8a) [0x7f24629d3a8a] 6: (OSDMonitor::prepare_command_impl(boost::intrusive_ptr<MonOpRequest>, std::map<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, boost::variant<std::__cxx11::basic_stri ng<char, std::char_traits<char>, std::allocator<char> >, bool, long, double, std::vector<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::allocator<std::__cxx11::basi c_string<char, std::char_traits<char>, std::allocator<char> > > >, std::vector<long, std::allocator<long> >, std::vector<double, std::allocator<double> > >, std::less<void>, std::allocator<std::pair<std: :__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const, boost::variant<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, bool, long, double, std: :vector<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > >, std::vector<lo ng, std::allocator<long> >, std::vector<double, std::allocator<double> > > > > > const&)+0xcb13) [0x5569f209a823] 7: (OSDMonitor::prepare_command(boost::intrusive_ptr<MonOpRequest>)+0x45f) [0x5569f20ab89f] 8: (OSDMonitor::prepare_update(boost::intrusive_ptr<MonOpRequest>)+0x162) [0x5569f20baa42] 9: (PaxosService::dispatch(boost::intrusive_ptr<MonOpRequest>)+0x716) [0x5569f201fd86] 10: (PaxosService::C_RetryMessage::_finish(int)+0x6c) [0x5569f1f4f93c] 11: (C_MonOp::finish(int)+0x4b) [0x5569f1ebbb3b] 12: (Context::complete(int)+0xd) [0x5569f1ebaa0d] 13: (void finish_contexts<std::__cxx11::list<Context*, std::allocator<Context*> > >(ceph::common::CephContext*, std::__cxx11::list<Context*, std::allocator<Context*> >&, int)+0xb0) [0x5569f1ef11e0] 14: (Paxos::finish_round()+0xb1) [0x5569f2015a61] 15: (Paxos::handle_last(boost::intrusive_ptr<MonOpRequest>)+0x11e3) [0x5569f20172a3] 16: (Paxos::dispatch(boost::intrusive_ptr<MonOpRequest>)+0x49f) [0x5569f2019f7f] 17: (Monitor::dispatch_op(boost::intrusive_ptr<MonOpRequest>)+0x14f4) [0x5569f1eb7f34] 18: (Monitor::_ms_dispatch(Message*)+0xa68) [0x5569f1eb8bd8] 19: (Dispatcher::ms_dispatch2(boost::intrusive_ptr<Message> const&)+0x5d) [0x5569f1ef2c4d] 20: (Messenger::ms_deliver_dispatch(boost::intrusive_ptr<Message> const&)+0x460) [0x7f2462c71da0] 21: (DispatchQueue::entry()+0x58f) [0x7f2462c6f63f] 22: (DispatchQueue::DispatchThread::entry()+0x11) [0x7f2462d40b61] 23: /lib/x86_64-linux-gnu/libpthread.so.0(+0x9609) [0x7f24624f4609] 24: clone() NOTE: a copy of the executable, or `objdump -rdS <executable>` is needed to interpret this. Now the cluster looks as follows: # ceph -s cluster: id: f95b201c-4cd6-4c36-a54e-7f2b68608b8f health: HEALTH_WARN noout flag(s) set 12 osds down 1 host (12 osds) down all OSDs are running octopus or later but require_osd_release < octopus Degraded data redundancy: 5731463/11463184 objects degraded (49.999%), 240 pgs degraded, 315 pgs undersized services: mon: 3 daemons, quorum ceph-ph-mon1-dc3,hw-ceph-ph3-dc3,hw-ceph-ph4-dc3 (age 68m) mgr: ceph-ph-mon1-dc3.alahd.kz.test.bash.kz(active, since 45m) osd: 25 osds: 13 up (since 26h), 25 in (since 21h); 6 remapped pgs flags noout rgw: 1 daemon active (1 hosts, 1 zones) data: pools: 8 pools, 321 pgs objects: 5.73M objects, 4.4 TiB usage: 8.8 TiB used, 32 TiB / 40 TiB avail pgs: 5731463/11463184 objects degraded (49.999%) 129/11463184 objects misplaced (0.001%) 240 active+undersized+degraded 75 active+undersized 6 active+clean+remapped # ceph osd df tree ID CLASS WEIGHT REWEIGHT SIZE RAW USE DATA OMAP META AVAIL %USE VAR PGS STATUS TYPE NAME -1 42.23981 - 22 TiB 4.7 TiB 4.6 TiB 1.2 GiB 22 GiB 17 TiB 0 0 - root default -2 20.19991 - 0 B 0 B 0 B 0 B 0 B 0 B 0 0 - host hw-ceph-ph3 0 hdd 1.81999 1.00000 0 B 0 B 0 B 0 B 0 B 0 B 0 0 0 down osd.0 1 hdd 1.81999 1.00000 1.8 TiB 401 GiB 396 GiB 76 MiB 1.2 GiB 1.4 TiB 21.47 0.98 0 down osd.1 2 hdd 1.81999 1.00000 1.8 TiB 365 GiB 362 GiB 45 KiB 1.1 GiB 1.5 TiB 19.59 0.90 0 down osd.2 3 hdd 1.81999 1.00000 1.8 TiB 584 GiB 580 GiB 825 KiB 1.7 GiB 1.3 TiB 31.29 1.43 0 down osd.3 4 hdd 1.81999 1.00000 1.8 TiB 365 GiB 362 GiB 621 KiB 1.1 GiB 1.5 TiB 19.57 0.90 0 down osd.4 5 hdd 1.81999 1.00000 1.8 TiB 583 GiB 579 GiB 31 KiB 1.7 GiB 1.3 TiB 31.25 1.43 0 down osd.5 6 hdd 1.81999 1.00000 1.8 TiB 365 GiB 362 GiB 43 KiB 1.1 GiB 1.5 TiB 19.55 0.90 0 down osd.6 7 hdd 1.81999 1.00000 1.8 TiB 365 GiB 362 GiB 12 KiB 1.1 GiB 1.5 TiB 19.58 0.90 0 down osd.7 8 hdd 1.81999 1.00000 1.8 TiB 365 GiB 362 GiB 27 KiB 1.1 GiB 1.5 TiB 19.58 0.90 0 down osd.8 9 hdd 1.81999 1.00000 1.8 TiB 547 GiB 543 GiB 47 KiB 1.6 GiB 1.3 TiB 29.32 1.34 0 down osd.9 10 hdd 1.81999 1.00000 1.8 TiB 330 GiB 327 GiB 68 KiB 987 MiB 1.5 TiB 17.67 0.81 0 down osd.10 12 ssd 0.17999 1.00000 186 GiB 1.2 GiB 67 MiB 1.1 GiB 86 MiB 185 GiB 0.65 0.03 0 down osd.12 -3 22.03990 - 22 TiB 4.7 TiB 4.6 TiB 1.2 GiB 22 GiB 17 TiB 21.11 0.97 - host hw-ceph-ph4 11 hdd 1.81999 1.00000 1.8 TiB 550 GiB 546 GiB 1012 KiB 2.3 GiB 1.3 TiB 29.48 1.35 34 up osd.11 13 hdd 1.81999 1.00000 1.8 TiB 332 GiB 329 GiB 2.8 MiB 1.5 GiB 1.5 TiB 17.78 0.82 22 up osd.13 14 hdd 1.81999 1.00000 1.8 TiB 550 GiB 547 GiB 4.4 MiB 2.1 GiB 1.3 TiB 29.50 1.35 25 up osd.14 15 hdd 1.81999 1.00000 1.8 TiB 259 GiB 256 GiB 1.6 MiB 1.2 GiB 1.6 TiB 13.86 0.64 20 up osd.15 16 hdd 1.81999 1.00000 1.8 TiB 477 GiB 474 GiB 2.7 MiB 1.9 GiB 1.4 TiB 25.58 1.17 23 up osd.16 17 hdd 1.81999 1.00000 1.8 TiB 403 GiB 400 GiB 1.7 MiB 1.7 GiB 1.4 TiB 21.63 0.99 32 up osd.17 18 hdd 1.81999 1.00000 1.8 TiB 294 GiB 291 GiB 79 MiB 1.4 GiB 1.5 TiB 15.79 0.72 25 up osd.18 19 hdd 1.81999 1.00000 1.8 TiB 294 GiB 291 GiB 1.6 MiB 1.4 GiB 1.5 TiB 15.76 0.72 27 up osd.19 20 hdd 1.81999 1.00000 1.8 TiB 477 GiB 474 GiB 2.3 MiB 1.9 GiB 1.4 TiB 25.60 1.17 23 up osd.20 21 hdd 1.81999 1.00000 1.8 TiB 473 GiB 470 GiB 23 MiB 2.0 GiB 1.4 TiB 25.36 1.16 24 up osd.21 22 hdd 1.81999 1.00000 1.8 TiB 404 GiB 401 GiB 3.6 MiB 2.3 GiB 1.4 TiB 21.69 0.99 22 up osd.22 23 hdd 1.81999 1.00000 1.8 TiB 258 GiB 255 GiB 2.7 MiB 1.3 GiB 1.6 TiB 13.84 0.63 18 up osd.23 24 ssd 0.20000 1.00000 238 GiB 2.2 GiB 67 MiB 1.1 GiB 1.0 GiB 236 GiB 0.91 0.04 32 up osd.24 TOTAL 40 TiB 8.8 TiB 8.8 TiB 2.3 GiB 34 GiB 32 TiB 21.81 MIN/MAX VAR: 0/1.43 STDDEV: 8.97 We checked the network connectivity between the hosts, and everything is fine there. We have not restarted the OSD services on the second node because we suspect that the entire cluster might crash. Please give advise how to solve the problem _______________________________________________ ceph-users mailing list -- ceph-users@xxxxxxx To unsubscribe send an email to ceph-users-leave@xxxxxxx