Hi Fulvio, You can check (offline) which PGs are on an OSD with the list-pgs op, e.g. ceph-objectstore-tool --data-path /var/lib/ceph/osd/cephpa1-158/ --op list-pgs The EC pgs have a naming convention like 85.25s1 etc.. for the various k/m EC shards. -- dan On Mon, Mar 28, 2022 at 2:29 PM Fulvio Galeazzi <fulvio.galeazzi@xxxxxxx> wrote: > > Hallo, > all of a sudden, 3 of my OSDs failed, showing similar messages in > the log: > > ..... > -5> 2022-03-28 14:19:02.451 7fc20fe99700 5 osd.145 pg_epoch: > 616454 pg[70.2c6s1( empty local-lis/les=612106/612107 n=0 > ec=148456/148456 lis/c > 612106/612106 les/c/f 612107/612107/0 612106/612106/612101) > [168,145,102,96,112,124,128,134,56,34]p168(0) r=1 lpr=616429 crt=0'0 > unknown mbc={}] > enter Started > -4> 2022-03-28 14:19:02.451 7fc20fe99700 5 osd.145 pg_epoch: > 616454 pg[70.2c6s1( empty local-lis/les=612106/612107 n=0 > ec=148456/148456 lis/c > 612106/612106 les/c/f 612107/612107/0 612106/612106/612101) > [168,145,102,96,112,124,128,134,56,34]p168(0) r=1 lpr=616429 crt=0'0 > unknown mbc={}] > enter Start > -3> 2022-03-28 14:19:02.451 7fc20fe99700 1 osd.145 pg_epoch: > 616454 pg[70.2c6s1( empty local-lis/les=612106/612107 n=0 > ec=148456/148456 lis/c > 612106/612106 les/c/f 612107/612107/0 612106/612106/612101) > [168,145,102,96,112,124,128,134,56,34]p168(0) r=1 lpr=616429 crt=0'0 > unknown mbc={}] > state<Start>: transitioning to Stray > -2> 2022-03-28 14:19:02.451 7fc20fe99700 5 osd.145 pg_epoch: > 616454 pg[70.2c6s1( empty local-lis/les=612106/612107 n=0 > ec=148456/148456 lis/c > 612106/612106 les/c/f 612107/612107/0 612106/612106/612101) > [168,145,102,96,112,124,128,134,56,34]p168(0) r=1 lpr=616429 crt=0'0 > unknown mbc={}] > exit Start 0.000008 0 0.000000 > -1> 2022-03-28 14:19:02.451 7fc20fe99700 5 osd.145 pg_epoch: > 616454 pg[70.2c6s1( empty local-lis/les=612106/612107 n=0 > ec=148456/148456 lis/c > 612106/612106 les/c/f 612107/612107/0 612106/612106/612101) > [168,145,102,96,112,124,128,134,56,34]p168(0) r=1 lpr=616429 crt=0'0 > unknown mbc={}] > enter Started/Stray > 0> 2022-03-28 14:19:02.451 7fc20f698700 -1 *** Caught signal > (Aborted) ** > in thread 7fc20f698700 thread_name:tp_osd_tp > > ceph version 14.2.22 (ca74598065096e6fcbd8433c8779a2be0c889351) > nautilus (stable) > 1: (()+0x12ce0) [0x7fc2327dcce0] > 2: (gsignal()+0x10f) [0x7fc231452a4f] > 3: (abort()+0x127) [0x7fc231425db5] > 4: (ceph::__ceph_abort(char const*, int, char const*, > std::__cxx11::basic_string<char, std::char_traits<char>, > std::allocator<char> > const&)+0x1b4) [0x55b8139cb671] > 5: (PG::check_past_interval_bounds() const+0xc16) [0x55b813b586f6] > 6: (PG::RecoveryState::Reset::react(PG::AdvMap const&)+0x3e8) > [0x55b813b963d8] > 7: (boost::statechart::simple_state<PG::RecoveryState::Reset, > PG::RecoveryState::RecoveryMachine, boost::mpl::list<mpl_::na, mpl_::na, > mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, > mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, mpl_::na, > mpl_::na, mpl_::na, mpl_::na, mpl_::na>, > (boost::statechart::history_mode)0>::react_impl(boost::statechart::event_base > const&, void const*)+0x7d) [0x55b813bdd32d] > 8: (PG::handle_advance_map(std::shared_ptr<OSDMap const>, > std::shared_ptr<OSDMap const>, std::vector<int, std::allocator<int> >&, > int, std::vector<int, std::allocator<int> >&, int, > PG::RecoveryCtx*)+0x39d) [0x55b813b7b5fd] > 9: (OSD::advance_pg(unsigned int, PG*, ThreadPool::TPHandle&, > PG::RecoveryCtx*)+0x2e9) [0x55b813ad14e9] > 10: (OSD::dequeue_peering_evt(OSDShard*, PG*, > std::shared_ptr<PGPeeringEvent>, ThreadPool::TPHandle&)+0xaa) > [0x55b813ae345a] > 11: (PGPeeringItem::run(OSD*, OSDShard*, boost::intrusive_ptr<PG>&, > ThreadPool::TPHandle&)+0x55) [0x55b813d66c15] > 12: (OSD::ShardedOpWQ::_process(unsigned int, > ceph::heartbeat_handle_d*)+0x1366) [0x55b813adff46] > 13: (ShardedThreadPool::shardedthreadpool_worker(unsigned int)+0x5c4) > [0x55b8140dc944] > 14: (ShardedThreadPool::WorkThreadSharded::entry()+0x14) [0x55b8140df514] > 15: (()+0x81cf) [0x7fc2327d21cf] > 16: (clone()+0x43) [0x7fc23143dd83] > > Trying to "activate --all", rebotting server, and such, did not help. > > I am now stuck with one PG (85.25) down, find below the output from "query". > > The PG belongs to a 3+2 erasure-coded pool. > As the devices corresponding to the 3 down OSDs are properly mounted, is > there a way to get PG.ID=85.25 from the devices and copy it elsewhere? > Actually, I tried to find 85.25 in the 3 down OSDs with command: > ~]# ceph-objectstore-tool --data-path /var/lib/ceph/osd/cephpa1-158/ > --no-mon-config --pgid 85.25 --op export --file /tmp/pg_85-25 > PG '85.25' not found > which puzzled me... is there a way to search such PG.ID over the > whole cluster? > > Thanks for your help! > > Fulvio > > ==================== > > ~]# ceph --cluster cephpa1 health detail | grep down > ..... > PG_AVAILABILITY Reduced data availability: 1 pg inactive, 1 pg down > pg 85.25 is down+remapped, acting > [2147483647,2147483647,96,2147483647,2147483647] > ~]# ceph --cluster cephpa1 pg 85.25 query > { > "state": "down+remapped", > "snap_trimq": "[]", > "snap_trimq_len": 0, > "epoch": 617667, > "up": [ > 2147483647, > 2147483647, > 2147483647, > 2147483647, > 2147483647 > ], > "acting": [ > 2147483647, > 2147483647, > 96, > 2147483647, > 2147483647 > ], > "info": { > "pgid": "85.25s2", > "last_update": "606021'521273", > "last_complete": "606021'521273", > "log_tail": "605873'518175", > "last_user_version": 521273, > "last_backfill": "MAX", > "last_backfill_bitwise": 1, > "purged_snaps": [], > "history": { > "epoch_created": 478130, > "epoch_pool_created": 478130, > "last_epoch_started": 611605, > "last_interval_started": 611604, > "last_epoch_clean": 595152, > "last_interval_clean": 595151, > "last_epoch_split": 0, > "last_epoch_marked_full": 0, > "same_up_since": 617661, > "same_interval_since": 617661, > "same_primary_since": 616519, > "last_scrub": "597862'517979", > "last_scrub_stamp": "2022-03-15 05:08:42.561720", > "last_deep_scrub": "597454'511650", > "last_deep_scrub_stamp": "2022-03-09 18:05:34.205121", > "last_clean_scrub_stamp": "2022-03-15 05:08:42.561720" > }, > "stats": { > "version": "606021'521273", > "reported_seq": "1733225", > "reported_epoch": "617667", > "state": "down+remapped", > "last_fresh": "2022-03-28 14:17:48.920542", > "last_change": "2022-03-28 14:15:42.025193", > "last_active": "2022-03-25 15:24:42.557667", > "last_peered": "2022-03-25 15:24:25.912029", > "last_clean": "2022-03-15 01:50:02.979366", > "last_became_active": "2022-03-25 12:44:52.379711", > "last_became_peered": "2022-03-25 12:44:52.379711", > "last_unstale": "2022-03-28 14:17:48.920542", > "last_undegraded": "2022-03-28 14:17:48.920542", > "last_fullsized": "2022-03-28 14:17:48.920542", > "mapping_epoch": 617661, > "log_start": "605873'518175", > "ondisk_log_start": "605873'518175", > "created": 478130, > "last_epoch_clean": 595152, > "parent": "0.0", > "parent_split_bits": 0, > "last_scrub": "597862'517979", > "last_scrub_stamp": "2022-03-15 05:08:42.561720", > "last_deep_scrub": "597454'511650", > "last_deep_scrub_stamp": "2022-03-09 18:05:34.205121", > "last_clean_scrub_stamp": "2022-03-15 05:08:42.561720", > "log_size": 3098, > "ondisk_log_size": 3098, > "stats_invalid": false, > "dirty_stats_invalid": false, > "omap_stats_invalid": false, > "hitset_stats_invalid": false, > "hitset_bytes_stats_invalid": false, > "pin_stats_invalid": false, > "manifest_stats_invalid": false, > "snaptrimq_len": 0, > "stat_sum": { > "num_bytes": 165108592640, > "num_objects": 39402, > "num_object_clones": 0, > "num_object_copies": 197010, > "num_objects_missing_on_primary": 0, > "num_objects_missing": 0, > "num_objects_degraded": 0, > "num_objects_misplaced": 0, > "num_objects_unfound": 0, > "num_objects_dirty": 39402, > "num_whiteouts": 0, > "num_read": 338705, > "num_read_kb": 124297874, > "num_write": 316971, > "num_write_kb": 311968402, > "num_scrub_errors": 0, > "num_shallow_scrub_errors": 0, > "num_deep_scrub_errors": 0, > "num_objects_recovered": 196778, > "num_bytes_recovered": 824577241088, > "num_keys_recovered": 0, > "num_objects_omap": 0, > "num_objects_hit_set_archive": 0, > "num_bytes_hit_set_archive": 0, > "num_flush": 0, > "num_flush_kb": 0, > "num_evict": 0, > "num_evict_kb": 0, > "num_promote": 0, > "num_flush_mode_high": 0, > "num_flush_mode_low": 0, > "num_evict_mode_some": 0, > "num_evict_mode_full": 0, > "num_objects_pinned": 0, > "num_legacy_snapsets": 0, > "num_large_omap_objects": 0, > "num_objects_manifest": 0, > "num_omap_bytes": 0, > "num_omap_keys": 0, > "num_objects_repaired": 0 > }, > "up": [ > 2147483647, > 2147483647, > 2147483647, > 2147483647, > 2147483647 > ], > "acting": [ > 2147483647, > 2147483647, > 96, > 2147483647, > 2147483647 > ], > "avail_no_missing": [], > "object_location_counts": [], > "blocked_by": [ > 121 > ], > "up_primary": -1, > "acting_primary": 96, > "purged_snaps": [] > }, > "empty": 0, > "dne": 0, > "incomplete": 0, > "last_epoch_started": 616455, > "hit_set_history": { > "current_last_update": "0'0", > "history": [] > } > }, > "peer_info": [], > "recovery_state": [ > { > "name": "Started/Primary/Peering/Down", > "enter_time": "2022-03-28 14:15:42.025185", > "comment": "not enough up instances of this PG to go active" > }, > { > "name": "Started/Primary/Peering", > "enter_time": "2022-03-28 14:15:42.025128", > "past_intervals": [ > { > "first": "595151", > "last": "617660", > "all_participants": [ > { > "osd": 77, > "shard": 0 > }, > { > "osd": 82, > "shard": 2 > }, > { > "osd": 90, > "shard": 0 > }, > { > "osd": 91, > "shard": 4 > }, > { > "osd": 96, > "shard": 2 > }, > { > "osd": 121, > "shard": 3 > }, > { > "osd": 140, > "shard": 1 > }, > { > "osd": 159, > "shard": 4 > } > ], > "intervals": [ > { > "first": "610237", > "last": "611219", > "acting": "96(2),121(3),159(4)" > }, > { > "first": "611604", > "last": "616452", > "acting": "121(3),140(1),159(4)" > }, > { > "first": "616461", > "last": "616518", > "acting": "96(2),140(1),159(4)" > } > ] > } > ], > "probing_osds": [ > "77(0)", > "82(2)", > "90(0)", > "91(4)", > "96(2)", > "140(1)", > "159(4)" > ], > "blocked": "peering is blocked due to down osds", > "down_osds_we_would_probe": [ > 121 > ], > "peering_blocked_by": [ > { > "osd": 121, > "current_lost_at": 0, > "comment": "starting or marking this osd lost may > let us proceed" > } > ] > }, > { > "name": "Started", > "enter_time": "2022-03-28 14:15:42.025068" > } > ], > "agent_state": {} > } > [ > > > > -- > Fulvio Galeazzi > GARR-CSD Department > skype: fgaleazzi70 > tel.: +39-334-6533-250 > _______________________________________________ > ceph-users mailing list -- ceph-users@xxxxxxx > To unsubscribe send an email to ceph-users-leave@xxxxxxx _______________________________________________ ceph-users mailing list -- ceph-users@xxxxxxx To unsubscribe send an email to ceph-users-leave@xxxxxxx