I cut out a HUGE list of "purged_snaps" to keep this a little shorter... $ cat 1.10e.txt { "state": "incomplete", "snap_trimq": "[]", "snap_trimq_len": 0, "epoch": 465904, "up": [ 52, 23, 20 ], "acting": [ 52, 23, 20 ], "info": { "pgid": "1.10e", "last_update": "438490'293946", "last_complete": "438490'293946", "log_tail": "427182'292446", "last_user_version": 0, "last_backfill": "MIN", "last_backfill_bitwise": 1, "purged_snaps": [ { "start": "2", "length": "12cd" }, { "start": "12d0", "length": "1fca" }, ... lots of snaps ... ], "history": { "epoch_created": 22654, "epoch_pool_created": 22654, "last_epoch_started": 447973, "last_interval_started": 447972, "last_epoch_clean": 438832, "last_interval_clean": 438831, "last_epoch_split": 0, "last_epoch_marked_full": 0, "same_up_since": 465900, "same_interval_since": 465901, "same_primary_since": 465901, "last_scrub": "438490'293946", "last_scrub_stamp": "2018-06-12 00:10:55.825562", "last_deep_scrub": "427203'293886", "last_deep_scrub_stamp": "2018-06-07 01:46:27.403211", "last_clean_scrub_stamp": "2018-06-12 00:10:55.825562" }, "stats": { "version": "438490'293946", "reported_seq": "69672", "reported_epoch": "465904", "state": "incomplete", "last_fresh": "2018-06-14 11:51:52.770692", "last_change": "2018-06-14 11:51:52.770692", "last_active": "0.000000", "last_peered": "0.000000", "last_clean": "0.000000", "last_became_active": "0.000000", "last_became_peered": "0.000000", "last_unstale": "2018-06-14 11:51:52.770692", "last_undegraded": "2018-06-14 11:51:52.770692", "last_fullsized": "2018-06-14 11:51:52.770692", "mapping_epoch": 465901, "log_start": "427182'292446", "ondisk_log_start": "427182'292446", "created": 22654, "last_epoch_clean": 438832, "parent": "0.0", "parent_split_bits": 0, "last_scrub": "438490'293946", "last_scrub_stamp": "2018-06-12 00:10:55.825562", "last_deep_scrub": "427203'293886", "last_deep_scrub_stamp": "2018-06-07 01:46:27.403211", "last_clean_scrub_stamp": "2018-06-12 00:10:55.825562", "log_size": 1500, "ondisk_log_size": 1500, "stats_invalid": false, "dirty_stats_invalid": false, "omap_stats_invalid": false, "hitset_stats_invalid": false, "hitset_bytes_stats_invalid": false, "pin_stats_invalid": false, "snaptrimq_len": 0, "stat_sum": { "num_bytes": 0, "num_objects": 0, "num_object_clones": 0, "num_object_copies": 0, "num_objects_missing_on_primary": 0, "num_objects_missing": 0, "num_objects_degraded": 0, "num_objects_misplaced": 0, "num_objects_unfound": 0, "num_objects_dirty": 0, "num_whiteouts": 0, "num_read": 0, "num_read_kb": 0, "num_write": 0, "num_write_kb": 0, "num_scrub_errors": 0, "num_shallow_scrub_errors": 0, "num_deep_scrub_errors": 0, "num_objects_recovered": 0, "num_bytes_recovered": 0, "num_keys_recovered": 0, "num_objects_omap": 0, "num_objects_hit_set_archive": 0, "num_bytes_hit_set_archive": 0, "num_flush": 0, "num_flush_kb": 0, "num_evict": 0, "num_evict_kb": 0, "num_promote": 0, "num_flush_mode_high": 0, "num_flush_mode_low": 0, "num_evict_mode_some": 0, "num_evict_mode_full": 0, "num_objects_pinned": 0, "num_legacy_snapsets": 0 }, "up": [ 52, 23, 20 ], "acting": [ 52, 23, 20 ], "blocked_by": [ 65, 100, 101, 107 ], "up_primary": 52, "acting_primary": 52 }, "empty": 0, "dne": 0, "incomplete": 1, "last_epoch_started": 447973, "hit_set_history": { "current_last_update": "0'0", "history": [] } }, "peer_info": [ { "peer": "5", "pgid": "1.10e", "last_update": "438490'293946", "last_complete": "438490'293946", "log_tail": "427182'292446", "last_user_version": 0, "last_backfill": "MIN", "last_backfill_bitwise": 1, "purged_snaps": [ { "start": "2", "length": "12cd" }, { "start": "12d0", "length": "1fca" }, ... lots of snaps ... ], "history": { "epoch_created": 22654, "epoch_pool_created": 22654, "last_epoch_started": 447973, "last_interval_started": 447972, "last_epoch_clean": 438832, "last_interval_clean": 438831, "last_epoch_split": 0, "last_epoch_marked_full": 0, "same_up_since": 465900, "same_interval_since": 465901, "same_primary_since": 465901, "last_scrub": "438490'293946", "last_scrub_stamp": "2018-06-12 00:10:55.825562", "last_deep_scrub": "427203'293886", "last_deep_scrub_stamp": "2018-06-07 01:46:27.403211", "last_clean_scrub_stamp": "2018-06-12 00:10:55.825562" }, "stats": { "version": "438490'293946", "reported_seq": "58224", "reported_epoch": "460636", "state": "peering", "last_fresh": "2018-06-14 09:07:43.914677", "last_change": "2018-06-14 09:07:43.580029", "last_active": "0.000000", "last_peered": "0.000000", "last_clean": "0.000000", "last_became_active": "0.000000", "last_became_peered": "0.000000", "last_unstale": "2018-06-14 09:07:43.914677", "last_undegraded": "2018-06-14 09:07:43.914677", "last_fullsized": "2018-06-14 09:07:43.914677", "mapping_epoch": 465901, "log_start": "427182'292446", "ondisk_log_start": "427182'292446", "created": 22654, "last_epoch_clean": 438832, "parent": "0.0", "parent_split_bits": 0, "last_scrub": "438490'293946", "last_scrub_stamp": "2018-06-12 00:10:55.825562", "last_deep_scrub": "427203'293886", "last_deep_scrub_stamp": "2018-06-07 01:46:27.403211", "last_clean_scrub_stamp": "2018-06-12 00:10:55.825562", "log_size": 1500, "ondisk_log_size": 1500, "stats_invalid": false, "dirty_stats_invalid": false, "omap_stats_invalid": false, "hitset_stats_invalid": false, "hitset_bytes_stats_invalid": false, "pin_stats_invalid": false, "snaptrimq_len": 0, "stat_sum": { "num_bytes": 0, "num_objects": 0, "num_object_clones": 0, "num_object_copies": 0, "num_objects_missing_on_primary": 0, "num_objects_missing": 0, "num_objects_degraded": 0, "num_objects_misplaced": 0, "num_objects_unfound": 0, "num_objects_dirty": 0, "num_whiteouts": 0, "num_read": 0, "num_read_kb": 0, "num_write": 0, "num_write_kb": 0, "num_scrub_errors": 0, "num_shallow_scrub_errors": 0, "num_deep_scrub_errors": 0, "num_objects_recovered": 0, "num_bytes_recovered": 0, "num_keys_recovered": 0, "num_objects_omap": 0, "num_objects_hit_set_archive": 0, "num_bytes_hit_set_archive": 0, "num_flush": 0, "num_flush_kb": 0, "num_evict": 0, "num_evict_kb": 0, "num_promote": 0, "num_flush_mode_high": 0, "num_flush_mode_low": 0, "num_evict_mode_some": 0, "num_evict_mode_full": 0, "num_objects_pinned": 0, "num_legacy_snapsets": 0 }, "up": [ 52, 23, 20 ], "acting": [ 52, 23, 20 ], "blocked_by": [ 20, 23, 30, 107 ], "up_primary": 52, "acting_primary": 52 }, "empty": 0, "dne": 0, "incomplete": 1, "last_epoch_started": 447973, "hit_set_history": { "current_last_update": "0'0", "history": [] } }, { "peer": "10", "pgid": "1.10e", "last_update": "438490'293946", "last_complete": "438490'293946", "log_tail": "427182'292446", "last_user_version": 0, "last_backfill": "MIN", "last_backfill_bitwise": 1, "purged_snaps": [ { "start": "2", "length": "12cd" }, { "start": "12d0", "length": "1fca" }, ... lots of snaps ... ], "history": { "epoch_created": 22654, "epoch_pool_created": 22654, "last_epoch_started": 447973, "last_interval_started": 447972, "last_epoch_clean": 438832, "last_interval_clean": 438831, "last_epoch_split": 0, "last_epoch_marked_full": 0, "same_up_since": 465900, "same_interval_since": 465901, "same_primary_since": 465901, "last_scrub": "438490'293946", "last_scrub_stamp": "2018-06-12 00:10:55.825562", "last_deep_scrub": "427203'293886", "last_deep_scrub_stamp": "2018-06-07 01:46:27.403211", "last_clean_scrub_stamp": "2018-06-12 00:10:55.825562" }, "stats": { "version": "0'0", "reported_seq": "0", "reported_epoch": "0", "state": "unknown", "last_fresh": "0.000000", "last_change": "0.000000", "last_active": "0.000000", "last_peered": "0.000000", "last_clean": "0.000000", "last_became_active": "0.000000", "last_became_peered": "0.000000", "last_unstale": "0.000000", "last_undegraded": "0.000000", "last_fullsized": "0.000000", "mapping_epoch": 465901, "log_start": "0'0", "ondisk_log_start": "0'0", "created": 0, "last_epoch_clean": 0, "parent": "0.0", "parent_split_bits": 0, "last_scrub": "0'0", "last_scrub_stamp": "0.000000", "last_deep_scrub": "0'0", "last_deep_scrub_stamp": "0.000000", "last_clean_scrub_stamp": "0.000000", "log_size": 0, "ondisk_log_size": 0, "stats_invalid": false, "dirty_stats_invalid": false, "omap_stats_invalid": false, "hitset_stats_invalid": false, "hitset_bytes_stats_invalid": false, "pin_stats_invalid": false, "snaptrimq_len": 0, "stat_sum": { "num_bytes": 0, "num_objects": 0, "num_object_clones": 0, "num_object_copies": 0, "num_objects_missing_on_primary": 0, "num_objects_missing": 0, "num_objects_degraded": 0, "num_objects_misplaced": 0, "num_objects_unfound": 0, "num_objects_dirty": 0, "num_whiteouts": 0, "num_read": 0, "num_read_kb": 0, "num_write": 0, "num_write_kb": 0, "num_scrub_errors": 0, "num_shallow_scrub_errors": 0, "num_deep_scrub_errors": 0, "num_objects_recovered": 0, "num_bytes_recovered": 0, "num_keys_recovered": 0, "num_objects_omap": 0, "num_objects_hit_set_archive": 0, "num_bytes_hit_set_archive": 0, "num_flush": 0, "num_flush_kb": 0, "num_evict": 0, "num_evict_kb": 0, "num_promote": 0, "num_flush_mode_high": 0, "num_flush_mode_low": 0, "num_evict_mode_some": 0, "num_evict_mode_full": 0, "num_objects_pinned": 0, "num_legacy_snapsets": 0 }, "up": [ 52, 23, 20 ], "acting": [ 52, 23, 20 ], "blocked_by": [], "up_primary": 52, "acting_primary": 52 }, "empty": 0, "dne": 0, "incomplete": 1, "last_epoch_started": 447587, "hit_set_history": { "current_last_update": "0'0", "history": [] } } ], "recovery_state": [ { "name": "Started/Primary/Peering/Incomplete", "enter_time": "2018-06-14 11:51:52.770682", "comment": "not enough complete instances of this PG" }, { "name": "Started/Primary/Peering", "enter_time": "2018-06-14 11:51:52.745649", "past_intervals": [ { "first": "438831", "last": "465900", "all_participants": [ { "osd": 20 }, { "osd": 23 }, { "osd": 30 }, { "osd": 52 }, { "osd": 65 }, { "osd": 100 }, { "osd": 101 }, { "osd": 107 } ], "intervals": [ { "first": "447972", "last": "447978", "acting": "100,101" }, { "first": "455481", "last": "455502", "acting": "52,107" }, { "first": "455748", "last": "455905", "acting": "23,107" }, { "first": "462150", "last": "462151", "acting": "20" }, { "first": "465118", "last": "465119", "acting": "23,52" }, { "first": "465815", "last": "465816", "acting": "20,23,52" } ] } ], "probing_osds": [ "20", "23", "30", "52" ], "down_osds_we_would_probe": [ 65, 100, 101, 107 ], "peering_blocked_by": [], "peering_blocked_by_detail": [ { "detail": "peering_blocked_by_history_les_bound" } ] }, { "name": "Started", "enter_time": "2018-06-14 11:51:52.745611" } ], "agent_state": {} } On Thu, Jun 14, 2018 at 11:53 AM, Sage Weil <sage@xxxxxxxxxxxx> wrote: > On Thu, 14 Jun 2018, Wyllys Ingersoll wrote: >> Yes, I did have the ignore_history_les_option set for 2 of the running >> osds, but I disabled and restarted the affected osds and this is where >> it ends up: >> >> "probing_osds": [ >> "20", >> "23", >> "30", >> "52" >> ], >> "down_osds_we_would_probe": [ >> 65, >> 100, >> 101, >> 107 >> ], >> "peering_blocked_by": [], >> "peering_blocked_by_detail": [ >> { >> "detail": "peering_blocked_by_history_les_bound" >> } >> ] >> >> >> The 'down_osds_we_would_probe' are all non-existent. This is where I >> started the day, still cant get past it. And this is seen on all of >> the incomplete pgs, this is just 1 example. > > Post the full query from this state? > > sage -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html