All the problem pg's are on osd.39. When I stop osd.39, it shows 86
pg's would be offline. However, there is no recovery that happens. It
just stays there. 86 undersized+remapped+peered
I managed to pin down all the pg groups that are in this state by using:
ceph pg dump | grep active+clean+remapped
From there I queried the first pg on the list:
NodeC:~# ceph pg 28.42 query
{
"snap_trimq": "[]",
"snap_trimq_len": 0,
"state": "active+clean+remapped",
"epoch": 490547,
"up": [],
"acting": [
39,
1
],
"acting_recovery_backfill": [
"1",
"39"
],
"info": {
"pgid": "28.42",
"last_update": "489784'67",
"last_complete": "489784'67",
"log_tail": "0'0",
"last_user_version": 67,
"last_backfill": "MAX",
"purged_snaps": [],
"history": {
"epoch_created": 487438,
"epoch_pool_created": 487438,
"last_epoch_started": 490544,
"last_interval_started": 490543,
"last_epoch_clean": 490544,
"last_interval_clean": 490543,
"last_epoch_split": 0,
"last_epoch_marked_full": 0,
"same_up_since": 489999,
"same_interval_since": 490543,
"same_primary_since": 490543,
"last_scrub": "489784'67",
"last_scrub_stamp": "2024-11-15T09:16:59.715671+0200",
"last_deep_scrub": "0'0",
"last_deep_scrub_stamp": "2024-11-12T16:55:34.967587+0200",
"last_clean_scrub_stamp": "2024-11-15T09:16:59.715671+0200",
"prior_readable_until_ub": 0
},
"stats": {
"version": "489784'67",
"reported_seq": 721,
"reported_epoch": 490547,
"state": "active+clean+remapped",
"last_fresh": "2024-11-15T16:07:08.111992+0200",
"last_change": "2024-11-15T15:01:12.081136+0200",
"last_active": "2024-11-15T16:07:08.111992+0200",
"last_peered": "2024-11-15T16:07:08.111992+0200",
"last_clean": "2024-11-15T16:07:08.111992+0200",
"last_became_active": "2024-11-15T15:01:12.080691+0200",
"last_became_peered": "2024-11-15T15:01:12.080691+0200",
"last_unstale": "2024-11-15T16:07:08.111992+0200",
"last_undegraded": "2024-11-15T16:07:08.111992+0200",
"last_fullsized": "2024-11-15T16:07:08.111992+0200",
"mapping_epoch": 490543,
"log_start": "0'0",
"ondisk_log_start": "0'0",
"created": 487438,
"last_epoch_clean": 490544,
"parent": "0.0",
"parent_split_bits": 0,
"last_scrub": "489784'67",
"last_scrub_stamp": "2024-11-15T09:16:59.715671+0200",
"last_deep_scrub": "0'0",
"last_deep_scrub_stamp": "2024-11-12T16:55:34.967587+0200",
"last_clean_scrub_stamp": "2024-11-15T09:16:59.715671+0200",
"objects_scrubbed": 0,
"log_size": 67,
"ondisk_log_size": 67,
"stats_invalid": false,
"dirty_stats_invalid": false,
"omap_stats_invalid": false,
"hitset_stats_invalid": false,
"hitset_bytes_stats_invalid": false,
"pin_stats_invalid": false,
"manifest_stats_invalid": false,
"snaptrimq_len": 0,
"last_scrub_duration": 1,
"scrub_schedule": "periodic scrub scheduled @
2024-11-16T17:19:20.646231+0000",
"scrub_duration": 0.024065349,
"objects_trimmed": 0,
"snaptrim_duration": 0.090716250999999998,
"stat_sum": {
"num_bytes": 0,
"num_objects": 0,
"num_object_clones": 0,
"num_object_copies": 0,
"num_objects_missing_on_primary": 0,
"num_objects_missing": 0,
"num_objects_degraded": 0,
"num_objects_misplaced": 0,
"num_objects_unfound": 0,
"num_objects_dirty": 0,
"num_whiteouts": 0,
"num_read": 68,
"num_read_kb": 708,
"num_write": 67,
"num_write_kb": 19088,
"num_scrub_errors": 0,
"num_shallow_scrub_errors": 0,
"num_deep_scrub_errors": 0,
"num_objects_recovered": 12,
"num_bytes_recovered": 26607616,
"num_keys_recovered": 0,
"num_objects_omap": 0,
"num_objects_hit_set_archive": 0,
"num_bytes_hit_set_archive": 0,
"num_flush": 0,
"num_flush_kb": 0,
"num_evict": 0,
"num_evict_kb": 0,
"num_promote": 0,
"num_flush_mode_high": 0,
"num_flush_mode_low": 0,
"num_evict_mode_some": 0,
"num_evict_mode_full": 0,
"num_objects_pinned": 0,
"num_legacy_snapsets": 0,
"num_large_omap_objects": 0,
"num_objects_manifest": 0,
"num_omap_bytes": 0,
"num_omap_keys": 0,
"num_objects_repaired": 0
},
"up": [],
"acting": [
39,
1
],
"avail_no_missing": [
"39",
"1"
],
"object_location_counts": [],
"blocked_by": [],
"up_primary": -1,
"acting_primary": 39,
"purged_snaps": []
},
"empty": 0,
"dne": 0,
"incomplete": 0,
"last_epoch_started": 490544,
"hit_set_history": {
"current_last_update": "0'0",
"history": []
}
},
"peer_info": [
{
"peer": "1",
"pgid": "28.42",
"last_update": "489784'67",
"last_complete": "489784'67",
"log_tail": "0'0",
"last_user_version": 67,
"last_backfill": "MAX",
"purged_snaps": [],
"history": {
"epoch_created": 487438,
"epoch_pool_created": 487438,
"last_epoch_started": 490544,
"last_interval_started": 490543,
"last_epoch_clean": 490544,
"last_interval_clean": 490543,
"last_epoch_split": 0,
"last_epoch_marked_full": 0,
"same_up_since": 489999,
"same_interval_since": 490543,
"same_primary_since": 490543,
"last_scrub": "489784'67",
"last_scrub_stamp": "2024-11-15T09:16:59.715671+0200",
"last_deep_scrub": "0'0",
"last_deep_scrub_stamp": "2024-11-12T16:55:34.967587+0200",
"last_clean_scrub_stamp":
"2024-11-15T09:16:59.715671+0200",
"prior_readable_until_ub": 0
},
"stats": {
"version": "489784'67",
"reported_seq": 430,
"reported_epoch": 490542,
"state": "undersized+remapped+peered",
"last_fresh": "2024-11-15T15:01:10.046703+0200",
"last_change": "2024-11-15T15:00:37.072082+0200",
"last_active": "2024-11-14T15:41:06.074260+0200",
"last_peered": "2024-11-15T15:01:10.046703+0200",
"last_clean": "2024-11-14T15:41:03.945754+0200",
"last_became_active": "2024-11-14T15:39:15.997707+0200",
"last_became_peered": "2024-11-15T15:00:37.072082+0200",
"last_unstale": "2024-11-15T15:01:10.046703+0200",
"last_undegraded": "2024-11-15T15:01:10.046703+0200",
"last_fullsized": "2024-11-15T15:00:37.069250+0200",
"mapping_epoch": 490543,
"log_start": "0'0",
"ondisk_log_start": "0'0",
"created": 487438,
"last_epoch_clean": 490528,
"parent": "0.0",
"parent_split_bits": 0,
"last_scrub": "489784'67",
"last_scrub_stamp": "2024-11-15T09:16:59.715671+0200",
"last_deep_scrub": "0'0",
"last_deep_scrub_stamp": "2024-11-12T16:55:34.967587+0200",
"last_clean_scrub_stamp":
"2024-11-15T09:16:59.715671+0200",
"objects_scrubbed": 0,
"log_size": 67,
"ondisk_log_size": 67,
"stats_invalid": false,
"dirty_stats_invalid": false,
"omap_stats_invalid": false,
"hitset_stats_invalid": false,
"hitset_bytes_stats_invalid": false,
"pin_stats_invalid": false,
"manifest_stats_invalid": false,
"snaptrimq_len": 0,
"last_scrub_duration": 0,
"scrub_schedule": "periodic scrub scheduled @
2024-11-16T12:15:01.587848+0000",
"scrub_duration": 0,
"objects_trimmed": 0,
"snaptrim_duration": 0.090716250999999998,
"stat_sum": {
"num_bytes": 0,
"num_objects": 0,
"num_object_clones": 0,
"num_object_copies": 0,
"num_objects_missing_on_primary": 0,
"num_objects_missing": 0,
"num_objects_degraded": 0,
"num_objects_misplaced": 0,
"num_objects_unfound": 0,
"num_objects_dirty": 0,
"num_whiteouts": 0,
"num_read": 68,
"num_read_kb": 708,
"num_write": 67,
"num_write_kb": 19088,
"num_scrub_errors": 0,
"num_shallow_scrub_errors": 0,
"num_deep_scrub_errors": 0,
"num_objects_recovered": 7,
"num_bytes_recovered": 26607616,
"num_keys_recovered": 0,
"num_objects_omap": 0,
"num_objects_hit_set_archive": 0,
"num_bytes_hit_set_archive": 0,
"num_flush": 0,
"num_flush_kb": 0,
"num_evict": 0,
"num_evict_kb": 0,
"num_promote": 0,
"num_flush_mode_high": 0,
"num_flush_mode_low": 0,
"num_evict_mode_some": 0,
"num_evict_mode_full": 0,
"num_objects_pinned": 0,
"num_legacy_snapsets": 0,
"num_large_omap_objects": 0,
"num_objects_manifest": 0,
"num_omap_bytes": 0,
"num_omap_keys": 0,
"num_objects_repaired": 0
},
"up": [],
"acting": [
39,
1
],
"avail_no_missing": [
"1"
],
"object_location_counts": [],
"blocked_by": [],
"up_primary": -1,
"acting_primary": 39,
"purged_snaps": []
},
"empty": 0,
"dne": 0,
"incomplete": 0,
"last_epoch_started": 490544,
"hit_set_history": {
"current_last_update": "0'0",
"history": []
}
}
],
"recovery_state": [
{
"name": "Started/Primary/Active",
"enter_time": "2024-11-15T15:01:12.073557+0200",
"might_have_unfound": [],
"recovery_progress": {
"backfill_targets": [],
"waiting_on_backfill": [],
"last_backfill_started": "MIN",
"backfill_info": {
"begin": "MIN",
"end": "MIN",
"objects": []
},
"peer_backfill_info": [],
"backfills_in_flight": [],
"recovering": [],
"pg_backend": {
"pull_from_peer": [],
"pushing": []
}
}
},
{
"name": "Started",
"enter_time": "2024-11-15T15:01:11.069697+0200"
}
],
"scrubber": {
"active": false,
"must_scrub": false,
"must_deep_scrub": false,
"must_repair": false,
"need_auto": false,
"scrub_reg_stamp": "2024-11-16T19:19:20.646231+0200",
"schedule": "scrub scheduled @ 2024-11-16T17:19:20.646231+0000"
},
"agent_state": {}
}
Can anyone see from this what the reason may be that this (and the other
pg's) are stuck on this osd?
On 2024/11/15 13:36, Roland Giesler wrote:
On 2024/11/15 13:00, Gregory Orange wrote:
On 15/11/24 17:11, Roland Giesler wrote:
How do I determine the primary osd?
ceph pg map $pg
ceph pg $pg query | jq .info.stats.acting_primary
You can jq and less to take a look at other values which might be
informative too.
Ah, of course :-) Sorry, I was looking for the primary osd of a pool!
No wonder I couldn't find anything. LOL!
Greg.
_______________________________________________
ceph-users mailing list -- ceph-users@xxxxxxx
To unsubscribe send an email to ceph-users-leave@xxxxxxx
_______________________________________________
ceph-users mailing list -- ceph-users@xxxxxxx
To unsubscribe send an email to ceph-users-leave@xxxxxxx