Hello,
I'm working with a small ceph cluster (about 10TB, 7-9 OSDs, all Bluestore on
lvm) and recently ran into a problem with 17 pgs marked as incomplete after
adding/removing OSDs.
Here's the sequence of events:
1. 7 osds in the cluster, health is OK, all pgs are active+clean
2. 3 new osds on a new host are added, lots of backfilling in progress
3. osd 6 needs to be removed, so we do "ceph osd crush reweight osd.6 0"
4. after a few hours we see "min osd.6 with 0 pgs" from "ceph osd utilization"
5. ceph osd out 6
6. systemctl stop ceph-osd@6
7. the drive backing osd 6 is pulled and wiped
8. backfilling has now finished all pgs are active+clean except for 17
incomplete pgs
>From reading the docs, it sounds like there has been unrecoverable data loss
in those 17 pgs. That raises some questions for me:
Was "ceph osd utilization" only showing a goal of 0 pgs allocated instead of
the current actual allocation?
Why is there data loss from a single osd being removed? Shouldn't that be
recoverable?
All pools in the cluster are either replicated 3 or erasure-coded k=2,m=1 with
default "host" failure domain. They shouldn't suffer data loss with a single
osd being removed even if there were no reweighting beforehand. Does the
backfilling temporarily reduce data durability in some way?
Is there a way to see which pgs actually have data on a given osd?
I attached an example of one of the incomplete pgs.
Thanks for any help,
Kyle
{
"state": "incomplete",
"snap_trimq": "[]",
"snap_trimq_len": 0,
"epoch": 2087,
"up": [
4,
3,
8
],
"acting": [
4,
3,
8
],
"info": {
"pgid": "15.59s0",
"last_update": "753'7465",
"last_complete": "753'7465",
"log_tail": "663'4401",
"last_user_version": 6947,
"last_backfill": "MAX",
"last_backfill_bitwise": 0,
"purged_snaps": [],
"history": {
"epoch_created": 603,
"epoch_pool_created": 603,
"last_epoch_started": 1581,
"last_interval_started": 1580,
"last_epoch_clean": 945,
"last_interval_clean": 944,
"last_epoch_split": 0,
"last_epoch_marked_full": 0,
"same_up_since": 2082,
"same_interval_since": 2082,
"same_primary_since": 2076,
"last_scrub": "753'7465",
"last_scrub_stamp": "2019-07-02 13:40:58.935208",
"last_deep_scrub": "0'0",
"last_deep_scrub_stamp": "2019-06-27 17:42:04.685790",
"last_clean_scrub_stamp": "2019-07-02 13:40:58.935208"
},
"stats": {
"version": "753'7465",
"reported_seq": "12691",
"reported_epoch": "2087",
"state": "incomplete",
"last_fresh": "2019-07-04 14:30:47.930190",
"last_change": "2019-07-04 14:30:47.930190",
"last_active": "2019-07-03 13:04:00.967354",
"last_peered": "2019-07-03 13:02:40.242867",
"last_clean": "2019-07-02 23:04:26.601070",
"last_became_active": "2019-07-03 08:35:12.459857",
"last_became_peered": "2019-07-03 08:35:12.459857",
"last_unstale": "2019-07-04 14:30:47.930190",
"last_undegraded": "2019-07-04 14:30:47.930190",
"last_fullsized": "2019-07-04 14:30:47.930190",
"mapping_epoch": 2082,
"log_start": "663'4401",
"ondisk_log_start": "663'4401",
"created": 603,
"last_epoch_clean": 945,
"parent": "0.0",
"parent_split_bits": 0,
"last_scrub": "753'7465",
"last_scrub_stamp": "2019-07-02 13:40:58.935208",
"last_deep_scrub": "0'0",
"last_deep_scrub_stamp": "2019-06-27 17:42:04.685790",
"last_clean_scrub_stamp": "2019-07-02 13:40:58.935208",
"log_size": 3064,
"ondisk_log_size": 3064,
"stats_invalid": false,
"dirty_stats_invalid": false,
"omap_stats_invalid": false,
"hitset_stats_invalid": false,
"hitset_bytes_stats_invalid": false,
"pin_stats_invalid": false,
"manifest_stats_invalid": false,
"snaptrimq_len": 0,
"stat_sum": {
"num_bytes": 12872933376,
"num_objects": 3094,
"num_object_clones": 0,
"num_object_copies": 9282,
"num_objects_missing_on_primary": 0,
"num_objects_missing": 0,
"num_objects_degraded": 0,
"num_objects_misplaced": 0,
"num_objects_unfound": 0,
"num_objects_dirty": 3094,
"num_whiteouts": 0,
"num_read": 896,
"num_read_kb": 3708,
"num_write": 5870,
"num_write_kb": 12567180,
"num_scrub_errors": 0,
"num_shallow_scrub_errors": 0,
"num_deep_scrub_errors": 0,
"num_objects_recovered": 284,
"num_bytes_recovered": 1178091520,
"num_keys_recovered": 0,
"num_objects_omap": 0,
"num_objects_hit_set_archive": 0,
"num_bytes_hit_set_archive": 0,
"num_flush": 0,
"num_flush_kb": 0,
"num_evict": 0,
"num_evict_kb": 0,
"num_promote": 0,
"num_flush_mode_high": 0,
"num_flush_mode_low": 0,
"num_evict_mode_some": 0,
"num_evict_mode_full": 0,
"num_objects_pinned": 0,
"num_legacy_snapsets": 0,
"num_large_omap_objects": 0,
"num_objects_manifest": 0
},
"up": [
4,
3,
8
],
"acting": [
4,
3,
8
],
"blocked_by": [
6
],
"up_primary": 4,
"acting_primary": 4,
"purged_snaps": []
},
"empty": 0,
"dne": 0,
"incomplete": 0,
"last_epoch_started": 1581,
"hit_set_history": {
"current_last_update": "0'0",
"history": []
}
},
"peer_info": [
{
"peer": "0(0)",
"pgid": "15.59s0",
"last_update": "0'0",
"last_complete": "0'0",
"log_tail": "0'0",
"last_user_version": 0,
"last_backfill": "MAX",
"last_backfill_bitwise": 0,
"purged_snaps": [],
"history": {
"epoch_created": 0,
"epoch_pool_created": 0,
"last_epoch_started": 0,
"last_interval_started": 0,
"last_epoch_clean": 0,
"last_interval_clean": 0,
"last_epoch_split": 0,
"last_epoch_marked_full": 0,
"same_up_since": 0,
"same_interval_since": 0,
"same_primary_since": 0,
"last_scrub": "0'0",
"last_scrub_stamp": "0.000000",
"last_deep_scrub": "0'0",
"last_deep_scrub_stamp": "0.000000",
"last_clean_scrub_stamp": "0.000000"
},
"stats": {
"version": "0'0",
"reported_seq": "0",
"reported_epoch": "0",
"state": "unknown",
"last_fresh": "0.000000",
"last_change": "0.000000",
"last_active": "0.000000",
"last_peered": "0.000000",
"last_clean": "0.000000",
"last_became_active": "0.000000",
"last_became_peered": "0.000000",
"last_unstale": "0.000000",
"last_undegraded": "0.000000",
"last_fullsized": "0.000000",
"mapping_epoch": 0,
"log_start": "0'0",
"ondisk_log_start": "0'0",
"created": 0,
"last_epoch_clean": 0,
"parent": "0.0",
"parent_split_bits": 0,
"last_scrub": "0'0",
"last_scrub_stamp": "0.000000",
"last_deep_scrub": "0'0",
"last_deep_scrub_stamp": "0.000000",
"last_clean_scrub_stamp": "0.000000",
"log_size": 0,
"ondisk_log_size": 0,
"stats_invalid": false,
"dirty_stats_invalid": false,
"omap_stats_invalid": false,
"hitset_stats_invalid": false,
"hitset_bytes_stats_invalid": false,
"pin_stats_invalid": false,
"manifest_stats_invalid": false,
"snaptrimq_len": 0,
"stat_sum": {
"num_bytes": 0,
"num_objects": 0,
"num_object_clones": 0,
"num_object_copies": 0,
"num_objects_missing_on_primary": 0,
"num_objects_missing": 0,
"num_objects_degraded": 0,
"num_objects_misplaced": 0,
"num_objects_unfound": 0,
"num_objects_dirty": 0,
"num_whiteouts": 0,
"num_read": 0,
"num_read_kb": 0,
"num_write": 0,
"num_write_kb": 0,
"num_scrub_errors": 0,
"num_shallow_scrub_errors": 0,
"num_deep_scrub_errors": 0,
"num_objects_recovered": 0,
"num_bytes_recovered": 0,
"num_keys_recovered": 0,
"num_objects_omap": 0,
"num_objects_hit_set_archive": 0,
"num_bytes_hit_set_archive": 0,
"num_flush": 0,
"num_flush_kb": 0,
"num_evict": 0,
"num_evict_kb": 0,
"num_promote": 0,
"num_flush_mode_high": 0,
"num_flush_mode_low": 0,
"num_evict_mode_some": 0,
"num_evict_mode_full": 0,
"num_objects_pinned": 0,
"num_legacy_snapsets": 0,
"num_large_omap_objects": 0,
"num_objects_manifest": 0
},
"up": [],
"acting": [],
"blocked_by": [],
"up_primary": -1,
"acting_primary": -1,
"purged_snaps": []
},
"empty": 1,
"dne": 1,
"incomplete": 0,
"last_epoch_started": 0,
"hit_set_history": {
"current_last_update": "0'0",
"history": []
}
},
{
"peer": "1(2)",
"pgid": "15.59s2",
"last_update": "753'7465",
"last_complete": "753'7465",
"log_tail": "663'4401",
"last_user_version": 6947,
"last_backfill": "MAX",
"last_backfill_bitwise": 0,
"purged_snaps": [],
"history": {
"epoch_created": 603,
"epoch_pool_created": 603,
"last_epoch_started": 1581,
"last_interval_started": 1580,
"last_epoch_clean": 945,
"last_interval_clean": 944,
"last_epoch_split": 0,
"last_epoch_marked_full": 0,
"same_up_since": 2082,
"same_interval_since": 2082,
"same_primary_since": 2076,
"last_scrub": "753'7465",
"last_scrub_stamp": "2019-07-02 13:40:58.935208",
"last_deep_scrub": "0'0",
"last_deep_scrub_stamp": "2019-06-27 17:42:04.685790",
"last_clean_scrub_stamp": "2019-07-02 13:40:58.935208"
},
"stats": {
"version": "753'7465",
"reported_seq": "7690",
"reported_epoch": "1748",
"state": "remapped",
"last_fresh": "2019-07-03 17:45:14.373257",
"last_change": "2019-07-03 17:45:14.373257",
"last_active": "2019-07-03 17:13:49.853331",
"last_peered": "2019-07-01 00:22:07.347288",
"last_clean": "2019-07-01 00:22:07.347288",
"last_became_active": "2019-06-30 16:40:22.839397",
"last_became_peered": "2019-06-30 16:40:22.839397",
"last_unstale": "2019-07-03 17:45:14.373257",
"last_undegraded": "2019-07-03 17:45:14.373257",
"last_fullsized": "2019-07-03 17:45:14.373257",
"mapping_epoch": 2082,
"log_start": "663'4401",
"ondisk_log_start": "663'4401",
"created": 603,
"last_epoch_clean": 945,
"parent": "0.0",
"parent_split_bits": 0,
"last_scrub": "753'7465",
"last_scrub_stamp": "2019-07-02 13:40:58.935208",
"last_deep_scrub": "0'0",
"last_deep_scrub_stamp": "2019-06-27 17:42:04.685790",
"last_clean_scrub_stamp": "2019-07-02 13:40:58.935208",
"log_size": 3064,
"ondisk_log_size": 3064,
"stats_invalid": false,
"dirty_stats_invalid": false,
"omap_stats_invalid": false,
"hitset_stats_invalid": false,
"hitset_bytes_stats_invalid": false,
"pin_stats_invalid": false,
"manifest_stats_invalid": false,
"snaptrimq_len": 0,
"stat_sum": {
"num_bytes": 12872933376,
"num_objects": 3094,
"num_object_clones": 0,
"num_object_copies": 9282,
"num_objects_missing_on_primary": 0,
"num_objects_missing": 0,
"num_objects_degraded": 0,
"num_objects_misplaced": 0,
"num_objects_unfound": 0,
"num_objects_dirty": 3094,
"num_whiteouts": 0,
"num_read": 501,
"num_read_kb": 2004,
"num_write": 5870,
"num_write_kb": 12567180,
"num_scrub_errors": 0,
"num_shallow_scrub_errors": 0,
"num_deep_scrub_errors": 0,
"num_objects_recovered": 0,
"num_bytes_recovered": 0,
"num_keys_recovered": 0,
"num_objects_omap": 0,
"num_objects_hit_set_archive": 0,
"num_bytes_hit_set_archive": 0,
"num_flush": 0,
"num_flush_kb": 0,
"num_evict": 0,
"num_evict_kb": 0,
"num_promote": 0,
"num_flush_mode_high": 0,
"num_flush_mode_low": 0,
"num_evict_mode_some": 0,
"num_evict_mode_full": 0,
"num_objects_pinned": 0,
"num_legacy_snapsets": 0,
"num_large_omap_objects": 0,
"num_objects_manifest": 0
},
"up": [
4,
3,
8
],
"acting": [
4,
3,
8
],
"blocked_by": [],
"up_primary": 4,
"acting_primary": 4,
"purged_snaps": []
},
"empty": 0,
"dne": 0,
"incomplete": 0,
"last_epoch_started": 1581,
"hit_set_history": {
"current_last_update": "0'0",
"history": []
}
},
{
"peer": "3(1)",
"pgid": "15.59s1",
"last_update": "753'7465",
"last_complete": "753'7465",
"log_tail": "663'4465",
"last_user_version": 0,
"last_backfill": "MIN",
"last_backfill_bitwise": 1,
"purged_snaps": [],
"history": {
"epoch_created": 603,
"epoch_pool_created": 603,
"last_epoch_started": 1581,
"last_interval_started": 1580,
"last_epoch_clean": 945,
"last_interval_clean": 944,
"last_epoch_split": 0,
"last_epoch_marked_full": 0,
"same_up_since": 2082,
"same_interval_since": 2082,
"same_primary_since": 2076,
"last_scrub": "753'7465",
"last_scrub_stamp": "2019-07-02 13:40:58.935208",
"last_deep_scrub": "0'0",
"last_deep_scrub_stamp": "2019-06-27 17:42:04.685790",
"last_clean_scrub_stamp": "2019-07-02 13:40:58.935208"
},
"stats": {
"version": "753'7465",
"reported_seq": "20",
"reported_epoch": "2075",
"state": "down",
"last_fresh": "2019-07-04 14:17:58.029917",
"last_change": "2019-07-04 14:17:58.029917",
"last_active": "0.000000",
"last_peered": "0.000000",
"last_clean": "0.000000",
"last_became_active": "0.000000",
"last_became_peered": "0.000000",
"last_unstale": "2019-07-04 14:17:58.029917",
"last_undegraded": "2019-07-04 14:17:58.029917",
"last_fullsized": "2019-07-04 14:17:58.029917",
"mapping_epoch": 2082,
"log_start": "663'4465",
"ondisk_log_start": "663'4465",
"created": 603,
"last_epoch_clean": 945,
"parent": "0.0",
"parent_split_bits": 0,
"last_scrub": "753'7465",
"last_scrub_stamp": "2019-07-02 13:40:58.935208",
"last_deep_scrub": "0'0",
"last_deep_scrub_stamp": "2019-06-27 17:42:04.685790",
"last_clean_scrub_stamp": "2019-07-02 13:40:58.935208",
"log_size": 3000,
"ondisk_log_size": 3000,
"stats_invalid": false,
"dirty_stats_invalid": false,
"omap_stats_invalid": false,
"hitset_stats_invalid": false,
"hitset_bytes_stats_invalid": false,
"pin_stats_invalid": false,
"manifest_stats_invalid": false,
"snaptrimq_len": 0,
"stat_sum": {
"num_bytes": 0,
"num_objects": 0,
"num_object_clones": 0,
"num_object_copies": 0,
"num_objects_missing_on_primary": 0,
"num_objects_missing": 0,
"num_objects_degraded": 0,
"num_objects_misplaced": 0,
"num_objects_unfound": 0,
"num_objects_dirty": 0,
"num_whiteouts": 0,
"num_read": 0,
"num_read_kb": 0,
"num_write": 0,
"num_write_kb": 0,
"num_scrub_errors": 0,
"num_shallow_scrub_errors": 0,
"num_deep_scrub_errors": 0,
"num_objects_recovered": 0,
"num_bytes_recovered": 0,
"num_keys_recovered": 0,
"num_objects_omap": 0,
"num_objects_hit_set_archive": 0,
"num_bytes_hit_set_archive": 0,
"num_flush": 0,
"num_flush_kb": 0,
"num_evict": 0,
"num_evict_kb": 0,
"num_promote": 0,
"num_flush_mode_high": 0,
"num_flush_mode_low": 0,
"num_evict_mode_some": 0,
"num_evict_mode_full": 0,
"num_objects_pinned": 0,
"num_legacy_snapsets": 0,
"num_large_omap_objects": 0,
"num_objects_manifest": 0
},
"up": [
4,
3,
8
],
"acting": [
4,
3,
8
],
"blocked_by": [
0,
4,
6
],
"up_primary": 4,
"acting_primary": 4,
"purged_snaps": []
},
"empty": 0,
"dne": 0,
"incomplete": 1,
"last_epoch_started": 1274,
"hit_set_history": {
"current_last_update": "0'0",
"history": []
}
},
{
"peer": "8(2)",
"pgid": "15.59s2",
"last_update": "753'7465",
"last_complete": "753'7465",
"log_tail": "663'4465",
"last_user_version": 0,
"last_backfill": "15:9a2e15fe:::rbd_data.12.170ea6b8b4567.0000000000038d14:head",
"last_backfill_bitwise": 1,
"purged_snaps": [],
"history": {
"epoch_created": 603,
"epoch_pool_created": 603,
"last_epoch_started": 1581,
"last_interval_started": 1580,
"last_epoch_clean": 945,
"last_interval_clean": 944,
"last_epoch_split": 0,
"last_epoch_marked_full": 0,
"same_up_since": 2082,
"same_interval_since": 2082,
"same_primary_since": 2076,
"last_scrub": "753'7465",
"last_scrub_stamp": "2019-07-02 13:40:58.935208",
"last_deep_scrub": "0'0",
"last_deep_scrub_stamp": "2019-06-27 17:42:04.685790",
"last_clean_scrub_stamp": "2019-07-02 13:40:58.935208"
},
"stats": {
"version": "753'7465",
"reported_seq": "1",
"reported_epoch": "1750",
"state": "unknown",
"last_fresh": "2019-07-03 17:45:51.249980",
"last_change": "0.000000",
"last_active": "0.000000",
"last_peered": "0.000000",
"last_clean": "0.000000",
"last_became_active": "0.000000",
"last_became_peered": "0.000000",
"last_unstale": "2019-07-03 17:45:51.249980",
"last_undegraded": "2019-07-03 17:45:51.249980",
"last_fullsized": "2019-07-03 17:45:51.249980",
"mapping_epoch": 2082,
"log_start": "663'4465",
"ondisk_log_start": "663'4465",
"created": 603,
"last_epoch_clean": 945,
"parent": "0.0",
"parent_split_bits": 0,
"last_scrub": "753'7465",
"last_scrub_stamp": "2019-07-02 13:40:58.935208",
"last_deep_scrub": "0'0",
"last_deep_scrub_stamp": "2019-06-27 17:42:04.685790",
"last_clean_scrub_stamp": "2019-07-02 13:40:58.935208",
"log_size": 3000,
"ondisk_log_size": 3000,
"stats_invalid": false,
"dirty_stats_invalid": false,
"omap_stats_invalid": false,
"hitset_stats_invalid": false,
"hitset_bytes_stats_invalid": false,
"pin_stats_invalid": false,
"manifest_stats_invalid": false,
"snaptrimq_len": 0,
"stat_sum": {
"num_bytes": 1173897216,
"num_objects": 283,
"num_object_clones": 0,
"num_object_copies": 849,
"num_objects_missing_on_primary": 0,
"num_objects_missing": 2812,
"num_objects_degraded": 0,
"num_objects_misplaced": 0,
"num_objects_unfound": 0,
"num_objects_dirty": 283,
"num_whiteouts": 0,
"num_read": 0,
"num_read_kb": 0,
"num_write": 0,
"num_write_kb": 0,
"num_scrub_errors": 0,
"num_shallow_scrub_errors": 0,
"num_deep_scrub_errors": 0,
"num_objects_recovered": 0,
"num_bytes_recovered": 0,
"num_keys_recovered": 0,
"num_objects_omap": 0,
"num_objects_hit_set_archive": 0,
"num_bytes_hit_set_archive": 0,
"num_flush": 0,
"num_flush_kb": 0,
"num_evict": 0,
"num_evict_kb": 0,
"num_promote": 0,
"num_flush_mode_high": 0,
"num_flush_mode_low": 0,
"num_evict_mode_some": 0,
"num_evict_mode_full": 0,
"num_objects_pinned": 0,
"num_legacy_snapsets": 0,
"num_large_omap_objects": 0,
"num_objects_manifest": 0
},
"up": [
4,
3,
8
],
"acting": [
4,
3,
8
],
"blocked_by": [],
"up_primary": 4,
"acting_primary": 4,
"purged_snaps": []
},
"empty": 0,
"dne": 0,
"incomplete": 1,
"last_epoch_started": 1233,
"hit_set_history": {
"current_last_update": "0'0",
"history": []
}
}
],
"recovery_state": [
{
"name": "Started/Primary/Peering/Incomplete",
"enter_time": "2019-07-04 14:30:47.930184",
"comment": "not enough complete instances of this PG"
},
{
"name": "Started/Primary/Peering",
"enter_time": "2019-07-04 14:30:47.911177",
"past_intervals": [
{
"first": "944",
"last": "2081",
"all_participants": [
{
"osd": 0,
"shard": 0
},
{
"osd": 1,
"shard": 2
},
{
"osd": 3,
"shard": 1
},
{
"osd": 4,
"shard": 0
},
{
"osd": 6,
"shard": 1
},
{
"osd": 8,
"shard": 2
}
],
"intervals": [
{
"first": "1580",
"last": "1664",
"acting": "1(2),4(0),6(1)"
},
{
"first": "2076",
"last": "2079",
"acting": "3(1),4(0),8(2)"
}
]
}
],
"probing_osds": [
"0(0)",
"1(2)",
"3(1)",
"4(0)",
"8(2)"
],
"down_osds_we_would_probe": [
6
],
"peering_blocked_by": [],
"peering_blocked_by_detail": [
{
"detail": "peering_blocked_by_history_les_bound"
}
]
},
{
"name": "Started",
"enter_time": "2019-07-04 14:30:47.911073"
}
],
"agent_state": {}
}
_______________________________________________
ceph-users mailing list
ceph-users@xxxxxxxxxxxxxx
http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com