do you have osd's crush location changed after reboot? kas <kas@xxxxxxxxxx> 于2019年5月15日周三 下午10:39写道: > > kas wrote: > : Marc, > : > : Marc Roos wrote: > : : Are you sure your osd's are up and reachable? (run ceph osd tree on > : : another node) > : > : They are up, because all three mons see them as up. > : However, ceph osd tree provided the hint (thanks!): The OSD host went back > : with hostname "localhost" instead of the correct one for some reason. > : So the OSDs moved themselves to a new HOST=localhost CRUSH node directly > : under the CRUSH root. I rebooted the OSD host once again, and it went up > : again with the correct hostname, and the "ceph osd tree" output looks sane > : now. So I guess we have a reason for such a huge rebalance. > : > : However, even though the OSD tree is back in the normal state, > : the rebalance is still going on, and there are even inactive PGs, > : with some Ceph clients being stuck seemingly forever: > : > : health: HEALTH_ERR > : 1964645/3977451 objects misplaced (49.395%) > : Reduced data availability: 11 pgs inactive > > Wild guessing what to do, I went to the rebooted OSD host and ran > systemctl restart ceph-osd.target > - restarting all OSD processes. The previously inactive (activating) pgs > went to the active state, and Ceph clients got unstuck. Now I see > HEALTH_ERR with backfill_toofull only, which I consider a normal state > during Ceph Mimic rebalance. > > It would be interesting to know why some of the PGs went stuck, > and why did restart help. FWIW, I have a "ceph pg query" output for > one of the 11 inactive PGs. > > -Yenya > > ------------------------------------------- > # ceph pg 23.4f5 query > { > "state": "activating+remapped", > "snap_trimq": "[]", > "snap_trimq_len": 0, > "epoch": 104015, > "up": [ > 70, > 72, > 27 > ], > "acting": [ > 25, > 27, > 79 > ], > "backfill_targets": [ > "70", > "72" > ], > "acting_recovery_backfill": [ > "25", > "27", > "70", > "72", > "79" > ], > "info": { > "pgid": "23.4f5", > "last_update": "103035'4667973", > "last_complete": "103035'4667973", > "log_tail": "102489'4664889", > "last_user_version": 4667973, > "last_backfill": "MAX", > "last_backfill_bitwise": 1, > "purged_snaps": [], > "history": { > "epoch_created": 406, > "epoch_pool_created": 406, > "last_epoch_started": 103086, > "last_interval_started": 103085, > "last_epoch_clean": 96881, > "last_interval_clean": 96880, > "last_epoch_split": 0, > "last_epoch_marked_full": 0, > "same_up_since": 103095, > "same_interval_since": 103095, > "same_primary_since": 95398, > "last_scrub": "102517'4667556", > "last_scrub_stamp": "2019-05-15 01:07:28.978979", > "last_deep_scrub": "102491'4666011", > "last_deep_scrub_stamp": "2019-05-08 07:20:08.253942", > "last_clean_scrub_stamp": "2019-05-15 01:07:28.978979" > }, > "stats": { > "version": "103035'4667973", > "reported_seq": "2116838", > "reported_epoch": "104015", > "state": "activating+remapped", > "last_fresh": "2019-05-15 16:19:44.530005", > "last_change": "2019-05-15 14:56:04.248887", > "last_active": "2019-05-15 14:56:02.579506", > "last_peered": "2019-05-15 14:56:01.401941", > "last_clean": "2019-05-15 14:53:39.291350", > "last_became_active": "2019-05-15 14:55:54.163102", > "last_became_peered": "2019-05-15 14:55:54.163102", > "last_unstale": "2019-05-15 16:19:44.530005", > "last_undegraded": "2019-05-15 16:19:44.530005", > "last_fullsized": "2019-05-15 16:19:44.530005", > "mapping_epoch": 103095, > "log_start": "102489'4664889", > "ondisk_log_start": "102489'4664889", > "created": 406, > "last_epoch_clean": 96881, > "parent": "0.0", > "parent_split_bits": 0, > "last_scrub": "102517'4667556", > "last_scrub_stamp": "2019-05-15 01:07:28.978979", > "last_deep_scrub": "102491'4666011", > "last_deep_scrub_stamp": "2019-05-08 07:20:08.253942", > "last_clean_scrub_stamp": "2019-05-15 01:07:28.978979", > "log_size": 3084, > "ondisk_log_size": 3084, > "stats_invalid": false, > "dirty_stats_invalid": false, > "omap_stats_invalid": false, > "hitset_stats_invalid": false, > "hitset_bytes_stats_invalid": false, > "pin_stats_invalid": true, > "manifest_stats_invalid": true, > "snaptrimq_len": 0, > "stat_sum": { > "num_bytes": 2641321984, > "num_objects": 633, > "num_object_clones": 49, > "num_object_copies": 1899, > "num_objects_missing_on_primary": 0, > "num_objects_missing": 0, > "num_objects_degraded": 0, > "num_objects_misplaced": 1266, > "num_objects_unfound": 0, > "num_objects_dirty": 633, > "num_whiteouts": 0, > "num_read": 1263624, > "num_read_kb": 49804648, > "num_write": 5054985, > "num_write_kb": 76175293, > "num_scrub_errors": 0, > "num_shallow_scrub_errors": 0, > "num_deep_scrub_errors": 0, > "num_objects_recovered": 6507, > "num_bytes_recovered": 27291253248, > "num_keys_recovered": 0, > "num_objects_omap": 0, > "num_objects_hit_set_archive": 0, > "num_bytes_hit_set_archive": 0, > "num_flush": 0, > "num_flush_kb": 0, > "num_evict": 0, > "num_evict_kb": 0, > "num_promote": 0, > "num_flush_mode_high": 0, > "num_flush_mode_low": 0, > "num_evict_mode_some": 0, > "num_evict_mode_full": 0, > "num_objects_pinned": 0, > "num_legacy_snapsets": 0, > "num_large_omap_objects": 0, > "num_objects_manifest": 0 > }, > "up": [ > 70, > 72, > 27 > ], > "acting": [ > 25, > 27, > 79 > ], > "blocked_by": [], > "up_primary": 70, > "acting_primary": 25, > "purged_snaps": [] > }, > "empty": 0, > "dne": 0, > "incomplete": 0, > "last_epoch_started": 103096, > "hit_set_history": { > "current_last_update": "0'0", > "history": [] > } > }, > "peer_info": [ > { > "peer": "27", > "pgid": "23.4f5", > "last_update": "103035'4667973", > "last_complete": "103035'4667973", > "log_tail": "102489'4664889", > "last_user_version": 4667973, > "last_backfill": "MAX", > "last_backfill_bitwise": 1, > "purged_snaps": [], > "history": { > "epoch_created": 406, > "epoch_pool_created": 406, > "last_epoch_started": 103086, > "last_interval_started": 103085, > "last_epoch_clean": 96881, > "last_interval_clean": 96880, > "last_epoch_split": 0, > "last_epoch_marked_full": 0, > "same_up_since": 103095, > "same_interval_since": 103095, > "same_primary_since": 95398, > "last_scrub": "102517'4667556", > "last_scrub_stamp": "2019-05-15 01:07:28.978979", > "last_deep_scrub": "102491'4666011", > "last_deep_scrub_stamp": "2019-05-08 07:20:08.253942", > "last_clean_scrub_stamp": "2019-05-15 01:07:28.978979" > }, > "stats": { > "version": "102814'4667972", > "reported_seq": "2115836", > "reported_epoch": "103035", > "state": "active+clean", > "last_fresh": "2019-05-15 14:52:36.025409", > "last_change": "2019-05-15 01:07:28.979033", > "last_active": "2019-05-15 14:52:36.025409", > "last_peered": "2019-05-15 14:52:36.025409", > "last_clean": "2019-05-15 14:52:36.025409", > "last_became_active": "2019-04-26 06:30:50.855477", > "last_became_peered": "2019-04-26 06:30:50.855477", > "last_unstale": "2019-05-15 14:52:36.025409", > "last_undegraded": "2019-05-15 14:52:36.025409", > "last_fullsized": "2019-05-15 14:52:36.025409", > "mapping_epoch": 103095, > "log_start": "102489'4664889", > "ondisk_log_start": "102489'4664889", > "created": 406, > "last_epoch_clean": 96881, > "parent": "0.0", > "parent_split_bits": 0, > "last_scrub": "102517'4667556", > "last_scrub_stamp": "2019-05-15 01:07:28.978979", > "last_deep_scrub": "102491'4666011", > "last_deep_scrub_stamp": "2019-05-08 07:20:08.253942", > "last_clean_scrub_stamp": "2019-05-15 01:07:28.978979", > "log_size": 3083, > "ondisk_log_size": 3083, > "stats_invalid": false, > "dirty_stats_invalid": false, > "omap_stats_invalid": false, > "hitset_stats_invalid": false, > "hitset_bytes_stats_invalid": false, > "pin_stats_invalid": true, > "manifest_stats_invalid": true, > "snaptrimq_len": 0, > "stat_sum": { > "num_bytes": 2641321984, > "num_objects": 633, > "num_object_clones": 49, > "num_object_copies": 1899, > "num_objects_missing_on_primary": 0, > "num_objects_missing": 0, > "num_objects_degraded": 0, > "num_objects_misplaced": 0, > "num_objects_unfound": 0, > "num_objects_dirty": 633, > "num_whiteouts": 0, > "num_read": 1263624, > "num_read_kb": 49804648, > "num_write": 5054985, > "num_write_kb": 76175293, > "num_scrub_errors": 0, > "num_shallow_scrub_errors": 0, > "num_deep_scrub_errors": 0, > "num_objects_recovered": 6507, > "num_bytes_recovered": 27291253248, > "num_keys_recovered": 0, > "num_objects_omap": 0, > "num_objects_hit_set_archive": 0, > "num_bytes_hit_set_archive": 0, > "num_flush": 0, > "num_flush_kb": 0, > "num_evict": 0, > "num_evict_kb": 0, > "num_promote": 0, > "num_flush_mode_high": 0, > "num_flush_mode_low": 0, > "num_evict_mode_some": 0, > "num_evict_mode_full": 0, > "num_objects_pinned": 0, > "num_legacy_snapsets": 0, > "num_large_omap_objects": 0, > "num_objects_manifest": 0 > }, > "up": [ > 70, > 72, > 27 > ], > "acting": [ > 25, > 27, > 79 > ], > "blocked_by": [], > "up_primary": 70, > "acting_primary": 25, > "purged_snaps": [] > }, > "empty": 0, > "dne": 0, > "incomplete": 0, > "last_epoch_started": 103086, > "hit_set_history": { > "current_last_update": "0'0", > "history": [] > } > }, > { > "peer": "70", > "pgid": "23.4f5", > "last_update": "103035'4667973", > "last_complete": "103035'4667973", > "log_tail": "102489'4664973", > "last_user_version": 0, > "last_backfill": "MIN", > "last_backfill_bitwise": 1, > "purged_snaps": [], > "history": { > "epoch_created": 406, > "epoch_pool_created": 406, > "last_epoch_started": 103086, > "last_interval_started": 103085, > "last_epoch_clean": 96881, > "last_interval_clean": 96880, > "last_epoch_split": 0, > "last_epoch_marked_full": 0, > "same_up_since": 103095, > "same_interval_since": 103095, > "same_primary_since": 95398, > "last_scrub": "102517'4667556", > "last_scrub_stamp": "2019-05-15 01:07:28.978979", > "last_deep_scrub": "102491'4666011", > "last_deep_scrub_stamp": "2019-05-08 07:20:08.253942", > "last_clean_scrub_stamp": "2019-05-15 01:07:28.978979" > }, > "stats": { > "version": "0'0", > "reported_seq": "0", > "reported_epoch": "0", > "state": "unknown", > "last_fresh": "0.000000", > "last_change": "0.000000", > "last_active": "0.000000", > "last_peered": "0.000000", > "last_clean": "0.000000", > "last_became_active": "0.000000", > "last_became_peered": "0.000000", > "last_unstale": "0.000000", > "last_undegraded": "0.000000", > "last_fullsized": "0.000000", > "mapping_epoch": 0, > "log_start": "0'0", > "ondisk_log_start": "0'0", > "created": 0, > "last_epoch_clean": 0, > "parent": "0.0", > "parent_split_bits": 0, > "last_scrub": "0'0", > "last_scrub_stamp": "0.000000", > "last_deep_scrub": "0'0", > "last_deep_scrub_stamp": "0.000000", > "last_clean_scrub_stamp": "0.000000", > "log_size": 0, > "ondisk_log_size": 0, > "stats_invalid": false, > "dirty_stats_invalid": false, > "omap_stats_invalid": false, > "hitset_stats_invalid": false, > "hitset_bytes_stats_invalid": false, > "pin_stats_invalid": false, > "manifest_stats_invalid": false, > "snaptrimq_len": 0, > "stat_sum": { > "num_bytes": 0, > "num_objects": 0, > "num_object_clones": 0, > "num_object_copies": 0, > "num_objects_missing_on_primary": 0, > "num_objects_missing": 633, > "num_objects_degraded": 0, > "num_objects_misplaced": 0, > "num_objects_unfound": 0, > "num_objects_dirty": 0, > "num_whiteouts": 0, > "num_read": 0, > "num_read_kb": 0, > "num_write": 0, > "num_write_kb": 0, > "num_scrub_errors": 0, > "num_shallow_scrub_errors": 0, > "num_deep_scrub_errors": 0, > "num_objects_recovered": 0, > "num_bytes_recovered": 0, > "num_keys_recovered": 0, > "num_objects_omap": 0, > "num_objects_hit_set_archive": 0, > "num_bytes_hit_set_archive": 0, > "num_flush": 0, > "num_flush_kb": 0, > "num_evict": 0, > "num_evict_kb": 0, > "num_promote": 0, > "num_flush_mode_high": 0, > "num_flush_mode_low": 0, > "num_evict_mode_some": 0, > "num_evict_mode_full": 0, > "num_objects_pinned": 0, > "num_legacy_snapsets": 0, > "num_large_omap_objects": 0, > "num_objects_manifest": 0 > }, > "up": [], > "acting": [], > "blocked_by": [], > "up_primary": -1, > "acting_primary": -1, > "purged_snaps": [] > }, > "empty": 0, > "dne": 0, > "incomplete": 1, > "last_epoch_started": 103096, > "hit_set_history": { > "current_last_update": "0'0", > "history": [] > } > }, > { > "peer": "72", > "pgid": "23.4f5", > "last_update": "103035'4667973", > "last_complete": "103035'4667973", > "log_tail": "102489'4664973", > "last_user_version": 0, > "last_backfill": "MIN", > "last_backfill_bitwise": 1, > "purged_snaps": [], > "history": { > "epoch_created": 406, > "epoch_pool_created": 406, > "last_epoch_started": 103086, > "last_interval_started": 103085, > "last_epoch_clean": 96881, > "last_interval_clean": 96880, > "last_epoch_split": 0, > "last_epoch_marked_full": 0, > "same_up_since": 103095, > "same_interval_since": 103095, > "same_primary_since": 95398, > "last_scrub": "102517'4667556", > "last_scrub_stamp": "2019-05-15 01:07:28.978979", > "last_deep_scrub": "102491'4666011", > "last_deep_scrub_stamp": "2019-05-08 07:20:08.253942", > "last_clean_scrub_stamp": "2019-05-15 01:07:28.978979" > }, > "stats": { > "version": "0'0", > "reported_seq": "0", > "reported_epoch": "0", > "state": "unknown", > "last_fresh": "0.000000", > "last_change": "0.000000", > "last_active": "0.000000", > "last_peered": "0.000000", > "last_clean": "0.000000", > "last_became_active": "0.000000", > "last_became_peered": "0.000000", > "last_unstale": "0.000000", > "last_undegraded": "0.000000", > "last_fullsized": "0.000000", > "mapping_epoch": 103095, > "log_start": "0'0", > "ondisk_log_start": "0'0", > "created": 0, > "last_epoch_clean": 0, > "parent": "0.0", > "parent_split_bits": 0, > "last_scrub": "0'0", > "last_scrub_stamp": "0.000000", > "last_deep_scrub": "0'0", > "last_deep_scrub_stamp": "0.000000", > "last_clean_scrub_stamp": "0.000000", > "log_size": 0, > "ondisk_log_size": 0, > "stats_invalid": false, > "dirty_stats_invalid": false, > "omap_stats_invalid": false, > "hitset_stats_invalid": false, > "hitset_bytes_stats_invalid": false, > "pin_stats_invalid": false, > "manifest_stats_invalid": false, > "snaptrimq_len": 0, > "stat_sum": { > "num_bytes": 0, > "num_objects": 0, > "num_object_clones": 0, > "num_object_copies": 0, > "num_objects_missing_on_primary": 0, > "num_objects_missing": 633, > "num_objects_degraded": 0, > "num_objects_misplaced": 0, > "num_objects_unfound": 0, > "num_objects_dirty": 0, > "num_whiteouts": 0, > "num_read": 0, > "num_read_kb": 0, > "num_write": 0, > "num_write_kb": 0, > "num_scrub_errors": 0, > "num_shallow_scrub_errors": 0, > "num_deep_scrub_errors": 0, > "num_objects_recovered": 0, > "num_bytes_recovered": 0, > "num_keys_recovered": 0, > "num_objects_omap": 0, > "num_objects_hit_set_archive": 0, > "num_bytes_hit_set_archive": 0, > "num_flush": 0, > "num_flush_kb": 0, > "num_evict": 0, > "num_evict_kb": 0, > "num_promote": 0, > "num_flush_mode_high": 0, > "num_flush_mode_low": 0, > "num_evict_mode_some": 0, > "num_evict_mode_full": 0, > "num_objects_pinned": 0, > "num_legacy_snapsets": 0, > "num_large_omap_objects": 0, > "num_objects_manifest": 0 > }, > "up": [ > 70, > 72, > 27 > ], > "acting": [ > 25, > 27, > 79 > ], > "blocked_by": [], > "up_primary": 70, > "acting_primary": 25, > "purged_snaps": [] > }, > "empty": 0, > "dne": 0, > "incomplete": 1, > "last_epoch_started": 103086, > "hit_set_history": { > "current_last_update": "0'0", > "history": [] > } > }, > { > "peer": "79", > "pgid": "23.4f5", > "last_update": "103035'4667973", > "last_complete": "103035'4667973", > "log_tail": "102489'4664889", > "last_user_version": 4667973, > "last_backfill": "MAX", > "last_backfill_bitwise": 1, > "purged_snaps": [], > "history": { > "epoch_created": 406, > "epoch_pool_created": 406, > "last_epoch_started": 103086, > "last_interval_started": 103085, > "last_epoch_clean": 96881, > "last_interval_clean": 96880, > "last_epoch_split": 0, > "last_epoch_marked_full": 0, > "same_up_since": 103095, > "same_interval_since": 103095, > "same_primary_since": 95398, > "last_scrub": "102517'4667556", > "last_scrub_stamp": "2019-05-15 01:07:28.978979", > "last_deep_scrub": "102491'4666011", > "last_deep_scrub_stamp": "2019-05-08 07:20:08.253942", > "last_clean_scrub_stamp": "2019-05-15 01:07:28.978979" > }, > "stats": { > "version": "102814'4667972", > "reported_seq": "2115836", > "reported_epoch": "103035", > "state": "active+clean", > "last_fresh": "2019-05-15 14:52:36.025409", > "last_change": "2019-05-15 01:07:28.979033", > "last_active": "2019-05-15 14:52:36.025409", > "last_peered": "2019-05-15 14:52:36.025409", > "last_clean": "2019-05-15 14:52:36.025409", > "last_became_active": "2019-04-26 06:30:50.855477", > "last_became_peered": "2019-04-26 06:30:50.855477", > "last_unstale": "2019-05-15 14:52:36.025409", > "last_undegraded": "2019-05-15 14:52:36.025409", > "last_fullsized": "2019-05-15 14:52:36.025409", > "mapping_epoch": 103095, > "log_start": "102489'4664889", > "ondisk_log_start": "102489'4664889", > "created": 406, > "last_epoch_clean": 96881, > "parent": "0.0", > "parent_split_bits": 0, > "last_scrub": "102517'4667556", > "last_scrub_stamp": "2019-05-15 01:07:28.978979", > "last_deep_scrub": "102491'4666011", > "last_deep_scrub_stamp": "2019-05-08 07:20:08.253942", > "last_clean_scrub_stamp": "2019-05-15 01:07:28.978979", > "log_size": 3083, > "ondisk_log_size": 3083, > "stats_invalid": false, > "dirty_stats_invalid": false, > "omap_stats_invalid": false, > "hitset_stats_invalid": false, > "hitset_bytes_stats_invalid": false, > "pin_stats_invalid": true, > "manifest_stats_invalid": true, > "snaptrimq_len": 0, > "stat_sum": { > "num_bytes": 2641321984, > "num_objects": 633, > "num_object_clones": 49, > "num_object_copies": 1899, > "num_objects_missing_on_primary": 0, > "num_objects_missing": 0, > "num_objects_degraded": 0, > "num_objects_misplaced": 0, > "num_objects_unfound": 0, > "num_objects_dirty": 633, > "num_whiteouts": 0, > "num_read": 1263624, > "num_read_kb": 49804648, > "num_write": 5054985, > "num_write_kb": 76175293, > "num_scrub_errors": 0, > "num_shallow_scrub_errors": 0, > "num_deep_scrub_errors": 0, > "num_objects_recovered": 6507, > "num_bytes_recovered": 27291253248, > "num_keys_recovered": 0, > "num_objects_omap": 0, > "num_objects_hit_set_archive": 0, > "num_bytes_hit_set_archive": 0, > "num_flush": 0, > "num_flush_kb": 0, > "num_evict": 0, > "num_evict_kb": 0, > "num_promote": 0, > "num_flush_mode_high": 0, > "num_flush_mode_low": 0, > "num_evict_mode_some": 0, > "num_evict_mode_full": 0, > "num_objects_pinned": 0, > "num_legacy_snapsets": 0, > "num_large_omap_objects": 0, > "num_objects_manifest": 0 > }, > "up": [ > 70, > 72, > 27 > ], > "acting": [ > 25, > 27, > 79 > ], > "blocked_by": [], > "up_primary": 70, > "acting_primary": 25, > "purged_snaps": [] > }, > "empty": 0, > "dne": 0, > "incomplete": 0, > "last_epoch_started": 96881, > "hit_set_history": { > "current_last_update": "0'0", > "history": [] > } > } > ], > "recovery_state": [ > { > "name": "Started/Primary/Active", > "enter_time": "2019-05-15 14:56:04.242725", > "might_have_unfound": [], > "recovery_progress": { > "backfill_targets": [ > "70", > "72" > ], > "waiting_on_backfill": [], > "last_backfill_started": "MIN", > "backfill_info": { > "begin": "MIN", > "end": "MIN", > "objects": [] > }, > "peer_backfill_info": [], > "backfills_in_flight": [], > "recovering": [], > "pg_backend": { > "pull_from_peer": [], > "pushing": [] > } > }, > "scrub": { > "scrubber.epoch_start": "96880", > "scrubber.active": false, > "scrubber.state": "INACTIVE", > "scrubber.start": "MIN", > "scrubber.end": "MIN", > "scrubber.max_end": "MIN", > "scrubber.subset_last_update": "0'0", > "scrubber.deep": false, > "scrubber.waiting_on_whom": [] > } > }, > { > "name": "Started", > "enter_time": "2019-05-15 14:56:03.673622" > } > ], > "agent_state": {} > } > ------------------------------------------- > > -- > | Jan "Yenya" Kasprzak <kas at {fi.muni.cz - work | yenya.net - private}> | > | http://www.fi.muni.cz/~kas/ GPG: 4096R/A45477D5 | > sir_clive> I hope you don't mind if I steal some of your ideas? > laryross> As far as stealing... we call it sharing here. --from rcgroups > _______________________________________________ > ceph-users mailing list > ceph-users@xxxxxxxxxxxxxx > http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com -- Thank you! HuangJun _______________________________________________ ceph-users mailing list ceph-users@xxxxxxxxxxxxxx http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com