Sorry HTML snuck in somewhere. ---------- Forwarded message ---------- From: Robert LeBlanc <robert@xxxxxxxxxxxxx> Date: Mon, Mar 30, 2015 at 8:15 PM Subject: Force an OSD to try to peer To: Ceph-User <ceph-users@xxxxxxxx>, ceph-devel <ceph-devel@xxxxxxxxxxxxxxx> I've been working at this peering problem all day. I've done a lot of testing at the network layer and I just don't believe that we have a problem that would prevent OSDs from peering. When looking though osd_debug 20/20 logs, it just doesn't look like the OSDs are trying to peer. I don't know if it is because there are so many outstanding creations or what. OSDs will peer with OSDs on other hosts, but for reason only chooses a certain number and not one that it needs to finish the peering process. I've check: firewall, open files, number of threads allowed. These usually have given me an error in the logs that helped me fix the problem. I can't find a configuration item that specifies how many peers an OSD should contact or anything that would be artificially limiting the peering connections. I've restarted the OSDs a number of times, as well as rebooting the hosts. I beleive if the OSDs finish peering everything will clear up. I can't find anything in pg query that would help me figure out what is blocking it (peering blocked by is empty). The PGs are scattered across all the hosts so we can't pin it down to a specific host. Any ideas on what to try would be appreciated. [ulhglive-root@ceph9 ~]# ceph --version ceph version 0.80.7 (6c0127fcb58008793d3c8b62d925bc91963672a3) [ulhglive-root@ceph9 ~]# ceph status cluster 48de182b-5488-42bb-a6d2-62e8e47b435c health HEALTH_WARN 1 pgs down; 1321 pgs peering; 1321 pgs stuck inactive; 1321 pgs stuck unclean; too few pgs per osd (17 < min 20) monmap e2: 3 mons at {mon1=10.217.72.27:6789/0,mon2=10.217.72.28:6789/0,mon3=10.217.72.29:6789/0}, election epoch 30, quorum 0,1,2 mon1,mon2,mon3 osdmap e704: 120 osds: 120 up, 120 in pgmap v1895: 2048 pgs, 1 pools, 0 bytes data, 0 objects 11447 MB used, 436 TB / 436 TB avail 727 active+clean 990 peering 37 creating+peering 1 down+peering 290 remapped+peering 3 creating+remapped+peering { "state": "peering", "epoch": 707, "up": [ 40, 92, 48, 91], "acting": [ 40, 92, 48, 91], "info": { "pgid": "7.171", "last_update": "0'0", "last_complete": "0'0", "log_tail": "0'0", "last_user_version": 0, "last_backfill": "MAX", "purged_snaps": "[]", "history": { "epoch_created": 293, "last_epoch_started": 343, "last_epoch_clean": 343, "last_epoch_split": 0, "same_up_since": 688, "same_interval_since": 688, "same_primary_since": 608, "last_scrub": "0'0", "last_scrub_stamp": "2015-03-30 11:11:18.872851", "last_deep_scrub": "0'0", "last_deep_scrub_stamp": "2015-03-30 11:11:18.872851", "last_clean_scrub_stamp": "0.000000"}, "stats": { "version": "0'0", "reported_seq": "326", "reported_epoch": "707", "state": "peering", "last_fresh": "2015-03-30 20:10:39.509855", "last_change": "2015-03-30 19:44:17.361601", "last_active": "2015-03-30 11:37:56.956417", "last_clean": "2015-03-30 11:37:56.956417", "last_became_active": "0.000000", "last_unstale": "2015-03-30 20:10:39.509855", "mapping_epoch": 683, "log_start": "0'0", "ondisk_log_start": "0'0", "created": 293, "last_epoch_clean": 343, "parent": "0.0", "parent_split_bits": 0, "last_scrub": "0'0", "last_scrub_stamp": "2015-03-30 11:11:18.872851", "last_deep_scrub": "0'0", "last_deep_scrub_stamp": "2015-03-30 11:11:18.872851", "last_clean_scrub_stamp": "0.000000", "log_size": 0, "ondisk_log_size": 0, "stats_invalid": "0", "stat_sum": { "num_bytes": 0, "num_objects": 0, "num_object_clones": 0, "num_object_copies": 0, "num_objects_missing_on_primary": 0, "num_objects_degraded": 0, "num_objects_unfound": 0, "num_objects_dirty": 0, "num_whiteouts": 0, "num_read": 0, "num_read_kb": 0, "num_write": 0, "num_write_kb": 0, "num_scrub_errors": 0, "num_shallow_scrub_errors": 0, "num_deep_scrub_errors": 0, "num_objects_recovered": 0, "num_bytes_recovered": 0, "num_keys_recovered": 0, "num_objects_omap": 0, "num_objects_hit_set_archive": 0}, "stat_cat_sum": {}, "up": [ 40, 92, 48, 91], "acting": [ 40, 92, 48, 91], "up_primary": 40, "acting_primary": 40}, "empty": 1, "dne": 0, "incomplete": 0, "last_epoch_started": 348, "hit_set_history": { "current_last_update": "0'0", "current_last_stamp": "0.000000", "current_info": { "begin": "0.000000", "end": "0.000000", "version": "0'0"}, "history": []}}, "peer_info": [ { "peer": "48", "pgid": "7.171", "last_update": "0'0", "last_complete": "0'0", "log_tail": "0'0", "last_user_version": 0, "last_backfill": "MAX", "purged_snaps": "[]", "history": { "epoch_created": 293, "last_epoch_started": 343, "last_epoch_clean": 343, "last_epoch_split": 0, "same_up_since": 688, "same_interval_since": 688, "same_primary_since": 608, "last_scrub": "0'0", "last_scrub_stamp": "2015-03-30 11:11:18.872851", "last_deep_scrub": "0'0", "last_deep_scrub_stamp": "2015-03-30 11:11:18.872851", "last_clean_scrub_stamp": "0.000000"}, "stats": { "version": "0'0", "reported_seq": "24", "reported_epoch": "348", "state": "peering", "last_fresh": "2015-03-30 11:39:02.979742", "last_change": "2015-03-30 11:39:01.650897", "last_active": "2015-03-30 11:37:56.956417", "last_clean": "2015-03-30 11:37:56.956417", "last_became_active": "0.000000", "last_unstale": "2015-03-30 11:39:02.979742", "mapping_epoch": 683, "log_start": "0'0", "ondisk_log_start": "0'0", "created": 293, "last_epoch_clean": 343, "parent": "0.0", "parent_split_bits": 0, "last_scrub": "0'0", "last_scrub_stamp": "2015-03-30 11:11:18.872851", "last_deep_scrub": "0'0", "last_deep_scrub_stamp": "2015-03-30 11:11:18.872851", "last_clean_scrub_stamp": "0.000000", "log_size": 0, "ondisk_log_size": 0, "stats_invalid": "0", "stat_sum": { "num_bytes": 0, "num_objects": 0, "num_object_clones": 0, "num_object_copies": 0, "num_objects_missing_on_primary": 0, "num_objects_degraded": 0, "num_objects_unfound": 0, "num_objects_dirty": 0, "num_whiteouts": 0, "num_read": 0, "num_read_kb": 0, "num_write": 0, "num_write_kb": 0, "num_scrub_errors": 0, "num_shallow_scrub_errors": 0, "num_deep_scrub_errors": 0, "num_objects_recovered": 0, "num_bytes_recovered": 0, "num_keys_recovered": 0, "num_objects_omap": 0, "num_objects_hit_set_archive": 0}, "stat_cat_sum": {}, "up": [ 40, 92, 48, 91], "acting": [ 40, 92, 48, 91], "up_primary": 40, "acting_primary": 40}, "empty": 1, "dne": 0, "incomplete": 0, "last_epoch_started": 348, "hit_set_history": { "current_last_update": "0'0", "current_last_stamp": "0.000000", "current_info": { "begin": "0.000000", "end": "0.000000", "version": "0'0"}, "history": []}}, { "peer": "110", "pgid": "7.171", "last_update": "0'0", "last_complete": "0'0", "log_tail": "0'0", "last_user_version": 0, "last_backfill": "MAX", "purged_snaps": "[]", "history": { "epoch_created": 0, "last_epoch_started": 0, "last_epoch_clean": 0, "last_epoch_split": 0, "same_up_since": 0, "same_interval_since": 0, "same_primary_since": 0, "last_scrub": "0'0", "last_scrub_stamp": "0.000000", "last_deep_scrub": "0'0", "last_deep_scrub_stamp": "0.000000", "last_clean_scrub_stamp": "0.000000"}, "stats": { "version": "0'0", "reported_seq": "0", "reported_epoch": "0", "state": "inactive", "last_fresh": "0.000000", "last_change": "0.000000", "last_active": "0.000000", "last_clean": "0.000000", "last_became_active": "0.000000", "last_unstale": "0.000000", "mapping_epoch": 0, "log_start": "0'0", "ondisk_log_start": "0'0", "created": 0, "last_epoch_clean": 0, "parent": "0.0", "parent_split_bits": 0, "last_scrub": "0'0", "last_scrub_stamp": "0.000000", "last_deep_scrub": "0'0", "last_deep_scrub_stamp": "0.000000", "last_clean_scrub_stamp": "0.000000", "log_size": 0, "ondisk_log_size": 0, "stats_invalid": "0", "stat_sum": { "num_bytes": 0, "num_objects": 0, "num_object_clones": 0, "num_object_copies": 0, "num_objects_missing_on_primary": 0, "num_objects_degraded": 0, "num_objects_unfound": 0, "num_objects_dirty": 0, "num_whiteouts": 0, "num_read": 0, "num_read_kb": 0, "num_write": 0, "num_write_kb": 0, "num_scrub_errors": 0, "num_shallow_scrub_errors": 0, "num_deep_scrub_errors": 0, "num_objects_recovered": 0, "num_bytes_recovered": 0, "num_keys_recovered": 0, "num_objects_omap": 0, "num_objects_hit_set_archive": 0}, "stat_cat_sum": {}, "up": [], "acting": [], "up_primary": -1, "acting_primary": -1}, "empty": 1, "dne": 1, "incomplete": 0, "last_epoch_started": 0, "hit_set_history": { "current_last_update": "0'0", "current_last_stamp": "0.000000", "current_info": { "begin": "0.000000", "end": "0.000000", "version": "0'0"}, "history": []}}], "recovery_state": [ { "name": "Started\/Primary\/Peering\/GetInfo", "enter_time": "2015-03-30 19:44:18.709317", "requested_info_from": [ { "osd": "0"}, { "osd": "5"}, { "osd": "10"}, { "osd": "22"}, { "osd": "54"}, { "osd": "91"}, { "osd": "92"}, { "osd": "113"}, { "osd": "114"}]}, { "name": "Started\/Primary\/Peering", "enter_time": "2015-03-30 19:44:18.709316", "past_intervals": [ { "first": 342, "last": 346, "maybe_went_rw": 1, "up": [ 40, 92, 114], "acting": [ 40, 92, 114, 40, 40]}, { "first": 347, "last": 353, "maybe_went_rw": 1, "up": [ 40, 92, 48], "acting": [ 40, 92, 48, 40, 40]}, { "first": 354, "last": 356, "maybe_went_rw": 1, "up": [ 92, 48], "acting": [ 92, 48, 92, 92]}, { "first": 357, "last": 359, "maybe_went_rw": 1, "up": [ 113, 48, 114], "acting": [ 113, 48, 114, 113, 113]}, { "first": 360, "last": 361, "maybe_went_rw": 1, "up": [ 40, 92, 48], "acting": [ 40, 92, 48, 40, 40]}, { "first": 362, "last": 364, "maybe_went_rw": 1, "up": [ 40, 92], "acting": [ 40, 92, 40, 40]}, { "first": 365, "last": 369, "maybe_went_rw": 1, "up": [ 40, 92, 114], "acting": [ 40, 92, 114, 40, 40]}, { "first": 370, "last": 379, "maybe_went_rw": 1, "up": [ 40, 92, 48], "acting": [ 40, 92, 48, 40, 40]}, { "first": 380, "last": 400, "maybe_went_rw": 1, "up": [ 40, 92, 48, 91], "acting": [ 40, 92, 48, 91, 40, 40]}, { "first": 401, "last": 409, "maybe_went_rw": 1, "up": [ 92, 48, 91], "acting": [ 92, 48, 91, 92, 92]}, { "first": 410, "last": 414, "maybe_went_rw": 1, "up": [ 113, 48, 114, 0], "acting": [ 113, 48, 114, 0, 113, 113]}, { "first": 415, "last": 435, "maybe_went_rw": 1, "up": [ 113, 48, 114, 10], "acting": [ 113, 48, 114, 10, 113, 113]}, { "first": 436, "last": 442, "maybe_went_rw": 1, "up": [ 40, 92, 48, 91], "acting": [ 40, 92, 48, 91, 40, 40]}, { "first": 443, "last": 446, "maybe_went_rw": 1, "up": [ 40, 92, 48], "acting": [ 40, 92, 48, 40, 40]}, { "first": 447, "last": 457, "maybe_went_rw": 1, "up": [ 40, 48], "acting": [ 40, 48, 40, 40]}, { "first": 458, "last": 460, "maybe_went_rw": 1, "up": [ 40, 48, 10], "acting": [ 40, 48, 10, 40, 40]}, { "first": 461, "last": 466, "maybe_went_rw": 1, "up": [ 40, 48, 22], "acting": [ 40, 48, 22, 40, 40]}, { "first": 467, "last": 478, "maybe_went_rw": 1, "up": [ 40, 48, 22, 5], "acting": [ 40, 48, 22, 5, 40, 40]}, { "first": 479, "last": 489, "maybe_went_rw": 1, "up": [ 40, 48, 22, 110], "acting": [ 40, 48, 22, 110, 40, 40]}, { "first": 490, "last": 496, "maybe_went_rw": 1, "up": [ 40, 48, 22, 0], "acting": [ 40, 48, 22, 0, 40, 40]}, { "first": 497, "last": 507, "maybe_went_rw": 1, "up": [ 40, 48, 114, 10], "acting": [ 40, 48, 114, 10, 40, 40]}, { "first": 508, "last": 511, "maybe_went_rw": 1, "up": [ 40, 48, 54, 91], "acting": [ 40, 48, 54, 91, 40, 40]}, { "first": 512, "last": 579, "maybe_went_rw": 1, "up": [ 40, 92, 48, 91], "acting": [ 40, 92, 48, 91, 40, 40]}, { "first": 580, "last": 580, "maybe_went_rw": 0, "up": [ 40, 92, 91], "acting": [ 40, 92, 91, 40, 40]}, { "first": 581, "last": 591, "maybe_went_rw": 1, "up": [ 92, 91], "acting": [ 92, 91, 92, 92]}, { "first": 592, "last": 595, "maybe_went_rw": 1, "up": [ 113, 114, 22, 0], "acting": [ 113, 114, 22, 0, 113, 113]}, { "first": 596, "last": 599, "maybe_went_rw": 1, "up": [ 113, 48, 114, 10], "acting": [ 113, 48, 114, 10, 113, 113]}, { "first": 600, "last": 606, "maybe_went_rw": 1, "up": [ 40, 92, 48, 91], "acting": [ 40, 92, 48, 91, 40, 40]}, { "first": 607, "last": 607, "maybe_went_rw": 0, "up": [ 92, 91], "acting": [ 92, 91, 92, 92]}, { "first": 608, "last": 616, "maybe_went_rw": 1, "up": [ 40, 92, 48, 91], "acting": [ 40, 92, 48, 91, 40, 40]}, { "first": 617, "last": 625, "maybe_went_rw": 1, "up": [ 40, 92, 91], "acting": [ 40, 92, 91, 40, 40]}, { "first": 626, "last": 632, "maybe_went_rw": 1, "up": [ 40, 92, 114, 10], "acting": [ 40, 92, 114, 10, 40, 40]}, { "first": 633, "last": 639, "maybe_went_rw": 1, "up": [ 40, 92, 48, 91], "acting": [ 40, 92, 48, 91, 40, 40]}, { "first": 640, "last": 643, "maybe_went_rw": 1, "up": [ 40, 92, 91], "acting": [ 40, 92, 91, 40, 40]}, { "first": 644, "last": 662, "maybe_went_rw": 1, "up": [ 40, 92, 114, 10], "acting": [ 40, 92, 114, 10, 40, 40]}, { "first": 663, "last": 679, "maybe_went_rw": 1, "up": [ 40, 92, 48, 91], "acting": [ 40, 92, 48, 91, 40, 40]}, { "first": 680, "last": 682, "maybe_went_rw": 1, "up": [ 40, 92, 48], "acting": [ 40, 92, 48, 40, 40]}, { "first": 683, "last": 687, "maybe_went_rw": 1, "up": [ 40, 92, 48, 10], "acting": [ 40, 92, 48, 10, 40, 40]}], "probing_osds": [ "0", "5", "10", "22", "40", "48", "54", "91", "92", "110", "113", "114"], "down_osds_we_would_probe": [], "peering_blocked_by": []}, { "name": "Started", "enter_time": "2015-03-30 19:44:18.709312"}], "agent_state": {}} _______________________________________________ ceph-users mailing list ceph-users@xxxxxxxxxxxxxx http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com