On Tue, 31 Mar 2015, Robert LeBlanc wrote: > Turns out jumbo frames was not set on all the switch ports. Once that > was resolved the cluster quickly became healthy. I always hesitate to point the finger at the jumbo frames configuration but almost every time that is the culprit! Thanks for the update. :) sage > > On Mon, Mar 30, 2015 at 8:15 PM, Robert LeBlanc <robert@xxxxxxxxxxxxx> wrote: > > I've been working at this peering problem all day. I've done a lot of > > testing at the network layer and I just don't believe that we have a problem > > that would prevent OSDs from peering. When looking though osd_debug 20/20 > > logs, it just doesn't look like the OSDs are trying to peer. I don't know if > > it is because there are so many outstanding creations or what. OSDs will > > peer with OSDs on other hosts, but for reason only chooses a certain number > > and not one that it needs to finish the peering process. > > > > I've check: firewall, open files, number of threads allowed. These usually > > have given me an error in the logs that helped me fix the problem. > > > > I can't find a configuration item that specifies how many peers an OSD > > should contact or anything that would be artificially limiting the peering > > connections. I've restarted the OSDs a number of times, as well as rebooting > > the hosts. I beleive if the OSDs finish peering everything will clear up. I > > can't find anything in pg query that would help me figure out what is > > blocking it (peering blocked by is empty). The PGs are scattered across all > > the hosts so we can't pin it down to a specific host. > > > > Any ideas on what to try would be appreciated. > > > > [ulhglive-root@ceph9 ~]# ceph --version > > ceph version 0.80.7 (6c0127fcb58008793d3c8b62d925bc91963672a3) > > [ulhglive-root@ceph9 ~]# ceph status > > cluster 48de182b-5488-42bb-a6d2-62e8e47b435c > > health HEALTH_WARN 1 pgs down; 1321 pgs peering; 1321 pgs stuck > > inactive; 1321 pgs stuck unclean; too few pgs per osd (17 < min 20) > > monmap e2: 3 mons at > > {mon1=10.217.72.27:6789/0,mon2=10.217.72.28:6789/0,mon3=10.217.72.29:6789/0}, > > election epoch 30, quorum 0,1,2 mon1,mon2,mon3 > > osdmap e704: 120 osds: 120 up, 120 in > > pgmap v1895: 2048 pgs, 1 pools, 0 bytes data, 0 objects > > 11447 MB used, 436 TB / 436 TB avail > > 727 active+clean > > 990 peering > > 37 creating+peering > > 1 down+peering > > 290 remapped+peering > > 3 creating+remapped+peering > > > > { "state": "peering", > > "epoch": 707, > > "up": [ > > 40, > > 92, > > 48, > > 91], > > "acting": [ > > 40, > > 92, > > 48, > > 91], > > "info": { "pgid": "7.171", > > "last_update": "0'0", > > "last_complete": "0'0", > > "log_tail": "0'0", > > "last_user_version": 0, > > "last_backfill": "MAX", > > "purged_snaps": "[]", > > "history": { "epoch_created": 293, > > "last_epoch_started": 343, > > "last_epoch_clean": 343, > > "last_epoch_split": 0, > > "same_up_since": 688, > > "same_interval_since": 688, > > "same_primary_since": 608, > > "last_scrub": "0'0", > > "last_scrub_stamp": "2015-03-30 11:11:18.872851", > > "last_deep_scrub": "0'0", > > "last_deep_scrub_stamp": "2015-03-30 11:11:18.872851", > > "last_clean_scrub_stamp": "0.000000"}, > > "stats": { "version": "0'0", > > "reported_seq": "326", > > "reported_epoch": "707", > > "state": "peering", > > "last_fresh": "2015-03-30 20:10:39.509855", > > "last_change": "2015-03-30 19:44:17.361601", > > "last_active": "2015-03-30 11:37:56.956417", > > "last_clean": "2015-03-30 11:37:56.956417", > > "last_became_active": "0.000000", > > "last_unstale": "2015-03-30 20:10:39.509855", > > "mapping_epoch": 683, > > "log_start": "0'0", > > "ondisk_log_start": "0'0", > > "created": 293, > > "last_epoch_clean": 343, > > "parent": "0.0", > > "parent_split_bits": 0, > > "last_scrub": "0'0", > > "last_scrub_stamp": "2015-03-30 11:11:18.872851", > > "last_deep_scrub": "0'0", > > "last_deep_scrub_stamp": "2015-03-30 11:11:18.872851", > > "last_clean_scrub_stamp": "0.000000", > > "log_size": 0, > > "ondisk_log_size": 0, > > "stats_invalid": "0", > > "stat_sum": { "num_bytes": 0, > > "num_objects": 0, > > "num_object_clones": 0, > > "num_object_copies": 0, > > "num_objects_missing_on_primary": 0, > > "num_objects_degraded": 0, > > "num_objects_unfound": 0, > > "num_objects_dirty": 0, > > "num_whiteouts": 0, > > "num_read": 0, > > "num_read_kb": 0, > > "num_write": 0, > > "num_write_kb": 0, > > "num_scrub_errors": 0, > > "num_shallow_scrub_errors": 0, > > "num_deep_scrub_errors": 0, > > "num_objects_recovered": 0, > > "num_bytes_recovered": 0, > > "num_keys_recovered": 0, > > "num_objects_omap": 0, > > "num_objects_hit_set_archive": 0}, > > "stat_cat_sum": {}, > > "up": [ > > 40, > > 92, > > 48, > > 91], > > "acting": [ > > 40, > > 92, > > 48, > > 91], > > "up_primary": 40, > > "acting_primary": 40}, > > "empty": 1, > > "dne": 0, > > "incomplete": 0, > > "last_epoch_started": 348, > > "hit_set_history": { "current_last_update": "0'0", > > "current_last_stamp": "0.000000", > > "current_info": { "begin": "0.000000", > > "end": "0.000000", > > "version": "0'0"}, > > "history": []}}, > > "peer_info": [ > > { "peer": "48", > > "pgid": "7.171", > > "last_update": "0'0", > > "last_complete": "0'0", > > "log_tail": "0'0", > > "last_user_version": 0, > > "last_backfill": "MAX", > > "purged_snaps": "[]", > > "history": { "epoch_created": 293, > > "last_epoch_started": 343, > > "last_epoch_clean": 343, > > "last_epoch_split": 0, > > "same_up_since": 688, > > "same_interval_since": 688, > > "same_primary_since": 608, > > "last_scrub": "0'0", > > "last_scrub_stamp": "2015-03-30 11:11:18.872851", > > "last_deep_scrub": "0'0", > > "last_deep_scrub_stamp": "2015-03-30 11:11:18.872851", > > "last_clean_scrub_stamp": "0.000000"}, > > "stats": { "version": "0'0", > > "reported_seq": "24", > > "reported_epoch": "348", > > "state": "peering", > > "last_fresh": "2015-03-30 11:39:02.979742", > > "last_change": "2015-03-30 11:39:01.650897", > > "last_active": "2015-03-30 11:37:56.956417", > > "last_clean": "2015-03-30 11:37:56.956417", > > "last_became_active": "0.000000", > > "last_unstale": "2015-03-30 11:39:02.979742", > > "mapping_epoch": 683, > > "log_start": "0'0", > > "ondisk_log_start": "0'0", > > "created": 293, > > "last_epoch_clean": 343, > > "parent": "0.0", > > "parent_split_bits": 0, > > "last_scrub": "0'0", > > "last_scrub_stamp": "2015-03-30 11:11:18.872851", > > "last_deep_scrub": "0'0", > > "last_deep_scrub_stamp": "2015-03-30 11:11:18.872851", > > "last_clean_scrub_stamp": "0.000000", > > "log_size": 0, > > "ondisk_log_size": 0, > > "stats_invalid": "0", > > "stat_sum": { "num_bytes": 0, > > "num_objects": 0, > > "num_object_clones": 0, > > "num_object_copies": 0, > > "num_objects_missing_on_primary": 0, > > "num_objects_degraded": 0, > > "num_objects_unfound": 0, > > "num_objects_dirty": 0, > > "num_whiteouts": 0, > > "num_read": 0, > > "num_read_kb": 0, > > "num_write": 0, > > "num_write_kb": 0, > > "num_scrub_errors": 0, > > "num_shallow_scrub_errors": 0, > > "num_deep_scrub_errors": 0, > > "num_objects_recovered": 0, > > "num_bytes_recovered": 0, > > "num_keys_recovered": 0, > > "num_objects_omap": 0, > > "num_objects_hit_set_archive": 0}, > > "stat_cat_sum": {}, > > "up": [ > > 40, > > 92, > > 48, > > 91], > > "acting": [ > > 40, > > 92, > > 48, > > 91], > > "up_primary": 40, > > "acting_primary": 40}, > > "empty": 1, > > "dne": 0, > > "incomplete": 0, > > "last_epoch_started": 348, > > "hit_set_history": { "current_last_update": "0'0", > > "current_last_stamp": "0.000000", > > "current_info": { "begin": "0.000000", > > "end": "0.000000", > > "version": "0'0"}, > > "history": []}}, > > { "peer": "110", > > "pgid": "7.171", > > "last_update": "0'0", > > "last_complete": "0'0", > > "log_tail": "0'0", > > "last_user_version": 0, > > "last_backfill": "MAX", > > "purged_snaps": "[]", > > "history": { "epoch_created": 0, > > "last_epoch_started": 0, > > "last_epoch_clean": 0, > > "last_epoch_split": 0, > > "same_up_since": 0, > > "same_interval_since": 0, > > "same_primary_since": 0, > > "last_scrub": "0'0", > > "last_scrub_stamp": "0.000000", > > "last_deep_scrub": "0'0", > > "last_deep_scrub_stamp": "0.000000", > > "last_clean_scrub_stamp": "0.000000"}, > > "stats": { "version": "0'0", > > "reported_seq": "0", > > "reported_epoch": "0", > > "state": "inactive", > > "last_fresh": "0.000000", > > "last_change": "0.000000", > > "last_active": "0.000000", > > "last_clean": "0.000000", > > "last_became_active": "0.000000", > > "last_unstale": "0.000000", > > "mapping_epoch": 0, > > "log_start": "0'0", > > "ondisk_log_start": "0'0", > > "created": 0, > > "last_epoch_clean": 0, > > "parent": "0.0", > > "parent_split_bits": 0, > > "last_scrub": "0'0", > > "last_scrub_stamp": "0.000000", > > "last_deep_scrub": "0'0", > > "last_deep_scrub_stamp": "0.000000", > > "last_clean_scrub_stamp": "0.000000", > > "log_size": 0, > > "ondisk_log_size": 0, > > "stats_invalid": "0", > > "stat_sum": { "num_bytes": 0, > > "num_objects": 0, > > "num_object_clones": 0, > > "num_object_copies": 0, > > "num_objects_missing_on_primary": 0, > > "num_objects_degraded": 0, > > "num_objects_unfound": 0, > > "num_objects_dirty": 0, > > "num_whiteouts": 0, > > "num_read": 0, > > "num_read_kb": 0, > > "num_write": 0, > > "num_write_kb": 0, > > "num_scrub_errors": 0, > > "num_shallow_scrub_errors": 0, > > "num_deep_scrub_errors": 0, > > "num_objects_recovered": 0, > > "num_bytes_recovered": 0, > > "num_keys_recovered": 0, > > "num_objects_omap": 0, > > "num_objects_hit_set_archive": 0}, > > "stat_cat_sum": {}, > > "up": [], > > "acting": [], > > "up_primary": -1, > > "acting_primary": -1}, > > "empty": 1, > > "dne": 1, > > "incomplete": 0, > > "last_epoch_started": 0, > > "hit_set_history": { "current_last_update": "0'0", > > "current_last_stamp": "0.000000", > > "current_info": { "begin": "0.000000", > > "end": "0.000000", > > "version": "0'0"}, > > "history": []}}], > > "recovery_state": [ > > { "name": "Started\/Primary\/Peering\/GetInfo", > > "enter_time": "2015-03-30 19:44:18.709317", > > "requested_info_from": [ > > { "osd": "0"}, > > { "osd": "5"}, > > { "osd": "10"}, > > { "osd": "22"}, > > { "osd": "54"}, > > { "osd": "91"}, > > { "osd": "92"}, > > { "osd": "113"}, > > { "osd": "114"}]}, > > { "name": "Started\/Primary\/Peering", > > "enter_time": "2015-03-30 19:44:18.709316", > > "past_intervals": [ > > { "first": 342, > > "last": 346, > > "maybe_went_rw": 1, > > "up": [ > > 40, > > 92, > > 114], > > "acting": [ > > 40, > > 92, > > 114, > > 40, > > 40]}, > > { "first": 347, > > "last": 353, > > "maybe_went_rw": 1, > > "up": [ > > 40, > > 92, > > 48], > > "acting": [ > > 40, > > 92, > > 48, > > 40, > > 40]}, > > { "first": 354, > > "last": 356, > > "maybe_went_rw": 1, > > "up": [ > > 92, > > 48], > > "acting": [ > > 92, > > 48, > > 92, > > 92]}, > > { "first": 357, > > "last": 359, > > "maybe_went_rw": 1, > > "up": [ > > 113, > > 48, > > 114], > > "acting": [ > > 113, > > 48, > > 114, > > 113, > > 113]}, > > { "first": 360, > > "last": 361, > > "maybe_went_rw": 1, > > "up": [ > > 40, > > 92, > > 48], > > "acting": [ > > 40, > > 92, > > 48, > > 40, > > 40]}, > > { "first": 362, > > "last": 364, > > "maybe_went_rw": 1, > > "up": [ > > 40, > > 92], > > "acting": [ > > 40, > > 92, > > 40, > > 40]}, > > { "first": 365, > > "last": 369, > > "maybe_went_rw": 1, > > "up": [ > > 40, > > 92, > > 114], > > "acting": [ > > 40, > > 92, > > 114, > > 40, > > 40]}, > > { "first": 370, > > "last": 379, > > "maybe_went_rw": 1, > > "up": [ > > 40, > > 92, > > 48], > > "acting": [ > > 40, > > 92, > > 48, > > 40, > > 40]}, > > { "first": 380, > > "last": 400, > > "maybe_went_rw": 1, > > "up": [ > > 40, > > 92, > > 48, > > 91], > > "acting": [ > > 40, > > 92, > > 48, > > 91, > > 40, > > 40]}, > > { "first": 401, > > "last": 409, > > "maybe_went_rw": 1, > > "up": [ > > 92, > > 48, > > 91], > > "acting": [ > > 92, > > 48, > > 91, > > 92, > > 92]}, > > { "first": 410, > > "last": 414, > > "maybe_went_rw": 1, > > "up": [ > > 113, > > 48, > > 114, > > 0], > > "acting": [ > > 113, > > 48, > > 114, > > 0, > > 113, > > 113]}, > > { "first": 415, > > "last": 435, > > "maybe_went_rw": 1, > > "up": [ > > 113, > > 48, > > 114, > > 10], > > "acting": [ > > 113, > > 48, > > 114, > > 10, > > 113, > > 113]}, > > { "first": 436, > > "last": 442, > > "maybe_went_rw": 1, > > "up": [ > > 40, > > 92, > > 48, > > 91], > > "acting": [ > > 40, > > 92, > > 48, > > 91, > > 40, > > 40]}, > > { "first": 443, > > "last": 446, > > "maybe_went_rw": 1, > > "up": [ > > 40, > > 92, > > 48], > > "acting": [ > > 40, > > 92, > > 48, > > 40, > > 40]}, > > { "first": 447, > > "last": 457, > > "maybe_went_rw": 1, > > "up": [ > > 40, > > 48], > > "acting": [ > > 40, > > 48, > > 40, > > 40]}, > > { "first": 458, > > "last": 460, > > "maybe_went_rw": 1, > > "up": [ > > 40, > > 48, > > 10], > > "acting": [ > > 40, > > 48, > > 10, > > 40, > > 40]}, > > { "first": 461, > > "last": 466, > > "maybe_went_rw": 1, > > "up": [ > > 40, > > 48, > > 22], > > "acting": [ > > 40, > > 48, > > 22, > > 40, > > 40]}, > > { "first": 467, > > "last": 478, > > "maybe_went_rw": 1, > > "up": [ > > 40, > > 48, > > 22, > > 5], > > "acting": [ > > 40, > > 48, > > 22, > > 5, > > 40, > > 40]}, > > { "first": 479, > > "last": 489, > > "maybe_went_rw": 1, > > "up": [ > > 40, > > 48, > > 22, > > 110], > > "acting": [ > > 40, > > 48, > > 22, > > 110, > > 40, > > 40]}, > > { "first": 490, > > "last": 496, > > "maybe_went_rw": 1, > > "up": [ > > 40, > > 48, > > 22, > > 0], > > "acting": [ > > 40, > > 48, > > 22, > > 0, > > 40, > > 40]}, > > { "first": 497, > > "last": 507, > > "maybe_went_rw": 1, > > "up": [ > > 40, > > 48, > > 114, > > 10], > > "acting": [ > > 40, > > 48, > > 114, > > 10, > > 40, > > 40]}, > > { "first": 508, > > "last": 511, > > "maybe_went_rw": 1, > > "up": [ > > 40, > > 48, > > 54, > > 91], > > "acting": [ > > 40, > > 48, > > 54, > > 91, > > 40, > > 40]}, > > { "first": 512, > > "last": 579, > > "maybe_went_rw": 1, > > "up": [ > > 40, > > 92, > > 48, > > 91], > > "acting": [ > > 40, > > 92, > > 48, > > 91, > > 40, > > 40]}, > > { "first": 580, > > "last": 580, > > "maybe_went_rw": 0, > > "up": [ > > 40, > > 92, > > 91], > > "acting": [ > > 40, > > 92, > > 91, > > 40, > > 40]}, > > { "first": 581, > > "last": 591, > > "maybe_went_rw": 1, > > "up": [ > > 92, > > 91], > > "acting": [ > > 92, > > 91, > > 92, > > 92]}, > > { "first": 592, > > "last": 595, > > "maybe_went_rw": 1, > > "up": [ > > 113, > > 114, > > 22, > > 0], > > "acting": [ > > 113, > > 114, > > 22, > > 0, > > 113, > > 113]}, > > { "first": 596, > > "last": 599, > > "maybe_went_rw": 1, > > "up": [ > > 113, > > 48, > > 114, > > 10], > > "acting": [ > > 113, > > 48, > > 114, > > 10, > > 113, > > 113]}, > > { "first": 600, > > "last": 606, > > "maybe_went_rw": 1, > > "up": [ > > 40, > > 92, > > 48, > > 91], > > "acting": [ > > 40, > > 92, > > 48, > > 91, > > 40, > > 40]}, > > { "first": 607, > > "last": 607, > > "maybe_went_rw": 0, > > "up": [ > > 92, > > 91], > > "acting": [ > > 92, > > 91, > > 92, > > 92]}, > > { "first": 608, > > "last": 616, > > "maybe_went_rw": 1, > > "up": [ > > 40, > > 92, > > 48, > > 91], > > "acting": [ > > 40, > > 92, > > 48, > > 91, > > 40, > > 40]}, > > { "first": 617, > > "last": 625, > > "maybe_went_rw": 1, > > "up": [ > > 40, > > 92, > > 91], > > "acting": [ > > 40, > > 92, > > 91, > > 40, > > 40]}, > > { "first": 626, > > "last": 632, > > "maybe_went_rw": 1, > > "up": [ > > 40, > > 92, > > 114, > > 10], > > "acting": [ > > 40, > > 92, > > 114, > > 10, > > 40, > > 40]}, > > { "first": 633, > > "last": 639, > > "maybe_went_rw": 1, > > "up": [ > > 40, > > 92, > > 48, > > 91], > > "acting": [ > > 40, > > 92, > > 48, > > 91, > > 40, > > 40]}, > > { "first": 640, > > "last": 643, > > "maybe_went_rw": 1, > > "up": [ > > 40, > > 92, > > 91], > > "acting": [ > > 40, > > 92, > > 91, > > 40, > > 40]}, > > { "first": 644, > > "last": 662, > > "maybe_went_rw": 1, > > "up": [ > > 40, > > 92, > > 114, > > 10], > > "acting": [ > > 40, > > 92, > > 114, > > 10, > > 40, > > 40]}, > > { "first": 663, > > "last": 679, > > "maybe_went_rw": 1, > > "up": [ > > 40, > > 92, > > 48, > > 91], > > "acting": [ > > 40, > > 92, > > 48, > > 91, > > 40, > > 40]}, > > { "first": 680, > > "last": 682, > > "maybe_went_rw": 1, > > "up": [ > > 40, > > 92, > > 48], > > "acting": [ > > 40, > > 92, > > 48, > > 40, > > 40]}, > > { "first": 683, > > "last": 687, > > "maybe_went_rw": 1, > > "up": [ > > 40, > > 92, > > 48, > > 10], > > "acting": [ > > 40, > > 92, > > 48, > > 10, > > 40, > > 40]}], > > "probing_osds": [ > > "0", > > "5", > > "10", > > "22", > > "40", > > "48", > > "54", > > "91", > > "92", > > "110", > > "113", > > "114"], > > "down_osds_we_would_probe": [], > > "peering_blocked_by": []}, > > { "name": "Started", > > "enter_time": "2015-03-30 19:44:18.709312"}], > > "agent_state": {}} > > > -- > To unsubscribe from this list: send the line "unsubscribe ceph-devel" in > the body of a message to majordomo@xxxxxxxxxxxxxxx > More majordomo info at http://vger.kernel.org/majordomo-info.html > > _______________________________________________ ceph-users mailing list ceph-users@xxxxxxxxxxxxxx http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com