-----BEGIN PGP SIGNED MESSAGE----- Hash: SHA256 I've had something similar to this when there was an MTU mismatch, the smaller I/O would get through, but the larger I/O would be blocked and prevent peering. - ---------------- Robert LeBlanc PGP Fingerprint 79A2 9CA4 6CC4 45DD A904 C70E E654 3BB2 FA62 B9F1 On Sun, Dec 13, 2015 at 9:01 PM, Chris Dunlop wrote: > Hi Varada, > > On Mon, Dec 14, 2015 at 03:23:20AM +0000, Varada Kari wrote: >> Can get the details of >> >> 1. ceph health detail >> 2. ceph pg query >> >> of any one PG stuck peering >> >> >> Varada > > The full health detail is over 9000 lines, but here's a summary: > > # ceph health detail | head > HEALTH_WARN 3072 pgs peering; 3072 pgs stuck inactive; 3072 pgs stuck unclean; 1570 requests are blocked > 32 sec; 25 osds have slow requests; noout flag(s) set > pg 3.1ae is stuck inactive for 23264.342056, current state peering, last acting [16,4,8] > pg 2.1af is stuck inactive for 23621.565024, current state peering, last acting [6,0] > pg 6.1ab is stuck inactive for 22843.875498, current state peering, last acting [27,18,54] > pg 3.1af is stuck inactive for 23315.971276, current state peering, last acting [17,16,24] > pg 2.1ae is stuck inactive for 19278.004657, current state peering, last acting [7,1] > pg 6.1aa is stuck inactive for 19321.668092, current state peering, last acting [31,39,56] > pg 3.1a8 is stuck inactive for 22897.969982, current state peering, last acting [16,17,24] > pg 2.1a9 is stuck inactive for 23516.554757, current state peering, last acting [14,7] > pg 6.1ad is stuck inactive for 23105.915508, current state peering, last acting [33,47,20] > # ceph health detail | grep -v peering > 34 ops are blocked > 16777.2 sec > 1289 ops are blocked > 8388.61 sec > 50 ops are blocked > 4194.3 sec > 34 ops are blocked > 2097.15 sec > 68 ops are blocked > 1048.58 sec > 13 ops are blocked > 524.288 sec > 11 ops are blocked > 16777.2 sec on osd.0 > 4 ops are blocked > 8388.61 sec on osd.0 > 5 ops are blocked > 8388.61 sec on osd.1 > 100 ops are blocked > 8388.61 sec on osd.2 > 100 ops are blocked > 8388.61 sec on osd.3 > 100 ops are blocked > 8388.61 sec on osd.4 > 80 ops are blocked > 8388.61 sec on osd.5 > 34 ops are blocked > 8388.61 sec on osd.6 > 27 ops are blocked > 4194.3 sec on osd.6 > 15 ops are blocked > 2097.15 sec on osd.6 > 6 ops are blocked > 1048.58 sec on osd.6 > 9 ops are blocked > 524.288 sec on osd.6 > 2 ops are blocked > 16777.2 sec on osd.7 > 20 ops are blocked > 4194.3 sec on osd.7 > 16 ops are blocked > 2097.15 sec on osd.7 > 62 ops are blocked > 1048.58 sec on osd.7 > 85 ops are blocked > 8388.61 sec on osd.8 > 80 ops are blocked > 8388.61 sec on osd.9 > 13 ops are blocked > 16777.2 sec on osd.10 > 3 ops are blocked > 8388.61 sec on osd.10 > 1 ops are blocked > 4194.3 sec on osd.10 > 1 ops are blocked > 2097.15 sec on osd.10 > 6 ops are blocked > 8388.61 sec on osd.11 > 5 ops are blocked > 8388.61 sec on osd.12 > 4 ops are blocked > 8388.61 sec on osd.13 > 2 ops are blocked > 8388.61 sec on osd.14 > 4 ops are blocked > 524.288 sec on osd.14 > 7 ops are blocked > 16777.2 sec on osd.15 > 12 ops are blocked > 8388.61 sec on osd.15 > 2 ops are blocked > 4194.3 sec on osd.15 > 2 ops are blocked > 2097.15 sec on osd.15 > 100 ops are blocked > 8388.61 sec on osd.16 > 82 ops are blocked > 8388.61 sec on osd.17 > 1 ops are blocked > 16777.2 sec on osd.18 > 100 ops are blocked > 8388.61 sec on osd.21 > 86 ops are blocked > 8388.61 sec on osd.24 > 100 ops are blocked > 8388.61 sec on osd.38 > 100 ops are blocked > 8388.61 sec on osd.42 > 100 ops are blocked > 8388.61 sec on osd.44 > 1 ops are blocked > 8388.61 sec on osd.51 > 25 osds have slow requests > noout flag(s) set > > > # ceph pg 3.1ae query > <<< hung, until ^c >>> > # ceph pg 2.1af query > { > "state": "peering", > "snap_trimq": "[]", > "epoch": 357236, > "up": [ > 6, > 0 > ], > "acting": [ > 6, > 0 > ], > "info": { > "pgid": "2.1af", > "last_update": "356361'1923761", > "last_complete": "356361'1923761", > "log_tail": "341349'1920757", > "last_user_version": 1923761, > "last_backfill": "MAX", > "purged_snaps": "[1~34,38~1b,55~2,59~2a,84~68,ee~62]", > "history": { > "epoch_created": 1, > "last_epoch_started": 356496, > "last_epoch_clean": 356496, > "last_epoch_split": 0, > "same_up_since": 357218, > "same_interval_since": 357218, > "same_primary_since": 357218, > "last_scrub": "356347'1923757", > "last_scrub_stamp": "2015-12-12 12:18:54.719534", > "last_deep_scrub": "356347'1923757", > "last_deep_scrub_stamp": "2015-12-12 12:18:54.719534", > "last_clean_scrub_stamp": "2015-12-12 12:18:54.719534" > }, > "stats": { > "version": "356361'1923761", > "reported_seq": "37552607", > "reported_epoch": "357218", > "state": "peering", > "last_fresh": "2015-12-14 12:54:41.084804", > "last_change": "2015-12-14 12:54:41.084804", > "last_active": "2015-12-14 07:53:05.850772", > "last_peered": "2015-12-14 07:53:05.850772", > "last_clean": "2015-12-14 07:53:05.850772", > "last_became_active": "2013-09-11 09:13:39.309600", > "last_became_peered": "2013-09-11 09:13:39.309600", > "last_unstale": "2015-12-14 12:54:41.084804", > "last_undegraded": "2015-12-14 12:54:41.084804", > "last_fullsized": "2015-12-14 12:54:41.084804", > "mapping_epoch": 357168, > "log_start": "341349'1920757", > "ondisk_log_start": "341349'1920757", > "created": 1, > "last_epoch_clean": 356496, > "parent": "0.0", > "parent_split_bits": 0, > "last_scrub": "356347'1923757", > "last_scrub_stamp": "2015-12-12 12:18:54.719534", > "last_deep_scrub": "356347'1923757", > "last_deep_scrub_stamp": "2015-12-12 12:18:54.719534", > "last_clean_scrub_stamp": "2015-12-12 12:18:54.719534", > "log_size": 3004, > "ondisk_log_size": 3004, > "stats_invalid": "0", > "stat_sum": { > "num_bytes": 7360028160, > "num_objects": 2107, > "num_object_clones": 642, > "num_object_copies": 4214, > "num_objects_missing_on_primary": 0, > "num_objects_degraded": 0, > "num_objects_misplaced": 0, > "num_objects_unfound": 0, > "num_objects_dirty": 569, > "num_whiteouts": 0, > "num_read": 726240, > "num_read_kb": 31291910, > "num_write": 127250, > "num_write_kb": 13514083, > "num_scrub_errors": 0, > "num_shallow_scrub_errors": 0, > "num_deep_scrub_errors": 0, > "num_objects_recovered": 2187, > "num_bytes_recovered": 9137582592, > "num_keys_recovered": 0, > "num_objects_omap": 0, > "num_objects_hit_set_archive": 0, > "num_bytes_hit_set_archive": 0 > }, > "up": [ > 6, > 0 > ], > "acting": [ > 6, > 0 > ], > "blocked_by": [ > 0 > ], > "up_primary": 6, > "acting_primary": 6 > }, > "empty": 0, > "dne": 0, > "incomplete": 0, > "last_epoch_started": 356496, > "hit_set_history": { > "current_last_update": "0'0", > "current_last_stamp": "0.000000", > "current_info": { > "begin": "0.000000", > "end": "0.000000", > "version": "0'0" > }, > "history": [] > } > }, > "peer_info": [], > "recovery_state": [ > { > "name": "Started\/Primary\/Peering\/GetInfo", > "enter_time": "2015-12-14 12:54:41.084784", > "requested_info_from": [ > { > "osd": "0" > } > ] > }, > { > "name": "Started\/Primary\/Peering", > "enter_time": "2015-12-14 12:54:41.084773", > "past_intervals": [ > { > "first": 356495, > "last": 356560, > "maybe_went_rw": 1, > "up": [ > 6, > 0 > ], > "acting": [ > 6, > 0 > ], > "primary": 6, > "up_primary": 6 > }, > { > "first": 356561, > "last": 356608, > "maybe_went_rw": 1, > "up": [ > 0 > ], > "acting": [ > 0 > ], > "primary": 0, > "up_primary": 0 > }, > { > "first": 356609, > "last": 356655, > "maybe_went_rw": 1, > "up": [ > 6, > 0 > ], > "acting": [ > 6, > 0 > ], > "primary": 6, > "up_primary": 6 > }, > { > "first": 356656, > "last": 356670, > "maybe_went_rw": 1, > "up": [ > 6 > ], > "acting": [ > 6 > ], > "primary": 6, > "up_primary": 6 > }, > { > "first": 356671, > "last": 356681, > "maybe_went_rw": 1, > "up": [ > 6, > 0 > ], > "acting": [ > 6, > 0 > ], > "primary": 6, > "up_primary": 6 > }, > { > "first": 356682, > "last": 356722, > "maybe_went_rw": 1, > "up": [ > 0 > ], > "acting": [ > 0 > ], > "primary": 0, > "up_primary": 0 > }, > { > "first": 356723, > "last": 356723, > "maybe_went_rw": 0, > "up": [], > "acting": [], > "primary": -1, > "up_primary": -1 > }, > { > "first": 356724, > "last": 356824, > "maybe_went_rw": 1, > "up": [ > 0 > ], > "acting": [ > 0 > ], > "primary": 0, > "up_primary": 0 > }, > { > "first": 356825, > "last": 356876, > "maybe_went_rw": 1, > "up": [ > 6, > 0 > ], > "acting": [ > 6, > 0 > ], > "primary": 6, > "up_primary": 6 > }, > { > "first": 356877, > "last": 356920, > "maybe_went_rw": 1, > "up": [ > 0 > ], > "acting": [ > 0 > ], > "primary": 0, > "up_primary": 0 > }, > { > "first": 356921, > "last": 356921, > "maybe_went_rw": 0, > "up": [], > "acting": [], > "primary": -1, > "up_primary": -1 > }, > { > "first": 356922, > "last": 356958, > "maybe_went_rw": 1, > "up": [ > 0 > ], > "acting": [ > 0 > ], > "primary": 0, > "up_primary": 0 > }, > { > "first": 356959, > "last": 356963, > "maybe_went_rw": 1, > "up": [ > 6, > 0 > ], > "acting": [ > 6, > 0 > ], > "primary": 6, > "up_primary": 6 > }, > { > "first": 356964, > "last": 357025, > "maybe_went_rw": 1, > "up": [ > 0 > ], > "acting": [ > 0 > ], > "primary": 0, > "up_primary": 0 > }, > { > "first": 357026, > "last": 357026, > "maybe_went_rw": 0, > "up": [], > "acting": [], > "primary": -1, > "up_primary": -1 > }, > { > "first": 357027, > "last": 357041, > "maybe_went_rw": 1, > "up": [ > 0 > ], > "acting": [ > 0 > ], > "primary": 0, > "up_primary": 0 > }, > { > "first": 357042, > "last": 357081, > "maybe_went_rw": 1, > "up": [ > 6, > 0 > ], > "acting": [ > 6, > 0 > ], > "primary": 6, > "up_primary": 6 > }, > { > "first": 357082, > "last": 357082, > "maybe_went_rw": 0, > "up": [ > 6 > ], > "acting": [ > 6 > ], > "primary": 6, > "up_primary": 6 > }, > { > "first": 357083, > "last": 357088, > "maybe_went_rw": 0, > "up": [ > 6, > 0 > ], > "acting": [ > 6, > 0 > ], > "primary": 6, > "up_primary": 6 > }, > { > "first": 357089, > "last": 357089, > "maybe_went_rw": 0, > "up": [ > 0 > ], > "acting": [ > 0 > ], > "primary": 0, > "up_primary": 0 > }, > { > "first": 357090, > "last": 357167, > "maybe_went_rw": 1, > "up": [ > 6, > 0 > ], > "acting": [ > 6, > 0 > ], > "primary": 6, > "up_primary": 6 > }, > { > "first": 357168, > "last": 357217, > "maybe_went_rw": 1, > "up": [ > 0 > ], > "acting": [ > 0 > ], > "primary": 0, > "up_primary": 0 > } > ], > "probing_osds": [ > "0", > "6" > ], > "down_osds_we_would_probe": [], > "peering_blocked_by": [] > }, > { > "name": "Started", > "enter_time": "2015-12-14 12:54:41.084717" > } > ], > "agent_state": {} > } > > > Chris > _______________________________________________ > ceph-users mailing list > ceph-users@xxxxxxxxxxxxxx > http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com -----BEGIN PGP SIGNATURE----- Version: Mailvelope v1.3.2 Comment: https://www.mailvelope.com wsFcBAEBCAAQBQJWbkE0CRDmVDuy+mK58QAAt1gP/jHXE8xMKGBQI2rwFr3y FkIzGKVzXfRhGW1u1gL1GwNMwl/i0g2Zworc9ZAnb2vsnzP9liG4jRQvnidL KOcQ1RO0NjXfy2FULuZ0NTzcwhbgfmNR8cDtr5D3+kMIungEtP9chardUxqB iFK4e+Xj8/Bb9Bk/fuHkWBLCnCIuTZr04pU+4EWLJJ8jlPdOEGzDiNxoGIQc pQ0h8+ICKlA+TpxKNcFRhXJzWyvy2/rWft7UaM4dg9PmGD4DECoVgdUMfsvx OouS0jSrw9OtR5Gr8g2xCPQiV+bwOW7Gd+G+G5jHXD1gsXrGOVhq1Hasvj5V kVj2sHi+as+GfBQjwEKlv/yh5b1D8a5+GF/145IbPIsJbMn6bYwBQyYe/Ll1 HRNKYZrJ8R5qnZlIY6uUCNZRKcG9nFCVyIDpzi68KrLpK7W+vn4GAlfMOmgL yVGfpp418XPw7soNkFWeET/qhRYLGcBNqUA1TN+FIiOUuBpBsmJ+hxmBzgO4 iFDintX0gXhzRej3Qh0lDXWDb1F69ozYes+CJkKLt3MfbRca5zyKkHbwGu0G Ssj0zePDT+OekuElcuvSA9yj0NNCInn7FZGP0Wr/wOAnVX7VtFNhQHjWAwEv NDN0F6JH3zfs6/miSOTzWK8nv4W/6C1OXdGVxlfGmQTm008ThUlLdrDl3MmE bhtB =PjKy -----END PGP SIGNATURE----- _______________________________________________ ceph-users mailing list ceph-users@xxxxxxxxxxxxxx http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com