Hi
We have weird issue iwth our ceph cluster - almost all PGs assigned to
one specific pool became stuck, locking out all operations without
reporting any errors.
Story:
We have 3 different pools, hdd-backed, ssd-backed and nvme-backed.
Pool ssh worked fine for few months.
Today one of the hosts assigned to nvme pool restarted triggering
recovery in that pool. It wnet fast and cluster went to OK state.
During these events or shortly after them ssd pool became unresponsive.
It was impossible to either read or write from/to it.
We decided to slowly restart fist OSDs assigned to it, thenm as it
didn't help - all the mons, wihout breaking quorum of course.
At this moment both nvme and hdd polls are working fine, ssd one is
stuck in recovery.
All OSDs in that ssd pool use large amount of CPU and are exchanging
approx 1Mpps per OSD server between each other.
PGs seem to be slowly migrating from peering to activating but it's
going very slowly - approx 10PGs during last hour.
We were using 14.2.2 OSDs when issues happened, upgrade to 14.2.13
didn't help. We increased heartbeat grace, but it didn't change
anything.
It doesn't seem that there's a network problem as OSDs don't report
problems with connecting to MONs or each other. Other OSDs - nvme,
connected to that same set of switches work without issues.
Can you help? Point me to what should i check or do? I looked on-line
and on the group for causes of peering issues and checked most of them,
nothing helped.
I can't use 'ceph pg 28.1cc query' as it hangs, even for PGs that are
marked as active+clean in the results of 'ceph pg dump'
I checked status of the one of stuck PGs via ceph-objectstore-tool
--data-path [...] --op info --pgid 28.29d for all three copies and got:
{
"pgid": "28.29d",
"last_update": "68160'205094",
"last_complete": "68160'205094",
"log_tail": "68062'202000",
"last_user_version": 205094,
"last_backfill": "MAX",
"last_backfill_bitwise": 0,
"purged_snaps": [
{
"start": "1",
"length": "3"
}
],
"history": {
"epoch_created": 67698,
"epoch_pool_created": 67698,
"last_epoch_started": 68871,
"last_interval_started": 68851,
"last_epoch_clean": 67746,
"last_interval_clean": 67745,
"last_epoch_split": 0,
"last_epoch_marked_full": 0,
"same_up_since": 69447,
"same_interval_since": 69447,
"same_primary_since": 69411,
"last_scrub": "68062'199623",
"last_scrub_stamp": "2020-11-03 03:32:46.895988",
"last_deep_scrub": "68062'177321",
"last_deep_scrub_stamp": "2020-11-02 01:07:15.963916",
"last_clean_scrub_stamp": "2020-11-03 03:32:46.895988"
},
"stats": {
"version": "68160'205094",
"reported_seq": "378496",
"reported_epoch": "69447",
"state": "peering",
"last_fresh": "2020-11-03 20:55:39.247348",
"last_change": "2020-11-03 20:55:39.247348",
"last_active": "2020-11-03 15:26:24.270088",
"last_peered": "2020-11-03 19:04:43.152655",
"last_clean": "2020-11-03 14:45:02.988293",
"last_became_active": "2020-09-01 13:52:40.091759",
"last_became_peered": "2020-11-03 19:04:42.939991",
"last_unstale": "2020-11-03 20:55:39.247348",
"last_undegraded": "2020-11-03 20:55:39.247348",
"last_fullsized": "2020-11-03 20:55:39.247348",
"mapping_epoch": 69447,
"log_start": "68062'202000",
"ondisk_log_start": "68062'202000",
"created": 67698,
"last_epoch_clean": 67746,
"parent": "0.0",
"parent_split_bits": 0,
"last_scrub": "68062'199623",
"last_scrub_stamp": "2020-11-03 03:32:46.895988",
"last_deep_scrub": "68062'177321",
"last_deep_scrub_stamp": "2020-11-02 01:07:15.963916",
"last_clean_scrub_stamp": "2020-11-03 03:32:46.895988",
"log_size": 3094,
"ondisk_log_size": 3094,
"stats_invalid": false,
"dirty_stats_invalid": false,
"omap_stats_invalid": false,
"hitset_stats_invalid": false,
"hitset_bytes_stats_invalid": false,
"pin_stats_invalid": false,
"manifest_stats_invalid": false,
"snaptrimq_len": 0,
"stat_sum": {
"num_bytes": 15173849600,
"num_objects": 3647,
"num_object_clones": 0,
"num_object_copies": 10941,
"num_objects_missing_on_primary": 0,
"num_objects_missing": 0,
"num_objects_degraded": 0,
"num_objects_misplaced": 0,
"num_objects_unfound": 0,
"num_objects_dirty": 3647,
"num_whiteouts": 0,
"num_read": 172836,
"num_read_kb": 6824184,
"num_write": 196190,
"num_write_kb": 21380176,
"num_scrub_errors": 0,
"num_shallow_scrub_errors": 0,
"num_deep_scrub_errors": 0,
"num_objects_recovered": 0,
"num_bytes_recovered": 0,
"num_keys_recovered": 0,
"num_objects_omap": 0,
"num_objects_hit_set_archive": 0,
"num_bytes_hit_set_archive": 0,
"num_flush": 0,
"num_flush_kb": 0,
"num_evict": 0,
"num_evict_kb": 0,
"num_promote": 0,
"num_flush_mode_high": 0,
"num_flush_mode_low": 0,
"num_evict_mode_some": 0,
"num_evict_mode_full": 0,
"num_objects_pinned": 0,
"num_legacy_snapsets": 0,
"num_large_omap_objects": 0,
"num_objects_manifest": 0,
"num_omap_bytes": 0,
"num_omap_keys": 0,
"num_objects_repaired": 0
},
"up": [
261,
284,
271
],
"acting": [
261,
284,
271
],
"avail_no_missing": [],
"object_location_counts": [],
"blocked_by": [
271,
284
],
"up_primary": 261,
"acting_primary": 261,
"purged_snaps": []
},
"empty": 0,
"dne": 0,
"incomplete": 0,
"last_epoch_started": 69422,
"hit_set_history": {
"current_last_update": "0'0",
"history": []
}
}
{
"pgid": "28.29d",
"last_update": "68160'205094",
"last_complete": "68160'205094",
"log_tail": "68062'202000",
"last_user_version": 205094,
"last_backfill": "MAX",
"last_backfill_bitwise": 0,
"purged_snaps": [
{
"start": "1",
"length": "3"
}
],
"history": {
"epoch_created": 67698,
"epoch_pool_created": 67698,
"last_epoch_started": 68871,
"last_interval_started": 68851,
"last_epoch_clean": 67746,
"last_interval_clean": 67745,
"last_epoch_split": 0,
"last_epoch_marked_full": 0,
"same_up_since": 69630,
"same_interval_since": 69630,
"same_primary_since": 69628,
"last_scrub": "68062'199623",
"last_scrub_stamp": "2020-11-03 03:32:46.895988",
"last_deep_scrub": "68062'177321",
"last_deep_scrub_stamp": "2020-11-02 01:07:15.963916",
"last_clean_scrub_stamp": "2020-11-03 03:32:46.895988"
},
"stats": {
"version": "68160'205094",
"reported_seq": "378445",
"reported_epoch": "69627",
"state": "peering",
"last_fresh": "2020-11-03 21:15:08.819278",
"last_change": "2020-11-03 21:14:18.360957",
"last_active": "2020-11-03 15:26:24.270088",
"last_peered": "2020-11-03 19:04:43.152655",
"last_clean": "2020-11-03 14:45:02.988293",
"last_became_active": "2020-09-01 13:52:40.091759",
"last_became_peered": "2020-11-03 19:04:42.939991",
"last_unstale": "2020-11-03 21:15:08.819278",
"last_undegraded": "2020-11-03 21:15:08.819278",
"last_fullsized": "2020-11-03 21:15:08.819278",
"mapping_epoch": 69630,
"log_start": "68062'202000",
"ondisk_log_start": "68062'202000",
"created": 67698,
"last_epoch_clean": 67746,
"parent": "0.0",
"parent_split_bits": 0,
"last_scrub": "68062'199623",
"last_scrub_stamp": "2020-11-03 03:32:46.895988",
"last_deep_scrub": "68062'177321",
"last_deep_scrub_stamp": "2020-11-02 01:07:15.963916",
"last_clean_scrub_stamp": "2020-11-03 03:32:46.895988",
"log_size": 3094,
"ondisk_log_size": 3094,
"stats_invalid": false,
"dirty_stats_invalid": false,
"omap_stats_invalid": false,
"hitset_stats_invalid": false,
"hitset_bytes_stats_invalid": false,
"pin_stats_invalid": false,
"manifest_stats_invalid": false,
"snaptrimq_len": 0,
"stat_sum": {
"num_bytes": 15173849600,
"num_objects": 3647,
"num_object_clones": 0,
"num_object_copies": 10941,
"num_objects_missing_on_primary": 0,
"num_objects_missing": 0,
"num_objects_degraded": 0,
"num_objects_misplaced": 0,
"num_objects_unfound": 0,
"num_objects_dirty": 3647,
"num_whiteouts": 0,
"num_read": 172836,
"num_read_kb": 6824184,
"num_write": 196190,
"num_write_kb": 21380176,
"num_scrub_errors": 0,
"num_shallow_scrub_errors": 0,
"num_deep_scrub_errors": 0,
"num_objects_recovered": 0,
"num_bytes_recovered": 0,
"num_keys_recovered": 0,
"num_objects_omap": 0,
"num_objects_hit_set_archive": 0,
"num_bytes_hit_set_archive": 0,
"num_flush": 0,
"num_flush_kb": 0,
"num_evict": 0,
"num_evict_kb": 0,
"num_promote": 0,
"num_flush_mode_high": 0,
"num_flush_mode_low": 0,
"num_evict_mode_some": 0,
"num_evict_mode_full": 0,
"num_objects_pinned": 0,
"num_legacy_snapsets": 0,
"num_large_omap_objects": 0,
"num_objects_manifest": 0,
"num_omap_bytes": 0,
"num_omap_keys": 0,
"num_objects_repaired": 0
},
"up": [
261,
284
],
"acting": [
261,
284
],
"avail_no_missing": [],
"object_location_counts": [],
"blocked_by": [
271
],
"up_primary": 261,
"acting_primary": 261,
"purged_snaps": []
},
"empty": 0,
"dne": 0,
"incomplete": 0,
"last_epoch_started": 69392,
"hit_set_history": {
"current_last_update": "0'0",
"history": []
}
}
{
"pgid": "28.29d",
"last_update": "68160'205094",
"last_complete": "68160'205094",
"log_tail": "68062'202000",
"last_user_version": 205094,
"last_backfill": "MAX",
"last_backfill_bitwise": 0,
"purged_snaps": [
{
"start": "1",
"length": "3"
}
],
"history": {
"epoch_created": 67698,
"epoch_pool_created": 67698,
"last_epoch_started": 68871,
"last_interval_started": 68851,
"last_epoch_clean": 67746,
"last_interval_clean": 67745,
"last_epoch_split": 0,
"last_epoch_marked_full": 0,
"same_up_since": 69411,
"same_interval_since": 69411,
"same_primary_since": 69411,
"last_scrub": "68062'199623",
"last_scrub_stamp": "2020-11-03 03:32:46.895988",
"last_deep_scrub": "68062'177321",
"last_deep_scrub_stamp": "2020-11-02 01:07:15.963916",
"last_clean_scrub_stamp": "2020-11-03 03:32:46.895988"
},
"stats": {
"version": "68070'205093",
"reported_seq": "378344",
"reported_epoch": "68160",
"state": "active+clean",
"last_fresh": "2020-11-03 14:45:02.988293",
"last_change": "2020-11-03 03:32:46.896044",
"last_active": "2020-11-03 14:45:02.988293",
"last_peered": "2020-11-03 14:45:02.988293",
"last_clean": "2020-11-03 14:45:02.988293",
"last_became_active": "2020-09-01 13:52:40.091759",
"last_became_peered": "2020-09-01 13:52:40.091759",
"last_unstale": "2020-11-03 14:45:02.988293",
"last_undegraded": "2020-11-03 14:45:02.988293",
"last_fullsized": "2020-11-03 14:45:02.988293",
"mapping_epoch": 69411,
"log_start": "68062'202000",
"ondisk_log_start": "68062'202000",
"created": 67698,
"last_epoch_clean": 67746,
"parent": "0.0",
"parent_split_bits": 0,
"last_scrub": "68062'199623",
"last_scrub_stamp": "2020-11-03 03:32:46.895988",
"last_deep_scrub": "68062'177321",
"last_deep_scrub_stamp": "2020-11-02 01:07:15.963916",
"last_clean_scrub_stamp": "2020-11-03 03:32:46.895988",
"log_size": 3093,
"ondisk_log_size": 3093,
"stats_invalid": false,
"dirty_stats_invalid": false,
"omap_stats_invalid": false,
"hitset_stats_invalid": false,
"hitset_bytes_stats_invalid": false,
"pin_stats_invalid": false,
"manifest_stats_invalid": false,
"snaptrimq_len": 0,
"stat_sum": {
"num_bytes": 15173849600,
"num_objects": 3647,
"num_object_clones": 0,
"num_object_copies": 10941,
"num_objects_missing_on_primary": 0,
"num_objects_missing": 0,
"num_objects_degraded": 0,
"num_objects_misplaced": 0,
"num_objects_unfound": 0,
"num_objects_dirty": 3647,
"num_whiteouts": 0,
"num_read": 172836,
"num_read_kb": 6824184,
"num_write": 196190,
"num_write_kb": 21380176,
"num_scrub_errors": 0,
"num_shallow_scrub_errors": 0,
"num_deep_scrub_errors": 0,
"num_objects_recovered": 0,
"num_bytes_recovered": 0,
"num_keys_recovered": 0,
"num_objects_omap": 0,
"num_objects_hit_set_archive": 0,
"num_bytes_hit_set_archive": 0,
"num_flush": 0,
"num_flush_kb": 0,
"num_evict": 0,
"num_evict_kb": 0,
"num_promote": 0,
"num_flush_mode_high": 0,
"num_flush_mode_low": 0,
"num_evict_mode_some": 0,
"num_evict_mode_full": 0,
"num_objects_pinned": 0,
"num_legacy_snapsets": 0,
"num_large_omap_objects": 0,
"num_objects_manifest": 0,
"num_omap_bytes": 0,
"num_omap_keys": 0,
"num_objects_repaired": 0
},
"up": [
261,
284,
271
],
"acting": [
261,
284,
271
],
"avail_no_missing": [],
"object_location_counts": [],
"blocked_by": [],
"up_primary": 261,
"acting_primary": 261,
"purged_snaps": []
},
"empty": 0,
"dne": 0,
"incomplete": 0,
"last_epoch_started": 67746,
"hit_set_history": {
"current_last_update": "0'0",
"history": []
}
}
Current status of the cluster:
Reduced data availability: 1021 pgs inactive, 999 pgs
peering
Degraded data redundancy: 18357/94939584 objects degraded
(0.019%), 3 pgs degraded, 5 pgs undersized
services:
mon: 3 daemons, quorum monb01,monb02,monb03
mgr: monb03(active), standbys: monb01, monb02
osd: 285 osds: 284 up, 284 in
data:
pools: 9 pools, 9546 pgs
objects: 31.65 M objects, 120 TiB
usage: 363 TiB used, 127 TiB / 490 TiB avail
pgs: 10.696% pgs not active
18357/94939584 objects degraded (0.019%)
8520 active+clean
999 peering
18 activating
3 active+clean+scrubbing+deep
2 activating+undersized+degraded
2 activating+undersized
1 active+clean+scrubbing
1 active+undersized+degraded
io:
client: 367 MiB/s rd, 195 MiB/s wr, 24.51 kop/s rd, 5.95 kop/s wr
cache: 24 MiB/s flush, 90 MiB/s evict, 23 op/s promote
_______________________________________________
ceph-users mailing list -- ceph-users@xxxxxxx
To unsubscribe send an email to ceph-users-leave@xxxxxxx