Hello,
We have some problems with 1 pg from this morning, this is what we
found so far...
# ceph --version
ceph version 10.2.0 (3a9fba20ec743699b69bd0181dd6c54dc01c64b9)
# ceph -s
cluster 2bf80721-fceb-4b63-89ee-1a5faa278493
health HEALTH_ERR
1 pgs inconsistent
2 scrub errors
monmap e1: 1 mons at {cephadm01=192.168.12.150:6789/0}
election epoch 7, quorum 0 cephadm01
osdmap e580: 9 osds: 9 up, 9 in
flags sortbitwise
pgmap v11430755: 664 pgs, 13 pools, 1010 GB data, 13894 kobjects
2142 GB used, 2355 GB / 4497 GB avail
660 active+clean
3 active+clean+scrubbing
1 active+clean+inconsistent
# ceph health detail
HEALTH_ERR 1 pgs inconsistent; 2 scrub errors
pg 10.55 is active+clean+inconsistent, acting [3,4]
2 scrub errors
# ceph pg 10.55 query
{
"state": "active+clean+inconsistent",
"snap_trimq": "[]",
"epoch": 580,
"up": [
3,
4
],
"acting": [
3,
4
],
"actingbackfill": [
"3",
"4"
],
"info": {
"pgid": "10.55",
"last_update": "580'40334",
"last_complete": "580'40334",
"log_tail": "448'37299",
"last_user_version": 40334,
"last_backfill": "MAX",
"last_backfill_bitwise": 1,
"purged_snaps": "[]",
"history": {
"epoch_created": 329,
"last_epoch_started": 577,
"last_epoch_clean": 577,
"last_epoch_split": 0,
"last_epoch_marked_full": 0,
"same_up_since": 576,
"same_interval_since": 576,
"same_primary_since": 572,
"last_scrub": "568'40333",
"last_scrub_stamp": "2017-01-26 10:06:56.062870",
"last_deep_scrub": "562'40329",
"last_deep_scrub_stamp": "2017-01-26 06:19:55.708518",
"last_clean_scrub_stamp": "2016-07-05 14:58:45.534218"
},
"stats": {
"version": "580'40334",
"reported_seq": "49407",
"reported_epoch": "580",
"state": "active+clean+inconsistent",
"last_fresh": "2017-01-26 11:21:55.393989",
"last_change": "2017-01-26 10:06:56.062930",
"last_active": "2017-01-26 11:21:55.393989",
"last_peered": "2017-01-26 11:21:55.393989",
"last_clean": "2017-01-26 11:21:55.393989",
"last_became_active": "2017-01-26 09:28:09.196447",
"last_became_peered": "2017-01-26 09:28:09.196447",
"last_unstale": "2017-01-26 11:21:55.393989",
"last_undegraded": "2017-01-26 11:21:55.393989",
"last_fullsized": "2017-01-26 11:21:55.393989",
"mapping_epoch": 575,
"log_start": "448'37299",
"ondisk_log_start": "448'37299",
"created": 329,
"last_epoch_clean": 577,
"parent": "0.0",
"parent_split_bits": 8,
"last_scrub": "568'40333",
"last_scrub_stamp": "2017-01-26 10:06:56.062870",
"last_deep_scrub": "562'40329",
"last_deep_scrub_stamp": "2017-01-26 06:19:55.708518",
"last_clean_scrub_stamp": "2016-07-05 14:58:45.534218",
"log_size": 3035,
"ondisk_log_size": 3035,
"stats_invalid": false,
"dirty_stats_invalid": false,
"omap_stats_invalid": false,
"hitset_stats_invalid": false,
"hitset_bytes_stats_invalid": false,
"pin_stats_invalid": false,
"stat_sum": {
"num_bytes": 2153869599,
"num_objects": 28148,
"num_object_clones": 0,
"num_object_copies": 56296,
"num_objects_missing_on_primary": 0,
"num_objects_missing": 0,
"num_objects_degraded": 0,
"num_objects_misplaced": 0,
"num_objects_unfound": 0,
"num_objects_dirty": 28148,
"num_whiteouts": 0,
"num_read": 21,
"num_read_kb": 696,
"num_write": 50,
"num_write_kb": 217,
"num_scrub_errors": 2,
"num_shallow_scrub_errors": 1,
"num_deep_scrub_errors": 1,
"num_objects_recovered": 0,
"num_bytes_recovered": 0,
"num_keys_recovered": 0,
"num_objects_omap": 0,
"num_objects_hit_set_archive": 0,
"num_bytes_hit_set_archive": 0,
"num_flush": 0,
"num_flush_kb": 0,
"num_evict": 0,
"num_evict_kb": 0,
"num_promote": 0,
"num_flush_mode_high": 0,
"num_flush_mode_low": 0,
"num_evict_mode_some": 0,
"num_evict_mode_full": 0,
"num_objects_pinned": 0
},
"up": [
3,
4
],
"acting": [
3,
4
],
"blocked_by": [],
"up_primary": 3,
"acting_primary": 3
},
"empty": 0,
"dne": 0,
"incomplete": 0,
"last_epoch_started": 577,
"hit_set_history": {
"current_last_update": "0'0",
"history": []
}
},
"peer_info": [
{
"peer": "4",
"pgid": "10.55",
"last_update": "580'40334",
"last_complete": "580'40334",
"log_tail": "448'37299",
"last_user_version": 40333,
"last_backfill": "MAX",
"last_backfill_bitwise": 1,
"purged_snaps": "[]",
"history": {
"epoch_created": 329,
"last_epoch_started": 577,
"last_epoch_clean": 577,
"last_epoch_split": 0,
"last_epoch_marked_full": 0,
"same_up_since": 576,
"same_interval_since": 576,
"same_primary_since": 572,
"last_scrub": "568'40333",
"last_scrub_stamp": "2017-01-26 10:06:56.062870",
"last_deep_scrub": "562'40329",
"last_deep_scrub_stamp": "2017-01-26 06:19:55.708518",
"last_clean_scrub_stamp": "2016-07-05 14:58:45.534218"
},
"stats": {
"version": "568'40333",
"reported_seq": "49386",
"reported_epoch": "571",
"state": "inconsistent+peering",
"last_fresh": "2017-01-26 09:25:26.210512",
"last_change": "2017-01-26 09:25:26.210512",
"last_active": "2017-01-26 08:40:35.617481",
"last_peered": "2017-01-26 08:40:35.617481",
"last_clean": "2017-01-26 08:40:35.617481",
"last_became_active": "2017-01-26 08:29:09.145329",
"last_became_peered": "2017-01-26 08:29:09.145329",
"last_unstale": "2017-01-26 09:25:26.210512",
"last_undegraded": "2017-01-26 09:25:26.210512",
"last_fullsized": "2017-01-26 09:25:26.210512",
"mapping_epoch": 575,
"log_start": "448'37299",
"ondisk_log_start": "448'37299",
"created": 329,
"last_epoch_clean": 568,
"parent": "0.0",
"parent_split_bits": 8,
"last_scrub": "562'40329",
"last_scrub_stamp": "2017-01-26 06:19:55.708518",
"last_deep_scrub": "562'40329",
"last_deep_scrub_stamp": "2017-01-26 06:19:55.708518",
"last_clean_scrub_stamp": "2016-07-05 14:58:45.534218",
"log_size": 3034,
"ondisk_log_size": 3034,
"stats_invalid": false,
"dirty_stats_invalid": false,
"omap_stats_invalid": false,
"hitset_stats_invalid": false,
"hitset_bytes_stats_invalid": false,
"pin_stats_invalid": false,
"stat_sum": {
"num_bytes": 2153849873,
"num_objects": 28147,
"num_object_clones": 0,
"num_object_copies": 56294,
"num_objects_missing_on_primary": 0,
"num_objects_missing": 0,
"num_objects_degraded": 0,
"num_objects_misplaced": 0,
"num_objects_unfound": 0,
"num_objects_dirty": 28147,
"num_whiteouts": 0,
"num_read": 6,
"num_read_kb": 110,
"num_write": 40,
"num_write_kb": 197,
"num_scrub_errors": 1,
"num_shallow_scrub_errors": 0,
"num_deep_scrub_errors": 1,
"num_objects_recovered": 0,
"num_bytes_recovered": 0,
"num_keys_recovered": 0,
"num_objects_omap": 0,
"num_objects_hit_set_archive": 0,
"num_bytes_hit_set_archive": 0,
"num_flush": 0,
"num_flush_kb": 0,
"num_evict": 0,
"num_evict_kb": 0,
"num_promote": 0,
"num_flush_mode_high": 0,
"num_flush_mode_low": 0,
"num_evict_mode_some": 0,
"num_evict_mode_full": 0,
"num_objects_pinned": 0
},
"up": [
3,
4
],
"acting": [
3,
4
],
"blocked_by": [],
"up_primary": 3,
"acting_primary": 3
},
"empty": 0,
"dne": 0,
"incomplete": 0,
"last_epoch_started": 577,
"hit_set_history": {
"current_last_update": "0'0",
"history": []
}
}
],
"recovery_state": [
{
"name": "Started\/Primary\/Active",
"enter_time": "2017-01-26 09:28:09.159017",
"might_have_unfound": [],
"recovery_progress": {
"backfill_targets": [],
"waiting_on_backfill": [],
"last_backfill_started": "MIN",
"backfill_info": {
"begin": "MIN",
"end": "MIN",
"objects": []
},
"peer_backfill_info": [],
"backfills_in_flight": [],
"recovering": [],
"pg_backend": {
"pull_from_peer": [],
"pushing": []
}
},
"scrub": {
"scrubber.epoch_start": "576",
"scrubber.active": 0,
"scrubber.state": "INACTIVE",
"scrubber.start": "MIN",
"scrubber.end": "MIN",
"scrubber.subset_last_update": "0'0",
"scrubber.deep": false,
"scrubber.seed": 0,
"scrubber.waiting_on": 0,
"scrubber.waiting_on_whom": []
}
},
{
"name": "Started",
"enter_time": "2017-01-26 09:28:08.166221"
}
],
"agent_state": {}
}
# grep -Hn 'ERR' /var/log/ceph/ceph-osd.3.log
/var/log/ceph/ceph-osd.3.log:47:2017-01-26 06:08:48.147129
7f3fda627700 -1 log_channel(cluster) log [ERR] : 10.55 shard 3: soid
10:aa0c6d9c:::ef4069bf-70fb-4414-a9d9-6bf5b32608fb.34127.33_nalazi%2f201607%2fLab_7bd28004-cc9d-4039-9567-7f5c597f6d88.pdf:head data_digest 0xc44df2ba != known data_digest 0xff59029 from auth shard
4
/var/log/ceph/ceph-osd.3.log:48:2017-01-26 06:19:55.708507
7f3fda627700 -1 log_channel(cluster) log [ERR] : 10.55 deep-scrub 0
missing, 1 inconsistent objects
/var/log/ceph/ceph-osd.3.log:49:2017-01-26 06:19:55.708513
7f3fda627700 -1 log_channel(cluster) log [ERR] : 10.55 deep-scrub 1
errors
/var/log/ceph/ceph-osd.3.log:7464:2017-01-26 10:00:48.267401
7fa6970c2700 -1 log_channel(cluster) log [ERR] : 10.55 shard 3
missing
10:aa0c6d9c:::ef4069bf-70fb-4414-a9d9-6bf5b32608fb.34127.33_nalazi%2f201607%2fLab_7bd28004-cc9d-4039-9567-7f5c597f6d88.pdf:head
/var/log/ceph/ceph-osd.3.log:7467:2017-01-26 10:06:56.062852
7fa6970c2700 -1 log_channel(cluster) log [ERR] : 10.55 scrub 1
missing, 0 inconsistent objects
/var/log/ceph/ceph-osd.3.log:7468:2017-01-26 10:06:56.062858
7fa6970c2700 -1 log_channel(cluster) log [ERR] : 10.55 scrub 1
errors ( 1 remaining deep scrub error(s) )
We have located inconsistent pg on the osd.3, and have tried the following:
Stop the osd 3.
Flushed journal
Moved the object to /tmp
Started osd 3
Tried ceph pg repair 10.55, but no luck...
Is there some ceph tool we could use to copy the healthy object from
osd 4 to osd 3?
Best regards!