Hey Florian, What does the ceph.log ERR or ceph-osd log show for this inconsistency? -- Dan On Mon, Oct 14, 2019 at 1:04 PM Florian Haas <florian@xxxxxxxxxxxxxx> wrote: > > Hello, > > I am running into an "interesting" issue with a PG that is being flagged > as inconsistent during scrub (causing the cluster to go to HEALTH_ERR), > but doesn't actually appear to contain any inconsistent objects. > > $ ceph health detail > HEALTH_ERR 1 scrub errors; Possible data damage: 1 pg inconsistent > OSD_SCRUB_ERRORS 1 scrub errors > PG_DAMAGED Possible data damage: 1 pg inconsistent > pg 10.10d is active+clean+inconsistent, acting [15,13] > > $ rados list-inconsistent-obj 10.10d > {"epoch":12138,"inconsistents":[]} > > "ceph pg query" (see below) on that PG does report num_scrub_errors=1, > num_shallow_scrub_errors=1, and num_objects_dirty=1. "osd scrub auto > repair = true" is set on all OSDs, but the PG never auto-repairs. (This > is a test cluster, the pool size is 2 — this may preclude auto repair > from ever kicking in; I'm not sure on that one.) > > "ceph pg repair" does repair, but the issue reappears on the next > scheduled scrub. > > This issue was first discovered while the cluster was on > Jewel/Filestore. In an event like this I would normally suspect either a > problem with an individual OSD, or a bug in the FileStore code. But the > cluster has had *all* of it's OSDs replaced since, as part of a full > Jewel→Luminous→Nautilus upgrade and a FileStore→BlueStore conversion. > The issue still persists. > > A full "ceph pg 10.10d query" result is below. If anyone has ideas on > how to permanently fix this issue, I'd be most grateful. > > Thanks! > > Cheers, > Florian > > > > > { > "state": "active+clean+inconsistent", > "snap_trimq": "[]", > "snap_trimq_len": 0, > "epoch": 12143, > "up": [ > 15, > 13 > ], > "acting": [ > 15, > 13 > ], > "acting_recovery_backfill": [ > "13", > "15" > ], > "info": { > "pgid": "10.10d", > "last_update": "100'11", > "last_complete": "100'11", > "log_tail": "0'0", > "last_user_version": 11, > "last_backfill": "MAX", > "last_backfill_bitwise": 0, > "purged_snaps": [], > "history": { > "epoch_created": 45, > "epoch_pool_created": 45, > "last_epoch_started": 12139, > "last_interval_started": 12138, > "last_epoch_clean": 12139, > "last_interval_clean": 12138, > "last_epoch_split": 0, > "last_epoch_marked_full": 0, > "same_up_since": 12138, > "same_interval_since": 12138, > "same_primary_since": 12114, > "last_scrub": "100'11", > "last_scrub_stamp": "2019-10-14 08:33:57.347097", > "last_deep_scrub": "100'11", > "last_deep_scrub_stamp": "2019-10-11 14:09:29.016946", > "last_clean_scrub_stamp": "2019-10-11 14:09:29.016946" > }, > "stats": { > "version": "100'11", > "reported_seq": "4927", > "reported_epoch": "12143", > "state": "active+clean+inconsistent", > "last_fresh": "2019-10-14 08:33:57.347147", > "last_change": "2019-10-14 08:33:57.347147", > "last_active": "2019-10-14 08:33:57.347147", > "last_peered": "2019-10-14 08:33:57.347147", > "last_clean": "2019-10-14 08:33:57.347147", > "last_became_active": "2019-10-11 14:44:09.312226", > "last_became_peered": "2019-10-11 14:44:09.312226", > "last_unstale": "2019-10-14 08:33:57.347147", > "last_undegraded": "2019-10-14 08:33:57.347147", > "last_fullsized": "2019-10-14 08:33:57.347147", > "mapping_epoch": 12138, > "log_start": "0'0", > "ondisk_log_start": "0'0", > "created": 45, > "last_epoch_clean": 12139, > "parent": "0.0", > "parent_split_bits": 0, > "last_scrub": "100'11", > "last_scrub_stamp": "2019-10-14 08:33:57.347097", > "last_deep_scrub": "100'11", > "last_deep_scrub_stamp": "2019-10-11 14:09:29.016946", > "last_clean_scrub_stamp": "2019-10-11 14:09:29.016946", > "log_size": 11, > "ondisk_log_size": 11, > "stats_invalid": false, > "dirty_stats_invalid": false, > "omap_stats_invalid": false, > "hitset_stats_invalid": false, > "hitset_bytes_stats_invalid": true, > "pin_stats_invalid": true, > "manifest_stats_invalid": true, > "snaptrimq_len": 0, > "stat_sum": { > "num_bytes": 11, > "num_objects": 1, > "num_object_clones": 0, > "num_object_copies": 2, > "num_objects_missing_on_primary": 0, > "num_objects_missing": 0, > "num_objects_degraded": 0, > "num_objects_misplaced": 0, > "num_objects_unfound": 0, > "num_objects_dirty": 1, > "num_whiteouts": 0, > "num_read": 33, > "num_read_kb": 22, > "num_write": 11, > "num_write_kb": 6, > "num_scrub_errors": 1, > "num_shallow_scrub_errors": 1, > "num_deep_scrub_errors": 0, > "num_objects_recovered": 2, > "num_bytes_recovered": 22, > "num_keys_recovered": 0, > "num_objects_omap": 0, > "num_objects_hit_set_archive": 0, > "num_bytes_hit_set_archive": 0, > "num_flush": 0, > "num_flush_kb": 0, > "num_evict": 0, > "num_evict_kb": 0, > "num_promote": 0, > "num_flush_mode_high": 0, > "num_flush_mode_low": 0, > "num_evict_mode_some": 0, > "num_evict_mode_full": 0, > "num_objects_pinned": 0, > "num_legacy_snapsets": 0, > "num_large_omap_objects": 0, > "num_objects_manifest": 0, > "num_omap_bytes": 0, > "num_omap_keys": 0, > "num_objects_repaired": 0 > }, > "up": [ > 15, > 13 > ], > "acting": [ > 15, > 13 > ], > "avail_no_missing": [], > "object_location_counts": [], > "blocked_by": [], > "up_primary": 15, > "acting_primary": 15, > "purged_snaps": [] > }, > "empty": 0, > "dne": 0, > "incomplete": 0, > "last_epoch_started": 12139, > "hit_set_history": { > "current_last_update": "0'0", > "history": [] > } > }, > "peer_info": [ > { > "peer": "13", > "pgid": "10.10d", > "last_update": "100'11", > "last_complete": "100'11", > "log_tail": "0'0", > "last_user_version": 11, > "last_backfill": "MAX", > "last_backfill_bitwise": 0, > "purged_snaps": [], > "history": { > "epoch_created": 45, > "epoch_pool_created": 45, > "last_epoch_started": 12139, > "last_interval_started": 12138, > "last_epoch_clean": 12139, > "last_interval_clean": 12138, > "last_epoch_split": 0, > "last_epoch_marked_full": 0, > "same_up_since": 12138, > "same_interval_since": 12138, > "same_primary_since": 12114, > "last_scrub": "100'11", > "last_scrub_stamp": "2019-10-14 08:33:57.347097", > "last_deep_scrub": "100'11", > "last_deep_scrub_stamp": "2019-10-11 14:09:29.016946", > "last_clean_scrub_stamp": "2019-10-11 14:09:29.016946" > }, > "stats": { > "version": "100'11", > "reported_seq": "36", > "reported_epoch": "12113", > "state": "active+undersized+degraded", > "last_fresh": "2019-10-11 14:39:58.946532", > "last_change": "2019-10-11 14:39:58.924989", > "last_active": "2019-10-11 14:39:58.946532", > "last_peered": "2019-10-11 14:39:58.946532", > "last_clean": "2014-11-05 15:48:35.131248", > "last_became_active": "2019-10-11 14:39:58.924989", > "last_became_peered": "2019-10-11 14:39:58.924989", > "last_unstale": "2019-10-11 14:39:58.946532", > "last_undegraded": "2019-10-11 14:39:58.892352", > "last_fullsized": "2019-10-11 14:39:58.892331", > "mapping_epoch": 12138, > "log_start": "0'0", > "ondisk_log_start": "0'0", > "created": 45, > "last_epoch_clean": 12103, > "parent": "0.0", > "parent_split_bits": 0, > "last_scrub": "100'11", > "last_scrub_stamp": "2019-10-11 14:09:29.016946", > "last_deep_scrub": "100'11", > "last_deep_scrub_stamp": "2019-10-11 14:09:29.016946", > "last_clean_scrub_stamp": "2019-10-11 14:09:29.016946", > "log_size": 11, > "ondisk_log_size": 11, > "stats_invalid": false, > "dirty_stats_invalid": false, > "omap_stats_invalid": false, > "hitset_stats_invalid": false, > "hitset_bytes_stats_invalid": true, > "pin_stats_invalid": true, > "manifest_stats_invalid": true, > "snaptrimq_len": 0, > "stat_sum": { > "num_bytes": 11, > "num_objects": 1, > "num_object_clones": 0, > "num_object_copies": 2, > "num_objects_missing_on_primary": 0, > "num_objects_missing": 0, > "num_objects_degraded": 1, > "num_objects_misplaced": 0, > "num_objects_unfound": 0, > "num_objects_dirty": 1, > "num_whiteouts": 0, > "num_read": 33, > "num_read_kb": 22, > "num_write": 11, > "num_write_kb": 6, > "num_scrub_errors": 0, > "num_shallow_scrub_errors": 0, > "num_deep_scrub_errors": 0, > "num_objects_recovered": 2, > "num_bytes_recovered": 22, > "num_keys_recovered": 0, > "num_objects_omap": 0, > "num_objects_hit_set_archive": 0, > "num_bytes_hit_set_archive": 0, > "num_flush": 0, > "num_flush_kb": 0, > "num_evict": 0, > "num_evict_kb": 0, > "num_promote": 0, > "num_flush_mode_high": 0, > "num_flush_mode_low": 0, > "num_evict_mode_some": 0, > "num_evict_mode_full": 0, > "num_objects_pinned": 0, > "num_legacy_snapsets": 0, > "num_large_omap_objects": 0, > "num_objects_manifest": 0, > "num_omap_bytes": 0, > "num_omap_keys": 0, > "num_objects_repaired": 0 > }, > "up": [ > 15, > 13 > ], > "acting": [ > 15, > 13 > ], > "avail_no_missing": [ > "13" > ], > "object_location_counts": [ > { > "shards": "13", > "objects": 1 > } > ], > "blocked_by": [], > "up_primary": 15, > "acting_primary": 15, > "purged_snaps": [] > }, > "empty": 0, > "dne": 0, > "incomplete": 0, > "last_epoch_started": 12139, > "hit_set_history": { > "current_last_update": "0'0", > "history": [] > } > } > ], > "recovery_state": [ > { > "name": "Started/Primary/Active", > "enter_time": "2019-10-11 14:44:09.175574", > "might_have_unfound": [], > "recovery_progress": { > "backfill_targets": [], > "waiting_on_backfill": [], > "last_backfill_started": "MIN", > "backfill_info": { > "begin": "MIN", > "end": "MIN", > "objects": [] > }, > "peer_backfill_info": [], > "backfills_in_flight": [], > "recovering": [], > "pg_backend": { > "pull_from_peer": [], > "pushing": [] > } > }, > "scrub": { > "scrubber.epoch_start": "12138", > "scrubber.active": false, > "scrubber.state": "INACTIVE", > "scrubber.start": "MIN", > "scrubber.end": "MIN", > "scrubber.max_end": "MIN", > "scrubber.subset_last_update": "0'0", > "scrubber.deep": false, > "scrubber.waiting_on_whom": [] > } > }, > { > "name": "Started", > "enter_time": "2019-10-11 14:44:08.833757" > } > ], > "agent_state": {} > } > _______________________________________________ > ceph-users mailing list -- ceph-users@xxxxxxx > To unsubscribe send an email to ceph-users-leave@xxxxxxx _______________________________________________ ceph-users mailing list -- ceph-users@xxxxxxx To unsubscribe send an email to ceph-users-leave@xxxxxxx