Hello, We have a cluster with HEALTH_ERR due to inconsisten PG. HEALTH_ERR 1 pgs inconsistent; 1 scrub errors pg 2.ae is active+clean+inconsistent, acting [11,4] 1 scrub errors We have run ceph pg repair on the problematic pg and health went back to OK. I checked the two osd acting on that pg (we have 2 replicas here) and one of them had I/O errors, which we assume was the cause of the inconsistent PG in the first place. So, to avoid further problems, we want to remove the disk from the cluster. However, as soon as we stop the OSD, we get back the inconsistent PG and recovery won't start. Any ideas of what could be happening? Why do we get back to inconsistent PG? How to remove the failing disk? Can't find any ERR on the logs of the OSDs, only on monitors logs. So I can't see if there is a specific object causing the inconsistent state (doesn't seem to be the case). I attach the ceph pg query when HEALTH_ERR. Any help would be much appreciated. Thanks! -- Ana Avilés Greenhost - sustainable hosting & digital security E: ana@xxxxxxxxxxxx T: +31 20 4890444 W: https://greenhost.nl
{ "state": "active+clean+inconsistent", "snap_trimq": "[]", "epoch": 198938, "up": [ 11, 4 ], "acting": [ 11, 4 ], "actingbackfill": [ "4", "11" ], "info": { "pgid": "2.ae", "last_update": "198925'737155", "last_complete": "198925'737155", "log_tail": "198005'733960", "last_user_version": 737036, "last_backfill": "MAX", "last_backfill_bitwise": 0, "purged_snaps": "[1~12d,131~2538,266a~19,2686~2,2689~7,2691~4b0,2b43~8,2b4c~3,2b50~c,2b61~24,2b88~c,2b9a~11,2bb2~5,2bb8~2,2bbb~4,2bc0~1416,3fd7~1730,5708~9,5712~88b,5f9e~1373,7312~9c0,7cd3~5,7cd9~e36,8b10~935,9446~c0b,a053~29,a07d~1b7c,bbfa~1d8d,d988~c83,e60c~299c,10fa9~a7c,11a26~2719,14140~1,14144~8,1414d~1197,152e5~2098,1737e~264a,199c9~11,199db~57e,19f5a~1c2e,1bb89~2b3,1bed9~b3,1bf91~1,1bfa2~4e0,1c483~187,1c60b~113e,1d74a~15c,1d8a7~31,1d8d9~4,1d8de~6d1,1e04c~b2,1e102~1,1e111~422,1e534~2a,1e55f~140,1e739~14e,1e899~15d,1ea8d~b4,1eb51~59d,1f0ef~1d6,1f35f~b0,1f41e~24,1f443~179,1f5bd~666,1fcbc~b4,1fd80~1aa,1ff2b~3a9,20369~ab,204bb~aa,2060d~b1,20764~aa,208b5~aa,20a09~ac,20b5f~ab]", "history": { "epoch_created": 1, "last_epoch_started": 198937, "last_epoch_clean": 198937, "last_epoch_split": 0, "last_epoch_marked_full": 154843, "same_up_since": 198936, "same_interval_since": 198936, "same_primary_since": 198936, "last_scrub": "198925'737155", "last_scrub_stamp": "2016-07-29 19:07:48.694564", "last_deep_scrub": "198925'737155", "last_deep_scrub_stamp": "2016-07-29 19:07:48.694564", "last_clean_scrub_stamp": "2016-07-29 19:07:48.694564" }, "stats": { "version": "198925'737155", "reported_seq": "1226184", "reported_epoch": "198937", "state": "active+clean+inconsistent", "last_fresh": "2016-07-29 19:14:30.876365", "last_change": "2016-07-29 19:14:30.876365", "last_active": "2016-07-29 19:14:30.876365", "last_peered": "2016-07-29 19:14:30.876365", "last_clean": "2016-07-29 19:14:30.876365", "last_became_active": "0.000000", "last_became_peered": "0.000000", "last_unstale": "2016-07-29 19:14:30.876365", "last_undegraded": "2016-07-29 19:14:30.876365", "last_fullsized": "2016-07-29 19:14:30.876365", "mapping_epoch": 198933, "log_start": "198005'733960", "ondisk_log_start": "198005'733960", "created": 1, "last_epoch_clean": 198937, "parent": "0.0", "parent_split_bits": 0, "last_scrub": "198925'737155", "last_scrub_stamp": "2016-07-29 19:07:48.694564", "last_deep_scrub": "198925'737155", "last_deep_scrub_stamp": "2016-07-29 19:07:48.694564", "last_clean_scrub_stamp": "2016-07-29 19:07:48.694564", "log_size": 3195, "ondisk_log_size": 3195, "stats_invalid": "0", "stat_sum": { "num_bytes": 15776131072, "num_objects": 5373, "num_object_clones": 2080, "num_object_copies": 10746, "num_objects_missing_on_primary": 0, "num_objects_degraded": 0, "num_objects_misplaced": 0, "num_objects_unfound": 0, "num_objects_dirty": 5373, "num_whiteouts": 0, "num_read": 78438, "num_read_kb": 3068887, "num_write": 104414, "num_write_kb": 32078753, "num_scrub_errors": 1, "num_shallow_scrub_errors": 0, "num_deep_scrub_errors": 1, "num_objects_recovered": 4258, "num_bytes_recovered": 15611940864, "num_keys_recovered": 0, "num_objects_omap": 1, "num_objects_hit_set_archive": 0, "num_bytes_hit_set_archive": 0, "num_flush": 0, "num_flush_kb": 0, "num_evict": 0, "num_evict_kb": 0, "num_promote": 0, "num_flush_mode_high": 0, "num_flush_mode_low": 0, "num_evict_mode_some": 0, "num_evict_mode_full": 0 }, "up": [ 11, 4 ], "acting": [ 11, 4 ], "blocked_by": [], "up_primary": 11, "acting_primary": 11 }, "empty": 0, "dne": 0, "incomplete": 0, "last_epoch_started": 198937, "hit_set_history": { "current_last_update": "0'0", "history": [] } }, "peer_info": [ { "peer": "4", "pgid": "2.ae", "last_update": "198925'737155", "last_complete": "198925'737155", "log_tail": "198005'733960", "last_user_version": 737036, "last_backfill": "MAX", "last_backfill_bitwise": 0, "purged_snaps": "[1~12d,131~2538,266a~19,2686~2,2689~7,2691~4b0,2b43~8,2b4c~3,2b50~c,2b61~24,2b88~c,2b9a~11,2bb2~5,2bb8~2,2bbb~4,2bc0~1416,3fd7~1730,5708~9,5712~88b,5f9e~1373,7312~9c0,7cd3~5,7cd9~e36,8b10~935,9446~c0b,a053~29,a07d~1b7c,bbfa~1d8d,d988~c83,e60c~299c,10fa9~a7c,11a26~2719,14140~1,14144~8,1414d~1197,152e5~2098,1737e~264a,199c9~11,199db~57e,19f5a~1c2e,1bb89~2b3,1bed9~b3,1bf91~1,1bfa2~4e0,1c483~187,1c60b~113e,1d74a~15c,1d8a7~31,1d8d9~4,1d8de~6d1,1e04c~b2,1e102~1,1e111~422,1e534~2a,1e55f~140,1e739~14e,1e899~15d,1ea8d~b4,1eb51~59d,1f0ef~1d6,1f35f~b0,1f41e~24,1f443~179,1f5bd~666,1fcbc~b4,1fd80~1aa,1ff2b~3a9,20369~ab,204bb~aa,2060d~b1,20764~aa,208b5~aa,20a09~ac,20b5f~ab]", "history": { "epoch_created": 1, "last_epoch_started": 198937, "last_epoch_clean": 198937, "last_epoch_split": 0, "last_epoch_marked_full": 154843, "same_up_since": 198936, "same_interval_since": 198936, "same_primary_since": 198936, "last_scrub": "198925'737155", "last_scrub_stamp": "2016-07-29 19:07:48.694564", "last_deep_scrub": "198925'737155", "last_deep_scrub_stamp": "2016-07-29 19:07:48.694564", "last_clean_scrub_stamp": "2016-07-29 19:07:48.694564" }, "stats": { "version": "198925'737155", "reported_seq": "1226180", "reported_epoch": "198936", "state": "active+undersized+degraded+inconsistent", "last_fresh": "2016-07-29 19:13:58.096548", "last_change": "2016-07-29 19:13:58.095929", "last_active": "2016-07-29 19:13:58.096548", "last_peered": "2016-07-29 19:13:58.096548", "last_clean": "2016-07-29 18:58:57.531797", "last_became_active": "0.000000", "last_became_peered": "0.000000", "last_unstale": "2016-07-29 19:13:58.096548", "last_undegraded": "2016-07-29 19:13:57.932878", "last_fullsized": "2016-07-29 19:13:57.932878", "mapping_epoch": 198933, "log_start": "198005'733960", "ondisk_log_start": "198005'733960", "created": 1, "last_epoch_clean": 198934, "parent": "0.0", "parent_split_bits": 0, "last_scrub": "198925'737155", "last_scrub_stamp": "2016-07-29 19:07:48.694564", "last_deep_scrub": "198925'737155", "last_deep_scrub_stamp": "2016-07-29 19:07:48.694564", "last_clean_scrub_stamp": "2016-07-29 19:07:48.694564", "log_size": 3195, "ondisk_log_size": 3195, "stats_invalid": "0", "stat_sum": { "num_bytes": 15776131072, "num_objects": 5373, "num_object_clones": 2080, "num_object_copies": 10746, "num_objects_missing_on_primary": 0, "num_objects_degraded": 5373, "num_objects_misplaced": 0, "num_objects_unfound": 0, "num_objects_dirty": 5373, "num_whiteouts": 0, "num_read": 78438, "num_read_kb": 3068887, "num_write": 104414, "num_write_kb": 32078753, "num_scrub_errors": 1, "num_shallow_scrub_errors": 0, "num_deep_scrub_errors": 1, "num_objects_recovered": 4258, "num_bytes_recovered": 15611940864, "num_keys_recovered": 0, "num_objects_omap": 1, "num_objects_hit_set_archive": 0, "num_bytes_hit_set_archive": 0, "num_flush": 0, "num_flush_kb": 0, "num_evict": 0, "num_evict_kb": 0, "num_promote": 0, "num_flush_mode_high": 0, "num_flush_mode_low": 0, "num_evict_mode_some": 0, "num_evict_mode_full": 0 }, "up": [ 11, 4 ], "acting": [ 11, 4 ], "blocked_by": [], "up_primary": 11, "acting_primary": 11 }, "empty": 0, "dne": 0, "incomplete": 0, "last_epoch_started": 198937, "hit_set_history": { "current_last_update": "0'0", "history": [] } } ], "recovery_state": [ { "name": "Started\/Primary\/Active", "enter_time": "2016-07-29 19:14:30.841075", "might_have_unfound": [], "recovery_progress": { "backfill_targets": [], "waiting_on_backfill": [], "last_backfill_started": "MIN", "backfill_info": { "begin": "MIN", "end": "MIN", "objects": [] }, "peer_backfill_info": [], "backfills_in_flight": [], "recovering": [], "pg_backend": { "pull_from_peer": [], "pushing": [] } }, "scrub": { "scrubber.epoch_start": "0", "scrubber.active": 0, "scrubber.waiting_on": 0, "scrubber.waiting_on_whom": [] } }, { "name": "Started", "enter_time": "2016-07-29 19:14:29.818707" } ], "agent_state": {} }
_______________________________________________ ceph-users mailing list ceph-users@xxxxxxxxxxxxxx http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com