Removing OSD after fixing PG-inconsistent brings back PG-inconsistent state

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Hello,

We have a cluster with HEALTH_ERR due to inconsisten PG.

HEALTH_ERR 1 pgs inconsistent; 1 scrub errors
pg 2.ae is active+clean+inconsistent, acting [11,4]
1 scrub errors

We have run ceph pg repair on the problematic pg and health went back to OK.

I checked the two osd acting on that pg (we have 2 replicas here) and
one of them had I/O errors, which we assume was the cause of the
inconsistent PG in the first place. So, to avoid further problems, we
want to remove the disk from the cluster. However, as soon as we stop
the OSD, we get back the inconsistent PG and recovery won't start.

Any ideas of what could be happening? Why do we get back to inconsistent
PG? How to remove the failing disk?

Can't find any ERR on the logs of the OSDs, only on monitors logs. So I
can't see if there is a specific object causing the inconsistent state
(doesn't seem to be the case).

I attach the ceph pg query when HEALTH_ERR.

Any help would be much appreciated. Thanks!

-- 
Ana Avilés
Greenhost - sustainable hosting & digital security
E: ana@xxxxxxxxxxxx
T: +31 20 4890444
W: https://greenhost.nl
{
    "state": "active+clean+inconsistent",
    "snap_trimq": "[]",
    "epoch": 198938,
    "up": [
        11,
        4
    ],
    "acting": [
        11,
        4
    ],
    "actingbackfill": [
        "4",
        "11"
    ],
    "info": {
        "pgid": "2.ae",
        "last_update": "198925'737155",
        "last_complete": "198925'737155",
        "log_tail": "198005'733960",
        "last_user_version": 737036,
        "last_backfill": "MAX",
        "last_backfill_bitwise": 0,
        "purged_snaps": "[1~12d,131~2538,266a~19,2686~2,2689~7,2691~4b0,2b43~8,2b4c~3,2b50~c,2b61~24,2b88~c,2b9a~11,2bb2~5,2bb8~2,2bbb~4,2bc0~1416,3fd7~1730,5708~9,5712~88b,5f9e~1373,7312~9c0,7cd3~5,7cd9~e36,8b10~935,9446~c0b,a053~29,a07d~1b7c,bbfa~1d8d,d988~c83,e60c~299c,10fa9~a7c,11a26~2719,14140~1,14144~8,1414d~1197,152e5~2098,1737e~264a,199c9~11,199db~57e,19f5a~1c2e,1bb89~2b3,1bed9~b3,1bf91~1,1bfa2~4e0,1c483~187,1c60b~113e,1d74a~15c,1d8a7~31,1d8d9~4,1d8de~6d1,1e04c~b2,1e102~1,1e111~422,1e534~2a,1e55f~140,1e739~14e,1e899~15d,1ea8d~b4,1eb51~59d,1f0ef~1d6,1f35f~b0,1f41e~24,1f443~179,1f5bd~666,1fcbc~b4,1fd80~1aa,1ff2b~3a9,20369~ab,204bb~aa,2060d~b1,20764~aa,208b5~aa,20a09~ac,20b5f~ab]",
        "history": {
            "epoch_created": 1,
            "last_epoch_started": 198937,
            "last_epoch_clean": 198937,
            "last_epoch_split": 0,
            "last_epoch_marked_full": 154843,
            "same_up_since": 198936,
            "same_interval_since": 198936,
            "same_primary_since": 198936,
            "last_scrub": "198925'737155",
            "last_scrub_stamp": "2016-07-29 19:07:48.694564",
            "last_deep_scrub": "198925'737155",
            "last_deep_scrub_stamp": "2016-07-29 19:07:48.694564",
            "last_clean_scrub_stamp": "2016-07-29 19:07:48.694564"
        },
        "stats": {
            "version": "198925'737155",
            "reported_seq": "1226184",
            "reported_epoch": "198937",
            "state": "active+clean+inconsistent",
            "last_fresh": "2016-07-29 19:14:30.876365",
            "last_change": "2016-07-29 19:14:30.876365",
            "last_active": "2016-07-29 19:14:30.876365",
            "last_peered": "2016-07-29 19:14:30.876365",
            "last_clean": "2016-07-29 19:14:30.876365",
            "last_became_active": "0.000000",
            "last_became_peered": "0.000000",
            "last_unstale": "2016-07-29 19:14:30.876365",
            "last_undegraded": "2016-07-29 19:14:30.876365",
            "last_fullsized": "2016-07-29 19:14:30.876365",
            "mapping_epoch": 198933,
            "log_start": "198005'733960",
            "ondisk_log_start": "198005'733960",
            "created": 1,
            "last_epoch_clean": 198937,
            "parent": "0.0",
            "parent_split_bits": 0,
            "last_scrub": "198925'737155",
            "last_scrub_stamp": "2016-07-29 19:07:48.694564",
            "last_deep_scrub": "198925'737155",
            "last_deep_scrub_stamp": "2016-07-29 19:07:48.694564",
            "last_clean_scrub_stamp": "2016-07-29 19:07:48.694564",
            "log_size": 3195,
            "ondisk_log_size": 3195,
            "stats_invalid": "0",
            "stat_sum": {
                "num_bytes": 15776131072,
                "num_objects": 5373,
                "num_object_clones": 2080,
                "num_object_copies": 10746,
                "num_objects_missing_on_primary": 0,
                "num_objects_degraded": 0,
                "num_objects_misplaced": 0,
                "num_objects_unfound": 0,
                "num_objects_dirty": 5373,
                "num_whiteouts": 0,
                "num_read": 78438,
                "num_read_kb": 3068887,
                "num_write": 104414,
                "num_write_kb": 32078753,
                "num_scrub_errors": 1,
                "num_shallow_scrub_errors": 0,
                "num_deep_scrub_errors": 1,
                "num_objects_recovered": 4258,
                "num_bytes_recovered": 15611940864,
                "num_keys_recovered": 0,
                "num_objects_omap": 1,
                "num_objects_hit_set_archive": 0,
                "num_bytes_hit_set_archive": 0,
                "num_flush": 0,
                "num_flush_kb": 0,
                "num_evict": 0,
                "num_evict_kb": 0,
                "num_promote": 0,
                "num_flush_mode_high": 0,
                "num_flush_mode_low": 0,
                "num_evict_mode_some": 0,
                "num_evict_mode_full": 0
            },
            "up": [
                11,
                4
            ],
            "acting": [
                11,
                4
            ],
            "blocked_by": [],
            "up_primary": 11,
            "acting_primary": 11
        },
        "empty": 0,
        "dne": 0,
        "incomplete": 0,
        "last_epoch_started": 198937,
        "hit_set_history": {
            "current_last_update": "0'0",
            "history": []
        }
    },
    "peer_info": [
        {
            "peer": "4",
            "pgid": "2.ae",
            "last_update": "198925'737155",
            "last_complete": "198925'737155",
            "log_tail": "198005'733960",
            "last_user_version": 737036,
            "last_backfill": "MAX",
            "last_backfill_bitwise": 0,
            "purged_snaps": "[1~12d,131~2538,266a~19,2686~2,2689~7,2691~4b0,2b43~8,2b4c~3,2b50~c,2b61~24,2b88~c,2b9a~11,2bb2~5,2bb8~2,2bbb~4,2bc0~1416,3fd7~1730,5708~9,5712~88b,5f9e~1373,7312~9c0,7cd3~5,7cd9~e36,8b10~935,9446~c0b,a053~29,a07d~1b7c,bbfa~1d8d,d988~c83,e60c~299c,10fa9~a7c,11a26~2719,14140~1,14144~8,1414d~1197,152e5~2098,1737e~264a,199c9~11,199db~57e,19f5a~1c2e,1bb89~2b3,1bed9~b3,1bf91~1,1bfa2~4e0,1c483~187,1c60b~113e,1d74a~15c,1d8a7~31,1d8d9~4,1d8de~6d1,1e04c~b2,1e102~1,1e111~422,1e534~2a,1e55f~140,1e739~14e,1e899~15d,1ea8d~b4,1eb51~59d,1f0ef~1d6,1f35f~b0,1f41e~24,1f443~179,1f5bd~666,1fcbc~b4,1fd80~1aa,1ff2b~3a9,20369~ab,204bb~aa,2060d~b1,20764~aa,208b5~aa,20a09~ac,20b5f~ab]",
            "history": {
                "epoch_created": 1,
                "last_epoch_started": 198937,
                "last_epoch_clean": 198937,
                "last_epoch_split": 0,
                "last_epoch_marked_full": 154843,
                "same_up_since": 198936,
                "same_interval_since": 198936,
                "same_primary_since": 198936,
                "last_scrub": "198925'737155",
                "last_scrub_stamp": "2016-07-29 19:07:48.694564",
                "last_deep_scrub": "198925'737155",
                "last_deep_scrub_stamp": "2016-07-29 19:07:48.694564",
                "last_clean_scrub_stamp": "2016-07-29 19:07:48.694564"
            },
            "stats": {
                "version": "198925'737155",
                "reported_seq": "1226180",
                "reported_epoch": "198936",
                "state": "active+undersized+degraded+inconsistent",
                "last_fresh": "2016-07-29 19:13:58.096548",
                "last_change": "2016-07-29 19:13:58.095929",
                "last_active": "2016-07-29 19:13:58.096548",
                "last_peered": "2016-07-29 19:13:58.096548",
                "last_clean": "2016-07-29 18:58:57.531797",
                "last_became_active": "0.000000",
                "last_became_peered": "0.000000",
                "last_unstale": "2016-07-29 19:13:58.096548",
                "last_undegraded": "2016-07-29 19:13:57.932878",
                "last_fullsized": "2016-07-29 19:13:57.932878",
                "mapping_epoch": 198933,
                "log_start": "198005'733960",
                "ondisk_log_start": "198005'733960",
                "created": 1,
                "last_epoch_clean": 198934,
                "parent": "0.0",
                "parent_split_bits": 0,
                "last_scrub": "198925'737155",
                "last_scrub_stamp": "2016-07-29 19:07:48.694564",
                "last_deep_scrub": "198925'737155",
                "last_deep_scrub_stamp": "2016-07-29 19:07:48.694564",
                "last_clean_scrub_stamp": "2016-07-29 19:07:48.694564",
                "log_size": 3195,
                "ondisk_log_size": 3195,
                "stats_invalid": "0",
                "stat_sum": {
                    "num_bytes": 15776131072,
                    "num_objects": 5373,
                    "num_object_clones": 2080,
                    "num_object_copies": 10746,
                    "num_objects_missing_on_primary": 0,
                    "num_objects_degraded": 5373,
                    "num_objects_misplaced": 0,
                    "num_objects_unfound": 0,
                    "num_objects_dirty": 5373,
                    "num_whiteouts": 0,
                    "num_read": 78438,
                    "num_read_kb": 3068887,
                    "num_write": 104414,
                    "num_write_kb": 32078753,
                    "num_scrub_errors": 1,
                    "num_shallow_scrub_errors": 0,
                    "num_deep_scrub_errors": 1,
                    "num_objects_recovered": 4258,
                    "num_bytes_recovered": 15611940864,
                    "num_keys_recovered": 0,
                    "num_objects_omap": 1,
                    "num_objects_hit_set_archive": 0,
                    "num_bytes_hit_set_archive": 0,
                    "num_flush": 0,
                    "num_flush_kb": 0,
                    "num_evict": 0,
                    "num_evict_kb": 0,
                    "num_promote": 0,
                    "num_flush_mode_high": 0,
                    "num_flush_mode_low": 0,
                    "num_evict_mode_some": 0,
                    "num_evict_mode_full": 0
                },
                "up": [
                    11,
                    4
                ],
                "acting": [
                    11,
                    4
                ],
                "blocked_by": [],
                "up_primary": 11,
                "acting_primary": 11
            },
            "empty": 0,
            "dne": 0,
            "incomplete": 0,
            "last_epoch_started": 198937,
            "hit_set_history": {
                "current_last_update": "0'0",
                "history": []
            }
        }
    ],
    "recovery_state": [
        {
            "name": "Started\/Primary\/Active",
            "enter_time": "2016-07-29 19:14:30.841075",
            "might_have_unfound": [],
            "recovery_progress": {
                "backfill_targets": [],
                "waiting_on_backfill": [],
                "last_backfill_started": "MIN",
                "backfill_info": {
                    "begin": "MIN",
                    "end": "MIN",
                    "objects": []
                },
                "peer_backfill_info": [],
                "backfills_in_flight": [],
                "recovering": [],
                "pg_backend": {
                    "pull_from_peer": [],
                    "pushing": []
                }
            },
            "scrub": {
                "scrubber.epoch_start": "0",
                "scrubber.active": 0,
                "scrubber.waiting_on": 0,
                "scrubber.waiting_on_whom": []
            }
        },
        {
            "name": "Started",
            "enter_time": "2016-07-29 19:14:29.818707"
        }
    ],
    "agent_state": {}
}
_______________________________________________
ceph-users mailing list
ceph-users@xxxxxxxxxxxxxx
http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com

[Index of Archives]     [Information on CEPH]     [Linux Filesystem Development]     [Ceph Development]     [Ceph Large]     [Linux USB Development]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]     [xfs]


  Powered by Linux