Re: Luminous cluster in very bad state need some assistance.

Philippe Van Hecke <Philippe.VanHecke@xxxxxxxxx> · Mon, 4 Feb 2019 08:30:03 +0000

 ceph pg ls | grep 11.182

11.182       10                 25       35         0      25    34648064 1306     1306 active+recovery_wait+undersized+degraded 2019-02-04 09:23:26.461468      70238'1306     70673:24924      [64]         64      [64]             64  46843'56759413 2019-01-26 16:31:32.607109  46843'56628962 2019-01-24 08:56:59.228615

root@storage-node-1-l3:~# ceph pg 11.182 query
{
    "state": "active+recovery_wait+undersized+degraded",
    "snap_trimq": "[1~b]",
    "snap_trimq_len": 11,
    "epoch": 70673,
    "up": [
        64
    ],
    "acting": [
        64
    ],
    "actingbackfill": [
        "64"
    ],
    "info": {
        "pgid": "11.182",
        "last_update": "70238'1306",
        "last_complete": "46843'56787837",
        "log_tail": "0'0",
        "last_user_version": 1301,
        "last_backfill": "MAX",
        "last_backfill_bitwise": 0,
        "purged_snaps": [],
        "history": {
            "epoch_created": 54817,
            "epoch_pool_created": 278,
            "last_epoch_started": 70656,
            "last_interval_started": 70655,
            "last_epoch_clean": 67924,
            "last_interval_clean": 54687,
            "last_epoch_split": 54817,
            "last_epoch_marked_full": 0,
            "same_up_since": 70655,
            "same_interval_since": 70655,
            "same_primary_since": 70655,
            "last_scrub": "46843'56759413",
            "last_scrub_stamp": "2019-01-26 16:31:32.607109",
            "last_deep_scrub": "46843'56628962",
            "last_deep_scrub_stamp": "2019-01-24 08:56:59.228615",
            "last_clean_scrub_stamp": "2019-01-26 16:31:32.607109"
        },
        "stats": {
            "version": "70238'1306",
            "reported_seq": "24940",
            "reported_epoch": "70673",
            "state": "active+recovery_wait+undersized+degraded",
            "last_fresh": "2019-02-04 09:25:56.966952",
            "last_change": "2019-02-04 09:25:56.966952",
            "last_active": "2019-02-04 09:25:56.966952",
            "last_peered": "2019-02-04 09:25:56.966952",
            "last_clean": "0.000000",
            "last_became_active": "2019-02-04 07:57:08.769839",
            "last_became_peered": "2019-02-04 07:57:08.769839",
            "last_unstale": "2019-02-04 09:25:56.966952",
            "last_undegraded": "2019-02-04 07:57:08.762164",
            "last_fullsized": "2019-02-04 07:57:08.761962",
            "mapping_epoch": 70655,
            "log_start": "0'0",
            "ondisk_log_start": "0'0",
            "created": 54817,
            "last_epoch_clean": 67924,
            "parent": "0.0",
            "parent_split_bits": 0,
            "last_scrub": "46843'56759413",
            "last_scrub_stamp": "2019-01-26 16:31:32.607109",
            "last_deep_scrub": "46843'56628962",
            "last_deep_scrub_stamp": "2019-01-24 08:56:59.228615",
            "last_clean_scrub_stamp": "2019-01-26 16:31:32.607109",
            "log_size": 1306,
            "ondisk_log_size": 1306,
            "stats_invalid": false,
            "dirty_stats_invalid": false,
            "omap_stats_invalid": false,
            "hitset_stats_invalid": false,
            "hitset_bytes_stats_invalid": false,
            "pin_stats_invalid": false,
            "snaptrimq_len": 11,
            "stat_sum": {
                "num_bytes": 34648064,
                "num_objects": 10,
                "num_object_clones": 0,
                "num_object_copies": 20,
                "num_objects_missing_on_primary": 25,
                "num_objects_missing": 0,
                "num_objects_degraded": 35,
                "num_objects_misplaced": 0,
                "num_objects_unfound": 25,
                "num_objects_dirty": 10,
                "num_whiteouts": 0,
                "num_read": 1274,
                "num_read_kb": 33808,
                "num_write": 1388,
                "num_write_kb": 42956,
                "num_scrub_errors": 0,
                "num_shallow_scrub_errors": 0,
                "num_deep_scrub_errors": 0,
                "num_objects_recovered": 0,
                "num_bytes_recovered": 0,
                "num_keys_recovered": 0,
                "num_objects_omap": 0,
                "num_objects_hit_set_archive": 0,
                "num_bytes_hit_set_archive": 0,
                "num_flush": 0,
                "num_flush_kb": 0,
                "num_evict": 0,
                "num_evict_kb": 0,
                "num_promote": 0,
                "num_flush_mode_high": 0,
                "num_flush_mode_low": 0,
                "num_evict_mode_some": 0,
                "num_evict_mode_full": 0,
                "num_objects_pinned": 0,
                "num_legacy_snapsets": 0
            },
            "up": [
                64
            ],
            "acting": [
                64
            ],
            "blocked_by": [],
            "up_primary": 64,
            "acting_primary": 64
        },
        "empty": 0,
        "dne": 0,
        "incomplete": 0,
        "last_epoch_started": 70656,
        "hit_set_history": {
            "current_last_update": "0'0",
            "history": []
        }
    },
    "peer_info": [],
    "recovery_state": [
        {
            "name": "Started/Primary/Active",
            "enter_time": "2019-02-04 07:57:08.762037",
            "might_have_unfound": [
                {
                    "osd": "9",
                    "status": "osd is down"
                },
                {
                    "osd": "29",
                    "status": "osd is down"
                },
                {
                    "osd": "49",
                    "status": "osd is down"
                },
                {
                    "osd": "51",
                    "status": "osd is down"
                },
                {
                    "osd": "63",
                    "status": "osd is down"
                },
                {
                    "osd": "92",
                    "status": "osd is down"
                }
            ],
            "recovery_progress": {
                "backfill_targets": [],
                "waiting_on_backfill": [],
                "last_backfill_started": "MIN",
                "backfill_info": {
                    "begin": "MIN",
                    "end": "MIN",
                    "objects": []
                },
                "peer_backfill_info": [],
                "backfills_in_flight": [],
                "recovering": [],
                "pg_backend": {
                    "pull_from_peer": [],
                    "pushing": []
                }
            },
            "scrub": {
                "scrubber.epoch_start": "0",
                "scrubber.active": false,
                "scrubber.state": "INACTIVE",
                "scrubber.start": "MIN",
                "scrubber.end": "MIN",
                "scrubber.subset_last_update": "0'0",
                "scrubber.deep": false,
                "scrubber.seed": 0,
                "scrubber.waiting_on": 0,
                "scrubber.waiting_on_whom": []
            }
        },
        {
            "name": "Started",
            "enter_time": "2019-02-04 07:57:08.220064"
        }
    ],
    "agent_state": {}
}

For 11.ac i will try wath you propose and keep you informed but i am a litle bit anxious to lose another healthy osd.

Kr

________________________________________
From: Sage Weil <sage@xxxxxxxxxxxx>
Sent: 04 February 2019 09:20
To: Philippe Van Hecke
Cc: ceph-users@xxxxxxxxxxxxxx; Belnet Services
Subject: Re:  Luminous cluster in very bad state need some assistance.

On Mon, 4 Feb 2019, Philippe Van Hecke wrote:
> So i restarted the osd but he stop after some time. But this is an effect on the cluster and cluster is on a partial recovery process.
>
> please find here log file of osd 49 after this restart
> https://filesender.belnet.be/?s=download&token=8c9c39f2-36f6-43f7-bebb-175679d27a22

It's the same PG 11.182 hitting the same assert when it tries to recover
to that OSD.  I think the problem will go away once there has been some
write traffic, but it may be tricky to prevent it from doing any recovery
until then.

I just noticed you pasted the wrong 'pg ls' result before:

> > result of  ceph pg ls | grep 11.118
> >
> > 11.118     9788                  0        0         0       0 40817837568 1584     1584                             active+clean 2019-02-01 12:48:41.343228  70238'19811673  70493:34596887  [121,24]        121  [121,24]            121  69295'19811665 2019-02-01 12:48:41.343144  66131'19810044 2019-01-30 11:44:36.006505

What does 11.182 look like?

We can try something slighty different.  From before it looked like your
only 'incomplete' pg was 11.ac (ceph pg ls incomplete), and the needed
state is either on osd.49 or osd.63.  On osd.49, do ceph-objectstore-tool
--op export on that pg, and then find an otherwise healthy OSD (that
doesn't have 11.ac), stop it, and ceph-objectstore-tool --op import it
there.  When you start it up, 11.ac will hopefull peer and recover.  (Or,
alternatively, osd.63 may have the needed state.)

sage
_______________________________________________
ceph-users mailing list
ceph-users@xxxxxxxxxxxxxx
http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com