Re: Recovery stuck after adjusting to recent tunables

Kostis Fardelas <dante1234@xxxxxxxxx> · Tue, 26 Jul 2016 12:25:53 +0300

Hi Brad,
the cluster recover to about 0.012% after switching to firefly
tunables (got stuck again  with 1 PG remapped) and after that I
increased the pg_num/pgp_num from 128 to 256 to 512, and the status is
getting worse: more PGs are getting stuck at a remapped state, so I
don't see a reason to keep increasing the PGs in the 3-replica pool
right now.  Do you see any problem in the pool conf and crusmap rules
below that may lead to this situation?

root@staging-rd0-00:~# ceph -s
    cluster 2c91375c-6926-4a96-a2b6-f154fbbe70d4
     health HEALTH_WARN
            9 pgs stuck unclean
            recovery 712/4870783 objects degraded (0.015%)
            recovery 1291/4870783 objects misplaced (0.027%)
     monmap e17: 3 mons at
{staging-rd0-00=62.217.119.10:6789/0,staging-rd0-01=62.217.119.11:6789/0,staging-rd0-03=62.217.119.13:6789/0}
            election epoch 416, quorum 0,1,2
staging-rd0-00,staging-rd0-01,staging-rd0-03
     osdmap e159764: 16 osds: 16 up, 16 in; 9 remapped pgs
      pgmap v38867868: 10752 pgs, 6 pools, 2529 GB data, 2342 kobjects
            5371 GB used, 35594 GB / 40965 GB avail
            712/4870783 objects degraded (0.015%)
            1291/4870783 objects misplaced (0.027%)
               10743 active+clean
                   9 active+remapped
  client io 6087 B/s rd, 566 kB/s wr, 126 op/s

root@staging-rd0-00:~# ceph osd dump | grep pool
pool 0 'data' replicated size 2 min_size 1 crush_ruleset 3 object_hash
rjenkins pg_num 2048 pgp_num 2048 last_change 119047
crash_replay_interval 45 stripe_width 0
pool 1 'metadata' replicated size 2 min_size 1 crush_ruleset 3
object_hash rjenkins pg_num 2048 pgp_num 2048 last_change 119048
stripe_width 0
pool 2 'rbd' replicated size 2 min_size 1 crush_ruleset 3 object_hash
rjenkins pg_num 2048 pgp_num 2048 last_change 119049 stripe_width 0
pool 3 'blocks' replicated size 2 min_size 1 crush_ruleset 4
object_hash rjenkins pg_num 2048 pgp_num 2048 last_change 119050
stripe_width 0
pool 4 'maps' replicated size 2 min_size 1 crush_ruleset 3 object_hash
rjenkins pg_num 2048 pgp_num 2048 last_change 119051 stripe_width 0
pool 179 'scbench' replicated size 3 min_size 1 crush_ruleset 0
object_hash rjenkins pg_num 512 pgp_num 512 last_change 159762 flags
hashpspool stripe_width 0

root@staging-rd0-00:~# ceph osd crush dump
{
    "devices": [
        {
            "id": 0,
            "name": "osd.0"
        },
        {
            "id": 1,
            "name": "osd.1"
        },
        {
            "id": 2,
            "name": "osd.2"
        },
        {
            "id": 3,
            "name": "osd.3"
        },
        {
            "id": 4,
            "name": "osd.4"
        },
        {
            "id": 5,
            "name": "osd.5"
        },
        {
            "id": 6,
            "name": "osd.6"
        },
        {
            "id": 7,
            "name": "osd.7"
        },
        {
            "id": 8,
            "name": "osd.8"
        },
        {
            "id": 9,
            "name": "osd.9"
        },
        {
            "id": 10,
            "name": "osd.10"
        },
        {
            "id": 11,
            "name": "osd.11"
        },
        {
            "id": 12,
            "name": "osd.12"
        },
        {
            "id": 13,
            "name": "osd.13"
        },
        {
            "id": 14,
            "name": "osd.14"
        },
        {
            "id": 15,
            "name": "osd.15"
        }
    ],
    "types": [
        {
            "type_id": 0,
            "name": "osd"
        },
        {
            "type_id": 1,
            "name": "host"
        },
        {
            "type_id": 2,
            "name": "rack"
        },
        {
            "type_id": 3,
            "name": "row"
        },
        {
            "type_id": 4,
            "name": "room"
        },
        {
            "type_id": 5,
            "name": "datacenter"
        },
        {
            "type_id": 6,
            "name": "root"
        }
    ],
    "buckets": [
        {
            "id": -1,
            "name": "default",
            "type_id": 6,
            "type_name": "root",
            "weight": 734000,
            "alg": "straw",
            "hash": "rjenkins1",
            "items": [
                {
                    "id": -3,
                    "weight": 734000,
                    "pos": 0
                }
            ]
        },
        {
            "id": -2,
            "name": "staging-rd0-03",
            "type_id": 1,
            "type_name": "host",
            "weight": 26214,
            "alg": "straw",
            "hash": "rjenkins1",
            "items": [
                {
                    "id": 14,
                    "weight": 13107,
                    "pos": 0
                },
                {
                    "id": 15,
                    "weight": 13107,
                    "pos": 1
                }
            ]
        },
        {
            "id": -3,
            "name": "unknownrack",
            "type_id": 2,
            "type_name": "rack",
            "weight": 734000,
            "alg": "straw",
            "hash": "rjenkins1",
            "items": [
                {
                    "id": -2,
                    "weight": 26214,
                    "pos": 0
                },
                {
                    "id": -8,
                    "weight": 340786,
                    "pos": 1
                },
                {
                    "id": -7,
                    "weight": 340786,
                    "pos": 2
                },
                {
                    "id": -4,
                    "weight": 26214,
                    "pos": 3
                }
            ]
        },
        {
            "id": -4,
            "name": "staging-rd0-02",
            "type_id": 1,
            "type_name": "host",
            "weight": 26214,
            "alg": "straw",
            "hash": "rjenkins1",
            "items": [
                {
                    "id": 12,
                    "weight": 13107,
                    "pos": 0
                },
                {
                    "id": 13,
                    "weight": 13107,
                    "pos": 1
                }
            ]
        },
        {
            "id": -7,
            "name": "staging-rd0-00",
            "type_id": 1,
            "type_name": "host",
            "weight": 340786,
            "alg": "straw",
            "hash": "rjenkins1",
            "items": [
                {
                    "id": 0,
                    "weight": 39321,
                    "pos": 0
                },
                {
                    "id": 1,
                    "weight": 39321,
                    "pos": 1
                },
                {
                    "id": 2,
                    "weight": 65536,
                    "pos": 2
                },
                {
                    "id": 3,
                    "weight": 65536,
                    "pos": 3
                },
                {
                    "id": 4,
                    "weight": 65536,
                    "pos": 4
                },
                {
                    "id": 5,
                    "weight": 65536,
                    "pos": 5
                }
            ]
        },
        {
            "id": -8,
            "name": "staging-rd0-01",
            "type_id": 1,
            "type_name": "host",
            "weight": 340786,
            "alg": "straw",
            "hash": "rjenkins1",
            "items": [
                {
                    "id": 6,
                    "weight": 39321,
                    "pos": 0
                },
                {
                    "id": 7,
                    "weight": 39321,
                    "pos": 1
                },
                {
                    "id": 8,
                    "weight": 65536,
                    "pos": 2
                },
                {
                    "id": 9,
                    "weight": 65536,
                    "pos": 3
                },
                {
                    "id": 10,
                    "weight": 65536,
                    "pos": 4
                },
                {
                    "id": 11,
                    "weight": 65536,
                    "pos": 5
                }
            ]
        }
    ],
    "rules": [
        {
            "rule_id": 0,
            "rule_name": "data",
            "ruleset": 0,
            "type": 1,
            "min_size": 1,
            "max_size": 10,
            "steps": [
                {
                    "op": "take",
                    "item": -1,
                    "item_name": "default"
                },
                {
                    "op": "chooseleaf_firstn",
                    "num": 0,
                    "type": "host"
                },
                {
                    "op": "emit"
                }
            ]
        },
        {
            "rule_id": 1,
            "rule_name": "metadata",
            "ruleset": 1,
            "type": 1,
            "min_size": 1,
            "max_size": 10,
            "steps": [
                {
                    "op": "take",
                    "item": -1,
                    "item_name": "default"
                },
                {
                    "op": "chooseleaf_firstn",
                    "num": 0,
                    "type": "host"
                },
                {
                    "op": "emit"
                }
            ]
        },
        {
            "rule_id": 2,
            "rule_name": "rbd",
            "ruleset": 2,
            "type": 1,
            "min_size": 1,
            "max_size": 10,
            "steps": [
                {
                    "op": "take",
                    "item": -1,
                    "item_name": "default"
                },
                {
                    "op": "chooseleaf_firstn",
                    "num": 0,
                    "type": "host"
                },
                {
                    "op": "emit"
                }
            ]
        },
        {
            "rule_id": 3,
            "rule_name": "sas",
            "ruleset": 3,
            "type": 1,
            "min_size": 2,
            "max_size": 10,
            "steps": [
                {
                    "op": "take",
                    "item": -1,
                    "item_name": "default"
                },
                {
                    "op": "chooseleaf_firstn",
                    "num": 0,
                    "type": "host"
                },
                {
                    "op": "emit"
                }
            ]
        },
        {
            "rule_id": 4,
            "rule_name": "sata",
            "ruleset": 4,
            "type": 1,
            "min_size": 2,
            "max_size": 10,
            "steps": [
                {
                    "op": "take",
                    "item": -1,
                    "item_name": "default"
                },
                {
                    "op": "chooseleaf_firstn",
                    "num": 0,
                    "type": "host"
                },
                {
                    "op": "emit"
                }
            ]
        }
    ],
    "tunables": {
        "choose_local_tries": 0,
        "choose_local_fallback_tries": 0,
        "choose_total_tries": 50,
        "chooseleaf_descend_once": 1,
        "chooseleaf_vary_r": 1,
        "straw_calc_version": 1,
        "allowed_bucket_algs": 22,
        "profile": "unknown",
        "optimal_tunables": 0,
        "legacy_tunables": 0,
        "require_feature_tunables": 1,
        "require_feature_tunables2": 1,
        "require_feature_tunables3": 1,
        "has_v2_rules": 0,
        "has_v3_rules": 0,
        "has_v4_buckets": 0
    }
}

On 26 July 2016 at 02:07, Brad Hubbard <bhubbard@xxxxxxxxxx> wrote:
> On Tue, Jul 26, 2016 at 6:08 AM, Kostis Fardelas <dante1234@xxxxxxxxx> wrote:
>> Following up, I increased pg_num/pgp_num for my 3-replica pool to 128
>
> These pg numbers seem low.
>
> Can you take a look at http://ceph.com/pgcalc/ and verify these values
> are appropriate for your environment and use case?
>
> I'd also take a good look at your crush rules to determine if they are
> contributing to the problem.
>
>> (being in argonaut tunables) and after a small recovery that followed,
>> I switched to bobtail tunables. Remapping started and got stuck (!)
>> again without any OSD down this time with 1 PG active+remapped. Tried
>> restarting PG's OSDs, no luck.
>>
>> One thing to notice is that stuck PGs are always on this 3-replicated pool.
>>
>> Finally, I decided to take the hit and switch to firefly tunables
>> (with chooseleaf_vary_r=1) just for the sake of it. Misplaced objects
>> are on 51% of the cluster right now, so I am going to wait and update
>> our thread with the outcome when the dust settles down.
>>
>> All in all, even if firefly tunables lead to a healthy PG
>> distribution, I am afraid I am going to stick with argonaut tunables
>> for now and on, the experience was far from encouraging and there is
>> little documentation regarding the cons and pros of profile tunables
>> changes and their impact on a production cluster.
>>
>> Kostis
>>
>> On 24 July 2016 at 14:29, Kostis Fardelas <dante1234@xxxxxxxxx> wrote:
>>> nice to hear from you Goncalo,
>>> what you propose sounds like an interesting theory, I will test it
>>> tomorrow and let you know. In the meanwhile, I did the same test with
>>> bobtail and argonaut tunables:
>>> - with argonaut tunables, the recovery completes to the end
>>> - with bobtail tunables, the situation is worse than with firefly - I
>>> got even more degraded and misplaced objects and recovery stuck across
>>> 6 PGs
>>>
>>> I also fell upon a thread with an almost similar case [1], where Sage
>>> recommends to switch to hammer tunables and straw2 algorithm, but this
>>> is not an option for a lot of people due to kernel requirements
>>>
>>> [1] https://www.spinics.net/lists/ceph-devel/msg30381.html
>>>
>>>
>>> On 24 July 2016 at 03:44, Goncalo Borges <goncalo.borges@xxxxxxxxxxxxx> wrote:
>>>> Hi Kostis
>>>> This is a wild guess but one thing I note is that your pool 179 has a very low pg number (100).
>>>>
>>>> Maybe the algorithm behind the new tunable need a higher pg number to actually proceed with the recovery?
>>>>
>>>> You could try to increase the pgs to 128 (it is always better to use powers of 2) and see if the recover completes..
>>>>
>>>> Cheers
>>>> G.
>>>> ________________________________________
>>>> From: ceph-users [ceph-users-bounces@xxxxxxxxxxxxxx] on behalf of Kostis Fardelas [dante1234@xxxxxxxxx]
>>>> Sent: 23 July 2016 16:32
>>>> To: Brad Hubbard
>>>> Cc: ceph-users
>>>> Subject: Re:  Recovery stuck after adjusting to recent tunables
>>>>
>>>> Hi Brad,
>>>>
>>>> pool 0 'data' replicated size 2 min_size 1 crush_ruleset 3 object_hash
>>>> rjenkins pg_num 2048 pgp_num 2048 last_change 119047
>>>> crash_replay_interval 45 stripe_width 0
>>>> pool 1 'metadata' replicated size 2 min_size 1 crush_ruleset 3
>>>> object_hash rjenkins pg_num 2048 pgp_num 2048 last_change 119048
>>>> stripe_width 0
>>>> pool 2 'rbd' replicated size 2 min_size 1 crush_ruleset 3 object_hash
>>>> rjenkins pg_num 2048 pgp_num 2048 last_change 119049 stripe_width 0
>>>> pool 3 'blocks' replicated size 2 min_size 1 crush_ruleset 4
>>>> object_hash rjenkins pg_num 2048 pgp_num 2048 last_change 119050
>>>> stripe_width 0
>>>> pool 4 'maps' replicated size 2 min_size 1 crush_ruleset 3 object_hash
>>>> rjenkins pg_num 2048 pgp_num 2048 last_change 119051 stripe_width 0
>>>> pool 179 'scbench' replicated size 3 min_size 1 crush_ruleset 0
>>>> object_hash rjenkins pg_num 100 pgp_num 100 last_change 154034 flags
>>>> hashpspool stripe_width 0
>>>>
>>>> This is the status of 179.38 when the cluster is healthy:
>>>> http://pastebin.ca/3663600
>>>>
>>>> and this is when recovery is stuck:
>>>> http://pastebin.ca/3663601
>>>>
>>>>
>>>> It seems that the PG is replicated with size 3 but the cluster cannot
>>>> create the third replica for some objects whose third OSD (OSD.14) is
>>>> down. That was not the case with argonaut tunables as I remember.
>>>>
>>>> Regards
>>>>
>>>>
>>>> On 23 July 2016 at 06:16, Brad Hubbard <bhubbard@xxxxxxxxxx> wrote:
>>>>> On Sat, Jul 23, 2016 at 12:17 AM, Kostis Fardelas <dante1234@xxxxxxxxx> wrote:
>>>>>> Hello,
>>>>>> being in latest Hammer, I think I hit a bug with more recent than
>>>>>> legacy tunables.
>>>>>>
>>>>>> Being in legacy tunables for a while, I decided to experiment with
>>>>>> "better" tunables. So first I went from argonaut profile to bobtail
>>>>>> and then to firefly. However, I decided to make the changes on
>>>>>> chooseleaf_vary_r incrementally (because the remapping from 0 to 5 was
>>>>>> huge), from 5 down to the best value (1). So when I reached
>>>>>> chooseleaf_vary_r = 2, I decided to run a simple test before going to
>>>>>> chooseleaf_vary_r = 1: close an OSD (OSD.14) and let the cluster
>>>>>> recover. But the recovery never completes and a PG remains stuck,
>>>>>> reported as undersized+degraded. No OSD is near full and all pools
>>>>>> have min_size=1.
>>>>>>
>>>>>> ceph osd crush show-tunables -f json-pretty
>>>>>>
>>>>>> {
>>>>>>     "choose_local_tries": 0,
>>>>>>     "choose_local_fallback_tries": 0,
>>>>>>     "choose_total_tries": 50,
>>>>>>     "chooseleaf_descend_once": 1,
>>>>>>     "chooseleaf_vary_r": 2,
>>>>>>     "straw_calc_version": 1,
>>>>>>     "allowed_bucket_algs": 22,
>>>>>>     "profile": "unknown",
>>>>>>     "optimal_tunables": 0,
>>>>>>     "legacy_tunables": 0,
>>>>>>     "require_feature_tunables": 1,
>>>>>>     "require_feature_tunables2": 1,
>>>>>>     "require_feature_tunables3": 1,
>>>>>>     "has_v2_rules": 0,
>>>>>>     "has_v3_rules": 0,
>>>>>>     "has_v4_buckets": 0
>>>>>> }
>>>>>>
>>>>>> The really strange thing is that the OSDs of the stuck PG belong to
>>>>>> other nodes than the one I decided to stop (osd.14).
>>>>>>
>>>>>> # ceph pg dump_stuck
>>>>>> ok
>>>>>> pg_stat state up up_primary acting acting_primary
>>>>>> 179.38 active+undersized+degraded [2,8] 2 [2,8] 2
>>>>>
>>>>> Can you share a query of this pg?
>>>>>
>>>>> What size (not min size) is this pool (assuming it's 2)?
>>>>>
>>>>>>
>>>>>>
>>>>>> ID WEIGHT   TYPE NAME                   UP/DOWN REWEIGHT PRIMARY-AFFINITY
>>>>>> -1 11.19995 root default
>>>>>> -3 11.19995     rack unknownrack
>>>>>> -2  0.39999         host staging-rd0-03
>>>>>> 14  0.20000             osd.14               up  1.00000          1.00000
>>>>>> 15  0.20000             osd.15               up  1.00000          1.00000
>>>>>> -8  5.19998         host staging-rd0-01
>>>>>>  6  0.59999             osd.6                up  1.00000          1.00000
>>>>>>  7  0.59999             osd.7                up  1.00000          1.00000
>>>>>>  8  1.00000             osd.8                up  1.00000          1.00000
>>>>>>  9  1.00000             osd.9                up  1.00000          1.00000
>>>>>> 10  1.00000             osd.10               up  1.00000          1.00000
>>>>>> 11  1.00000             osd.11               up  1.00000          1.00000
>>>>>> -7  5.19998         host staging-rd0-00
>>>>>>  0  0.59999             osd.0                up  1.00000          1.00000
>>>>>>  1  0.59999             osd.1                up  1.00000          1.00000
>>>>>>  2  1.00000             osd.2                up  1.00000          1.00000
>>>>>>  3  1.00000             osd.3                up  1.00000          1.00000
>>>>>>  4  1.00000             osd.4                up  1.00000          1.00000
>>>>>>  5  1.00000             osd.5                up  1.00000          1.00000
>>>>>> -4  0.39999         host staging-rd0-02
>>>>>> 12  0.20000             osd.12               up  1.00000          1.00000
>>>>>> 13  0.20000             osd.13               up  1.00000          1.00000
>>>>>>
>>>>>>
>>>>>> Have you experienced something similar?
>>>>>>
>>>>>> Regards,
>>>>>> Kostis
>>>>>> _______________________________________________
>>>>>> ceph-users mailing list
>>>>>> ceph-users@xxxxxxxxxxxxxx
>>>>>> http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com
>>>>>
>>>>>
>>>>>
>>>>> --
>>>>> Cheers,
>>>>> Brad
>>>> _______________________________________________
>>>> ceph-users mailing list
>>>> ceph-users@xxxxxxxxxxxxxx
>>>> http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com
>
>
>
> --
> Cheers,
> Brad
_______________________________________________
ceph-users mailing list
ceph-users@xxxxxxxxxxxxxx
http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com