Hi Brad, the cluster recover to about 0.012% after switching to firefly tunables (got stuck again with 1 PG remapped) and after that I increased the pg_num/pgp_num from 128 to 256 to 512, and the status is getting worse: more PGs are getting stuck at a remapped state, so I don't see a reason to keep increasing the PGs in the 3-replica pool right now. Do you see any problem in the pool conf and crusmap rules below that may lead to this situation? root@staging-rd0-00:~# ceph -s cluster 2c91375c-6926-4a96-a2b6-f154fbbe70d4 health HEALTH_WARN 9 pgs stuck unclean recovery 712/4870783 objects degraded (0.015%) recovery 1291/4870783 objects misplaced (0.027%) monmap e17: 3 mons at {staging-rd0-00=62.217.119.10:6789/0,staging-rd0-01=62.217.119.11:6789/0,staging-rd0-03=62.217.119.13:6789/0} election epoch 416, quorum 0,1,2 staging-rd0-00,staging-rd0-01,staging-rd0-03 osdmap e159764: 16 osds: 16 up, 16 in; 9 remapped pgs pgmap v38867868: 10752 pgs, 6 pools, 2529 GB data, 2342 kobjects 5371 GB used, 35594 GB / 40965 GB avail 712/4870783 objects degraded (0.015%) 1291/4870783 objects misplaced (0.027%) 10743 active+clean 9 active+remapped client io 6087 B/s rd, 566 kB/s wr, 126 op/s root@staging-rd0-00:~# ceph osd dump | grep pool pool 0 'data' replicated size 2 min_size 1 crush_ruleset 3 object_hash rjenkins pg_num 2048 pgp_num 2048 last_change 119047 crash_replay_interval 45 stripe_width 0 pool 1 'metadata' replicated size 2 min_size 1 crush_ruleset 3 object_hash rjenkins pg_num 2048 pgp_num 2048 last_change 119048 stripe_width 0 pool 2 'rbd' replicated size 2 min_size 1 crush_ruleset 3 object_hash rjenkins pg_num 2048 pgp_num 2048 last_change 119049 stripe_width 0 pool 3 'blocks' replicated size 2 min_size 1 crush_ruleset 4 object_hash rjenkins pg_num 2048 pgp_num 2048 last_change 119050 stripe_width 0 pool 4 'maps' replicated size 2 min_size 1 crush_ruleset 3 object_hash rjenkins pg_num 2048 pgp_num 2048 last_change 119051 stripe_width 0 pool 179 'scbench' replicated size 3 min_size 1 crush_ruleset 0 object_hash rjenkins pg_num 512 pgp_num 512 last_change 159762 flags hashpspool stripe_width 0 root@staging-rd0-00:~# ceph osd crush dump { "devices": [ { "id": 0, "name": "osd.0" }, { "id": 1, "name": "osd.1" }, { "id": 2, "name": "osd.2" }, { "id": 3, "name": "osd.3" }, { "id": 4, "name": "osd.4" }, { "id": 5, "name": "osd.5" }, { "id": 6, "name": "osd.6" }, { "id": 7, "name": "osd.7" }, { "id": 8, "name": "osd.8" }, { "id": 9, "name": "osd.9" }, { "id": 10, "name": "osd.10" }, { "id": 11, "name": "osd.11" }, { "id": 12, "name": "osd.12" }, { "id": 13, "name": "osd.13" }, { "id": 14, "name": "osd.14" }, { "id": 15, "name": "osd.15" } ], "types": [ { "type_id": 0, "name": "osd" }, { "type_id": 1, "name": "host" }, { "type_id": 2, "name": "rack" }, { "type_id": 3, "name": "row" }, { "type_id": 4, "name": "room" }, { "type_id": 5, "name": "datacenter" }, { "type_id": 6, "name": "root" } ], "buckets": [ { "id": -1, "name": "default", "type_id": 6, "type_name": "root", "weight": 734000, "alg": "straw", "hash": "rjenkins1", "items": [ { "id": -3, "weight": 734000, "pos": 0 } ] }, { "id": -2, "name": "staging-rd0-03", "type_id": 1, "type_name": "host", "weight": 26214, "alg": "straw", "hash": "rjenkins1", "items": [ { "id": 14, "weight": 13107, "pos": 0 }, { "id": 15, "weight": 13107, "pos": 1 } ] }, { "id": -3, "name": "unknownrack", "type_id": 2, "type_name": "rack", "weight": 734000, "alg": "straw", "hash": "rjenkins1", "items": [ { "id": -2, "weight": 26214, "pos": 0 }, { "id": -8, "weight": 340786, "pos": 1 }, { "id": -7, "weight": 340786, "pos": 2 }, { "id": -4, "weight": 26214, "pos": 3 } ] }, { "id": -4, "name": "staging-rd0-02", "type_id": 1, "type_name": "host", "weight": 26214, "alg": "straw", "hash": "rjenkins1", "items": [ { "id": 12, "weight": 13107, "pos": 0 }, { "id": 13, "weight": 13107, "pos": 1 } ] }, { "id": -7, "name": "staging-rd0-00", "type_id": 1, "type_name": "host", "weight": 340786, "alg": "straw", "hash": "rjenkins1", "items": [ { "id": 0, "weight": 39321, "pos": 0 }, { "id": 1, "weight": 39321, "pos": 1 }, { "id": 2, "weight": 65536, "pos": 2 }, { "id": 3, "weight": 65536, "pos": 3 }, { "id": 4, "weight": 65536, "pos": 4 }, { "id": 5, "weight": 65536, "pos": 5 } ] }, { "id": -8, "name": "staging-rd0-01", "type_id": 1, "type_name": "host", "weight": 340786, "alg": "straw", "hash": "rjenkins1", "items": [ { "id": 6, "weight": 39321, "pos": 0 }, { "id": 7, "weight": 39321, "pos": 1 }, { "id": 8, "weight": 65536, "pos": 2 }, { "id": 9, "weight": 65536, "pos": 3 }, { "id": 10, "weight": 65536, "pos": 4 }, { "id": 11, "weight": 65536, "pos": 5 } ] } ], "rules": [ { "rule_id": 0, "rule_name": "data", "ruleset": 0, "type": 1, "min_size": 1, "max_size": 10, "steps": [ { "op": "take", "item": -1, "item_name": "default" }, { "op": "chooseleaf_firstn", "num": 0, "type": "host" }, { "op": "emit" } ] }, { "rule_id": 1, "rule_name": "metadata", "ruleset": 1, "type": 1, "min_size": 1, "max_size": 10, "steps": [ { "op": "take", "item": -1, "item_name": "default" }, { "op": "chooseleaf_firstn", "num": 0, "type": "host" }, { "op": "emit" } ] }, { "rule_id": 2, "rule_name": "rbd", "ruleset": 2, "type": 1, "min_size": 1, "max_size": 10, "steps": [ { "op": "take", "item": -1, "item_name": "default" }, { "op": "chooseleaf_firstn", "num": 0, "type": "host" }, { "op": "emit" } ] }, { "rule_id": 3, "rule_name": "sas", "ruleset": 3, "type": 1, "min_size": 2, "max_size": 10, "steps": [ { "op": "take", "item": -1, "item_name": "default" }, { "op": "chooseleaf_firstn", "num": 0, "type": "host" }, { "op": "emit" } ] }, { "rule_id": 4, "rule_name": "sata", "ruleset": 4, "type": 1, "min_size": 2, "max_size": 10, "steps": [ { "op": "take", "item": -1, "item_name": "default" }, { "op": "chooseleaf_firstn", "num": 0, "type": "host" }, { "op": "emit" } ] } ], "tunables": { "choose_local_tries": 0, "choose_local_fallback_tries": 0, "choose_total_tries": 50, "chooseleaf_descend_once": 1, "chooseleaf_vary_r": 1, "straw_calc_version": 1, "allowed_bucket_algs": 22, "profile": "unknown", "optimal_tunables": 0, "legacy_tunables": 0, "require_feature_tunables": 1, "require_feature_tunables2": 1, "require_feature_tunables3": 1, "has_v2_rules": 0, "has_v3_rules": 0, "has_v4_buckets": 0 } } On 26 July 2016 at 02:07, Brad Hubbard <bhubbard@xxxxxxxxxx> wrote: > On Tue, Jul 26, 2016 at 6:08 AM, Kostis Fardelas <dante1234@xxxxxxxxx> wrote: >> Following up, I increased pg_num/pgp_num for my 3-replica pool to 128 > > These pg numbers seem low. > > Can you take a look at http://ceph.com/pgcalc/ and verify these values > are appropriate for your environment and use case? > > I'd also take a good look at your crush rules to determine if they are > contributing to the problem. > >> (being in argonaut tunables) and after a small recovery that followed, >> I switched to bobtail tunables. Remapping started and got stuck (!) >> again without any OSD down this time with 1 PG active+remapped. Tried >> restarting PG's OSDs, no luck. >> >> One thing to notice is that stuck PGs are always on this 3-replicated pool. >> >> Finally, I decided to take the hit and switch to firefly tunables >> (with chooseleaf_vary_r=1) just for the sake of it. Misplaced objects >> are on 51% of the cluster right now, so I am going to wait and update >> our thread with the outcome when the dust settles down. >> >> All in all, even if firefly tunables lead to a healthy PG >> distribution, I am afraid I am going to stick with argonaut tunables >> for now and on, the experience was far from encouraging and there is >> little documentation regarding the cons and pros of profile tunables >> changes and their impact on a production cluster. >> >> Kostis >> >> On 24 July 2016 at 14:29, Kostis Fardelas <dante1234@xxxxxxxxx> wrote: >>> nice to hear from you Goncalo, >>> what you propose sounds like an interesting theory, I will test it >>> tomorrow and let you know. In the meanwhile, I did the same test with >>> bobtail and argonaut tunables: >>> - with argonaut tunables, the recovery completes to the end >>> - with bobtail tunables, the situation is worse than with firefly - I >>> got even more degraded and misplaced objects and recovery stuck across >>> 6 PGs >>> >>> I also fell upon a thread with an almost similar case [1], where Sage >>> recommends to switch to hammer tunables and straw2 algorithm, but this >>> is not an option for a lot of people due to kernel requirements >>> >>> [1] https://www.spinics.net/lists/ceph-devel/msg30381.html >>> >>> >>> On 24 July 2016 at 03:44, Goncalo Borges <goncalo.borges@xxxxxxxxxxxxx> wrote: >>>> Hi Kostis >>>> This is a wild guess but one thing I note is that your pool 179 has a very low pg number (100). >>>> >>>> Maybe the algorithm behind the new tunable need a higher pg number to actually proceed with the recovery? >>>> >>>> You could try to increase the pgs to 128 (it is always better to use powers of 2) and see if the recover completes.. >>>> >>>> Cheers >>>> G. >>>> ________________________________________ >>>> From: ceph-users [ceph-users-bounces@xxxxxxxxxxxxxx] on behalf of Kostis Fardelas [dante1234@xxxxxxxxx] >>>> Sent: 23 July 2016 16:32 >>>> To: Brad Hubbard >>>> Cc: ceph-users >>>> Subject: Re: Recovery stuck after adjusting to recent tunables >>>> >>>> Hi Brad, >>>> >>>> pool 0 'data' replicated size 2 min_size 1 crush_ruleset 3 object_hash >>>> rjenkins pg_num 2048 pgp_num 2048 last_change 119047 >>>> crash_replay_interval 45 stripe_width 0 >>>> pool 1 'metadata' replicated size 2 min_size 1 crush_ruleset 3 >>>> object_hash rjenkins pg_num 2048 pgp_num 2048 last_change 119048 >>>> stripe_width 0 >>>> pool 2 'rbd' replicated size 2 min_size 1 crush_ruleset 3 object_hash >>>> rjenkins pg_num 2048 pgp_num 2048 last_change 119049 stripe_width 0 >>>> pool 3 'blocks' replicated size 2 min_size 1 crush_ruleset 4 >>>> object_hash rjenkins pg_num 2048 pgp_num 2048 last_change 119050 >>>> stripe_width 0 >>>> pool 4 'maps' replicated size 2 min_size 1 crush_ruleset 3 object_hash >>>> rjenkins pg_num 2048 pgp_num 2048 last_change 119051 stripe_width 0 >>>> pool 179 'scbench' replicated size 3 min_size 1 crush_ruleset 0 >>>> object_hash rjenkins pg_num 100 pgp_num 100 last_change 154034 flags >>>> hashpspool stripe_width 0 >>>> >>>> This is the status of 179.38 when the cluster is healthy: >>>> http://pastebin.ca/3663600 >>>> >>>> and this is when recovery is stuck: >>>> http://pastebin.ca/3663601 >>>> >>>> >>>> It seems that the PG is replicated with size 3 but the cluster cannot >>>> create the third replica for some objects whose third OSD (OSD.14) is >>>> down. That was not the case with argonaut tunables as I remember. >>>> >>>> Regards >>>> >>>> >>>> On 23 July 2016 at 06:16, Brad Hubbard <bhubbard@xxxxxxxxxx> wrote: >>>>> On Sat, Jul 23, 2016 at 12:17 AM, Kostis Fardelas <dante1234@xxxxxxxxx> wrote: >>>>>> Hello, >>>>>> being in latest Hammer, I think I hit a bug with more recent than >>>>>> legacy tunables. >>>>>> >>>>>> Being in legacy tunables for a while, I decided to experiment with >>>>>> "better" tunables. So first I went from argonaut profile to bobtail >>>>>> and then to firefly. However, I decided to make the changes on >>>>>> chooseleaf_vary_r incrementally (because the remapping from 0 to 5 was >>>>>> huge), from 5 down to the best value (1). So when I reached >>>>>> chooseleaf_vary_r = 2, I decided to run a simple test before going to >>>>>> chooseleaf_vary_r = 1: close an OSD (OSD.14) and let the cluster >>>>>> recover. But the recovery never completes and a PG remains stuck, >>>>>> reported as undersized+degraded. No OSD is near full and all pools >>>>>> have min_size=1. >>>>>> >>>>>> ceph osd crush show-tunables -f json-pretty >>>>>> >>>>>> { >>>>>> "choose_local_tries": 0, >>>>>> "choose_local_fallback_tries": 0, >>>>>> "choose_total_tries": 50, >>>>>> "chooseleaf_descend_once": 1, >>>>>> "chooseleaf_vary_r": 2, >>>>>> "straw_calc_version": 1, >>>>>> "allowed_bucket_algs": 22, >>>>>> "profile": "unknown", >>>>>> "optimal_tunables": 0, >>>>>> "legacy_tunables": 0, >>>>>> "require_feature_tunables": 1, >>>>>> "require_feature_tunables2": 1, >>>>>> "require_feature_tunables3": 1, >>>>>> "has_v2_rules": 0, >>>>>> "has_v3_rules": 0, >>>>>> "has_v4_buckets": 0 >>>>>> } >>>>>> >>>>>> The really strange thing is that the OSDs of the stuck PG belong to >>>>>> other nodes than the one I decided to stop (osd.14). >>>>>> >>>>>> # ceph pg dump_stuck >>>>>> ok >>>>>> pg_stat state up up_primary acting acting_primary >>>>>> 179.38 active+undersized+degraded [2,8] 2 [2,8] 2 >>>>> >>>>> Can you share a query of this pg? >>>>> >>>>> What size (not min size) is this pool (assuming it's 2)? >>>>> >>>>>> >>>>>> >>>>>> ID WEIGHT TYPE NAME UP/DOWN REWEIGHT PRIMARY-AFFINITY >>>>>> -1 11.19995 root default >>>>>> -3 11.19995 rack unknownrack >>>>>> -2 0.39999 host staging-rd0-03 >>>>>> 14 0.20000 osd.14 up 1.00000 1.00000 >>>>>> 15 0.20000 osd.15 up 1.00000 1.00000 >>>>>> -8 5.19998 host staging-rd0-01 >>>>>> 6 0.59999 osd.6 up 1.00000 1.00000 >>>>>> 7 0.59999 osd.7 up 1.00000 1.00000 >>>>>> 8 1.00000 osd.8 up 1.00000 1.00000 >>>>>> 9 1.00000 osd.9 up 1.00000 1.00000 >>>>>> 10 1.00000 osd.10 up 1.00000 1.00000 >>>>>> 11 1.00000 osd.11 up 1.00000 1.00000 >>>>>> -7 5.19998 host staging-rd0-00 >>>>>> 0 0.59999 osd.0 up 1.00000 1.00000 >>>>>> 1 0.59999 osd.1 up 1.00000 1.00000 >>>>>> 2 1.00000 osd.2 up 1.00000 1.00000 >>>>>> 3 1.00000 osd.3 up 1.00000 1.00000 >>>>>> 4 1.00000 osd.4 up 1.00000 1.00000 >>>>>> 5 1.00000 osd.5 up 1.00000 1.00000 >>>>>> -4 0.39999 host staging-rd0-02 >>>>>> 12 0.20000 osd.12 up 1.00000 1.00000 >>>>>> 13 0.20000 osd.13 up 1.00000 1.00000 >>>>>> >>>>>> >>>>>> Have you experienced something similar? >>>>>> >>>>>> Regards, >>>>>> Kostis >>>>>> _______________________________________________ >>>>>> ceph-users mailing list >>>>>> ceph-users@xxxxxxxxxxxxxx >>>>>> http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com >>>>> >>>>> >>>>> >>>>> -- >>>>> Cheers, >>>>> Brad >>>> _______________________________________________ >>>> ceph-users mailing list >>>> ceph-users@xxxxxxxxxxxxxx >>>> http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com > > > > -- > Cheers, > Brad _______________________________________________ ceph-users mailing list ceph-users@xxxxxxxxxxxxxx http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com