Hi, I'm trying to understand erasure coded pools and why CRUSH rules seem to work for only part of PGs in EC pools.
Basically what I'm trying to do is to check erasure coded pool recovering behaviour after the single OSD or single HOST failure.
I noticed that in case of HOST failure only part of PGs get recovered to active+remapped when other PGs remain in active+undersized+degraded state. Why??
EC pool profile I use is k=3 , m=2.
Also I'm not really sure what is the meaning of all steps of below crush rule (perhaps it is the root cause).
rule ecpool_3_2 {
ruleset 1
type erasure
min_size 3
max_size 5
step set_chooseleaf_tries 5 # should I maybe try to increase this number of retry ?? Can I apply the changes to existing EC crush rule and pool or need to create a new one ?
step set_choose_tries 100
step take default
step chooseleaf indep 0 type host # Does it allow to choose more than one OSD from single HOST but first trying to get only one OSD per HOST if there are enough HOSTs in the cluster?
step emit
}
ceph version 10.2.9 (jewel)
# INITIAL CLUSTER STATE
root@host01:~# ceph osd tree
ID WEIGHT TYPE NAME UP/DOWN REWEIGHT PRIMARY-AFFINITY
-1 218.18401 root default
-6 218.18401 region MyRegion
-5 218.18401 datacenter MyDC
-4 218.18401 room MyRoom
-3 43.63699 rack Rack01
-2 43.63699 host host01
0 3.63599 osd.0 up 1.00000 1.00000
3 3.63599 osd.3 up 1.00000 1.00000
4 3.63599 osd.4 up 1.00000 1.00000
6 3.63599 osd.6 up 1.00000 1.00000
8 3.63599 osd.8 up 1.00000 1.00000
10 3.63599 osd.10 up 1.00000 1.00000
12 3.63599 osd.12 up 1.00000 1.00000
14 3.63599 osd.14 up 1.00000 1.00000
16 3.63599 osd.16 up 1.00000 1.00000
19 3.63599 osd.19 up 1.00000 1.00000
22 3.63599 osd.22 up 1.00000 1.00000
25 3.63599 osd.25 up 1.00000 1.00000
-8 43.63699 rack Rack02
-7 43.63699 host host02
1 3.63599 osd.1 up 1.00000 1.00000
2 3.63599 osd.2 up 1.00000 1.00000
5 3.63599 osd.5 up 1.00000 1.00000
7 3.63599 osd.7 up 1.00000 1.00000
9 3.63599 osd.9 up 1.00000 1.00000
11 3.63599 osd.11 up 1.00000 1.00000
13 3.63599 osd.13 up 1.00000 1.00000
15 3.63599 osd.15 up 1.00000 1.00000
17 3.63599 osd.17 up 1.00000 1.00000
20 3.63599 osd.20 up 1.00000 1.00000
23 3.63599 osd.23 up 1.00000 1.00000
26 3.63599 osd.26 up 1.00000 1.00000
-10 130.91000 rack Rack03
-9 43.63699 host host03
18 3.63599 osd.18 up 1.00000 1.00000
21 3.63599 osd.21 up 1.00000 1.00000
24 3.63599 osd.24 up 1.00000 1.00000
27 3.63599 osd.27 up 1.00000 1.00000
28 3.63599 osd.28 up 1.00000 1.00000
29 3.63599 osd.29 up 1.00000 1.00000
30 3.63599 osd.30 up 1.00000 1.00000
31 3.63599 osd.31 up 1.00000 1.00000
32 3.63599 osd.32 up 1.00000 1.00000
33 3.63599 osd.33 up 1.00000 1.00000
34 3.63599 osd.34 up 1.00000 1.00000
35 3.63599 osd.35 up 1.00000 1.00000
-11 43.63699 host host04
36 3.63599 osd.36 up 1.00000 1.00000
37 3.63599 osd.37 up 1.00000 1.00000
38 3.63599 osd.38 up 1.00000 1.00000
39 3.63599 osd.39 up 1.00000 1.00000
40 3.63599 osd.40 up 1.00000 1.00000
41 3.63599 osd.41 up 1.00000 1.00000
42 3.63599 osd.42 up 1.00000 1.00000
43 3.63599 osd.43 up 1.00000 1.00000
44 3.63599 osd.44 up 1.00000 1.00000
45 3.63599 osd.45 up 1.00000 1.00000
46 3.63599 osd.46 up 1.00000 1.00000
47 3.63599 osd.47 up 1.00000 1.00000
-12 43.63699 host host05
48 3.63599 osd.48 up 1.00000 1.00000
49 3.63599 osd.49 up 1.00000 1.00000
50 3.63599 osd.50 up 1.00000 1.00000
51 3.63599 osd.51 up 1.00000 1.00000
52 3.63599 osd.52 up 1.00000 1.00000
53 3.63599 osd.53 up 1.00000 1.00000
54 3.63599 osd.54 up 1.00000 1.00000
55 3.63599 osd.55 up 1.00000 1.00000
56 3.63599 osd.56 up 1.00000 1.00000
57 3.63599 osd.57 up 1.00000 1.00000
58 3.63599 osd.58 up 1.00000 1.00000
59 3.63599 osd.59 up 1.00000 1.00000
root@host01:~# ceph -w
cluster a6f73750-1972-47f6-bcf5-a99753be65ad
health HEALTH_OK
monmap e2: 3 mons at {host01=10.212.32.23:6789/0,host02=10.212.32.24:6789/0,host03=10.212.32.25:6789/0}
election epoch 22, quorum 0,1,2 host01,host02,host03
osdmap e527: 60 osds: 60 up, 60 in
flags sortbitwise,require_jewel_osds
pgmap v57164: 3736 pgs, 19 pools, 10343 bytes data, 241 objects
4665 MB used, 218 TB / 218 TB avail
3736 active+clean
2017-11-28 07:38:52.350228 mon.0 [INF] pgmap v57163: 3736 pgs: 3736 active+clean; 10343 bytes data, 4665 MB used, 218 TB / 218 TB avail
...
root@host01:~#
In the 1st scenario I stop single OSD (id 48, host host05) and after 5 minutes cluster start to recover by remapping PGs using other OSD from HOST host05.
In the 2nd scenario, I stop all Ceph services on one HOST host05
# FIND ALL PGs USING OSDs FROM HOST host05
root@host01:~# ceph pg dump pgs_brief |egrep '\[48|,48|\[49|,49|\[50|,50|\[51|,51|\[52|,52|\[53|,53|\[54|,54|\[55|,55|\[56|,56|\[57|,57|\[58|,58|\[59|,59' > PGs_on_HOST_host05
dumped pgs_brief in format plain
root@host01:~# wc -l PGs_on_HOST_host05
2556 PGs_on_HOST_host05
# STOP ALL CEPH SERVICES on HOST host05
root@host05:~# systemctl stop ceph.target
root@host01:~# ceph -w
cluster a6f73750-1972-47f6-bcf5-a99753be65ad
health HEALTH_OK
monmap e2: 3 mons at {host01=10.212.32.23:6789/0,host02=10.212.32.24:6789/0,host03=10.212.32.25:6789/0}
election epoch 22, quorum 0,1,2 host01,host02,host03
osdmap e538: 60 osds: 59 up, 59 in
flags sortbitwise,require_jewel_osds
pgmap v57405: 3736 pgs, 19 pools, 10343 bytes data, 241 objects
4581 MB used, 214 TB / 214 TB avail
3736 active+clean
2017-11-28 08:08:21.349340 mon.0 [INF] pgmap v57405: 3736 pgs: 3736 active+clean; 10343 bytes data, 4581 MB used, 214 TB / 214 TB avail
2017-11-28 08:08:33.082249 mon.0 [INF] osd.57 marked itself down
2017-11-28 08:08:33.082607 mon.0 [INF] osd.49 marked itself down
2017-11-28 08:08:33.082899 mon.0 [INF] osd.59 marked itself down
2017-11-28 08:08:33.083471 mon.0 [INF] osd.56 marked itself down
2017-11-28 08:08:33.084091 mon.0 [INF] osd.58 marked itself down
2017-11-28 08:08:33.084842 mon.0 [INF] osd.53 marked itself down
2017-11-28 08:08:33.085373 mon.0 [INF] osd.50 marked itself down
2017-11-28 08:08:33.085830 mon.0 [INF] osd.54 marked itself down
2017-11-28 08:08:33.086437 mon.0 [INF] osd.55 marked itself down
2017-11-28 08:08:33.086664 mon.0 [INF] osd.52 marked itself down
2017-11-28 08:08:33.086970 mon.0 [INF] osd.51 marked itself down
2017-11-28 08:08:33.246299 mon.0 [INF] osdmap e539: 60 osds: 48 up, 59 in
2017-11-28 08:08:33.253694 mon.0 [INF] pgmap v57406: 3736 pgs: 3736 active+clean; 10343 bytes data, 4581 MB used, 214 TB / 214 TB avail
2017-11-28 08:08:34.333012 mon.0 [INF] osdmap e540: 60 osds: 48 up, 59 in
2017-11-28 08:08:34.348753 mon.0 [INF] pgmap v57407: 3736 pgs: 64 peering, 658 stale+active+clean, 3014 active+clean; 10343 bytes data, 4581 MB used, 214 TB / 214 TB avail
2017-11-28 08:08:35.344372 mon.0 [INF] pgmap v57408: 3736 pgs: 4 active+undersized+degraded, 42 activating+undersized+degraded, 64 peering, 648 stale+active+clean, 2978 active+clean; 10343 bytes data, 4581 MB used, 214 TB / 214 TB avail
2017-11-28 08:08:36.375645 mon.0 [INF] pgmap v57409: 3736 pgs: 268 active+undersized+degraded, 42 activating+undersized+degraded, 64 peering, 578 stale+active+clean, 2784 active+clean; 10343 bytes data, 4584 MB used, 214 TB / 214 TB avail; 24/791 objects degraded (3.034%)
2017-11-28 08:08:37.457164 mon.0 [INF] pgmap v57410: 3736 pgs: 1750 active+undersized+degraded, 42 activating+undersized+degraded, 64 peering, 198 stale+active+clean, 1682 active+clean; 10343 bytes data, 4622 MB used, 214 TB / 214 TB avail; 141/791 objects degraded (17.826%)
2017-11-28 08:08:38.466174 mon.0 [INF] pgmap v57411: 3736 pgs: 2450 active+undersized+degraded, 42 activating+undersized+degraded, 64 peering, 1180 active+clean; 10343 bytes data, 4643 MB used, 214 TB / 214 TB avail; 190/791 objects degraded (24.020%)
2017-11-28 08:08:39.454811 mon.0 [INF] pgmap v57412: 3736 pgs: 2556 active+undersized+degraded, 1180 active+clean; 10343 bytes data, 4645 MB used, 214 TB / 214 TB avail; 193/791 objects degraded (24.399%)
2017-11-28 08:08:45.202295 mon.0 [INF] HEALTH_WARN; 2556 pgs degraded; 2549 pgs stuck unclean; 2556 pgs undersized; recovery 193/791 objects degraded (24.399%); 11/59 in osds are down
.... AFTER 5 MINUTES PGs REMAPPING HAS STARTED
2017-11-28 08:12:45.205422 mon.0 [INF] HEALTH_WARN; 2556 pgs degraded; 2556 pgs stuck unclean; 2556 pgs undersized; recovery 193/791 objects degraded (24.399%); 11/59 in osds are down
2017-11-28 08:12:51.570936 mon.0 [INF] pgmap v57446: 3736 pgs: 2556 active+undersized+degraded, 1180 active+clean; 10343 bytes data, 4632 MB used, 214 TB / 214 TB avail; 193/791 objects degraded (24.399%)
2017-11-28 08:13:35.060583 mon.0 [INF] osd.49 out (down for 301.868797)
2017-11-28 08:13:35.060723 mon.0 [INF] osd.50 out (down for 301.868797)
2017-11-28 08:13:35.060753 mon.0 [INF] osd.51 out (down for 301.868797)
2017-11-28 08:13:35.060783 mon.0 [INF] osd.52 out (down for 301.868796)
2017-11-28 08:13:35.060812 mon.0 [INF] osd.53 out (down for 301.868796)
2017-11-28 08:13:35.060842 mon.0 [INF] osd.54 out (down for 301.868796)
2017-11-28 08:13:35.060870 mon.0 [INF] osd.55 out (down for 301.868795)
2017-11-28 08:13:35.060928 mon.0 [INF] osd.56 out (down for 301.868795)
2017-11-28 08:13:35.060958 mon.0 [INF] osd.57 out (down for 301.868795)
2017-11-28 08:13:35.060990 mon.0 [INF] osd.58 out (down for 301.868795)
2017-11-28 08:13:35.061021 mon.0 [INF] osd.59 out (down for 301.868794)
2017-11-28 08:13:35.274737 mon.0 [INF] osdmap e541: 60 osds: 48 up, 48 in
2017-11-28 08:13:35.276185 mon.0 [INF] pgmap v57447: 3736 pgs: 2556 active+undersized+degraded, 1180 active+clean; 10343 bytes data, 3773 MB used, 174 TB / 174 TB avail; 193/791 objects degraded (24.399%)
2017-11-28 08:13:36.330316 mon.0 [INF] osdmap e542: 60 osds: 48 up, 48 in
2017-11-28 08:13:36.334183 mon.0 [INF] pgmap v57448: 3736 pgs: 135 remapped+peering, 2421 active+undersized+degraded, 1180 active+clean; 10343 bytes data, 3775 MB used, 174 TB / 174 TB avail; 174/791 objects degraded (21.997%)
2017-11-28 08:13:37.289319 mon.0 [INF] osdmap e543: 60 osds: 48 up, 48 in
2017-11-28 08:13:37.326379 mon.0 [INF] pgmap v57449: 3736 pgs: 4 active+undersized+remapped, 900 peering, 329 remapped+peering, 1323 active+undersized+degraded, 1180 active+clean; 10343 bytes data, 3784 MB used, 174 TB / 174 TB avail; 69/791 objects degraded (8.723%); 65 B/s, 117 objects/s recovering
2017-11-28 08:13:36.172666 osd.28 [INF] 8.6 starting backfill to osd.22 from (0'0,0'0] MAX to 538'46558
2017-11-28 08:13:36.174172 osd.28 [INF] 8.6 starting backfill to osd.47 from (0'0,0'0] MAX to 538'46558
2017-11-28 08:13:36.184611 osd.19 [INF] 8.5 starting backfill to osd.40 from (0'0,0'0] MAX to 538'52902
2017-11-28 08:13:36.190060 osd.24 [INF] 7.3 starting backfill to osd.8 from (0'0,0'0] MAX to 538'3172
2017-11-28 08:13:36.193337 osd.24 [INF] 7.3 starting backfill to osd.41 from (0'0,0'0] MAX to 538'3172
2017-11-28 08:13:37.517955 osd.21 [INF] 5.144 scrub starts
2017-11-28 08:13:37.518701 osd.21 [INF] 5.144 scrub ok
2017-11-28 08:13:38.235143 mon.0 [INF] osdmap e544: 60 osds: 48 up, 48 in
2017-11-28 08:13:38.250128 mon.0 [INF] pgmap v57450: 3736 pgs: 37 activating, 7 activating+remapped, 4 active+undersized+remapped, 37 active, 902 peering, 87 active+remapped, 313 remapped+peering, 793 active+undersized+degraded, 1556 active+clean; 10343 bytes data, 3789 MB used, 174 TB / 174 TB avail; 40/791 objects degraded (5.057%); 5/791 objects misplaced (0.632%); 629 B/s, 167 objects/s recovering
2017-11-28 08:13:36.157779 osd.18 [INF] 8.3 starting backfill to osd.17 from (0'0,0'0] MAX to 538'34158
2017-11-28 08:13:38.147555 osd.18 [INF] 5.203 deep-scrub starts
2017-11-28 08:13:38.148310 osd.18 [INF] 5.203 deep-scrub ok
2017-11-28 08:13:38.523380 osd.22 [INF] 5.235 scrub starts
2017-11-28 08:13:38.524181 osd.22 [INF] 5.235 scrub ok
2017-11-28 08:13:39.251064 mon.0 [INF] pgmap v57451: 3736 pgs: 37 activating, 7 activating+remapped, 4 active+undersized+remapped, 50 active, 903 peering, 117 active+remapped, 312 remapped+peering, 625 active+undersized+degraded, 1681 active+clean; 10343 bytes data, 3799 MB used, 174 TB / 174 TB avail; 25/791 objects degraded (3.161%); 5/791 objects misplaced (0.632%); 620 B/s, 0 keys/s, 58 objects/s recovering
2017-11-28 08:13:36.110274 osd.4 [INF] 8.0 starting backfill to osd.14 from (0'0,0'0] MAX to 538'49482
2017-11-28 08:13:36.112128 osd.4 [INF] 8.0 starting backfill to osd.23 from (0'0,0'0] MAX to 538'49482
2017-11-28 08:13:36.127248 osd.4 [INF] 8.0 starting backfill to osd.37 from (0'0,0'0] MAX to 538'49482
2017-11-28 08:13:40.250559 mon.0 [INF] pgmap v57452: 3736 pgs: 37 activating, 7 activating+remapped, 4 active+undersized+remapped, 52 active, 903 peering, 123 active+remapped, 311 remapped+peering, 590 active+undersized+degraded, 1709 active+clean; 10343 bytes data, 3803 MB used, 174 TB / 174 TB avail; 25/791 objects degraded (3.161%); 5/791 objects misplaced (0.632%); 77 B/s, 0 keys/s, 14 objects/s recovering
2017-11-28 08:13:36.153569 osd.2 [INF] 8.2 starting backfill to osd.6 from (0'0,0'0] MAX to 538'49646
2017-11-28 08:13:36.164089 osd.2 [INF] 8.2 starting backfill to osd.34 from (0'0,0'0] MAX to 538'49646
2017-11-28 08:13:36.217509 osd.10 [INF] 8.1 starting backfill to osd.0 from (0'0,0'0] MAX to 538'55946
2017-11-28 08:13:36.219512 osd.10 [INF] 8.1 starting backfill to osd.23 from (0'0,0'0] MAX to 538'55946
2017-11-28 08:13:37.806811 osd.10 [INF] 5.318 scrub starts
2017-11-28 08:13:37.807563 osd.10 [INF] 5.318 scrub ok
2017-11-28 08:13:36.235023 osd.45 [INF] 8.4 starting backfill to osd.2 from (0'0,0'0] MAX to 538'65004
2017-11-28 08:13:36.236576 osd.45 [INF] 8.4 starting backfill to osd.8 from (0'0,0'0] MAX to 538'65004
2017-11-28 08:13:39.607783 osd.3 [INF] 5.185 scrub starts
2017-11-28 08:13:39.608687 osd.3 [INF] 5.185 scrub ok
2017-11-28 08:13:41.357592 mon.0 [INF] pgmap v57453: 3736 pgs: 37 activating, 7 activating+remapped, 4 active+undersized+remapped, 75 active, 869 peering, 157 active+remapped, 174 remapped+peering, 540 active+undersized+degraded, 1873 active+clean; 10343 bytes data, 3813 MB used, 174 TB / 174 TB avail; 22/791 objects degraded (2.781%); 5/791 objects misplaced (0.632%); 87 B/s, 23 objects/s recovering
2017-11-28 08:13:42.397617 mon.0 [INF] pgmap v57454: 3736 pgs: 146 active, 3 peering, 338 active+remapped, 540 active+undersized+degraded, 2709 active+clean; 10343 bytes data, 3835 MB used, 174 TB / 174 TB avail; 31/791 objects degraded (3.919%); 14/791 objects misplaced (1.770%); 2765 B/s, 27 keys/s, 56 objects/s recovering
2017-11-28 08:13:37.396991 osd.14 [INF] 5.332 scrub starts
2017-11-28 08:13:37.397496 osd.14 [INF] 5.332 scrub ok
2017-11-28 08:13:42.524505 osd.6 [INF] 3.185 scrub starts
2017-11-28 08:13:42.525389 osd.6 [INF] 3.185 scrub ok
2017-11-28 08:13:43.385342 mon.0 [INF] pgmap v57455: 3736 pgs: 146 active, 338 active+remapped, 540 active+undersized+degraded, 2712 active+clean; 10343 bytes data, 3847 MB used, 174 TB / 174 TB avail; 31/791 objects degraded (3.919%); 14/791 objects misplaced (1.770%); 2768 B/s, 28 keys/s, 33 objects/s recovering
2017-11-28 08:13:43.397979 osd.14 [INF] 8.0 scrub starts
2017-11-28 08:13:43.401167 osd.14 [INF] 8.0 scrub ok
2017-11-28 08:13:44.392089 mon.0 [INF] pgmap v57456: 3736 pgs: 146 active, 338 active+remapped, 540 active+undersized+degraded, 2712 active+clean; 10343 bytes data, 3848 MB used, 174 TB / 174 TB avail; 31/791 objects degraded (3.919%); 14/791 objects misplaced (1.770%)
2017-11-28 08:13:45.206293 mon.0 [INF] HEALTH_WARN; 540 pgs degraded; 540 pgs stuck degraded; 1024 pgs stuck unclean; 540 pgs stuck undersized; 540 pgs undersized; recovery 31/791 objects degraded (3.919%); recovery 14/791 objects misplaced (1.770%)
...
2017-11-28 08:14:10.362591 osd.44 [WRN] 1 slow requests, 1 included below; oldest blocked for > 30.779132 secs
2017-11-28 08:14:10.362600 osd.44 [WRN] slow request 30.779132 seconds old, received at 2017-11-28 08:13:39.583415: osd_op(client.4740.0:153303 4.31099063 (undecoded) ondisk+write+known_if_redirected e541) currently no flag points reached
2017-11-28 08:14:11.579659 mon.0 [INF] pgmap v57474: 3736 pgs: 146 active, 338 active+remapped, 540 active+undersized+degraded, 2712 active+clean; 10343 bytes data, 3852 MB used, 174 TB / 174 TB avail; 31/791 objects degraded (3.919%); 14/791 objects misplaced (1.770%)
2017-11-28 08:14:40.365929 osd.44 [WRN] 1 slow requests, 1 included below; oldest blocked for > 60.782471 secs
2017-11-28 08:14:40.365934 osd.44 [WRN] slow request 60.782471 seconds old, received at 2017-11-28 08:13:39.583415: osd_op(client.4740.0:153303 4.31099063 (undecoded) ondisk+write+known_if_redirected e541) currently no flag points reached
2017-11-28 08:14:45.207183 mon.0 [INF] HEALTH_WARN; 540 pgs degraded; 540 pgs stuck degraded; 1024 pgs stuck unclean; 540 pgs stuck undersized; 540 pgs undersized; 1 requests are blocked > 32 sec; recovery 31/791 objects degraded (3.919%); recovery 14/791 objects misplaced (1.770%)
2017-11-28 08:14:46.657287 mon.0 [INF] pgmap v57478: 3736 pgs: 146 active, 338 active+remapped, 540 active+undersized+degraded, 2712 active+clean; 10343 bytes data, 3852 MB used, 174 TB / 174 TB avail; 31/791 objects degraded (3.919%); 14/791 objects misplaced (1.770%)
2017-11-28 08:15:40.372583 osd.44 [WRN] 1 slow requests, 1 included below; oldest blocked for > 120.789122 secs
2017-11-28 08:15:40.372589 osd.44 [WRN] slow request 120.789122 seconds old, received at 2017-11-28 08:13:39.583415: osd_op(client.4740.0:153303 4.31099063 (undecoded) ondisk+write+known_if_redirected e541) currently no flag points reached
2017-11-28 08:15:56.664417 mon.0 [INF] pgmap v57479: 3736 pgs: 146 active, 338 active+remapped, 540 active+undersized+degraded, 2712 active+clean; 10343 bytes data, 3852 MB used, 174 TB / 174 TB avail; 31/791 objects degraded (3.919%); 14/791 objects misplaced (1.770%)
# NOW CEPH STATUS IS
root@host01:~# ceph status
cluster a6f73750-1972-47f6-bcf5-a99753be65ad
health HEALTH_WARN
540 pgs degraded
540 pgs stuck degraded
1024 pgs stuck unclean
540 pgs stuck undersized
540 pgs undersized
1 requests are blocked > 32 sec
recovery 31/791 objects degraded (3.919%)
recovery 14/791 objects misplaced (1.770%)
monmap e2: 3 mons at {host01=10.212.32.23:6789/0,host02=10.212.32.24:6789/0,host03=10.212.32.25:6789/0}
election epoch 22, quorum 0,1,2 host01,host02,host03
osdmap e544: 60 osds: 48 up, 48 in; 1024 remapped pgs
flags sortbitwise,require_jewel_osds
pgmap v57508: 3736 pgs, 19 pools, 10343 bytes data, 241 objects
3786 MB used, 174 TB / 174 TB avail
31/791 objects degraded (3.919%)
14/791 objects misplaced (1.770%)
2712 active+clean
540 active+undersized+degraded
338 active+remapped
146 active
root@host01:~#
# LOOKS THAT 338 PGs IN ERASURE CODED POOLS HAVE BEEN REMAPPED
# I DONT GET WHY 540 PGs STILL ENCOUNTER active+undersized+degraded STATE
root@host01:~# ceph pg dump pgs_brief |grep 'active+remapped'
dumped pgs_brief in format plain
16.6f active+remapped [43,2147483647,2,31,12] 43 [43,33,2,31,12] 43
16.6e active+remapped [10,5,35,44,2147483647] 10 [10,5,35,44,41] 10
....
root@host01:~# egrep '16.6f|16.6e' PGs_on_HOST_host05
16.6f active+clean [43,33,2,59,12] 43 [43,33,2,59,12] 43
16.6e active+clean [10,5,49,35,41] 10 [10,5,49,35,41] 10
root@host01:~#
root@host01:~# ceph pg dump pgs_brief |grep 'active+undersized+degraded'
dumped pgs_brief in format plain
19.6c active+undersized+degraded [24,20,19,2147483647,46] 24 [24,20,19,2147483647,46] 24
17.6e active+undersized+degraded [19,2147483647,36,31,5] 19 [19,2147483647,36,31,5] 19
...
root@host01:~# egrep '19.6c|17.6e' PGs_on_HOST_host05
19.6c active+clean [24,20,19,58,46] 24 [24,20,19,58,46] 24
17.6e active+clean [19,59,36,31,5] 19 [19,59,36,31,5] 19
root@host01:~#
# POOLS DETAILS
root@host01:~# ceph osd lspools
0 rbd,1 .rgw.root,2 vms,3 images,4 default.rgw.control,5 volumes,6 default.rgw.data.root.old,7 default.rgw.gc,8 default.rgw.log,9 default.rgw.users.uid,10 default.rgw.users.keys,11 default.rgw.users.email,12 default.rgw.buckets.index,13 default.rgw.usage,14 default.rgw.buckets.data.old,15 ecpool_3_2,16 default.rgw.data.root,17 default.rgw.data.root.new01,19 default.rgw.buckets.data,
rbd size: 3 pgp_num: 64
.rgw.root size: 3 pgp_num: 8
vms size: 3 pgp_num: 1024
images size: 3 pgp_num: 512
default.rgw.control size: 3 pgp_num: 8
volumes size: 3 pgp_num: 1024
default.rgw.data.root.old size: 3 pgp_num: 8
default.rgw.gc size: 3 pgp_num: 8
default.rgw.log size: 3 pgp_num: 8
default.rgw.users.uid size: 3 pgp_num: 8
default.rgw.users.keys size: 3 pgp_num: 8
default.rgw.users.email size: 3 pgp_num: 8
default.rgw.buckets.index size: 3 pgp_num: 8
default.rgw.usage size: 3 pgp_num: 8
default.rgw.buckets.data.old size: 3 pgp_num: 8
ecpool_3_2 size: 5 pgp_num: 256
default.rgw.data.root size: 5 pgp_num: 256
default.rgw.data.root.new01 size: 5 pgp_num: 256
default.rgw.buckets.data size: 5 pgp_num: 256
# EC pools use below profile
root@host01:~# ceph osd erasure-code-profile get ec_profile_k_3_m_2
jerasure-per-chunk-alignment=false
k=3
m=2
plugin=jerasure
ruleset-failure-domain=host
ruleset-root=default
technique=reed_sol_van
w=8
root@host01:~#
# PGs that are in active+remapped or active+undersized+degraded state belong to erasure coded pools only
root@host01:~# ceph pg dump pgs_brief |grep 'active+remapped' |cut -d '.' -f1 |sort |uniq
dumped pgs_brief in format plain
15
16
17
19
root@host01:~# ceph pg dump pgs_brief |grep 'active+undersized+degraded' |cut -d '.' -f1 |sort |uniq
dumped pgs_brief in format plain
15
16
17
19
# FINALLY, CRUSH MAP IS
root@host01:~# cat crushmap.txt
# begin crush map
tunable choose_local_tries 0
tunable choose_local_fallback_tries 0
tunable choose_total_tries 50
tunable chooseleaf_descend_once 1
tunable chooseleaf_vary_r 1
tunable straw_calc_version 1
# devices
device 0 osd.0
device 1 osd.1
device 2 osd.2
device 3 osd.3
device 4 osd.4
device 5 osd.5
device 6 osd.6
device 7 osd.7
device 8 osd.8
device 9 osd.9
device 10 osd.10
device 11 osd.11
device 12 osd.12
device 13 osd.13
device 14 osd.14
device 15 osd.15
device 16 osd.16
device 17 osd.17
device 18 osd.18
device 19 osd.19
device 20 osd.20
device 21 osd.21
device 22 osd.22
device 23 osd.23
device 24 osd.24
device 25 osd.25
device 26 osd.26
device 27 osd.27
device 28 osd.28
device 29 osd.29
device 30 osd.30
device 31 osd.31
device 32 osd.32
device 33 osd.33
device 34 osd.34
device 35 osd.35
device 36 osd.36
device 37 osd.37
device 38 osd.38
device 39 osd.39
device 40 osd.40
device 41 osd.41
device 42 osd.42
device 43 osd.43
device 44 osd.44
device 45 osd.45
device 46 osd.46
device 47 osd.47
device 48 osd.48
device 49 osd.49
device 50 osd.50
device 51 osd.51
device 52 osd.52
device 53 osd.53
device 54 osd.54
device 55 osd.55
device 56 osd.56
device 57 osd.57
device 58 osd.58
device 59 osd.59
# types
type 0 osd
type 1 host
type 2 chassis
type 3 rack
type 4 row
type 5 pdu
type 6 pod
type 7 room
type 8 datacenter
type 9 region
type 10 root
# buckets
host host01 {
id -2 # do not change unnecessarily
# weight 43.637
alg straw
hash 0 # rjenkins1
item osd.0 weight 3.636
item osd.3 weight 3.636
item osd.4 weight 3.636
item osd.6 weight 3.636
item osd.8 weight 3.636
item osd.10 weight 3.636
item osd.12 weight 3.636
item osd.14 weight 3.636
item osd.16 weight 3.636
item osd.19 weight 3.636
item osd.22 weight 3.636
item osd.25 weight 3.636
}
rack Rack01 {
id -3 # do not change unnecessarily
# weight 43.637
alg straw
hash 0 # rjenkins1
item host01 weight 43.637
}
host host02 {
id -7 # do not change unnecessarily
# weight 43.637
alg straw
hash 0 # rjenkins1
item osd.1 weight 3.636
item osd.2 weight 3.636
item osd.5 weight 3.636
item osd.7 weight 3.636
item osd.9 weight 3.636
item osd.11 weight 3.636
item osd.13 weight 3.636
item osd.15 weight 3.636
item osd.17 weight 3.636
item osd.20 weight 3.636
item osd.23 weight 3.636
item osd.26 weight 3.636
}
rack Rack02 {
id -8 # do not change unnecessarily
# weight 43.637
alg straw
hash 0 # rjenkins1
item host02 weight 43.637
}
host host03 {
id -9 # do not change unnecessarily
# weight 43.637
alg straw
hash 0 # rjenkins1
item osd.18 weight 3.636
item osd.21 weight 3.636
item osd.24 weight 3.636
item osd.27 weight 3.636
item osd.28 weight 3.636
item osd.29 weight 3.636
item osd.30 weight 3.636
item osd.31 weight 3.636
item osd.32 weight 3.636
item osd.33 weight 3.636
item osd.34 weight 3.636
item osd.35 weight 3.636
}
host host04 {
id -11 # do not change unnecessarily
# weight 43.637
alg straw
hash 0 # rjenkins1
item osd.36 weight 3.636
item osd.37 weight 3.636
item osd.38 weight 3.636
item osd.39 weight 3.636
item osd.40 weight 3.636
item osd.41 weight 3.636
item osd.42 weight 3.636
item osd.43 weight 3.636
item osd.44 weight 3.636
item osd.45 weight 3.636
item osd.46 weight 3.636
item osd.47 weight 3.636
}
host host05 {
id -12 # do not change unnecessarily
# weight 43.637
alg straw
hash 0 # rjenkins1
item osd.48 weight 3.636
item osd.49 weight 3.636
item osd.50 weight 3.636
item osd.51 weight 3.636
item osd.52 weight 3.636
item osd.53 weight 3.636
item osd.54 weight 3.636
item osd.55 weight 3.636
item osd.56 weight 3.636
item osd.57 weight 3.636
item osd.58 weight 3.636
item osd.59 weight 3.636
}
rack Rack03 {
id -10 # do not change unnecessarily
# weight 130.910
alg straw
hash 0 # rjenkins1
item host03 weight 43.637
item host04 weight 43.637
item host05 weight 43.637
}
room MyRoom {
id -4 # do not change unnecessarily
# weight 218.184
alg straw
hash 0 # rjenkins1
item Rack01 weight 43.637
item Rack02 weight 43.637
item Rack03 weight 130.910
}
datacenter MyDC {
id -5 # do not change unnecessarily
# weight 218.184
alg straw
hash 0 # rjenkins1
item MyRoom weight 218.184
}
region MyRegion {
id -6 # do not change unnecessarily
# weight 218.184
alg straw
hash 0 # rjenkins1
item MyDC weight 218.184
}
root default {
id -1 # do not change unnecessarily
# weight 218.184
alg straw
hash 0 # rjenkins1
item MyRegion weight 218.184
}
# rules
rule replicated_ruleset {
ruleset 0
type replicated
min_size 1
max_size 10
step take default
step chooseleaf firstn 0 type host
step emit
}
rule ecpool_3_2 {
ruleset 1
type erasure
min_size 3
max_size 5
step set_chooseleaf_tries 5
step set_choose_tries 100
step take default
step chooseleaf indep 0 type host
step emit
}
rule default.rgw.data.root.new {
ruleset 2
type erasure
min_size 3
max_size 5
step set_chooseleaf_tries 5
step set_choose_tries 100
step take default
step chooseleaf indep 0 type host
step emit
}
rule default.rgw.data.root.new01 {
ruleset 3
type erasure
min_size 3
max_size 5
step set_chooseleaf_tries 5
step set_choose_tries 100
step take default
step chooseleaf indep 0 type host
step emit
}
rule default.rgw.buckets.data.new {
ruleset 4
type erasure
min_size 3
max_size 5
step set_chooseleaf_tries 5
step set_choose_tries 100
step take default
step chooseleaf indep 0 type host
step emit
}
# end crush map
root@host01:~#
Jakub
_______________________________________________ ceph-users mailing list ceph-users@xxxxxxxxxxxxxx http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com