Re: Problems after migrating to straw2 (to enable the balancer)

Massimo Sgaravatto <massimo.sgaravatto@xxxxxxxxx> · Mon, 14 Jan 2019 15:29:06 +0100

This [*]is the output of "ceph osd df".

Thanks a lot !
Massimo

[*]
[root@ceph-mon-01 ~]# ceph osd df
ID CLASS WEIGHT  REWEIGHT SIZE  USE   AVAIL %USE  VAR  PGS 
30   hdd 5.45609  1.00000 5587G 1875G 3711G 33.57 0.65 140 
31   hdd 5.45609  1.00000 5587G 3951G 1635G 70.72 1.38 144 
32   hdd 5.45609  1.00000 5587G 3426G 2160G 61.33 1.19 127 
33   hdd 5.45609  1.00000 5587G 3548G 2038G 63.51 1.24 167 
34   hdd 5.45609  1.00000 5587G 1847G 3739G 33.06 0.64 121 
35   hdd 5.45609  1.00000 5587G 2496G 3090G 44.68 0.87 161 
36   hdd 5.45609  1.00000 5587G 3038G 2548G 54.38 1.06 153 
37   hdd 5.45609  1.00000 5587G 2834G 2752G 50.73 0.99 122 
38   hdd 5.45609  1.00000 5587G 2781G 2805G 49.79 0.97 124 
39   hdd 5.45609  1.00000 5587G 3362G 2224G 60.18 1.17 141 
40   hdd 5.45609  1.00000 5587G 2738G 2848G 49.02 0.95 139 
41   hdd 5.45609  1.00000 5587G 2924G 2662G 52.35 1.02 129 
42   hdd 5.45609  1.00000 5587G 2195G 3391G 39.29 0.77 116 
43   hdd 5.45609  1.00000 5587G 2654G 2932G 47.51 0.93 132 
44   hdd 5.45609  1.00000 5587G 3180G 2406G 56.93 1.11 125 
45   hdd 5.45609  1.00000 5587G 2727G 2859G 48.82 0.95 152 
46   hdd 5.45609  1.00000 5587G 2844G 2742G 50.91 0.99 153 
47   hdd 5.45609  1.00000 5587G 2611G 2975G 46.74 0.91 127 
48   hdd 5.45609  1.00000 5587G 3575G 2011G 63.99 1.25 139 
49   hdd 5.45609  1.00000 5587G 1876G 3710G 33.59 0.65 121 
10   hdd 5.45609  1.00000 5587G 2884G 2702G 51.64 1.01 128 
11   hdd 5.45609  1.00000 5587G 3401G 2185G 60.89 1.19 130 
12   hdd 5.45609  1.00000 5587G 4023G 1563G 72.01 1.40 153 
13   hdd 5.45609  1.00000 5587G 1303G 4283G 23.34 0.45 131 
14   hdd 5.45609  1.00000 5587G 2792G 2794G 49.97 0.97 135 
15   hdd 5.45609  1.00000 5587G 1765G 3821G 31.61 0.62 123 
16   hdd 5.45609  1.00000 5587G 3958G 1628G 70.86 1.38 152 
17   hdd 5.45609  1.00000 5587G 4362G 1224G 78.09 1.52 139 
18   hdd 5.45609  1.00000 5587G 2766G 2820G 49.51 0.96 144 
19   hdd 5.45609  1.00000 5587G 3427G 2159G 61.34 1.19 131 
20   hdd 5.45609  1.00000 5587G 3226G 2360G 57.75 1.12 162 
21   hdd 5.45609  1.00000 5587G 2247G 3339G 40.22 0.78 146 
22   hdd 5.45609  1.00000 5587G 2128G 3458G 38.10 0.74 124 
23   hdd 5.45609  1.00000 5587G 2749G 2837G 49.21 0.96 133 
24   hdd 5.45609  1.00000 5587G 3979G 1607G 71.24 1.39 148 
25   hdd 5.45609  1.00000 5587G 2179G 3407G 39.02 0.76 121 
26   hdd 5.45609  1.00000 5587G 3860G 1726G 69.09 1.35 151 
27   hdd 5.45609  1.00000 5587G 2161G 3425G 38.68 0.75 137 
28   hdd 5.45609  1.00000 5587G 3898G 1688G 69.78 1.36 141 
29   hdd 5.45609  1.00000 5587G 2355G 3231G 42.15 0.82 121 
 0   hdd 5.45609  1.00000 5587G 3294G 2292G 58.97 1.15 127 
 1   hdd 5.45609  1.00000 5587G 2515G 3071G 45.02 0.88 132 
 2   hdd 5.45609  1.00000 5587G 3300G 2286G 59.07 1.15 144 
 3   hdd 5.45609  1.00000 5587G 2943G 2643G 52.68 1.03 151 
 4   hdd 5.45609  1.00000 5587G 2641G 2945G 47.29 0.92 114 
 5   hdd 5.45609  1.00000 5587G 2786G 2801G 49.87 0.97 131 
 6   hdd 5.45609  1.00000 5587G 2564G 3022G 45.90 0.89 121 
 7   hdd 5.45609  1.00000 5587G 1923G 3663G 34.43 0.67 143 
 8   hdd 5.45609  1.00000 5587G 2625G 2961G 46.99 0.91 130 
 9   hdd 5.45609  1.00000 5587G 2921G 2665G 52.30 1.02 140 
                    TOTAL  272T  140T  132T 51.36          
MIN/MAX VAR: 0.45/1.52  STDDEV: 12.09

On Mon, Jan 14, 2019 at 3:22 PM Wido den Hollander <wido@xxxxxxxx> wrote:

On 1/14/19 3:18 PM, Massimo Sgaravatto wrote:

> Thanks for the prompt reply

> 

> Indeed I have different racks with different weights. 

> Below the ceph osd tree" output

> 

Can you also show the output of 'ceph osd df' ?

The amount of PGs might be on the low side which also causes this imbalance.

If you do not have enough PGs CRUSH can't properly distribute either.

Wido

> [root@ceph-mon-01 ~]# ceph osd tree

> ID CLASS WEIGHT    TYPE NAME                 STATUS REWEIGHT PRI-AFF 

> -1       272.80426 root default                                      

> -7       109.12170     rack Rack11-PianoAlto                         

> -8        54.56085         host ceph-osd-04                          

> 30   hdd   5.45609             osd.30            up  1.00000 1.00000 

> 31   hdd   5.45609             osd.31            up  1.00000 1.00000 

> 32   hdd   5.45609             osd.32            up  1.00000 1.00000 

> 33   hdd   5.45609             osd.33            up  1.00000 1.00000 

> 34   hdd   5.45609             osd.34            up  1.00000 1.00000 

> 35   hdd   5.45609             osd.35            up  1.00000 1.00000 

> 36   hdd   5.45609             osd.36            up  1.00000 1.00000 

> 37   hdd   5.45609             osd.37            up  1.00000 1.00000 

> 38   hdd   5.45609             osd.38            up  1.00000 1.00000 

> 39   hdd   5.45609             osd.39            up  1.00000 1.00000 

> -9        54.56085         host ceph-osd-05                          

> 40   hdd   5.45609             osd.40            up  1.00000 1.00000 

> 41   hdd   5.45609             osd.41            up  1.00000 1.00000 

> 42   hdd   5.45609             osd.42            up  1.00000 1.00000 

> 43   hdd   5.45609             osd.43            up  1.00000 1.00000 

> 44   hdd   5.45609             osd.44            up  1.00000 1.00000 

> 45   hdd   5.45609             osd.45            up  1.00000 1.00000 

> 46   hdd   5.45609             osd.46            up  1.00000 1.00000 

> 47   hdd   5.45609             osd.47            up  1.00000 1.00000 

> 48   hdd   5.45609             osd.48            up  1.00000 1.00000 

> 49   hdd   5.45609             osd.49            up  1.00000 1.00000 

> -6       109.12170     rack Rack15-PianoAlto                         

> -3        54.56085         host ceph-osd-02                          

> 10   hdd   5.45609             osd.10            up  1.00000 1.00000 

> 11   hdd   5.45609             osd.11            up  1.00000 1.00000 

> 12   hdd   5.45609             osd.12            up  1.00000 1.00000 

> 13   hdd   5.45609             osd.13            up  1.00000 1.00000 

> 14   hdd   5.45609             osd.14            up  1.00000 1.00000 

> 15   hdd   5.45609             osd.15            up  1.00000 1.00000 

> 16   hdd   5.45609             osd.16            up  1.00000 1.00000 

> 17   hdd   5.45609             osd.17            up  1.00000 1.00000 

> 18   hdd   5.45609             osd.18            up  1.00000 1.00000 

> 19   hdd   5.45609             osd.19            up  1.00000 1.00000 

> -4        54.56085         host ceph-osd-03                          

> 20   hdd   5.45609             osd.20            up  1.00000 1.00000 

> 21   hdd   5.45609             osd.21            up  1.00000 1.00000 

> 22   hdd   5.45609             osd.22            up  1.00000 1.00000 

> 23   hdd   5.45609             osd.23            up  1.00000 1.00000 

> 24   hdd   5.45609             osd.24            up  1.00000 1.00000 

> 25   hdd   5.45609             osd.25            up  1.00000 1.00000 

> 26   hdd   5.45609             osd.26            up  1.00000 1.00000 

> 27   hdd   5.45609             osd.27            up  1.00000 1.00000 

> 28   hdd   5.45609             osd.28            up  1.00000 1.00000 

> 29   hdd   5.45609             osd.29            up  1.00000 1.00000 

> -5        54.56085     rack Rack17-PianoAlto                         

> -2        54.56085         host ceph-osd-01                          

>  0   hdd   5.45609             osd.0             up  1.00000 1.00000 

>  1   hdd   5.45609             osd.1             up  1.00000 1.00000 

>  2   hdd   5.45609             osd.2             up  1.00000 1.00000 

>  3   hdd   5.45609             osd.3             up  1.00000 1.00000 

>  4   hdd   5.45609             osd.4             up  1.00000 1.00000 

>  5   hdd   5.45609             osd.5             up  1.00000 1.00000 

>  6   hdd   5.45609             osd.6             up  1.00000 1.00000 

>  7   hdd   5.45609             osd.7             up  1.00000 1.00000 

>  8   hdd   5.45609             osd.8             up  1.00000 1.00000 

>  9   hdd   5.45609             osd.9             up  1.00000 1.00000 

> [root@ceph-mon-01 ~]#

> 

> On Mon, Jan 14, 2019 at 3:13 PM Dan van der Ster <dan@xxxxxxxxxxxxxx

> <mailto:dan@xxxxxxxxxxxxxx>> wrote:

> 

>     On Mon, Jan 14, 2019 at 3:06 PM Massimo Sgaravatto

>     <massimo.sgaravatto@xxxxxxxxx <mailto:massimo.sgaravatto@xxxxxxxxx>>

>     wrote:

>     >

>     > I have a ceph luminous cluster running on CentOS7 nodes.

>     > This cluster has 50 OSDs, all with the same size and all with the

>     same weight.

>     >

>     > Since I noticed that there was a quite "unfair" usage of OSD nodes

>     (some used at 30 %, some used at 70 %) I tried to activate the balancer.

>     >

>     > But the balancer doesn't start I guess because of this problem:

>     >

>     > [root@ceph-mon-01 ~]# ceph osd crush weight-set create-compat

>     > Error EPERM: crush map contains one or more bucket(s) that are not

>     straw2

>     >

>     >

>     > So I issued the command to convert from straw to straw2 (all the

>     clients are running luminous):

>     >

>     >

>     > [root@ceph-mon-01 ~]# ceph osd crush set-all-straw-buckets-to-straw2

>     > Error EINVAL: new crush map requires client version hammer but

>     require_min_compat_client is firefly

>     > [root@ceph-mon-01 ~]# ceph osd set-require-min-compat-client jewel

>     > set require_min_compat_client to jewel

>     > [root@ceph-mon-01 ~]# ceph osd crush set-all-straw-buckets-to-straw2

>     > [root@ceph-mon-01 ~]#

>     >

>     >

>     > After having issued the command, the cluster went in WARNING state

>     because ~ 12 % objects were misplaced.

>     >

>     > Is this normal ?

>     > I read somewhere that the migration from straw to straw2 should

>     trigger a data migration only if the OSDs have different sizes,

>     which is not my case.

> 

>     The relevant sizes to compare are the crush buckets across which you

>     are replicating.

>     Are you replicating host-wise or rack-wise?

>     Do you have hosts/racks with a different crush weight (e.g. different

>     crush size).

>     Maybe share your `ceph osd tree`.

> 

>     Cheers, dan

> 

> 

> 

>     >

>     >

>     > The cluster is still recovering, but what is worrying me is that

>     it looks like that data are being moved to the most used OSDs and

>     the MAX_AVAIL value is decreasing quite quickly.

>     >

>     > I hope that the recovery can finish without causing problems: then

>     I will immediately activate the balancer.

>     >

>     > But, if some OSDs are getting too full, is it safe to decrease

>     their weights  while the cluster is still being recovered ?

>     >

>     > Thanks a lot for your help

>     > Of course I can provide other info, if needed

>     >

>     >

>     > Cheers, Massimo

>     >

>     > _______________________________________________

>     > ceph-users mailing list

>     > ceph-users@xxxxxxxxxxxxxx <mailto:ceph-users@xxxxxxxxxxxxxx>

>     > http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com

> 

> 

> _______________________________________________

> ceph-users mailing list

> ceph-users@xxxxxxxxxxxxxx

> http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com

> 

_______________________________________________
ceph-users mailing list
ceph-users@xxxxxxxxxxxxxx
http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com