Re: osd' balancing question

Yair Magnezi <yair.magnezi@xxxxxxxxxxx> · Tue, 3 Jan 2017 14:57:16 +0200

Hello Christian .Sorry for my mistake it's  Infernalis  we're running ( 9.2.1 ) 

our tree looks like this -->

root@ecprdbcph01-opens:~# ceph osd tree
ID WEIGHT   TYPE NAME                  UP/DOWN REWEIGHT PRIMARY-AFFINITY
-1 51.26370 root default
-2  8.49396     host ecprdbcph03-opens
 3  0.86800         osd.3                   up  1.00000          1.00000
 4  0.86800         osd.4                   up  1.00000          1.00000
 6  0.86800         osd.6                   up  1.00000          1.00000
 8  0.86800         osd.8                   up  1.00000          1.00000
10  0.86800         osd.10                  up  1.00000          1.00000
12  0.79999         osd.12                  up  1.00000          1.00000
14  0.75000         osd.14                  up  1.00000          1.00000
16  0.86800         osd.16                  up  1.00000          1.00000
18  0.86800         osd.18                  up  1.00000          1.00000
29  0.86800         osd.29                  up  1.00000          1.00000
-3  8.40794     host ecprdbcph01-opens
 5  0.84999         osd.5                   up  0.87172          1.00000
 7  0.86800         osd.7                   up  1.00000          1.00000
 9  0.86800         osd.9                   up  0.85579          1.00000
11  0.86800         osd.11                  up  1.00000          1.00000
13  0.84999         osd.13                  up  1.00000          1.00000
15  0.86800         osd.15                  up  1.00000          1.00000
17  0.79999         osd.17                  up  1.00000          1.00000
19  0.86800         osd.19                  up  1.00000          1.00000
 1  0.70000         osd.1                   up  1.00000          1.00000
 2  0.86800         osd.2                   up  1.00000          1.00000
-4  8.45793     host ecprdbcph02-opens
20  0.84999         osd.20                  up  0.85753          1.00000
25  0.86800         osd.25                  up  1.00000          1.00000
26  0.86800         osd.26                  up  1.00000          1.00000
21  0.79999         osd.21                  up  0.86604          1.00000
22  0.86800         osd.22                  up  1.00000          1.00000
23  0.86800         osd.23                  up  1.00000          1.00000
24  0.79999         osd.24                  up  1.00000          1.00000
27  0.86800         osd.27                  up  0.79852          1.00000
28  0.86800         osd.28                  up  0.89223          1.00000
 0  0.79999         osd.0                   up  0.84404          1.00000
-5  8.67996     host ecprdbcph06-opens
30  0.86800         osd.30                  up  0.90486          1.00000
32  0.86800         osd.32                  up  0.83070          1.00000
34  0.86800         osd.34                  up  1.00000          1.00000
41  0.86800         osd.41                  up  0.83226          1.00000
44  0.86800         osd.44                  up  1.00000          1.00000
47  0.86800         osd.47                  up  1.00000          1.00000
49  0.86800         osd.49                  up  1.00000          1.00000
51  0.86800         osd.51                  up  1.00000          1.00000
53  0.86800         osd.53                  up  1.00000          1.00000
55  0.86800         osd.55                  up  1.00000          1.00000
-6  8.54395     host ecprdbcph04-opens
31  0.86800         osd.31                  up  0.85638          1.00000
36  0.86800         osd.36                  up  1.00000          1.00000
38  0.86800         osd.38                  up  1.00000          1.00000
40  0.79999         osd.40                  up  0.89485          1.00000
42  0.86800         osd.42                  up  0.86018          1.00000
45  0.79999         osd.45                  up  1.00000          1.00000
48  0.86800         osd.48                  up  0.88635          1.00000
50  0.86800         osd.50                  up  1.00000          1.00000
52  0.86800         osd.52                  up  1.00000          1.00000
54  0.86800         osd.54                  up  1.00000          1.00000
-7  8.67996     host ecprdbcph05-opens
33  0.86800         osd.33                  up  1.00000          1.00000
35  0.86800         osd.35                  up  1.00000          1.00000
37  0.86800         osd.37                  up  1.00000          1.00000
39  0.86800         osd.39                  up  1.00000          1.00000
43  0.86800         osd.43                  up  1.00000          1.00000
46  0.86800         osd.46                  up  0.85481          1.00000
56  0.86800         osd.56                  up  1.00000          1.00000
57  0.86800         osd.57                  up  0.88829          1.00000
58  0.86800         osd.58                  up  1.00000          1.00000
59  0.86800         osd.59                  up  0.80495          1.00000
root@ecprdbcph01-opens:~#

we have an ongoing capacity issue as you can see below ( although we're only using less then 80% ) 

root@ecprdbcph01-opens:/var/lib/ceph/osd/ceph-11/current# ceph df
GLOBAL:
    SIZE       AVAIL      RAW USED     %RAW USED
    53329G     11219G       42110G         78.96

osd.12 is near full at 85%
osd.16 is near full at 85%
osd.17 is near full at 87%
osd.19 is near full at 85%
osd.22 is near full at 87%
osd.24 is near full at 87%
osd.29 is near full at 85%
osd.33 is near full at 86%
osd.39 is near full at 85%
osd.42 is near full at 87%
osd.45 is near full at 87%
osd.47 is near full at 87%
osd.49 is near full at 88%
osd.58 is near full at 87%

i'm trying to decrease the weigh as you've suggested but it looks like we have some troubles :

ceph osd crush reweight osd.11 0.98

tail -f ceph-osd.11.log

2017-01-03 07:38:41.952538 7f9a5c7e1700  0 -- 10.63.4.1:6808/3301342 >> 10.63.4.19:6827/2264381 pipe(0x7f9ad3df4000 sd=442 :6808 s=0 pgs=0 cs=0 l=0 c=0x7f9ac2530000).accept connect_seq 34 vs existing 33 state standby
2017-01-03 07:41:46.566313 7f9a73871700  0 -- 10.63.4.1:6808/3301342 >> 10.63.4.1:6830/3303583 pipe(0x7f9ac80d5000 sd=376 :6808 s=0 pgs=0 cs=0 l=0 c=0x7f9ac2530160).accept connect_seq 4 vs existing 4 state standby
2017-01-03 07:41:46.566370 7f9a73871700  0 -- 10.63.4.1:6808/3301342 >> 10.63.4.1:6830/3303583 pipe(0x7f9ac80d5000 sd=376 :6808 s=0 pgs=0 cs=0 l=0 c=0x7f9ac2530160).accept connect_seq 5 vs existing 4 state standby
2017-01-03 07:41:46.585562 7f9a631d9700  0 -- 10.63.4.1:6808/3301342 >> 10.63.4.1:6824/3303035 pipe(0x7f9ab9940000 sd=283 :6808 s=0 pgs=0 cs=0 l=0 c=0x7f9ac2532ec0).accept connect_seq 5 vs existing 5 state standby
2017-01-03 07:41:46.585608 7f9a631d9700  0 -- 10.63.4.1:6808/3301342 >> 10.63.4.1:6824/3303035 pipe(0x7f9ab9940000 sd=283 :6808 s=0 pgs=0 cs=0 l=0 c=0x7f9ac2532ec0).accept connect_seq 6 vs existing 5 state standby

in general i've also tried to use reweight-by-utilization but it doesn't seem to work so well 

Is there any known bug with our version  ? will a  restart of the osds solve this issue ( it was menstioned in one of the forum's threads but it was related to firefly ) 

Many Thanks .

Yair  Magnezi 
Storage & Data Protection TL   // Kenshoo
Office +972 7 32862423   // Mobile +972 50 575-2955
__________________________________________

On Tue, Jan 3, 2017 at 1:41 PM, Christian Balzer <chibi@xxxxxxx> wrote:

Hello,

On Tue, 3 Jan 2017 13:08:50 +0200 Yair Magnezi wrote:

> Hello cephers

> We're running firefly  ( 9.2.1 )

One of these two is wrong, you're either running Firefly (0.8.x, old and

unsupported) or Infernalis (9.2.x, non-LTS and thus also unsupported).

> I'm trying to re balance  our cluster's osd and from some reason it looks

> like the re balance is going the wrong way :

A "ceph osd tree" would be helpful for starters.

> What's i'm trying to do is to reduce the loads from osd-14  ( ceph osd

> crush reweight osd.14 0.75 ) but what i see is the the backfill process  is

> moving pgs to osd-29 which is also  86% full

> i wonder why the crash doesn't map to the less occupied  osd-s  (  3 , 4  6

> for example )

>  Any input is much appreciated .

>

CRUSH isn't particular deterministic from a human perspective and often

data movements will involve steps that are not anticipated.

CRUSH also does NOT know nor involve the utilization of OSDs, only their

weight counts.

If you're having extreme in-balances, RAISE the weight of the least

utilized OSDs first (and in very small increments until you get a

feeling for things).

Do this in a manner to keep the weights of hosts more or less the same

in the end.

Christian

>

>

> 2017-01-03 05:59:20.877705 7f3e6a0d6700  0 log_channel(cluster) log

> [INF] : *2.2cb

> starting backfill to osd.29 from* (0'0,0'0] MAX to 131306'8029954

> 2017-01-03 05:59:20.877841 7f3e670d0700  0 log_channel(cluster) log [INF] :

> 2.30d starting backfill to osd.10 from (0'0,0'0] MAX to 131306'8721158

> 2017-01-03 05:59:31.374323 7f3e356b0700  0 -- 10.63.4.3:6826/3125306 >>

> 10.63.4.5:6821/3162046 pipe(0x7f3e9d513000 sd=322 :6826 s=0 pgs=0 cs=0 l=0

> c=0x7f3ea72b5de0).accept connect_seq 1605 vs existing 1605 state standby

> 2017-01-03 05:59:31.374440 7f3e356b0700  0 -- 10.63.4.3:6826/3125306 >>

> 10.63.4.5:6821/3162046 pipe(0x7f3e9d513000 sd=322 :6826 s=0 pgs=0 cs=0 l=0

> c=0x7f3ea72b5de0).accept connect_seq 1606 vs existing 1605 state standby

> ^C

> root@ecprdbcph03-opens:/var/log/ceph# df -h

> Filesystem                           Size  Used Avail Use% Mounted on

> udev                                  32G  4.0K   32G   1% /dev

> tmpfs                                6.3G  1.4M  6.3G   1% /run

> /dev/dm-1                            106G  4.1G   96G   5% /

> none                                 4.0K     0  4.0K   0% /sys/fs/cgroup

> none                                 5.0M     0  5.0M   0% /run/lock

> none                                  32G     0   32G   0% /run/shm

> none                                 100M     0  100M   0% /run/user

> /dev/sdk2                            465M   50M  391M  12% /boot

> /dev/sdk1                            512M  3.4M  509M   1% /boot/efi

> ec-mapr-prd:/mapr/ec-mapr-prd/homes  262T  143T  119T  55% /export/home

> /dev/sde1                            889G  640G  250G  72%

> /var/lib/ceph/osd/ceph-3

> /dev/sdf1                            889G  656G  234G  74%

> /var/lib/ceph/osd/ceph-4

> /dev/sdg1                            889G  583G  307G  66%

> /var/lib/ceph/osd/ceph-6

> /dev/sda1                            889G  559G  331G  63%

> /var/lib/ceph/osd/ceph-8

> /dev/sdb1                            889G  651G  239G  74%

> /var/lib/ceph/osd/ceph-10

> /dev/sdc1                            889G  751G  139G  85%

> /var/lib/ceph/osd/ceph-12

> /dev/sdh1                            889G  759G  131G  86%

> /var/lib/ceph/osd/ceph-14

> /dev/sdi1                            889G  763G  127G  86%

> /var/lib/ceph/osd/ceph-16

> /dev/sdj1                            889G  732G  158G  83%

> /var/lib/ceph/osd/ceph-18

> /dev/sdd1                            889G  756G  134G  86%

> /var/lib/ceph/osd/ceph-29

> root@ecprdbcph03-opens:/var/log/ceph#

>

> Thanks

>

>

>

> *Yair Magnezi *

>

>

>

> *Storage & Data Protection TL   // Kenshoo*

>

--

Christian Balzer        Network/Systems Engineer

chibi@xxxxxxx           Global OnLine Japan/Rakuten Communications

http://www.gol.com/

This e-mail, as well as any attached document, may contain material which is confidential and privileged and may include trademark, copyright and other intellectual property rights that are proprietary to Kenshoo Ltd,  its subsidiaries or affiliates ("Kenshoo"). This e-mail and its attachments may be read, copied and used only by the addressee for the purpose(s) for which it was disclosed herein. If you have received it in error, please destroy the message and any attachment, and contact us immediately. If you are not the intended recipient, be aware that any review, reliance, disclosure, copying, distribution or use of the contents of this message without Kenshoo's express permission is strictly prohibited._______________________________________________
ceph-users mailing list
ceph-users@xxxxxxxxxxxxxx
http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com