Hi all,
after encountering a warning about one of my OSDs running out of space i tried to study better how data distribution works.
I'm running a Hammer Ceph cluster v. 0.94.7
I did some test with crushtool trying to figure out how to achieve even data distribution across OSDs.
Let's take this simple CRUSH MAP:
# begin crush map
tunable choose_local_tries 0
tunable choose_local_fallback_tries 0
tunable choose_total_tries 50
tunable chooseleaf_descend_once 1
tunable straw_calc_version 1
tunable chooseleaf_vary_r 1
# devices
# ceph-osd-001
device 0 osd.0 # sata-p
device 1 osd.1 # sata-p
device 3 osd.3 # sata-p
device 4 osd.4 # sata-p
device 5 osd.5 # sata-p
device 7 osd.7 # sata-p
device 9 osd.9 # sata-p
device 10 osd.10 # sata-p
device 11 osd.11 # sata-p
device 13 osd.13 # sata-p
# ceph-osd-002
device 14 osd.14 # sata-p
device 15 osd.15 # sata-p
device 16 osd.16 # sata-p
device 18 osd.18 # sata-p
device 19 osd.19 # sata-p
device 21 osd.21 # sata-p
device 23 osd.23 # sata-p
device 24 osd.24 # sata-p
device 25 osd.25 # sata-p
device 26 osd.26 # sata-p
# ceph-osd-003
device 28 osd.28 # sata-p
device 29 osd.29 # sata-p
device 30 osd.30 # sata-p
device 31 osd.31 # sata-p
device 32 osd.32 # sata-p
device 33 osd.33 # sata-p
device 34 osd.34 # sata-p
device 35 osd.35 # sata-p
device 36 osd.36 # sata-p
device 41 osd.41 # sata-p
# types
type 0 osd
type 1 server
type 3 datacenter
# buckets
### CEPH-OSD-003 ###
server ceph-osd-003-sata-p {
id -12
alg straw
hash 0 # rjenkins1
item osd.28 weight 1.000
item osd.29 weight 1.000
item osd.30 weight 1.000
item osd.31 weight 1.000
item osd.32 weight 1.000
item osd.33 weight 1.000
item osd.34 weight 1.000
item osd.35 weight 1.000
item osd.36 weight 1.000
item osd.41 weight 1.000
}
### CEPH-OSD-002 ###
server ceph-osd-002-sata-p {
id -9
alg straw
hash 0 # rjenkins1
item osd.14 weight 1.000
item osd.15 weight 1.000
item osd.16 weight 1.000
item osd.18 weight 1.000
item osd.19 weight 1.000
item osd.21 weight 1.000
item osd.23 weight 1.000
item osd.24 weight 1.000
item osd.25 weight 1.000
item osd.26 weight 1.000
}
### CEPH-OSD-001 ###
server ceph-osd-001-sata-p {
id -5
alg straw
hash 0 # rjenkins1
item osd.0 weight 1.000
item osd.1 weight 1.000
item osd.3 weight 1.000
item osd.4 weight 1.000
item osd.5 weight 1.000
item osd.7 weight 1.000
item osd.9 weight 1.000
item osd.10 weight 1.000
item osd.11 weight 1.000
item osd.13 weight 1.000
}
# DATACENTER
datacenter dc1 {
id -1
alg straw
hash 0 # rjenkins1
item ceph-osd-001-sata-p weight 10.000
item ceph-osd-002-sata-p weight 10.000
item ceph-osd-003-sata-p weight 10.000
}
# rules
rule sata-p {
ruleset 0
type replicated
min_size 2
max_size 10
step take dc1
step chooseleaf firstn 0 type server
step emit
}
# end crush map
tunable choose_local_tries 0
tunable choose_local_fallback_tries 0
tunable choose_total_tries 50
tunable chooseleaf_descend_once 1
tunable straw_calc_version 1
tunable chooseleaf_vary_r 1
# devices
# ceph-osd-001
device 0 osd.0 # sata-p
device 1 osd.1 # sata-p
device 3 osd.3 # sata-p
device 4 osd.4 # sata-p
device 5 osd.5 # sata-p
device 7 osd.7 # sata-p
device 9 osd.9 # sata-p
device 10 osd.10 # sata-p
device 11 osd.11 # sata-p
device 13 osd.13 # sata-p
# ceph-osd-002
device 14 osd.14 # sata-p
device 15 osd.15 # sata-p
device 16 osd.16 # sata-p
device 18 osd.18 # sata-p
device 19 osd.19 # sata-p
device 21 osd.21 # sata-p
device 23 osd.23 # sata-p
device 24 osd.24 # sata-p
device 25 osd.25 # sata-p
device 26 osd.26 # sata-p
# ceph-osd-003
device 28 osd.28 # sata-p
device 29 osd.29 # sata-p
device 30 osd.30 # sata-p
device 31 osd.31 # sata-p
device 32 osd.32 # sata-p
device 33 osd.33 # sata-p
device 34 osd.34 # sata-p
device 35 osd.35 # sata-p
device 36 osd.36 # sata-p
device 41 osd.41 # sata-p
# types
type 0 osd
type 1 server
type 3 datacenter
# buckets
### CEPH-OSD-003 ###
server ceph-osd-003-sata-p {
id -12
alg straw
hash 0 # rjenkins1
item osd.28 weight 1.000
item osd.29 weight 1.000
item osd.30 weight 1.000
item osd.31 weight 1.000
item osd.32 weight 1.000
item osd.33 weight 1.000
item osd.34 weight 1.000
item osd.35 weight 1.000
item osd.36 weight 1.000
item osd.41 weight 1.000
}
### CEPH-OSD-002 ###
server ceph-osd-002-sata-p {
id -9
alg straw
hash 0 # rjenkins1
item osd.14 weight 1.000
item osd.15 weight 1.000
item osd.16 weight 1.000
item osd.18 weight 1.000
item osd.19 weight 1.000
item osd.21 weight 1.000
item osd.23 weight 1.000
item osd.24 weight 1.000
item osd.25 weight 1.000
item osd.26 weight 1.000
}
### CEPH-OSD-001 ###
server ceph-osd-001-sata-p {
id -5
alg straw
hash 0 # rjenkins1
item osd.0 weight 1.000
item osd.1 weight 1.000
item osd.3 weight 1.000
item osd.4 weight 1.000
item osd.5 weight 1.000
item osd.7 weight 1.000
item osd.9 weight 1.000
item osd.10 weight 1.000
item osd.11 weight 1.000
item osd.13 weight 1.000
}
# DATACENTER
datacenter dc1 {
id -1
alg straw
hash 0 # rjenkins1
item ceph-osd-001-sata-p weight 10.000
item ceph-osd-002-sata-p weight 10.000
item ceph-osd-003-sata-p weight 10.000
}
# rules
rule sata-p {
ruleset 0
type replicated
min_size 2
max_size 10
step take dc1
step chooseleaf firstn 0 type server
step emit
}
# end crush map
Basically it's 30 OSDs spanned across 3 servers. One rule exists, the classic replica-3
cephadm@cephadm01:/etc/ceph/$ crushtool -i crushprova.c --test --show-utilization --num-rep 3 --tree --max-x 1
ID WEIGHT TYPE NAME
-1 30.00000 datacenter milano1
-5 10.00000 server ceph-osd-001-sata-p
0 1.00000 osd.0
1 1.00000 osd.1
3 1.00000 osd.3
4 1.00000 osd.4
5 1.00000 osd.5
7 1.00000 osd.7
9 1.00000 osd.9
10 1.00000 osd.10
11 1.00000 osd.11
13 1.00000 osd.13
-9 10.00000 server ceph-osd-002-sata-p
14 1.00000 osd.14
15 1.00000 osd.15
16 1.00000 osd.16
18 1.00000 osd.18
19 1.00000 osd.19
21 1.00000 osd.21
23 1.00000 osd.23
24 1.00000 osd.24
25 1.00000 osd.25
26 1.00000 osd.26
-12 10.00000 server ceph-osd-003-sata-p
28 1.00000 osd.28
29 1.00000 osd.29
30 1.00000 osd.30
31 1.00000 osd.31
32 1.00000 osd.32
33 1.00000 osd.33
34 1.00000 osd.34
35 1.00000 osd.35
36 1.00000 osd.36
41 1.00000 osd.41
rule 0 (sata-performance), x = 0..1023, numrep = 3..3
rule 0 (sata-performance) num_rep 3 result size == 3: 1024/1024
rule 0 (sata-performance) num_rep 3 result size == 3: 1024/1024
device 0: stored : 95 expected : 102.400009
device 1: stored : 95 expected : 102.400009
device 3: stored : 104 expected : 102.400009
device 4: stored : 95 expected : 102.400009
device 5: stored : 110 expected : 102.400009
device 7: stored : 111 expected : 102.400009
device 9: stored : 106 expected : 102.400009
device 10: stored : 97 expected : 102.400009
device 11: stored : 105 expected : 102.400009
device 13: stored : 106 expected : 102.400009
device 14: stored : 107 expected : 102.400009
device 15: stored : 107 expected : 102.400009
device 16: stored : 101 expected : 102.400009
device 18: stored : 93 expected : 102.400009
device 19: stored : 102 expected : 102.400009
device 21: stored : 112 expected : 102.400009
device 23: stored : 115 expected : 102.400009
device 24: stored : 95 expected : 102.400009
device 25: stored : 98 expected : 102.400009
device 26: stored : 94 expected : 102.400009
device 28: stored : 92 expected : 102.400009
device 29: stored : 87 expected : 102.400009
device 30: stored : 109 expected : 102.400009
device 31: stored : 102 expected : 102.400009
device 32: stored : 116 expected : 102.400009
device 33: stored : 100 expected : 102.400009
device 34: stored : 137 expected : 102.400009
device 35: stored : 86 expected : 102.400009
device 36: stored : 99 expected : 102.400009
device 41: stored : 96 expected : 102.400009
device 3: stored : 104 expected : 102.400009
device 4: stored : 95 expected : 102.400009
device 5: stored : 110 expected : 102.400009
device 7: stored : 111 expected : 102.400009
device 9: stored : 106 expected : 102.400009
device 10: stored : 97 expected : 102.400009
device 11: stored : 105 expected : 102.400009
device 13: stored : 106 expected : 102.400009
device 14: stored : 107 expected : 102.400009
device 15: stored : 107 expected : 102.400009
device 16: stored : 101 expected : 102.400009
device 18: stored : 93 expected : 102.400009
device 19: stored : 102 expected : 102.400009
device 21: stored : 112 expected : 102.400009
device 23: stored : 115 expected : 102.400009
device 24: stored : 95 expected : 102.400009
device 25: stored : 98 expected : 102.400009
device 26: stored : 94 expected : 102.400009
device 28: stored : 92 expected : 102.400009
device 29: stored : 87 expected : 102.400009
device 30: stored : 109 expected : 102.400009
device 31: stored : 102 expected : 102.400009
device 32: stored : 116 expected : 102.400009
device 33: stored : 100 expected : 102.400009
device 34: stored : 137 expected : 102.400009
device 35: stored : 86 expected : 102.400009
device 36: stored : 99 expected : 102.400009
device 41: stored : 96 expected : 102.400009
My real CRUSH is a little bit more complicated (i have multiple disk type on the same hardware) but the result is the same.
I don't know how to interpreter theese numbers or what can i do to fix it...
Thoughts?
_______________________________________________ ceph-users mailing list ceph-users@xxxxxxxxxxxxxx http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com