Hi,
I added a 4th osd node (same configuration than 3 others) and now cluster status is health warn (active+remapped).
I got a functionnal and operationnal ceph cluster (in version 0.94.5), with 3 nodes (acting for MON and OSD), everything was fine.
I added a 4th osd node (same configuration than 3 others) and now cluster status is health warn (active+remapped).
cluster e821c68f-995c-41a9-9c46-dbbd0a28b8c7
health HEALTH_WARN
256 pgs stuck unclean
recovery 279/1245 objects degraded (22.410%)
recovery 415/1245 objects misplaced (33.333%)
pool rbd pg_num 128 > pgp_num 64
pool data pg_num 128 > pgp_num 100
monmap e1: 3 mons at {ceph-osd-1=10.200.1.11:6789/0,ceph-osd-2=10.200.1.12:6789/0,ceph-osd-3=10.200.1.13:6789/0}
election epoch 4, quorum 0,1,2 ceph-osd-1,ceph-osd-2,ceph-osd-3
osdmap e57: 8 osds: 8 up, 8 in; 256 remapped pgs
pgmap v948: 256 pgs, 2 pools, 1566 MB data, 415 objects
14929 MB used, 38237 MB / 55717 MB avail
279/1245 objects degraded (22.410%)
415/1245 objects misplaced (33.333%)
256 active+remapped
OSD Tree
root@ceph-osd-1:~# ceph osd tree
ID WEIGHT TYPE NAME UP/DOWN REWEIGHT PRIMARY-AFFINITY
-8 4.00000 root default
-7 4.00000 region eu-west-1
-5 1.00000 datacenter eu-west-1a
-2 1.00000 host ceph-osd-1
0 1.00000 osd.0 up 1.00000 1.00000
1 1.00000 osd.1 up 1.00000 1.00000
-4 1.00000 host ceph-osd-3
4 1.00000 osd.4 up 1.00000 1.00000
5 1.00000 osd.5 up 1.00000 1.00000
-6 1.00000 datacenter eu-west-1b
-3 1.00000 host ceph-osd-2
2 1.00000 osd.2 up 1.00000 1.00000
3 1.00000 osd.3 up 1.00000 1.00000
-9 1.00000 host ceph-osd-4
6 1.00000 osd.6 up 1.00000 1.00000
7 1.00000 osd.7 up 1.00000 1.00000
root@ceph-osd-1:~#
I'm using this crush map :
{
"devices": [
{
"id": 0,
"name": "osd.0"
},
{
"id": 1,
"name": "osd.1"
},
{
"id": 2,
"name": "osd.2"
},
{
"id": 3,
"name": "osd.3"
},
{
"id": 4,
"name": "osd.4"
},
{
"id": 5,
"name": "osd.5"
},
{
"id": 6,
"name": "osd.6"
},
{
"id": 7,
"name": "osd.7"
}
],
"types": [
{
"type_id": 0,
"name": "osd"
},
{
"type_id": 1,
"name": "host"
},
{
"type_id": 2,
"name": "chassis"
},
{
"type_id": 3,
"name": "rack"
},
{
"type_id": 4,
"name": "row"
},
{
"type_id": 5,
"name": "pdu"
},
{
"type_id": 6,
"name": "pod"
},
{
"type_id": 7,
"name": "room"
},
{
"type_id": 8,
"name": "datacenter"
},
{
"type_id": 9,
"name": "region"
},
{
"type_id": 10,
"name": "root"
}
],
"buckets": [
{
"id": -2,
"name": "ceph-osd-1",
"type_id": 1,
"type_name": "host",
"weight": 131072,
"alg": "straw",
"hash": "rjenkins1",
"items": [
{
"id": 0,
"weight": 65536,
"pos": 0
},
{
"id": 1,
"weight": 65536,
"pos": 1
}
]
},
{
"id": -3,
"name": "ceph-osd-2",
"type_id": 1,
"type_name": "host",
"weight": 131072,
"alg": "straw",
"hash": "rjenkins1",
"items": [
{
"id": 2,
"weight": 65536,
"pos": 0
},
{
"id": 3,
"weight": 65536,
"pos": 1
}
]
},
{
"id": -4,
"name": "ceph-osd-3",
"type_id": 1,
"type_name": "host",
"weight": 131072,
"alg": "straw",
"hash": "rjenkins1",
"items": [
{
"id": 4,
"weight": 65536,
"pos": 0
},
{
"id": 5,
"weight": 65536,
"pos": 1
}
]
},
{
"id": -5,
"name": "eu-west-1a",
"type_id": 8,
"type_name": "datacenter",
"weight": 131072,
"alg": "straw",
"hash": "rjenkins1",
"items": [
{
"id": -2,
"weight": 65536,
"pos": 0
},
{
"id": -4,
"weight": 65536,
"pos": 1
}
]
},
{
"id": -6,
"name": "eu-west-1b",
"type_id": 8,
"type_name": "datacenter",
"weight": 131072,
"alg": "straw",
"hash": "rjenkins1",
"items": [
{
"id": -3,
"weight": 65536,
"pos": 0
},
{
"id": -9,
"weight": 65536,
"pos": 1
}
]
},
{
"id": -7,
"name": "eu-west-1",
"type_id": 9,
"type_name": "region",
"weight": 131072,
"alg": "straw",
"hash": "rjenkins1",
"items": [
{
"id": -5,
"weight": 65536,
"pos": 0
},
{
"id": -6,
"weight": 65536,
"pos": 1
}
]
},
{
"id": -8,
"name": "default",
"type_id": 10,
"type_name": "root",
"weight": 262144,
"alg": "straw",
"hash": "rjenkins1",
"items": [
{
"id": -7,
"weight": 262144,
"pos": 0
}
]
},
{
"id": -9,
"name": "ceph-osd-4",
"type_id": 1,
"type_name": "host",
"weight": 131072,
"alg": "straw",
"hash": "rjenkins1",
"items": [
{
"id": 6,
"weight": 65536,
"pos": 0
},
{
"id": 7,
"weight": 65536,
"pos": 1
}
]
}
],
"rules": [
{
"rule_id": 0,
"rule_name": "replicated_ruleset",
"ruleset": 0,
"type": 1,
"min_size": 1,
"max_size": 10,
"steps": [
{
"op": "take",
"item": -8,
"item_name": "default"
},
{
"op": "choose_firstn",
"num": 0,
"type": "datacenter"
},
{
"op": "chooseleaf_firstn",
"num": 1,
"type": "host"
},
{
"op": "emit"
}
]
}
],
"tunables": {
"choose_local_tries": 0,
"choose_local_fallback_tries": 0,
"choose_total_tries": 50,
"chooseleaf_descend_once": 1,
"chooseleaf_vary_r": 1,
"straw_calc_version": 1,
"allowed_bucket_algs": 54,
"profile": "hammer",
"optimal_tunables": 0,
"legacy_tunables": 0,
"require_feature_tunables": 1,
"require_feature_tunables2": 1,
"require_feature_tunables3": 1,
"has_v2_rules": 0,
"has_v3_rules": 0,
"has_v4_buckets": 0
}
}
I read a thread (http://lists.ceph.com/pipermail/ceph-users-ceph.com/2013-November/006017.html) from this mailling list, I tried everything (tunnable to optimal, add more pg, use the same weight ), but I still got this issue.
Do you have any ideas to fix this situation ?
_______________________________________________ ceph-users mailing list ceph-users@xxxxxxxxxxxxxx http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com