this is happen after some OSD fail and i recreate osd. i have did "ceph osd rm osd.4" to remove the osd.4 and osd.6. but when i use ceph-deploy to install OSD by "ceph-deploy osd --zap-disk --fs-type btrfs create ceph0x-vm:sdb", ceph-deploy result said new osd is ready, but the OSD can not start. said that ceph-disk failure. /var/lib/ceph/bootstrap-osd/ceph.keyring and auth:error and i have check the ceph.keyring is same as other on live OSD. when i run ceph-deploy twice. first it will create osd.4, failed , will display in osd tree. then osd.6 same. next ceph-deploy osd again, create osd.10, this OSD can start successful. but osd.4 osd.6 display down in osd tree. when i use ceph osd reweight-by-utilization, run one time, more pgs active+remapped. Ceph can not recover itself and Crush map tunables already optimize. do not how to solve it. root at ceph-admin:~# ceph osd crush dump { "devices": [ { "id": 0, "name": "osd.0"}, { "id": 1, "name": "osd.1"}, { "id": 2, "name": "osd.2"}, { "id": 3, "name": "osd.3"}, { "id": 4, "name": "device4"}, { "id": 5, "name": "osd.5"}, { "id": 6, "name": "device6"}, { "id": 7, "name": "osd.7"}, { "id": 8, "name": "osd.8"}, { "id": 9, "name": "osd.9"}, { "id": 10, "name": "osd.10"}], "types": [ { "type_id": 0, "name": "osd"}, { "type_id": 1, "name": "host"}, { "type_id": 2, "name": "chassis"}, { "type_id": 3, "name": "rack"}, { "type_id": 4, "name": "row"}, { "type_id": 5, "name": "pdu"}, { "type_id": 6, "name": "pod"}, { "type_id": 7, "name": "room"}, { "type_id": 8, "name": "datacenter"}, { "type_id": 9, "name": "region"}, { "type_id": 10, "name": "root"}], "buckets": [ { "id": -1, "name": "default", "type_id": 10, "type_name": "root", "weight": 302773, "alg": "straw", "hash": "rjenkins1", "items": [ { "id": -2, "weight": 5898, "pos": 0}, { "id": -3, "weight": 5898, "pos": 1}, { "id": -4, "weight": 5898, "pos": 2}, { "id": -5, "weight": 12451, "pos": 3}, { "id": -6, "weight": 13107, "pos": 4}, { "id": -7, "weight": 87162, "pos": 5}, { "id": -8, "weight": 49807, "pos": 6}, { "id": -9, "weight": 116654, "pos": 7}, { "id": -10, "weight": 5898, "pos": 8}]}, { "id": -2, "name": "ceph02-vm", "type_id": 1, "type_name": "host", "weight": 5898, "alg": "straw", "hash": "rjenkins1", "items": [ { "id": 0, "weight": 5898, "pos": 0}]}, { "id": -3, "name": "ceph03-vm", "type_id": 1, "type_name": "host", "weight": 5898, "alg": "straw", "hash": "rjenkins1", "items": [ { "id": 1, "weight": 5898, "pos": 0}]}, { "id": -4, "name": "ceph01-vm", "type_id": 1, "type_name": "host", "weight": 5898, "alg": "straw", "hash": "rjenkins1", "items": [ { "id": 2, "weight": 5898, "pos": 0}]}, { "id": -5, "name": "ceph04-vm", "type_id": 1, "type_name": "host", "weight": 12451, "alg": "straw", "hash": "rjenkins1", "items": [ { "id": 8, "weight": 12451, "pos": 0}]}, { "id": -6, "name": "ceph05-vm", "type_id": 1, "type_name": "host", "weight": 13107, "alg": "straw", "hash": "rjenkins1", "items": [ { "id": 3, "weight": 13107, "pos": 0}]}, { "id": -7, "name": "ceph06-vm", "type_id": 1, "type_name": "host", "weight": 87162, "alg": "straw", "hash": "rjenkins1", "items": [ { "id": 5, "weight": 87162, "pos": 0}]}, { "id": -8, "name": "ceph07-vm", "type_id": 1, "type_name": "host", "weight": 49807, "alg": "straw", "hash": "rjenkins1", "items": [ { "id": 9, "weight": 49807, "pos": 0}]}, { "id": -9, "name": "ceph08-vm", "type_id": 1, "type_name": "host", "weight": 116654, "alg": "straw", "hash": "rjenkins1", "items": [ { "id": 7, "weight": 116654, "pos": 0}]}, { "id": -10, "name": "ceph09-vm", "type_id": 1, "type_name": "host", "weight": 5898, "alg": "straw", "hash": "rjenkins1", "items": [ { "id": 10, "weight": 5898, "pos": 0}]}], "rules": [ { "rule_id": 0, "rule_name": "replicated_ruleset", "ruleset": 0, "type": 1, "min_size": 1, "max_size": 8, "steps": [ { "op": "take", "item": -1, "item_name": "default"}, { "op": "chooseleaf_firstn", "num": 0, "type": "host"}, { "op": "emit"}]}], "tunables": { "choose_local_tries": 0, "choose_local_fallback_tries": 0, "choose_total_tries": 50, "chooseleaf_descend_once": 1, "profile": "firefly", "optimal_tunables": 1, "legacy_tunables": 0, "require_feature_tunables": 1, "require_feature_tunables2": 1}} root at ceph-admin:~# ceph osd tree # id weight type name up/down reweight -1 4.62 root default -2 0.09 host ceph02-vm 0 0.09 osd.0 up 0.2079 -3 0.09 host ceph03-vm 1 0.09 osd.1 up 0.1729 -4 0.09 host ceph01-vm 2 0.09 osd.2 up 0.1192 -5 0.19 host ceph04-vm 8 0.19 osd.8 up 0.1867 -6 0.2 host ceph05-vm 3 0.2 osd.3 up 0.2006 -7 1.33 host ceph06-vm 5 1.33 osd.5 up 1 -8 0.76 host ceph07-vm 9 0.76 osd.9 up 0.6108 -9 1.78 host ceph08-vm 7 1.78 osd.7 up 1 -10 0.09 host ceph09-vm 10 0.09 osd.10 up 1 4 0 osd.4 down 0 6 0 osd.6 down 0 root at ceph-admin:~# ceph -s cluster ae3da4d2-eef0-47cf-a872-24df8f2c8df4 health HEALTH_WARN 85 pgs stuck unclean; recovery 3/17085 objects degraded (0.018%) monmap e12: 2 mons at {ceph01-vm= 192.168.123.251:6789/0,ceph02-vm=192.168.123.252:6789/0}, election epoch 94, quorum 0,1 ceph01-vm,ceph02-vm mdsmap e47: 1/1/1 up {0=ceph01-vm=up:active} osdmap e992: 11 osds: 9 up, 9 in pgmap v34412: 2985 pgs, 21 pools, 22250 MB data, 5695 objects 67696 MB used, 4667 GB / 4740 GB avail 3/17085 objects degraded (0.018%) 85 active+remapped 2900 active+clean root at ceph-admin:~# ceph health detail HEALTH_WARN 85 pgs stuck unclean; recovery 3/17058 objects degraded (0.018%) pg 17.33 is stuck unclean for 36875.635897, current state active+remapped, last acting [5,7,8] pg 25.38 is stuck unclean for 87503.792230, current state active+remapped, last acting [5,7,9] pg 26.3b is stuck unclean for 81404.008815, current state active+remapped, last acting [5,7,9] pg 9.ea is stuck unclean for 36875.593450, current state active+remapped, last acting [5,7,2] pg 0.1f is stuck unclean for 38958.901459, current state active+remapped, last acting [7,5,2] pg 10.14 is stuck unclean for 14509.102941, current state active+remapped, last acting [7,5,8] pg 13.72 is stuck unclean for 38960.142441, current state active+remapped, last acting [5,7,3] pg 9.e9 is stuck unclean for 14508.983386, current state active+remapped, last acting [5,7,0] pg 0.1d is stuck unclean for 81404.010614, current state active+remapped, last acting [5,7,0] pg 9.d6 is stuck unclean for 81403.979337, current state active+remapped, last acting [7,5,9] pg 4.78 is stuck unclean for 81403.983261, current state active+remapped, last acting [7,5,9] pg 1.1b is stuck unclean for 38960.179802, current state active+remapped, last acting [5,7,8] pg 26.61 is stuck unclean for 38958.868631, current state active+remapped, last acting [7,5,8] pg 13.14 is stuck unclean for 519010.036121, current state active+remapped, last acting [5,7,8] pg 3.1a is stuck unclean for 81404.013669, current state active+remapped, last acting [7,5,1] pg 25.0 is stuck unclean for 19709.409739, current state active+remapped, last acting [5,7,3] pg 9.132 is stuck unclean for 38958.861392, current state active+remapped, last acting [7,5,8] pg 8.1f is stuck unclean for 36875.641918, current state active+remapped, last acting [5,7,2] pg 4.7c is stuck unclean for 38960.144864, current state active+remapped, last acting [5,7,3] pg 7.11 is stuck unclean for 36874.394976, current state active+remapped, last acting [7,5,2] pg 2.17 is stuck unclean for 38960.181574, current state active+remapped, last acting [5,7,1] pg 13.7b is stuck unclean for 81403.985106, current state active+remapped, last acting [7,5,9] pg 16.65 is stuck unclean for 81403.985193, current state active+remapped, last acting [7,5,9] pg 11.18 is stuck unclean for 81404.015584, current state active+remapped, last acting [7,5,1] pg 3.11 is stuck unclean for 38960.182945, current state active+remapped, last acting [5,7,3] pg 14.7c is stuck unclean for 36875.603800, current state active+remapped, last acting [5,7,3] pg 9.6 is stuck unclean for 38958.909013, current state active+remapped, last acting [7,5,8] pg 15.7f is stuck unclean for 621594.458504, current state active+remapped, last acting [5,7,8] pg 4.a is stuck unclean for 14509.115738, current state active+remapped, last acting [7,5,1] pg 9.7 is stuck unclean for 81404.016455, current state active+remapped, last acting [5,7,9] pg 9.c6 is stuck unclean for 19709.356082, current state active+remapped, last acting [5,7,3] pg 25.15 is stuck unclean for 81404.019076, current state active+remapped, last acting [7,5,9] pg 10.1 is stuck unclean for 36874.400612, current state active+remapped, last acting [7,5,3] pg 26.10 is stuck unclean for 86063.632237, current state active+remapped, last acting [7,5,0] pg 16.79 is stuck unclean for 19707.589587, current state active+remapped, last acting [7,5,2] pg 7.6e is stuck unclean for 38958.874311, current state active+remapped, last acting [7,5,3] pg 9.e is stuck unclean for 81404.019610, current state active+remapped, last acting [5,7,9] pg 10.62 is stuck unclean for 81403.987697, current state active+remapped, last acting [5,7,1] pg 16.77 is stuck unclean for 38960.150397, current state active+remapped, last acting [5,7,3] pg 10.f is stuck unclean for 36874.403510, current state active+remapped, last acting [7,5,3] pg 9.180 is stuck unclean for 683539.557081, current state active+remapped, last acting [5,7,8] pg 7.3 is stuck unclean for 1499960.402871, current state active+remapped, last acting [7,5,3] pg 16.13 is stuck unclean for 36874.404199, current state active+remapped, last acting [7,5,2] pg 4.7 is stuck unclean for 38958.914074, current state active+remapped, last acting [7,5,8] pg 4.60 is stuck unclean for 36874.359939, current state active+remapped, last acting [7,5,3] pg 24.7b is stuck unclean for 38958.876796, current state active+remapped, last acting [7,5,0] pg 4.65 is stuck unclean for 36874.360956, current state active+remapped, last acting [7,5,8] pg 9.12b is stuck unclean for 81403.978360, current state active+remapped, last acting [7,5,9] pg 14.50 is stuck unclean for 38958.878275, current state active+remapped, last acting [7,5,0] pg 9.189 is stuck unclean for 86063.631617, current state active+remapped, last acting [5,7,0] pg 25.42 is stuck unclean since forever, current state active+remapped, last acting [5,7,2] pg 12.57 is stuck unclean for 14509.062855, current state active+remapped, last acting [7,5,0] pg 15.57 is stuck unclean for 38960.155634, current state active+remapped, last acting [5,7,3] pg 9.1d3 is stuck unclean for 14509.081383, current state active+remapped, last acting [7,5,2] pg 9.11e is stuck unclean for 36874.342530, current state active+remapped, last acting [7,5,2] pg 7.52 is stuck unclean for 1307123.508719, current state active+remapped, last acting [7,5,9] pg 8.5b is stuck unclean for 36875.615756, current state active+remapped, last acting [5,7,8] pg 3.50 is stuck unclean for 87503.780959, current state active+remapped, last acting [5,7,3] pg 28.4d is stuck unclean for 38960.158456, current state active+remapped, last acting [5,7,8] pg 9.1da is stuck unclean for 36874.407315, current state active+remapped, last acting [7,5,8] pg 17.5e is stuck unclean for 81403.996288, current state active+remapped, last acting [7,5,9] pg 12.41 is stuck unclean for 473213.328811, current state active+remapped, last acting [7,5,0] pg 12.44 is stuck unclean for 81403.998234, current state active+remapped, last acting [7,5,9] pg 17.59 is stuck unclean for 36874.371587, current state active+remapped, last acting [7,5,3] pg 9.162 is stuck unclean for 81404.025027, current state active+remapped, last acting [5,7,9] pg 7.40 is stuck unclean for 1326816.767489, current state active+remapped, last acting [5,7,0] pg 16.57 is stuck unclean for 38958.886731, current state active+remapped, last acting [7,5,1] pg 15.49 is stuck unclean for 36874.372875, current state active+remapped, last acting [7,5,3] pg 9.108 is stuck unclean for 81403.978180, current state active+remapped, last acting [5,7,9] pg 12.32 is stuck unclean for 81403.999704, current state active+remapped, last acting [5,7,9] pg 26.27 is stuck unclean for 86063.613158, current state active+remapped, last acting [5,7,1] pg 27.22 is stuck unclean for 86063.618921, current state active+remapped, last acting [7,5,0] pg 9.1b5 is stuck unclean for 81404.026012, current state active+remapped, last acting [7,5,9] pg 11.33 is stuck unclean for 14509.036315, current state active+remapped, last acting [5,7,1] pg 14.3a is stuck unclean for 87503.851817, current state active+remapped, last acting [7,5,1] pg 7.35 is stuck unclean for 14509.089478, current state active+remapped, last acting [7,5,1] pg 0.2d is stuck unclean for 19707.617077, current state active+remapped, last acting [7,5,2] pg 17.3d is stuck unclean for 38958.896770, current state active+remapped, last acting [7,5,0] pg 26.36 is stuck unclean for 87502.949930, current state active+remapped, last acting [5,7,3] pg 9.1a6 is stuck unclean for 38958.917534, current state active+remapped, last acting [7,5,8] pg 1.29 is stuck unclean for 19709.402325, current state active+remapped, last acting [5,7,3] pg 12.2b is stuck unclean for 19709.402186, current state active+remapped, last acting [5,7,2] pg 9.1a2 is stuck unclean for 87503.804426, current state active+remapped, last acting [5,7,2] pg 9.8e is stuck unclean for 81403.982489, current state active+remapped, last acting [7,5,9] pg 27.38 is stuck unclean for 86063.620507, current state active+remapped, last acting [5,7,0] recovery 3/17058 objects degraded (0.018%) root at ceph-admin:~# ceph pg 27.38 query { "state": "active+remapped", "epoch": 992, "up": [ 5, 7], "acting": [ 5, 7, 0], "actingbackfill": [ "0", "5", "7"], "info": { "pgid": "27.38", "last_update": "0'0", "last_complete": "0'0", "log_tail": "0'0", "last_user_version": 0, "last_backfill": "MAX", "purged_snaps": "[]", "history": { "epoch_created": 465, "last_epoch_started": 854, "last_epoch_clean": 854, "last_epoch_split": 0, "same_up_since": 852, "same_interval_since": 853, "same_primary_since": 465, "last_scrub": "0'0", "last_scrub_stamp": "2014-08-18 03:50:58.482069", "last_deep_scrub": "0'0", "last_deep_scrub_stamp": "2014-08-12 01:17:51.908690", "last_clean_scrub_stamp": "2014-08-18 03:50:58.482069"}, "stats": { "version": "0'0", "reported_seq": "587", "reported_epoch": "992", "state": "active+remapped", "last_fresh": "2014-08-19 03:04:04.900906", "last_change": "2014-08-18 07:48:22.798332", "last_active": "2014-08-19 03:04:04.900906", "last_clean": "2014-08-18 06:31:12.361215", "last_became_active": "0.000000", "last_unstale": "2014-08-19 03:04:04.900906", "mapping_epoch": 852, "log_start": "0'0", "ondisk_log_start": "0'0", "created": 465, "last_epoch_clean": 854, "parent": "0.0", "parent_split_bits": 0, "last_scrub": "0'0", "last_scrub_stamp": "2014-08-18 03:50:58.482069", "last_deep_scrub": "0'0", "last_deep_scrub_stamp": "2014-08-12 01:17:51.908690", "last_clean_scrub_stamp": "2014-08-18 03:50:58.482069", "log_size": 0, "ondisk_log_size": 0, "stats_invalid": "0", "stat_sum": { "num_bytes": 0, "num_objects": 0, "num_object_clones": 0, "num_object_copies": 0, "num_objects_missing_on_primary": 0, "num_objects_degraded": 0, "num_objects_unfound": 0, "num_objects_dirty": 0, "num_whiteouts": 0, "num_read": 0, "num_read_kb": 0, "num_write": 0, "num_write_kb": 0, "num_scrub_errors": 0, "num_shallow_scrub_errors": 0, "num_deep_scrub_errors": 0, "num_objects_recovered": 0, "num_bytes_recovered": 0, "num_keys_recovered": 0, "num_objects_omap": 0, "num_objects_hit_set_archive": 0}, "stat_cat_sum": {}, "up": [ 5, 7], "acting": [ 5, 7, 0], "up_primary": 5, "acting_primary": 5}, "empty": 1, "dne": 0, "incomplete": 0, "last_epoch_started": 854, "hit_set_history": { "current_last_update": "0'0", "current_last_stamp": "0.000000", "current_info": { "begin": "0.000000", "end": "0.000000", "version": "0'0"}, "history": []}}, "peer_info": [ { "peer": "0", "pgid": "27.38", "last_update": "0'0", "last_complete": "0'0", "log_tail": "0'0", "last_user_version": 0, "last_backfill": "MAX", "purged_snaps": "[]", "history": { "epoch_created": 465, "last_epoch_started": 854, "last_epoch_clean": 854, "last_epoch_split": 0, "same_up_since": 852, "same_interval_since": 853, "same_primary_since": 465, "last_scrub": "0'0", "last_scrub_stamp": "2014-08-18 03:50:58.482069", "last_deep_scrub": "0'0", "last_deep_scrub_stamp": "2014-08-12 01:17:51.908690", "last_clean_scrub_stamp": "2014-08-18 03:50:58.482069"}, "stats": { "version": "0'0", "reported_seq": "388", "reported_epoch": "803", "state": "peering", "last_fresh": "2014-08-18 06:29:20.770740", "last_change": "2014-08-18 06:29:19.709158", "last_active": "2014-08-18 06:07:12.189878", "last_clean": "2014-08-18 06:07:12.189878", "last_became_active": "0.000000", "last_unstale": "2014-08-18 06:29:20.770740", "mapping_epoch": 852, "log_start": "0'0", "ondisk_log_start": "0'0", "created": 465, "last_epoch_clean": 707, "parent": "0.0", "parent_split_bits": 0, "last_scrub": "0'0", "last_scrub_stamp": "2014-08-18 03:50:58.482069", "last_deep_scrub": "0'0", "last_deep_scrub_stamp": "2014-08-12 01:17:51.908690", "last_clean_scrub_stamp": "2014-08-18 03:50:58.482069", "log_size": 0, "ondisk_log_size": 0, "stats_invalid": "0", "stat_sum": { "num_bytes": 0, "num_objects": 0, "num_object_clones": 0, "num_object_copies": 0, "num_objects_missing_on_primary": 0, "num_objects_degraded": 0, "num_objects_unfound": 0, "num_objects_dirty": 0, "num_whiteouts": 0, "num_read": 0, "num_read_kb": 0, "num_write": 0, "num_write_kb": 0, "num_scrub_errors": 0, "num_shallow_scrub_errors": 0, "num_deep_scrub_errors": 0, "num_objects_recovered": 0, "num_bytes_recovered": 0, "num_keys_recovered": 0, "num_objects_omap": 0, "num_objects_hit_set_archive": 0}, "stat_cat_sum": {}, "up": [ 5, 7], "acting": [ 5, 7, 0], "up_primary": 5, "acting_primary": 5}, "empty": 1, "dne": 0, "incomplete": 0, "last_epoch_started": 854, "hit_set_history": { "current_last_update": "0'0", "current_last_stamp": "0.000000", "current_info": { "begin": "0.000000", "end": "0.000000", "version": "0'0"}, "history": []}}, { "peer": "7", "pgid": "27.38", "last_update": "0'0", "last_complete": "0'0", "log_tail": "0'0", "last_user_version": 0, "last_backfill": "MAX", "purged_snaps": "[]", "history": { "epoch_created": 465, "last_epoch_started": 854, "last_epoch_clean": 854, "last_epoch_split": 0, "same_up_since": 852, "same_interval_since": 853, "same_primary_since": 465, "last_scrub": "0'0", "last_scrub_stamp": "2014-08-18 03:50:58.482069", "last_deep_scrub": "0'0", "last_deep_scrub_stamp": "2014-08-12 01:17:51.908690", "last_clean_scrub_stamp": "2014-08-18 03:50:58.482069"}, "stats": { "version": "0'0", "reported_seq": "388", "reported_epoch": "803", "state": "peering", "last_fresh": "2014-08-18 06:29:20.770740", "last_change": "2014-08-18 06:29:19.709158", "last_active": "2014-08-18 06:07:12.189878", "last_clean": "2014-08-18 06:07:12.189878", "last_became_active": "0.000000", "last_unstale": "2014-08-18 06:29:20.770740", "mapping_epoch": 852, "log_start": "0'0", "ondisk_log_start": "0'0", "created": 465, "last_epoch_clean": 707, "parent": "0.0", "parent_split_bits": 0, "last_scrub": "0'0", "last_scrub_stamp": "2014-08-18 03:50:58.482069", "last_deep_scrub": "0'0", "last_deep_scrub_stamp": "2014-08-12 01:17:51.908690", "last_clean_scrub_stamp": "2014-08-18 03:50:58.482069", "log_size": 0, "ondisk_log_size": 0, "stats_invalid": "0", "stat_sum": { "num_bytes": 0, "num_objects": 0, "num_object_clones": 0, "num_object_copies": 0, "num_objects_missing_on_primary": 0, "num_objects_degraded": 0, "num_objects_unfound": 0, "num_objects_dirty": 0, "num_whiteouts": 0, "num_read": 0, "num_read_kb": 0, "num_write": 0, "num_write_kb": 0, "num_scrub_errors": 0, "num_shallow_scrub_errors": 0, "num_deep_scrub_errors": 0, "num_objects_recovered": 0, "num_bytes_recovered": 0, "num_keys_recovered": 0, "num_objects_omap": 0, "num_objects_hit_set_archive": 0}, "stat_cat_sum": {}, "up": [ 5, 7], "acting": [ 5, 7, 0], "up_primary": 5, "acting_primary": 5}, "empty": 1, "dne": 0, "incomplete": 0, "last_epoch_started": 854, "hit_set_history": { "current_last_update": "0'0", "current_last_stamp": "0.000000", "current_info": { "begin": "0.000000", "end": "0.000000", "version": "0'0"}, "history": []}}], "recovery_state": [ { "name": "Started\/Primary\/Active", "enter_time": "2014-08-18 07:48:22.681837", "might_have_unfound": [], "recovery_progress": { "backfill_targets": [], "waiting_on_backfill": [], "last_backfill_started": "0\/\/0\/\/-1", "backfill_info": { "begin": "0\/\/0\/\/-1", "end": "0\/\/0\/\/-1", "objects": []}, "peer_backfill_info": [], "backfills_in_flight": [], "recovering": [], "pg_backend": { "pull_from_peer": [], "pushing": []}}, "scrub": { "scrubber.epoch_start": "562", "scrubber.active": 0, "scrubber.block_writes": 0, "scrubber.finalizing": 0, "scrubber.waiting_on": 0, "scrubber.waiting_on_whom": []}}, { "name": "Started", "enter_time": "2014-08-18 07:48:21.508853"}], "agent_state": {}} -------------- next part -------------- An HTML attachment was scrubbed... URL: <http://lists.ceph.com/pipermail/ceph-users-ceph.com/attachments/20140819/c6f0e834/attachment.htm>