Hello, i've lost a complete server and an additional disk on another server over the night. I was able to fix everything except one PG that stays incomplete. i wasn't able to rescue the failed single disk. i already tried to get the cluster back running marking the failes osd as lost. some questions i didn't find answers until now: - how do i get the cluster back healthy? - do i have to do a "ceph pg force_create_pg". I guess i would loose the complete pg. - can i check which rbd are affected of that pg loss? thanks for your help. current status is: sudo ceph health detail --------------------------------------------------------------------- pg 3.8 is stuck inactive for 56202.938866, current state incomplete, last acting [13,25,5] pg 3.8 is stuck unclean for 56659.090971, current state incomplete, last acting [13,25,5] pg 3.8 is incomplete, acting [13,25,5] 5 ops are blocked > 16777.2 sec 3 ops are blocked > 4194.3 sec 5 ops are blocked > 16777.2 sec on osd.13 3 ops are blocked > 4194.3 sec on osd.13 1 osds have slow requests --------------------------------------------------------------------- sudo ceph pg 3.8 query --------------------------------------------------------------------- { "state": "incomplete", "epoch": 6410, "up": [ 13, 25, 5], "acting": [ 13, 25, 5], "info": { "pgid": "3.8", "last_update": "5069'94208", "last_complete": "5069'94208", "log_tail": "4978'91205", "last_user_version": 94208, "last_backfill": "MAX", "purged_snaps": "[1~7b1,7b4~1e,7d3~1d,7f1~1d,80f~1d,82d~1,82f~1,831~1,833~1,835~1,837~1,839~1,83b~1,83d~1,83f~1,841~1,843~1,845~1,847~1,849~1,84b~1,84d~1,84f~1,851~1,853~1,855~1,857~1,859~1,85b~1,85d~1,85f~1,861~1,863~1,865~1,867~1,86c~1,86e~1,870~1,872~1,874~1,876~1,878~1,87a~1,87c~1,87e~1,880~1,882~1,884~1,886~1,888~1,88a~1,88c~1,88e~1,890~1,892~1,894~1,896~1,898~1,89a~1,89c~1,89e~1,8a0~1,8a2~1,8a4~1,8a6~1,8a8~1,8aa~1,8ac~1,8ae~1,8b0~1,8b2~1,8b4~1,8b6~1,8b8~1,8ba~1,8bc~1,8be~1,8c0~1,8c2~1,8c4~1,8c8~1,8ca~1,8cc~1,8ce~1,8d0~1,8d2~1,8d4~1,8d6~1,8d8~1,8da~1,8dc~1,8de~1,8e0~1,8e2~1,8e4~1,8e6~1,8e8~1,8ea~1,8ec~1,8ee~1,8f0~1,8f2~1,8f4~1,8f6~1,8f8~1,8fa~1,8fc~1,8fe~1,900~1,902~1,906~2,909~2,90c~2,90f~2,912~2,915~2,918~2,91b~2,91e~2,921~2,924~2,927~2,92a~2,92d~2,931~1,933~1,935~1,937~1,939~1,93b~1,93d~1,93f~1,941~1,943~1,945~1,947~1,949~1,94b~1]", "history": { "epoch_created": 188, "last_epoch_started": 5185, "last_epoch_clean": 5066, "last_epoch_split": 0, "same_up_since": 6403, "same_interval_since": 6403, "same_primary_since": 6186, "last_scrub": "5052'93452", "last_scrub_stamp": "2014-03-07 15:40:52.774102", "last_deep_scrub": "4908'85840", "last_deep_scrub_stamp": "2014-03-02 15:37:23.975966", "last_clean_scrub_stamp": "2014-03-07 15:40:52.774102"}, "stats": { "version": "5069'94208", "reported_seq": "151527", "reported_epoch": "6410", "state": "incomplete", "last_fresh": "2014-03-08 15:16:54.912909", "last_change": "2014-03-08 15:16:54.912909", "last_active": "2014-03-07 23:57:03.807203", "last_clean": "2014-03-07 23:49:27.655151", "last_became_active": "0.000000", "last_unstale": "2014-03-08 15:16:54.912909", "mapping_epoch": 6395, "log_start": "4978'91205", "ondisk_log_start": "4978'91205", "created": 188, "last_epoch_clean": 5066, "parent": "0.0", "parent_split_bits": 0, "last_scrub": "5052'93452", "last_scrub_stamp": "2014-03-07 15:40:52.774102", "last_deep_scrub": "4908'85840", "last_deep_scrub_stamp": "2014-03-02 15:37:23.975966", "last_clean_scrub_stamp": "2014-03-07 15:40:52.774102", "log_size": 3003, "ondisk_log_size": 3003, "stats_invalid": "0", "stat_sum": { "num_bytes": 608240640, "num_objects": 186, "num_object_clones": 46, "num_object_copies": 0, "num_objects_missing_on_primary": 0, "num_objects_degraded": 0, "num_objects_unfound": 0, "num_read": 17342, "num_read_kb": 2457859, "num_write": 92022, "num_write_kb": 1244712, "num_scrub_errors": 0, "num_shallow_scrub_errors": 0, "num_deep_scrub_errors": 0, "num_objects_recovered": 472, "num_bytes_recovered": 1693462528, "num_keys_recovered": 0}, "stat_cat_sum": {}, "up": [ 13, 25, 5], "acting": [ 13, 25, 5]}, "empty": 0, "dne": 0, "incomplete": 0, "last_epoch_started": 5081}, "recovery_state": [ { "name": "Started\/Primary\/Peering", "enter_time": "2014-03-08 15:16:54.912687", "past_intervals": [ { "first": 5060, "last": 5071, "maybe_went_rw": 1, "up": [ 13, 0], "acting": [ 13, 0]}, { "first": 5072, "last": 5074, "maybe_went_rw": 1, "up": [ 13, 5], "acting": [ 13, 5]}, { "first": 5075, "last": 5082, "maybe_went_rw": 1, "up": [ 13, 5], "acting": [ 13, 5, 0]}, { "first": 5083, "last": 5084, "maybe_went_rw": 1, "up": [ 5], "acting": [ 5, 0]}, { "first": 5085, "last": 5112, "maybe_went_rw": 1, "up": [ 5], "acting": [ 0, 5]}, { "first": 5113, "last": 5118, "maybe_went_rw": 0, "up": [ 23, 5], "acting": [ 5]}, { "first": 5119, "last": 5148, "maybe_went_rw": 1, "up": [ 23, 5], "acting": [ 23, 5]}, { "first": 5149, "last": 5169, "maybe_went_rw": 1, "up": [ 23], "acting": [ 23]}, { "first": 5170, "last": 5183, "maybe_went_rw": 1, "up": [ 23, 5], "acting": [ 23, 5]}, { "first": 5184, "last": 5229, "maybe_went_rw": 1, "up": [ 23, 5], "acting": [ 0, 23]}, { "first": 5230, "last": 5245, "maybe_went_rw": 1, "up": [ 23, 5], "acting": [ 23]}, { "first": 5246, "last": 5259, "maybe_went_rw": 1, "up": [ 23], "acting": [ 23]}, { "first": 5260, "last": 5266, "maybe_went_rw": 0, "up": [ 23, 5], "acting": [ 23]}, { "first": 5267, "last": 5287, "maybe_went_rw": 1, "up": [ 23, 5], "acting": [ 23, 5]}, { "first": 5288, "last": 5305, "maybe_went_rw": 1, "up": [ 23], "acting": [ 23]}, { "first": 5306, "last": 5404, "maybe_went_rw": 1, "up": [ 23, 5], "acting": [ 23, 5]}, { "first": 5405, "last": 5417, "maybe_went_rw": 1, "up": [ 5], "acting": [ 5]}, { "first": 5418, "last": 5698, "maybe_went_rw": 1, "up": [ 23, 5], "acting": [ 23, 5]}, { "first": 5699, "last": 5710, "maybe_went_rw": 1, "up": [ 5], "acting": [ 5]}, { "first": 5711, "last": 6051, "maybe_went_rw": 1, "up": [ 23, 5], "acting": [ 23, 5]}, { "first": 6052, "last": 6053, "maybe_went_rw": 1, "up": [ 23, 35, 5], "acting": [ 23, 35, 5]}, { "first": 6054, "last": 6054, "maybe_went_rw": 0, "up": [ 23, 28, 5], "acting": [ 23, 28, 5]}, { "first": 6055, "last": 6057, "maybe_went_rw": 1, "up": [ 23, 26, 5], "acting": [ 23, 26, 5]}, { "first": 6058, "last": 6185, "maybe_went_rw": 1, "up": [ 23, 25, 5], "acting": [ 23, 25, 5]}, { "first": 6186, "last": 6341, "maybe_went_rw": 1, "up": [ 13, 25, 5], "acting": [ 13, 25, 5]}, { "first": 6342, "last": 6344, "maybe_went_rw": 1, "up": [ 13, 25], "acting": [ 13, 25]}, { "first": 6345, "last": 6359, "maybe_went_rw": 1, "up": [ 13, 25, 5], "acting": [ 13, 25, 5]}, { "first": 6360, "last": 6368, "maybe_went_rw": 1, "up": [ 13, 25], "acting": [ 13, 25]}, { "first": 6369, "last": 6394, "maybe_went_rw": 1, "up": [ 13, 25, 5], "acting": [ 13, 25, 5]}, { "first": 6395, "last": 6402, "maybe_went_rw": 1, "up": [ 13, 25], "acting": [ 13, 25]}], "probing_osds": [ 5, 13, 23, 25, 26, 35], "down_osds_we_would_probe": [ 0], "peering_blocked_by": []}, { "name": "Started", "enter_time": "2014-03-08 15:16:54.912635"}]} --------------------------------------------------------------------- -- Mit freundlichen Grüßen Stefan Schwarz UNITED COLO GmbH Sonntagsanger 1 96450 Coburg Germany Tel : ++49 9561 8711 - 45 Fax : ++49 9561 8711 - 46 Mail: S.Schwarz@xxxxxxxxxxxxx URL: http://www.unitedcolo.de Geschäftsführer: Timo Rolle, Registergericht: Amtsgericht Coburg HR 3928, Steuernummer: DE240512157 _______________________________________________ ceph-users mailing list ceph-users@xxxxxxxxxxxxxx http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com