ceph pg ls | grep 11.182 11.182 10 25 35 0 25 34648064 1306 1306 active+recovery_wait+undersized+degraded 2019-02-04 09:23:26.461468 70238'1306 70673:24924 [64] 64 [64] 64 46843'56759413 2019-01-26 16:31:32.607109 46843'56628962 2019-01-24 08:56:59.228615 root@storage-node-1-l3:~# ceph pg 11.182 query { "state": "active+recovery_wait+undersized+degraded", "snap_trimq": "[1~b]", "snap_trimq_len": 11, "epoch": 70673, "up": [ 64 ], "acting": [ 64 ], "actingbackfill": [ "64" ], "info": { "pgid": "11.182", "last_update": "70238'1306", "last_complete": "46843'56787837", "log_tail": "0'0", "last_user_version": 1301, "last_backfill": "MAX", "last_backfill_bitwise": 0, "purged_snaps": [], "history": { "epoch_created": 54817, "epoch_pool_created": 278, "last_epoch_started": 70656, "last_interval_started": 70655, "last_epoch_clean": 67924, "last_interval_clean": 54687, "last_epoch_split": 54817, "last_epoch_marked_full": 0, "same_up_since": 70655, "same_interval_since": 70655, "same_primary_since": 70655, "last_scrub": "46843'56759413", "last_scrub_stamp": "2019-01-26 16:31:32.607109", "last_deep_scrub": "46843'56628962", "last_deep_scrub_stamp": "2019-01-24 08:56:59.228615", "last_clean_scrub_stamp": "2019-01-26 16:31:32.607109" }, "stats": { "version": "70238'1306", "reported_seq": "24940", "reported_epoch": "70673", "state": "active+recovery_wait+undersized+degraded", "last_fresh": "2019-02-04 09:25:56.966952", "last_change": "2019-02-04 09:25:56.966952", "last_active": "2019-02-04 09:25:56.966952", "last_peered": "2019-02-04 09:25:56.966952", "last_clean": "0.000000", "last_became_active": "2019-02-04 07:57:08.769839", "last_became_peered": "2019-02-04 07:57:08.769839", "last_unstale": "2019-02-04 09:25:56.966952", "last_undegraded": "2019-02-04 07:57:08.762164", "last_fullsized": "2019-02-04 07:57:08.761962", "mapping_epoch": 70655, "log_start": "0'0", "ondisk_log_start": "0'0", "created": 54817, "last_epoch_clean": 67924, "parent": "0.0", "parent_split_bits": 0, "last_scrub": "46843'56759413", "last_scrub_stamp": "2019-01-26 16:31:32.607109", "last_deep_scrub": "46843'56628962", "last_deep_scrub_stamp": "2019-01-24 08:56:59.228615", "last_clean_scrub_stamp": "2019-01-26 16:31:32.607109", "log_size": 1306, "ondisk_log_size": 1306, "stats_invalid": false, "dirty_stats_invalid": false, "omap_stats_invalid": false, "hitset_stats_invalid": false, "hitset_bytes_stats_invalid": false, "pin_stats_invalid": false, "snaptrimq_len": 11, "stat_sum": { "num_bytes": 34648064, "num_objects": 10, "num_object_clones": 0, "num_object_copies": 20, "num_objects_missing_on_primary": 25, "num_objects_missing": 0, "num_objects_degraded": 35, "num_objects_misplaced": 0, "num_objects_unfound": 25, "num_objects_dirty": 10, "num_whiteouts": 0, "num_read": 1274, "num_read_kb": 33808, "num_write": 1388, "num_write_kb": 42956, "num_scrub_errors": 0, "num_shallow_scrub_errors": 0, "num_deep_scrub_errors": 0, "num_objects_recovered": 0, "num_bytes_recovered": 0, "num_keys_recovered": 0, "num_objects_omap": 0, "num_objects_hit_set_archive": 0, "num_bytes_hit_set_archive": 0, "num_flush": 0, "num_flush_kb": 0, "num_evict": 0, "num_evict_kb": 0, "num_promote": 0, "num_flush_mode_high": 0, "num_flush_mode_low": 0, "num_evict_mode_some": 0, "num_evict_mode_full": 0, "num_objects_pinned": 0, "num_legacy_snapsets": 0 }, "up": [ 64 ], "acting": [ 64 ], "blocked_by": [], "up_primary": 64, "acting_primary": 64 }, "empty": 0, "dne": 0, "incomplete": 0, "last_epoch_started": 70656, "hit_set_history": { "current_last_update": "0'0", "history": [] } }, "peer_info": [], "recovery_state": [ { "name": "Started/Primary/Active", "enter_time": "2019-02-04 07:57:08.762037", "might_have_unfound": [ { "osd": "9", "status": "osd is down" }, { "osd": "29", "status": "osd is down" }, { "osd": "49", "status": "osd is down" }, { "osd": "51", "status": "osd is down" }, { "osd": "63", "status": "osd is down" }, { "osd": "92", "status": "osd is down" } ], "recovery_progress": { "backfill_targets": [], "waiting_on_backfill": [], "last_backfill_started": "MIN", "backfill_info": { "begin": "MIN", "end": "MIN", "objects": [] }, "peer_backfill_info": [], "backfills_in_flight": [], "recovering": [], "pg_backend": { "pull_from_peer": [], "pushing": [] } }, "scrub": { "scrubber.epoch_start": "0", "scrubber.active": false, "scrubber.state": "INACTIVE", "scrubber.start": "MIN", "scrubber.end": "MIN", "scrubber.subset_last_update": "0'0", "scrubber.deep": false, "scrubber.seed": 0, "scrubber.waiting_on": 0, "scrubber.waiting_on_whom": [] } }, { "name": "Started", "enter_time": "2019-02-04 07:57:08.220064" } ], "agent_state": {} } For 11.ac i will try wath you propose and keep you informed but i am a litle bit anxious to lose another healthy osd. Kr ________________________________________ From: Sage Weil <sage@xxxxxxxxxxxx> Sent: 04 February 2019 09:20 To: Philippe Van Hecke Cc: ceph-users@xxxxxxxxxxxxxx; Belnet Services Subject: Re: Luminous cluster in very bad state need some assistance. On Mon, 4 Feb 2019, Philippe Van Hecke wrote: > So i restarted the osd but he stop after some time. But this is an effect on the cluster and cluster is on a partial recovery process. > > please find here log file of osd 49 after this restart > https://filesender.belnet.be/?s=download&token=8c9c39f2-36f6-43f7-bebb-175679d27a22 It's the same PG 11.182 hitting the same assert when it tries to recover to that OSD. I think the problem will go away once there has been some write traffic, but it may be tricky to prevent it from doing any recovery until then. I just noticed you pasted the wrong 'pg ls' result before: > > result of ceph pg ls | grep 11.118 > > > > 11.118 9788 0 0 0 0 40817837568 1584 1584 active+clean 2019-02-01 12:48:41.343228 70238'19811673 70493:34596887 [121,24] 121 [121,24] 121 69295'19811665 2019-02-01 12:48:41.343144 66131'19810044 2019-01-30 11:44:36.006505 What does 11.182 look like? We can try something slighty different. From before it looked like your only 'incomplete' pg was 11.ac (ceph pg ls incomplete), and the needed state is either on osd.49 or osd.63. On osd.49, do ceph-objectstore-tool --op export on that pg, and then find an otherwise healthy OSD (that doesn't have 11.ac), stop it, and ceph-objectstore-tool --op import it there. When you start it up, 11.ac will hopefull peer and recover. (Or, alternatively, osd.63 may have the needed state.) sage _______________________________________________ ceph-users mailing list ceph-users@xxxxxxxxxxxxxx http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com