You can run 'ceph pg 0.cfa mark_unfound_lost revert'. (Revert Lost section of http://ceph.com/docs/master/rados/operations/placement-groups/). -Sam On Tue, Aug 13, 2013 at 6:50 AM, Jens-Christian Fischer <jens-christian.fischer@xxxxxxxxx> wrote: > We have a cluster with 10 servers, 64 OSDs and 5 Mons on them. The OSDs are > 3TB disk, formatted with btrfs and the servers are either on Ubuntu 12.10 or > 13.04. > > Recently one of the servers (13.04) stood still (due to problems with btrfs > - something we have seen a few times). I decided to not try to recover the > disks, but reformat them with XFS. I removed the OSDs, reformatted, and > re-created them (they got the same OSD numbers) > > I redid this twice (because I wrongly partioned the disks in the first > place) and I ended up with 2 unfound "pieces" in one pg: > > root@s2:~# ceph health details > HEALTH_WARN 1 pgs degraded; 1 pgs recovering; 1 pgs stuck unclean; recovery > 4448/28915270 degraded (0.015%); 2/9854766 unfound (0.000%) > pg 0.cfa is stuck unclean for 1004252.309704, current state > active+recovering+degraded+remapped, last acting [23,50] > pg 0.cfa is active+recovering+degraded+remapped, acting [23,50], 2 unfound > recovery 4448/28915270 degraded (0.015%); 2/9854766 unfound (0.000%) > > > root@s2:~# ceph pg 0.cfa query > > { "state": "active+recovering+degraded+remapped", > "epoch": 28197, > "up": [ > 23, > 50, > 18], > "acting": [ > 23, > 50], > "info": { "pgid": "0.cfa", > "last_update": "28082'7774", > "last_complete": "23686'7083", > "log_tail": "14360'4061", > "last_backfill": "MAX", > "purged_snaps": "[]", > "history": { "epoch_created": 1, > "last_epoch_started": 28197, > "last_epoch_clean": 24810, > "last_epoch_split": 0, > "same_up_since": 28195, > "same_interval_since": 28196, > "same_primary_since": 26036, > "last_scrub": "20585'6801", > "last_scrub_stamp": "2013-07-28 15:40:53.298786", > "last_deep_scrub": "20585'6801", > "last_deep_scrub_stamp": "2013-07-28 15:40:53.298786", > "last_clean_scrub_stamp": "2013-07-28 15:40:53.298786"}, > "stats": { "version": "28082'7774", > "reported": "28197'41950", > "state": "active+recovering+degraded+remapped", > "last_fresh": "2013-08-13 14:34:33.057271", > "last_change": "2013-08-13 14:34:33.057271", > "last_active": "2013-08-13 14:34:33.057271", > "last_clean": "2013-08-01 23:50:18.414082", > "last_became_active": "2013-05-29 13:10:51.366237", > "last_unstale": "2013-08-13 14:34:33.057271", > "mapping_epoch": 28195, > "log_start": "14360'4061", > "ondisk_log_start": "14360'4061", > "created": 1, > "last_epoch_clean": 24810, > "parent": "0.0", > "parent_split_bits": 0, > "last_scrub": "20585'6801", > "last_scrub_stamp": "2013-07-28 15:40:53.298786", > "last_deep_scrub": "20585'6801", > "last_deep_scrub_stamp": "2013-07-28 15:40:53.298786", > "last_clean_scrub_stamp": "2013-07-28 15:40:53.298786", > "log_size": 0, > "ondisk_log_size": 0, > "stats_invalid": "0", > "stat_sum": { "num_bytes": 145307402, > "num_objects": 2234, > "num_object_clones": 0, > "num_object_copies": 0, > "num_objects_missing_on_primary": 0, > "num_objects_degraded": 0, > "num_objects_unfound": 0, > "num_read": 744, > "num_read_kb": 410184, > "num_write": 7774, > "num_write_kb": 1155438, > "num_scrub_errors": 0, > "num_shallow_scrub_errors": 0, > "num_deep_scrub_errors": 0, > "num_objects_recovered": 3998, > "num_bytes_recovered": 278803622, > "num_keys_recovered": 0}, > "stat_cat_sum": {}, > "up": [ > 23, > 50, > 18], > "acting": [ > 23, > 50]}, > "empty": 0, > "dne": 0, > "incomplete": 0, > "last_epoch_started": 28197}, > "recovery_state": [ > { "name": "Started\/Primary\/Active", > "enter_time": "2013-08-13 14:34:33.026698", > "might_have_unfound": [ > { "osd": 9, > "status": "querying"}, > { "osd": 18, > "status": "querying"}, > { "osd": 50, > "status": "already probed"}], > "recovery_progress": { "backfill_target": 50, > "waiting_on_backfill": 0, > "backfill_pos": "96220cfa\/10000799e82.00000000\/head\/\/0", > "backfill_info": { "begin": "0\/\/0\/\/-1", > "end": "0\/\/0\/\/-1", > "objects": []}, > "peer_backfill_info": { "begin": "0\/\/0\/\/-1", > "end": "0\/\/0\/\/-1", > "objects": []}, > "backfills_in_flight": [], > "pull_from_peer": [], > "pushing": []}, > "scrub": { "scrubber.epoch_start": "0", > "scrubber.active": 0, > "scrubber.block_writes": 0, > "scrubber.finalizing": 0, > "scrubber.waiting_on": 0, > "scrubber.waiting_on_whom": []}}, > { "name": "Started", > "enter_time": "2013-08-13 14:34:32.024282"}]} > > I have tried to mark those two pieces as lost, but ceph wouldn't let me (due > to the fact that it is still in querying state on osd 9 and 18). I have > restarted the OSDs, but I can't force any other status change. > > What next? Take the OSDs (9, 18) out again and rebuilding? > > thanks for your help > Jens-Christian > > > -- > SWITCH > Jens-Christian Fischer, Peta Solutions > Werdstrasse 2, P.O. Box, 8021 Zurich, Switzerland > phone +41 44 268 15 15, direct +41 44 268 15 71 > jens-christian.fischer@xxxxxxxxx > http://www.switch.ch > > http://www.switch.ch/socialmedia > > > _______________________________________________ > ceph-users mailing list > ceph-users@xxxxxxxxxxxxxx > http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com > _______________________________________________ ceph-users mailing list ceph-users@xxxxxxxxxxxxxx http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com