I have an issue very similar to this thread: http://article.gmane.org/gmane.comp.file-systems.ceph.user/3197. I have 19 unfound objects that are part of a VM image that I have already recovered from backup. If I query pg 4.30 ( the one with the unfound objects), it says it is still querying osd.8, looking for the unfound objects. Because of this, when I run: # ceph pg 4.30 mark_unfound_lost revert Error EINVAL: pg has 19 unfound objects but we haven't probed all sources, not marking lost It refuses to remove them. It has been "querying" osd.8 for almost 2 days now, and there is only 200GB on it, so I don't see why it would take so long. So how can I force it to either stop querying, or revert the unfound objects? Here is how I got into this state. I have only 6 OSDs total, 3 on one host (vashti) and 3 on another (zadok). I set the noout flag so I could reboot zadok. Zadok was down for 2 minutes. When it came up ceph began recovering the objects that had not been replicated yet. Before recovery finished, osd.6, on vashti, died (IO errors on disk, whole drive un-recoverable). Since osd.6 had objects that had not yet had a chance to replicate to any OSD on zadok, they were lost. I cannot recover anything further from osd.6. Here is the output of "ceph pg 4.30 query": { "state": "active+recovering+degraded+remapped", "epoch": 20364, "up": [ 2, 0], "acting": [ 1, 2], "info": { "pgid": "4.30", "last_update": "20364'10377395", "last_complete": "0'0", "log_tail": "20161'10325373", "last_user_version": 10377395, "last_backfill": "MAX", "purged_snaps": "[1~7,10~4]", "history": { "epoch_created": 386, "last_epoch_started": 20323, "last_epoch_clean": 20161, "last_epoch_split": 0, "same_up_since": 20322, "same_interval_since": 20322, "same_primary_since": 20311, "last_scrub": "20118'10315975", "last_scrub_stamp": "2014-04-29 11:54:57.358096", "last_deep_scrub": "20050'10061396", "last_deep_scrub_stamp": "2014-04-24 11:39:40.313745", "last_clean_scrub_stamp": "2014-04-29 11:54:57.358096"}, "stats": { "version": "20364'10377395", "reported_seq": "17957416", "reported_epoch": "20364", "state": "active+recovering+degraded+remapped", "last_fresh": "2014-05-01 10:00:51.210564", "last_change": "2014-05-01 09:03:31.708198", "last_active": "2014-05-01 10:00:51.210564", "last_clean": "2014-04-29 16:14:12.127562", "last_became_active": "0.000000", "last_unstale": "2014-05-01 10:00:51.210564", "mapping_epoch": 20317, "log_start": "20161'10325373", "ondisk_log_start": "20161'10325373", "created": 386, "last_epoch_clean": 20161, "parent": "0.0", "parent_split_bits": 0, "last_scrub": "20118'10315975", "last_scrub_stamp": "2014-04-29 11:54:57.358096", "last_deep_scrub": "20050'10061396", "last_deep_scrub_stamp": "2014-04-24 11:39:40.313745", "last_clean_scrub_stamp": "2014-04-29 11:54:57.358096", "log_size": 52022, "ondisk_log_size": 52022, "stats_invalid": "0", "stat_sum": { "num_bytes": 9078859264, "num_objects": 2598, "num_object_clones": 360, "num_object_copies": 0, "num_objects_missing_on_primary": 0, "num_objects_degraded": 0, "num_objects_unfound": 0, "num_read": 703887, "num_read_kb": 164523202, "num_write": 8785487, "num_write_kb": 69327327, "num_scrub_errors": 0, "num_shallow_scrub_errors": 0, "num_deep_scrub_errors": 0, "num_objects_recovered": 24428, "num_bytes_recovered": 93261249024, "num_keys_recovered": 0}, "stat_cat_sum": {}, "up": [ 2, 0], "acting": [ 1, 2]}, "empty": 0, "dne": 0, "incomplete": 0, "last_epoch_started": 20323}, "recovery_state": [ { "name": "Started\/Primary\/Active", "enter_time": "2014-05-01 09:03:30.557244", "might_have_unfound": [ { "osd": 0, "status": "already probed"}, { "osd": 2, "status": "already probed"}, { "osd": 6, "status": "osd is down"}, { "osd": 8, "status": "querying"}], "recovery_progress": { "backfill_target": 2, "waiting_on_backfill": 0, "last_backfill_started": "0\/\/0\/\/-1", "backfill_info": { "begin": "0\/\/0\/\/-1", "end": "0\/\/0\/\/-1", "objects": []}, "peer_backfill_info": { "begin": "0\/\/0\/\/-1", "end": "0\/\/0\/\/-1", "objects": []}, "backfills_in_flight": [], "recovering": [], "pg_backend": { "pull_from_peer": [], "pushing": []}}, "scrub": { "scrubber.epoch_start": "0", "scrubber.active": 0, "scrubber.block_writes": 0, "scrubber.finalizing": 0, "scrubber.waiting_on": 0, "scrubber.waiting_on_whom": []}}, { "name": "Started", "enter_time": "2014-05-01 09:03:29.347540"}]} Thanks. Kevin