cannot revert lost objects

khoran@xxxxxxxxxxxxxxxxxxxx (kevin horan) · Thu, 01 May 2014 10:11:19 -0700

     I have an issue very similar to this thread: 
http://article.gmane.org/gmane.comp.file-systems.ceph.user/3197. I have 
19 unfound objects that are part of a VM image that I have already 
recovered from backup. If I query pg 4.30 ( the one with the unfound 
objects), it says it is still querying osd.8, looking for the unfound 
objects. Because of this, when I run:

# ceph pg 4.30 mark_unfound_lost revert
Error EINVAL: pg has 19 unfound objects but we haven't probed all 
sources, not marking lost

It refuses to remove them. It has been "querying" osd.8 for almost 2 
days now, and there is only 200GB on it, so I don't see why it would 
take so long. So how can I force it to either stop querying, or revert 
the unfound objects?

Here is how I got into this state. I have only 6 OSDs total, 3 on one 
host (vashti) and 3 on another (zadok). I set the noout flag so I could 
reboot zadok. Zadok was down for 2 minutes. When it came up ceph began 
recovering the objects that had not been replicated yet. Before recovery 
finished, osd.6, on vashti, died (IO errors on disk, whole drive 
un-recoverable). Since osd.6 had objects that had not yet had a chance 
to replicate to any OSD on zadok, they were lost. I cannot recover 
anything further from osd.6.

Here is the output of "ceph pg 4.30 query":

{ "state": "active+recovering+degraded+remapped",
   "epoch": 20364,
   "up": [
         2,
         0],
   "acting": [
         1,
         2],
   "info": { "pgid": "4.30",
       "last_update": "20364'10377395",
       "last_complete": "0'0",
       "log_tail": "20161'10325373",
       "last_user_version": 10377395,
       "last_backfill": "MAX",
       "purged_snaps": "[1~7,10~4]",
       "history": { "epoch_created": 386,
           "last_epoch_started": 20323,
           "last_epoch_clean": 20161,
           "last_epoch_split": 0,
           "same_up_since": 20322,
           "same_interval_since": 20322,
           "same_primary_since": 20311,
           "last_scrub": "20118'10315975",
           "last_scrub_stamp": "2014-04-29 11:54:57.358096",
           "last_deep_scrub": "20050'10061396",
           "last_deep_scrub_stamp": "2014-04-24 11:39:40.313745",
           "last_clean_scrub_stamp": "2014-04-29 11:54:57.358096"},
       "stats": { "version": "20364'10377395",
           "reported_seq": "17957416",
           "reported_epoch": "20364",
           "state": "active+recovering+degraded+remapped",
           "last_fresh": "2014-05-01 10:00:51.210564",
           "last_change": "2014-05-01 09:03:31.708198",
           "last_active": "2014-05-01 10:00:51.210564",
           "last_clean": "2014-04-29 16:14:12.127562",
           "last_became_active": "0.000000",
           "last_unstale": "2014-05-01 10:00:51.210564",
           "mapping_epoch": 20317,
           "log_start": "20161'10325373",
           "ondisk_log_start": "20161'10325373",
           "created": 386,
           "last_epoch_clean": 20161,
           "parent": "0.0",
           "parent_split_bits": 0,
           "last_scrub": "20118'10315975",
           "last_scrub_stamp": "2014-04-29 11:54:57.358096",
           "last_deep_scrub": "20050'10061396",
           "last_deep_scrub_stamp": "2014-04-24 11:39:40.313745",
           "last_clean_scrub_stamp": "2014-04-29 11:54:57.358096",
           "log_size": 52022,
           "ondisk_log_size": 52022,
           "stats_invalid": "0",
           "stat_sum": { "num_bytes": 9078859264,
               "num_objects": 2598,
               "num_object_clones": 360,
               "num_object_copies": 0,
               "num_objects_missing_on_primary": 0,
               "num_objects_degraded": 0,
               "num_objects_unfound": 0,
               "num_read": 703887,
               "num_read_kb": 164523202,
               "num_write": 8785487,
               "num_write_kb": 69327327,
               "num_scrub_errors": 0,
               "num_shallow_scrub_errors": 0,
               "num_deep_scrub_errors": 0,
               "num_objects_recovered": 24428,
               "num_bytes_recovered": 93261249024,
               "num_keys_recovered": 0},
           "stat_cat_sum": {},
           "up": [
                 2,
                 0],
           "acting": [
                 1,
                 2]},
       "empty": 0,
       "dne": 0,
       "incomplete": 0,
       "last_epoch_started": 20323},
   "recovery_state": [
         { "name": "Started\/Primary\/Active",
           "enter_time": "2014-05-01 09:03:30.557244",
           "might_have_unfound": [
                 { "osd": 0,
                   "status": "already probed"},
                 { "osd": 2,
                   "status": "already probed"},
                 { "osd": 6,
                   "status": "osd is down"},
                 { "osd": 8,
                   "status": "querying"}],
           "recovery_progress": { "backfill_target": 2,
               "waiting_on_backfill": 0,
               "last_backfill_started": "0\/\/0\/\/-1",
               "backfill_info": { "begin": "0\/\/0\/\/-1",
                   "end": "0\/\/0\/\/-1",
                   "objects": []},
               "peer_backfill_info": { "begin": "0\/\/0\/\/-1",
                   "end": "0\/\/0\/\/-1",
                   "objects": []},
               "backfills_in_flight": [],
               "recovering": [],
               "pg_backend": { "pull_from_peer": [],
                   "pushing": []}},
           "scrub": { "scrubber.epoch_start": "0",
               "scrubber.active": 0,
               "scrubber.block_writes": 0,
               "scrubber.finalizing": 0,
               "scrubber.waiting_on": 0,
               "scrubber.waiting_on_whom": []}},
         { "name": "Started",
           "enter_time": "2014-05-01 09:03:29.347540"}]}

Thanks.
Kevin