Fixing mark_unfound_lost revert failure

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Hi Craig,

I'll try that, thanks for the hint :-)

Cheers

On 03/09/2014 19:53, Craig Lewis wrote:
> The only way I've been able to solve this it to recreate the OSDs that Ceph wants to probe.  It doesn't have to have anything on it, it's probably better if it doesn't.  Even ceph osd lost 2 won't help; Ceph won't mark the data lost until it's exhausted all possibilities.
> 
> Any reason you can't get OSD.2 back up?  Last time I had this problem, I had an OSD that would crash 30 seconds after start.  I formatted the disk, then re-created it with the same OSD ID.  The probe completed within a few minutes.
> 
> 
> 
> 
> On Tue, Sep 2, 2014 at 8:38 AM, Loic Dachary <loic at dachary.org <mailto:loic at dachary.org>> wrote:
> 
>     FYI it is a known issue : http://tracker.ceph.com/issues/6109
> 
>     On 01/09/2014 00:02, Loic Dachary wrote:
>     > Hi Ceph,
>     >
>     > In a mixed dumpling / emperor cluster, because osd 2 has been removed but is still in
>     >
>     >           "might_have_unfound": [
>     >                 { "osd": 2,
>     >                   "status": "osd is down"},
>     >                 { "osd": 6,
>     >                   "status": "already probed"}],
>     >
>     > and because of that mark_unfound_lost fails with
>     >
>     > # ceph pg 4.46 mark_unfound_lost revert
>     > Error EINVAL: pg has 1 objects but we haven't probed all sources, not marking lost
>     >
>     > What would be the recommended way to fix this ?
>     >
>     > FWIW the missing object is an XFS read error
>     >
>     > # cp '/var/lib/ceph/osd/ceph-2/current/4.46_head/DIR_6/DIR_C/DIR_D/rbd\udata.9ad9d26b8b4567.00000000000007b1__head_0BC0BDC6__4' .
>     > cp: reading `/var/lib/ceph/osd/ceph-2/current/4.46_head/DIR_6/DIR_C/DIR_D/rbd\\udata.9ad9d26b8b4567.00000000000007b1__head_0BC0BDC6__4': Input/output error
>     >
>     > that is not caught by xfs_repair and I expect the older version of the object on the remaining OSD to be OK.
>     >
>     > Cheers
>     >
>     > osd 6 is running
>     > # ceph --version
>     > ceph version 0.72.2 (a913ded2ff138aefb8cb84d347d72164099cfd60)
>     >
>     > and
>     > osd 1 is running
>     > # ceph --version
>     > ceph version 0.67.4 (ad85b8bfafea6232d64cb7ba76a8b6e8252fa0c7)
>     >
>     > # ceph pg 4.46 mark_unfound_lost revert
>     > Error EINVAL: pg has 1 objects but we haven't probed all sources, not marking lost
>     > # ceph pg 4.46 list_missing
>     > { "offset": { "oid": "",
>     >       "key": "",
>     >       "snapid": 0,
>     >       "hash": 0,
>     >       "max": 0,
>     >       "pool": -1,
>     >       "namespace": ""},
>     >   "num_missing": 1,
>     >   "num_unfound": 1,
>     >   "objects": [
>     >         { "oid": { "oid": "rbd_data.9ad9d26b8b4567.00000000000007b1",
>     >               "key": "",
>     >               "snapid": -2,
>     >               "hash": 197180870,
>     >               "max": 0,
>     >               "pool": 4,
>     >               "namespace": ""},
>     >           "need": "328685'1233912",
>     >           "have": "328683'1233904",
>     >           "locations": []}],
>     >   "more": 0}
>     > # ceph pg 4.46 query
>     > { "state": "active+recovering+degraded+remapped",
>     >   "epoch": 346424,
>     >   "up": [
>     >         6,
>     >         1],
>     >   "acting": [
>     >         1,
>     >         6],
>     >   "info": { "pgid": "4.46",
>     >       "last_update": "346424'1288927",
>     >       "last_complete": "0'0",
>     >       "log_tail": "328683'1233911",
>     >       "last_backfill": "MAX",
>     >       "purged_snaps": "[1~3]",
>     >       "history": { "epoch_created": 195,
>     >           "last_epoch_started": 346424,
>     >           "last_epoch_clean": 328685,
>     >           "last_epoch_split": 0,
>     >           "same_up_since": 346423,
>     >           "same_interval_since": 346423,
>     >           "same_primary_since": 346423,
>     >           "last_scrub": "328664'1230185",
>     >           "last_scrub_stamp": "2014-08-23 09:34:01.524854",
>     >           "last_deep_scrub": "328604'1208887",
>     >           "last_deep_scrub_stamp": "2014-08-20 09:33:19.073523",
>     >           "last_clean_scrub_stamp": "2014-08-23 09:34:01.524854"},
>     >       "stats": { "version": "346424'1288927",
>     >           "reported_seq": "2553234",
>     >           "reported_epoch": "346424",
>     >           "state": "active+recovering+degraded+remapped",
>     >           "last_fresh": "2014-08-31 23:47:49.866548",
>     >           "last_change": "2014-08-31 23:44:22.571492",
>     >           "last_active": "2014-08-31 23:47:49.866548",
>     >           "last_clean": "2014-08-23 22:27:23.391412",
>     >           "last_became_active": "0.000000",
>     >           "last_unstale": "2014-08-31 23:47:49.866548",
>     >           "mapping_epoch": 346421,
>     >           "log_start": "328683'1233911",
>     >           "ondisk_log_start": "328683'1233911",
>     >           "created": 195,
>     >           "last_epoch_clean": 328685,
>     >           "parent": "0.0",
>     >           "parent_split_bits": 0,
>     >           "last_scrub": "328664'1230185",
>     >           "last_scrub_stamp": "2014-08-23 09:34:01.524854",
>     >           "last_deep_scrub": "328604'1208887",
>     >           "last_deep_scrub_stamp": "2014-08-20 09:33:19.073523",
>     >           "last_clean_scrub_stamp": "2014-08-23 09:34:01.524854",
>     >           "log_size": 55016,
>     >           "ondisk_log_size": 55016,
>     >           "stats_invalid": "0",
>     >           "stat_sum": { "num_bytes": 12584300544,
>     >               "num_objects": 3035,
>     >               "num_object_clones": 1,
>     >               "num_object_copies": 0,
>     >               "num_objects_missing_on_primary": 0,
>     >               "num_objects_degraded": 0,
>     >               "num_objects_unfound": 0,
>     >               "num_read": 51123,
>     >               "num_read_kb": 1525186,
>     >               "num_write": 1288927,
>     >               "num_write_kb": 19076876,
>     >               "num_scrub_errors": 0,
>     >               "num_shallow_scrub_errors": 0,
>     >               "num_deep_scrub_errors": 0,
>     >               "num_objects_recovered": 20047,
>     >               "num_bytes_recovered": 78532055040,
>     >               "num_keys_recovered": 0},
>     >           "stat_cat_sum": {},
>     >           "up": [
>     >                 6,
>     >                 1],
>     >           "acting": [
>     >                 1,
>     >                 6]},
>     >       "empty": 0,
>     >       "dne": 0,
>     >       "incomplete": 0,
>     >       "last_epoch_started": 346424},
>     >   "recovery_state": [
>     >         { "name": "Started\/Primary\/Active",
>     >           "enter_time": "2014-08-31 23:44:22.435483",
>     >           "might_have_unfound": [
>     >                 { "osd": 2,
>     >                   "status": "osd is down"},
>     >                 { "osd": 6,
>     >                   "status": "already probed"}],
>     >           "recovery_progress": { "backfill_target": 6,
>     >               "waiting_on_backfill": 0,
>     >               "backfill_pos": "0\/\/0\/\/-1",
>     >               "backfill_info": { "begin": "0\/\/0\/\/-1",
>     >                   "end": "0\/\/0\/\/-1",
>     >                   "objects": []},
>     >               "peer_backfill_info": { "begin": "0\/\/0\/\/-1",
>     >                   "end": "0\/\/0\/\/-1",
>     >                   "objects": []},
>     >               "backfills_in_flight": [],
>     >               "pull_from_peer": [],
>     >               "pushing": []},
>     >           "scrub": { "scrubber.epoch_start": "0",
>     >               "scrubber.active": 0,
>     >               "scrubber.block_writes": 0,
>     >               "scrubber.finalizing": 0,
>     >               "scrubber.waiting_on": 0,
>     >               "scrubber.waiting_on_whom": []}},
>     >         { "name": "Started",
>     >           "enter_time": "2014-08-31 23:44:21.177460"}]}
>     >
> 
>     --
>     Lo?c Dachary, Artisan Logiciel Libre
> 
> 
>     _______________________________________________
>     ceph-users mailing list
>     ceph-users at lists.ceph.com <mailto:ceph-users at lists.ceph.com>
>     http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com
> 
> 

-- 
Lo?c Dachary, Artisan Logiciel Libre

-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 263 bytes
Desc: OpenPGP digital signature
URL: <http://lists.ceph.com/pipermail/ceph-users-ceph.com/attachments/20140903/b815a215/attachment.pgp>


[Index of Archives]     [Information on CEPH]     [Linux Filesystem Development]     [Ceph Development]     [Ceph Large]     [Ceph Dev]     [Linux USB Development]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]     [xfs]


  Powered by Linux