Thanks in advance for your help, and time...
ceph 0.67.4
It's been a long battle, and I don't really know what I'm doing (first ceph cluster.....ever)....so here goes.
Iv'e been testing with btrfs, been having lots of problems too. But ceph was resilient and kept working. (I still think its great)
everything was running smoothly one day....here is the play by play
- had osd.0(btrfs) and osd.1(btrfs)
- added new osd.2(xfs)
- during rebalance did a reboot on osd.0
- osd.1 self implodes :( (btrfs totally corrupts...its gone..all gone)
- osd.0 comes back online (it took 20 minutes for it to come back up, btrfs runs like crap for me)
- 4 pgs stuck, doing lots of googling
- try "ceph pg {pg-id} mark_unfound_lost revert"...says no missing objects
- try "ceph osd lost {OSD ID}"
- rebuild osd.1 on XFS
- rebalance.....
- current setup at this point osd.0(btrfs) osd.1(xfs) osd.2(xfs) pgs are still stuck
- try rebooting osd.1 and osd.2 (read somewhere that can help)
- shut down osd.0 planning to move to XFS (im done with btrfs)
- pg query shows "down_osds_we_would_probe" 0 so i try marking osd.0 as lost
- pgs still stuck
From: "Gregory Farnum" <greg@xxxxxxxxxxx>
To: "Bryan Morris" <bryan@xxxxxxxxxxxxxx>
Cc: ceph-users@xxxxxxxxxxxxxx
Sent: Tuesday, January 7, 2014 3:50:55 PM
Subject: Re: 4 PGs stuck inactive
Oh, sorry, you did do that. Hrm.
What osdmap epoch did your lost node (0, I assume) disappear in? What
version of Ceph are you running? That pg stat isn't making a lot of
sense to me.
Software Engineer #42 @ http://inktank.com | http://ceph.com
On Tue, Jan 7, 2014 at 2:45 PM, Gregory Farnum <greg@xxxxxxxxxxx> wrote:
> Assuming the one who lost its filesystem is totally gone, mark it
> lost. That will tell the OSDs to give up on whatever data it might
> have had and you should be good to go (modulo whatever data you might
> have lost from only having it on the dead OSD during the reboot).
> -Greg
> Software Engineer #42 @ http://inktank.com | http://ceph.com
>
>
> On Tue, Jan 7, 2014 at 1:32 PM, Bryan Morris <bryan@xxxxxxxxxxxxxx> wrote:
>>
>> I have 4 PGs stuck inactive.....
>>
>> I have tried everything I can find online, and could use some help.
>>
>> I had 3 OSDs... was in process of rebooting one and another totally crashed
>> and corrupted its filesystem (BAD).
>>
>> So now I have 4 incomplete PGs.
>>
>> Things I've tried:
>>
>> Rebooting all OSDs.
>> Running 'ceph pg {pg-id} mark_unfound_lost revert'
>> Running 'ceph osd lost {OSD ID}'
>>
>>
>> Here is a PG query from one of the 4 PGs
>>
>>
>>
>> [[{ "state": "down+incomplete",
>> "epoch": 7994,
>> "up": [
>> 1,
>> 2],
>> "acting": [
>> 1,
>> 2],
>> "info": { "pgid": "4.1ca",
>> "last_update": "0'0",
>> "last_complete": "0'0",
>> "log_tail": "0'0",
>> "last_backfill": "MAX",
>> "purged_snaps": "[]",
>> "history": { "epoch_created": 75,
>> "last_epoch_started": 6719,
>> "last_epoch_clean": 6710,
>> "last_epoch_split": 0,
>> "same_up_since": 7973,
>> "same_interval_since": 7973,
>> "same_primary_since": 7969,
>> "last_scrub": "6239'25314",
>> "last_scrub_stamp": "2014-01-05 18:48:01.122717",
>> "last_deep_scrub": "6235'25313",
>> "last_deep_scrub_stamp": "2014-01-03 18:47:48.390112",
>> "last_clean_scrub_stamp": "2014-01-05 18:48:01.122717"},
>> "stats": { "version": "0'0",
>> "reported_seq": "1259",
>> "reported_epoch": "7994",
>> "state": "down+incomplete",
>> "last_fresh": "2014-01-07 14:04:36.269587",
>> "last_change": "2014-01-07 14:04:36.269587",
>> "last_active": "0.000000",
>> "last_clean": "0.000000",
>> "last_became_active": "0.000000",
>> "last_unstale": "2014-01-07 14:04:36.269587",
>> "mapping_epoch": 7971,
>> "log_start": "0'0",
>> "ondisk_log_start": "0'0",
>> "created": 75,
>> "last_epoch_clean": 6710,
>> "parent": "0.0",
>> "parent_split_bits": 0,
>> "last_scrub": "6239'25314",
>> "last_scrub_stamp": "2014-01-05 18:48:01.122717",
>> "last_deep_scrub": "6235'25313",
>> "last_deep_scrub_stamp": "2014-01-03 18:47:48.390112",
>> "last_clean_scrub_stamp": "2014-01-05 18:48:01.122717",
>> "log_size": 0,
>> "ondisk_log_size": 0,
>> "stats_invalid": "0",
>> "stat_sum": { "num_bytes": 0,
>> "num_objects": 0,
>> "num_object_clones": 0,
>> "num_object_copies": 0,
>> "num_objects_missing_on_primary": 0,
>> "num_objects_degraded": 0,
>> "num_objects_unfound": 0,
>> "num_read": 0,
>> "num_read_kb": 0,
>> "num_write": 0,
>> "num_write_kb": 0,
>> "num_scrub_errors": 0,
>> "num_shallow_scrub_errors": 0,
>> "num_deep_scrub_errors": 0,
>> "num_objects_recovered": 0,
>> "num_bytes_recovered": 0,
>> "num_keys_recovered": 0},
>> "stat_cat_sum": {},
>> "up": [
>> 1,
>> 2],
>> "acting": [
>> 1,
>> 2]},
>> "empty": 1,
>> "dne": 0,
>> "incomplete": 0,
>> "last_epoch_started": 0},
>> "recovery_state": [
>> { "name": "Started\/Primary\/Peering",
>> "enter_time": "2014-01-07 14:04:36.269509",
>> "past_intervals": [
>> { "first": 5221,
>> "last": 5226,
>> "maybe_went_rw": 1,
>> "up": [
>> 0,
>> 1],
>> "acting": [
>> 0,
>> 1]},
>> { "first": 5227,
>> "last": 5439,
>> "maybe_went_rw": 1,
>> "up": [
>> 1,
>> 0],
>> "acting": [
>> 1,
>> 0]},
>> { "first": 5440,
>> "last": 5441,
>> "maybe_went_rw": 0,
>> "up": [
>> 1],
>> "acting": [
>> 1]},
>> { "first": 5442,
>> "last": 6388,
>> "maybe_went_rw": 1,
>> "up": [
>> 1,
>> 0],
>> "acting": [
>> 1,
>> 0]},
>> { "first": 6389,
>> "last": 6395,
>> "maybe_went_rw": 1,
>> "up": [
>> 0],
>> "acting": [
>> 0]},
>> { "first": 6396,
>> "last": 6708,
>> "maybe_went_rw": 1,
>> "up": [
>> 1,
>> 0],
>> "acting": [
>> 1,
>> 0]},
>> { "first": 6709,
>> "last": 6717,
>> "maybe_went_rw": 1,
>> "up": [
>> 1],
>> "acting": [
>> 1]},
>> { "first": 6718,
>> "last": 6721,
>> "maybe_went_rw": 1,
>> "up": [
>> 1,
>> 2],
>> "acting": [
>> 1,
>> 2]},
>> { "first": 6722,
>> "last": 6724,
>> "maybe_went_rw": 1,
>> "up": [
>> 2],
>> "acting": [
>> 2]},
>> { "first": 6725,
>> "last": 6737,
>> "maybe_went_rw": 1,
>> "up": [
>> 0,
>> 2],
>> "acting": [
>> 0,
>> 2]},
>> { "first": 6738,
>> "last": 7946,
>> "maybe_went_rw": 1,
>> "up": [
>> 1,
>> 0],
>> "acting": [
>> 1,
>> 0]},
>> { "first": 7947,
>> "last": 7948,
>> "maybe_went_rw": 1,
>> "up": [
>> 1],
>> "acting": [
>> 1]},
>> { "first": 7949,
>> "last": 7950,
>> "maybe_went_rw": 1,
>> "up": [
>> 1,
>> 0],
>> "acting": [
>> 1,
>> 0]},
>> { "first": 7951,
>> "last": 7952,
>> "maybe_went_rw": 1,
>> "up": [
>> 0],
>> "acting": [
>> 0]},
>> { "first": 7953,
>> "last": 7954,
>> "maybe_went_rw": 1,
>> "up": [
>> 1,
>> 0],
>> "acting": [
>> 1,
>> 0]},
>> { "first": 7955,
>> "last": 7956,
>> "maybe_went_rw": 1,
>> "up": [
>> 1],
>> "acting": [
>> 1]},
>> { "first": 7957,
>> "last": 7958,
>> "maybe_went_rw": 1,
>> "up": [
>> 1,
>> 2],
>> "acting": [
>> 1,
>> 2]},
>> { "first": 7959,
>> "last": 7966,
>> "maybe_went_rw": 1,
>> "up": [
>> 1,
>> 0],
>> "acting": [
>> 1,
>> 0]},
>> { "first": 7967,
>> "last": 7968,
>> "maybe_went_rw": 1,
>> "up": [
>> 0],
>> "acting": [
>> 0]},
>> { "first": 7969,
>> "last": 7970,
>> "maybe_went_rw": 1,
>> "up": [
>> 1,
>> 0],
>> "acting": [
>> 1,
>> 0]},
>> { "first": 7971,
>> "last": 7972,
>> "maybe_went_rw": 1,
>> "up": [
>> 1],
>> "acting": [
>> 1]}],
>> "probing_osds": [
>> 1,
>> 2],
>> "down_osds_we_would_probe": [
>> 0],
>> "peering_blocked_by": []},
>> { "name": "Started",
>> "enter_time": "2014-01-07 14:04:36.269462"}]}
>>
>>
>>
>> CEPH OSD TREE
>>
>> ceph osd tree
>> # id weight type name up/down reweight
>> -1 4.5 pool default
>> -4 1.8 host ceph2
>> 1 1.8 osd.1 up 1
>> -6 2.7 host ceph3
>> 2 2.7 osd.2 up 1
>>
>>
>>
>> _______________________________________________
>> ceph-users mailing list
>> ceph-users@xxxxxxxxxxxxxx
>> http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com
>>
To: "Bryan Morris" <bryan@xxxxxxxxxxxxxx>
Cc: ceph-users@xxxxxxxxxxxxxx
Sent: Tuesday, January 7, 2014 3:50:55 PM
Subject: Re: 4 PGs stuck inactive
version of Ceph are you running? That pg stat isn't making a lot of
sense to me.
Software Engineer #42 @ http://inktank.com | http://ceph.com
On Tue, Jan 7, 2014 at 2:45 PM, Gregory Farnum <greg@xxxxxxxxxxx> wrote:
> Assuming the one who lost its filesystem is totally gone, mark it
> lost. That will tell the OSDs to give up on whatever data it might
> have had and you should be good to go (modulo whatever data you might
> have lost from only having it on the dead OSD during the reboot).
> -Greg
> Software Engineer #42 @ http://inktank.com | http://ceph.com
>
>
> On Tue, Jan 7, 2014 at 1:32 PM, Bryan Morris <bryan@xxxxxxxxxxxxxx> wrote:
>>
>> I have 4 PGs stuck inactive.....
>>
>> I have tried everything I can find online, and could use some help.
>>
>> I had 3 OSDs... was in process of rebooting one and another totally crashed
>> and corrupted its filesystem (BAD).
>>
>> So now I have 4 incomplete PGs.
>>
>> Things I've tried:
>>
>> Rebooting all OSDs.
>> Running 'ceph pg {pg-id} mark_unfound_lost revert'
>> Running 'ceph osd lost {OSD ID}'
>>
>>
>> Here is a PG query from one of the 4 PGs
>>
>>
>>
>> [[{ "state": "down+incomplete",
>> "epoch": 7994,
>> "up": [
>> 1,
>> 2],
>> "acting": [
>> 1,
>> 2],
>> "info": { "pgid": "4.1ca",
>> "last_update": "0'0",
>> "last_complete": "0'0",
>> "log_tail": "0'0",
>> "last_backfill": "MAX",
>> "purged_snaps": "[]",
>> "history": { "epoch_created": 75,
>> "last_epoch_started": 6719,
>> "last_epoch_clean": 6710,
>> "last_epoch_split": 0,
>> "same_up_since": 7973,
>> "same_interval_since": 7973,
>> "same_primary_since": 7969,
>> "last_scrub": "6239'25314",
>> "last_scrub_stamp": "2014-01-05 18:48:01.122717",
>> "last_deep_scrub": "6235'25313",
>> "last_deep_scrub_stamp": "2014-01-03 18:47:48.390112",
>> "last_clean_scrub_stamp": "2014-01-05 18:48:01.122717"},
>> "stats": { "version": "0'0",
>> "reported_seq": "1259",
>> "reported_epoch": "7994",
>> "state": "down+incomplete",
>> "last_fresh": "2014-01-07 14:04:36.269587",
>> "last_change": "2014-01-07 14:04:36.269587",
>> "last_active": "0.000000",
>> "last_clean": "0.000000",
>> "last_became_active": "0.000000",
>> "last_unstale": "2014-01-07 14:04:36.269587",
>> "mapping_epoch": 7971,
>> "log_start": "0'0",
>> "ondisk_log_start": "0'0",
>> "created": 75,
>> "last_epoch_clean": 6710,
>> "parent": "0.0",
>> "parent_split_bits": 0,
>> "last_scrub": "6239'25314",
>> "last_scrub_stamp": "2014-01-05 18:48:01.122717",
>> "last_deep_scrub": "6235'25313",
>> "last_deep_scrub_stamp": "2014-01-03 18:47:48.390112",
>> "last_clean_scrub_stamp": "2014-01-05 18:48:01.122717",
>> "log_size": 0,
>> "ondisk_log_size": 0,
>> "stats_invalid": "0",
>> "stat_sum": { "num_bytes": 0,
>> "num_objects": 0,
>> "num_object_clones": 0,
>> "num_object_copies": 0,
>> "num_objects_missing_on_primary": 0,
>> "num_objects_degraded": 0,
>> "num_objects_unfound": 0,
>> "num_read": 0,
>> "num_read_kb": 0,
>> "num_write": 0,
>> "num_write_kb": 0,
>> "num_scrub_errors": 0,
>> "num_shallow_scrub_errors": 0,
>> "num_deep_scrub_errors": 0,
>> "num_objects_recovered": 0,
>> "num_bytes_recovered": 0,
>> "num_keys_recovered": 0},
>> "stat_cat_sum": {},
>> "up": [
>> 1,
>> 2],
>> "acting": [
>> 1,
>> 2]},
>> "empty": 1,
>> "dne": 0,
>> "incomplete": 0,
>> "last_epoch_started": 0},
>> "recovery_state": [
>> { "name": "Started\/Primary\/Peering",
>> "enter_time": "2014-01-07 14:04:36.269509",
>> "past_intervals": [
>> { "first": 5221,
>> "last": 5226,
>> "maybe_went_rw": 1,
>> "up": [
>> 0,
>> 1],
>> "acting": [
>> 0,
>> 1]},
>> { "first": 5227,
>> "last": 5439,
>> "maybe_went_rw": 1,
>> "up": [
>> 1,
>> 0],
>> "acting": [
>> 1,
>> 0]},
>> { "first": 5440,
>> "last": 5441,
>> "maybe_went_rw": 0,
>> "up": [
>> 1],
>> "acting": [
>> 1]},
>> { "first": 5442,
>> "last": 6388,
>> "maybe_went_rw": 1,
>> "up": [
>> 1,
>> 0],
>> "acting": [
>> 1,
>> 0]},
>> { "first": 6389,
>> "last": 6395,
>> "maybe_went_rw": 1,
>> "up": [
>> 0],
>> "acting": [
>> 0]},
>> { "first": 6396,
>> "last": 6708,
>> "maybe_went_rw": 1,
>> "up": [
>> 1,
>> 0],
>> "acting": [
>> 1,
>> 0]},
>> { "first": 6709,
>> "last": 6717,
>> "maybe_went_rw": 1,
>> "up": [
>> 1],
>> "acting": [
>> 1]},
>> { "first": 6718,
>> "last": 6721,
>> "maybe_went_rw": 1,
>> "up": [
>> 1,
>> 2],
>> "acting": [
>> 1,
>> 2]},
>> { "first": 6722,
>> "last": 6724,
>> "maybe_went_rw": 1,
>> "up": [
>> 2],
>> "acting": [
>> 2]},
>> { "first": 6725,
>> "last": 6737,
>> "maybe_went_rw": 1,
>> "up": [
>> 0,
>> 2],
>> "acting": [
>> 0,
>> 2]},
>> { "first": 6738,
>> "last": 7946,
>> "maybe_went_rw": 1,
>> "up": [
>> 1,
>> 0],
>> "acting": [
>> 1,
>> 0]},
>> { "first": 7947,
>> "last": 7948,
>> "maybe_went_rw": 1,
>> "up": [
>> 1],
>> "acting": [
>> 1]},
>> { "first": 7949,
>> "last": 7950,
>> "maybe_went_rw": 1,
>> "up": [
>> 1,
>> 0],
>> "acting": [
>> 1,
>> 0]},
>> { "first": 7951,
>> "last": 7952,
>> "maybe_went_rw": 1,
>> "up": [
>> 0],
>> "acting": [
>> 0]},
>> { "first": 7953,
>> "last": 7954,
>> "maybe_went_rw": 1,
>> "up": [
>> 1,
>> 0],
>> "acting": [
>> 1,
>> 0]},
>> { "first": 7955,
>> "last": 7956,
>> "maybe_went_rw": 1,
>> "up": [
>> 1],
>> "acting": [
>> 1]},
>> { "first": 7957,
>> "last": 7958,
>> "maybe_went_rw": 1,
>> "up": [
>> 1,
>> 2],
>> "acting": [
>> 1,
>> 2]},
>> { "first": 7959,
>> "last": 7966,
>> "maybe_went_rw": 1,
>> "up": [
>> 1,
>> 0],
>> "acting": [
>> 1,
>> 0]},
>> { "first": 7967,
>> "last": 7968,
>> "maybe_went_rw": 1,
>> "up": [
>> 0],
>> "acting": [
>> 0]},
>> { "first": 7969,
>> "last": 7970,
>> "maybe_went_rw": 1,
>> "up": [
>> 1,
>> 0],
>> "acting": [
>> 1,
>> 0]},
>> { "first": 7971,
>> "last": 7972,
>> "maybe_went_rw": 1,
>> "up": [
>> 1],
>> "acting": [
>> 1]}],
>> "probing_osds": [
>> 1,
>> 2],
>> "down_osds_we_would_probe": [
>> 0],
>> "peering_blocked_by": []},
>> { "name": "Started",
>> "enter_time": "2014-01-07 14:04:36.269462"}]}
>>
>>
>>
>> CEPH OSD TREE
>>
>> ceph osd tree
>> # id weight type name up/down reweight
>> -1 4.5 pool default
>> -4 1.8 host ceph2
>> 1 1.8 osd.1 up 1
>> -6 2.7 host ceph3
>> 2 2.7 osd.2 up 1
>>
>>
>>
>> _______________________________________________
>> ceph-users mailing list
>> ceph-users@xxxxxxxxxxxxxx
>> http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com
>>
_______________________________________________ ceph-users mailing list ceph-users@xxxxxxxxxxxxxx http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com