Re: OSD will not start

Chris Taylor <ctaylor@xxxxxxxxxx> · Sun, 11 Oct 2015 17:13:39 -0700

Farther back in the logfile before it is reporting the missing objects, I get these messages:
osd.33 pg_epoch: 256084 pg[8.484(unlocked)] enter Initial
 osd.33 pg_epoch: 256084 pg[8.484(unlocked)] enter NotTrimming
It looks like it is processing placement group "8.484" on OSD 33. But when I run "ceph pg 8.484 query" is looks like that placement group is on two other OSDs [13,44] in a healthy state. Is this just a lingering copy of the PG that needs to be manually removed?
# ceph pg 8.484 query
{
 "state": "active+clean",
 "snap_trimq": "[]",
 "epoch": 256850,
 "up": [
 13,
 44
 ],
 "acting": [
 13,
 44
 ],
 "actingbackfill": [
 "13",
 "44"
 ],
 "info": {
 "pgid": "8.484",
 "last_update": "256816'16486",
 "last_complete": "256816'16486",
 "log_tail": "254158'13420",
 "last_user_version": 16486,
 "last_backfill": "MAX",
 "purged_snaps": "[1~3a,44~1,46~1,4d~a,58~4,73~30,ba~11]",
 "history": {
 "epoch_created": 250794,
 "last_epoch_started": 256849,
 "last_epoch_clean": 256849,
 "last_epoch_split": 0,
 "same_up_since": 256848,
 "same_interval_since": 256848,
 "same_primary_since": 256848,
 "last_scrub": "254220'13929",
 "last_scrub_stamp": "2015-10-09 20:18:15.856071",
 "last_deep_scrub": "251785'8629",
 "last_deep_scrub_stamp": "2015-10-04 10:33:37.209878",
 "last_clean_scrub_stamp": "2015-10-09 20:18:15.856071"
 },
 "stats": {
 "version": "256816'16486",
 "reported_seq": "67588",
 "reported_epoch": "256850",
 "state": "active+clean",
 "last_fresh": "2015-10-11 17:09:46.972382",
 "last_change": "2015-10-11 16:30:32.646205",
 "last_active": "2015-10-11 17:09:46.972382",
 "last_peered": "2015-10-11 17:09:46.972382",
 "last_clean": "2015-10-11 17:09:46.972382",
 "last_became_active": "0.000000",
 "last_became_peered": "0.000000",
 "last_unstale": "2015-10-11 17:09:46.972382",
 "last_undegraded": "2015-10-11 17:09:46.972382",
 "last_fullsized": "2015-10-11 17:09:46.972382",
 "mapping_epoch": 256846,
 "log_start": "254158'13420",
 "ondisk_log_start": "254158'13420",
 "created": 250794,
 "last_epoch_clean": 256849,
 "parent": "0.0",
 "parent_split_bits": 0,
 "last_scrub": "254220'13929",
 "last_scrub_stamp": "2015-10-09 20:18:15.856071",
 "last_deep_scrub": "251785'8629",
 "last_deep_scrub_stamp": "2015-10-04 10:33:37.209878",
 "last_clean_scrub_stamp": "2015-10-09 20:18:15.856071",
 "log_size": 3066,
 "ondisk_log_size": 3066,
 "stats_invalid": "0",
 "stat_sum": {
 "num_bytes": 7075272704,
 "num_objects": 1709,
 "num_object_clones": 582,
 "num_object_copies": 3418,
 "num_objects_missing_on_primary": 0,
 "num_objects_degraded": 0,
 "num_objects_misplaced": 0,
 "num_objects_unfound": 0,
 "num_objects_dirty": 1709,
 "num_whiteouts": 0,
 "num_read": 92000,
 "num_read_kb": 17658096,
 "num_write": 23198,
 "num_write_kb": 8868482,
 "num_scrub_errors": 0,
 "num_shallow_scrub_errors": 0,
 "num_deep_scrub_errors": 0,
 "num_objects_recovered": 3928,
 "num_bytes_recovered": 16447209472,
 "num_keys_recovered": 0,
 "num_objects_omap": 0,
 "num_objects_hit_set_archive": 0,
 "num_bytes_hit_set_archive": 0
 },
 "up": [
 13,
 44
 ],
 "acting": [
 13,
 44
 ],
 "blocked_by": [],
 "up_primary": 13,
 "acting_primary": 13
 },
 "empty": 0,
 "dne": 0,
 "incomplete": 0,
 "last_epoch_started": 256849,
 "hit_set_history": {
 "current_last_update": "0'0",
 "current_last_stamp": "0.000000",
 "current_info": {
 "begin": "0.000000",
 "end": "0.000000",
 "version": "0'0"
 },
 "history": []
 }
 },
 "peer_info": [
 {
 "peer": "44",
 "pgid": "8.484",
 "last_update": "256816'16486",
 "last_complete": "256816'16486",
 "log_tail": "254158'13420",
 "last_user_version": 16486,
 "last_backfill": "MAX",
 "purged_snaps": "[1~3a,44~1,46~1,4d~a,58~4,73~30,ba~11]",
 "history": {
 "epoch_created": 250794,
 "last_epoch_started": 256849,
 "last_epoch_clean": 256849,
 "last_epoch_split": 0,
 "same_up_since": 256848,
 "same_interval_since": 256848,
 "same_primary_since": 256848,
 "last_scrub": "254220'13929",
 "last_scrub_stamp": "2015-10-09 20:18:15.856071",
 "last_deep_scrub": "251785'8629",
 "last_deep_scrub_stamp": "2015-10-04 10:33:37.209878",
 "last_clean_scrub_stamp": "2015-10-09 20:18:15.856071"
 },
 "stats": {
 "version": "256816'16486",
 "reported_seq": "67580",
 "reported_epoch": "256848",
 "state": "active+undersized+degraded",
 "last_fresh": "2015-10-11 16:30:10.656475",
 "last_change": "2015-10-11 16:30:10.655808",
 "last_active": "2015-10-11 16:30:10.656475",
 "last_peered": "2015-10-11 16:30:10.656475",
 "last_clean": "2015-10-11 11:21:32.645164",
 "last_became_active": "0.000000",
 "last_became_peered": "0.000000",
 "last_unstale": "2015-10-11 16:30:10.656475",
 "last_undegraded": "2015-10-11 16:30:06.790836",
 "last_fullsized": "2015-10-11 16:30:06.790836",
 "mapping_epoch": 256846,
 "log_start": "254158'13420",
 "ondisk_log_start": "254158'13420",
 "created": 250794,
 "last_epoch_clean": 256847,
 "parent": "0.0",
 "parent_split_bits": 0,
 "last_scrub": "254220'13929",
 "last_scrub_stamp": "2015-10-09 20:18:15.856071",
 "last_deep_scrub": "251785'8629",
 "last_deep_scrub_stamp": "2015-10-04 10:33:37.209878",
 "last_clean_scrub_stamp": "2015-10-09 20:18:15.856071",
 "log_size": 3066,
 "ondisk_log_size": 3066,
 "stats_invalid": "0",
 "stat_sum": {
 "num_bytes": 7075272704,
 "num_objects": 1709,
 "num_object_clones": 582,
 "num_object_copies": 3418,
 "num_objects_missing_on_primary": 0,
 "num_objects_degraded": 1709,
 "num_objects_misplaced": 0,
 "num_objects_unfound": 0,
 "num_objects_dirty": 1709,
 "num_whiteouts": 0,
 "num_read": 92000,
 "num_read_kb": 17658096,
 "num_write": 23198,
 "num_write_kb": 8868482,
 "num_scrub_errors": 0,
 "num_shallow_scrub_errors": 0,
 "num_deep_scrub_errors": 0,
 "num_objects_recovered": 3928,
 "num_bytes_recovered": 16447209472,
 "num_keys_recovered": 0,
 "num_objects_omap": 0,
 "num_objects_hit_set_archive": 0,
 "num_bytes_hit_set_archive": 0
 },
 "up": [
 13,
 44
 ],
 "acting": [
 13,
 44
 ],
 "blocked_by": [],
 "up_primary": 13,
 "acting_primary": 13
 },
 "empty": 0,
 "dne": 0,
 "incomplete": 0,
 "last_epoch_started": 256849,
 "hit_set_history": {
 "current_last_update": "0'0",
 "current_last_stamp": "0.000000",
 "current_info": {
 "begin": "0.000000",
 "end": "0.000000",
 "version": "0'0"
 },
 "history": []
 }
 }
 ],
 "recovery_state": [
 {
 "name": "Started\/Primary\/Active",
 "enter_time": "2015-10-11 16:30:32.551400",
 "might_have_unfound": [],
 "recovery_progress": {
 "backfill_targets": [],
 "waiting_on_backfill": [],
 "last_backfill_started": "0\/\/0\/\/-1",
 "backfill_info": {
 "begin": "0\/\/0\/\/-1",
 "end": "0\/\/0\/\/-1",
 "objects": []
 },
 "peer_backfill_info": [],
 "backfills_in_flight": [],
 "recovering": [],
 "pg_backend": {
 "pull_from_peer": [],
 "pushing": []
 }
 },
 "scrub": {
 "scrubber.epoch_start": "0",
 "scrubber.active": 0,
 "scrubber.waiting_on": 0,
 "scrubber.waiting_on_whom": []
 }
 },
 {
 "name": "Started",
 "enter_time": "2015-10-11 16:30:28.201561"
 }
 ],
 "agent_state": {}
}

On 2015-10-11 12:19 pm, Chris Taylor wrote:

I have an OSD that went down while the cluster was recovering from another OSD being reweighted. The cluster appears to be stuck in recovery since the number of degraded and misplaced objects are not decreasing.
It is a three node cluster in production and the pool size is 2. Ceph version 94.3.
Here is a snip of the failing OSD log. The full log file was uploaded with ceph-post-file "ceph-post-file: dfcf6dff-11cb-49b0-81b8-60bf8ff898eb".
2015-10-11 10:45:06.182615 7f9270567900 20 read_log 254342'14799 (251922'9664) delete d1aa1484/rb.0.ac3386.238e1f29.0000000bb0ad/44//8 by unknown.0.0:0 2015-10-11 00:40:34.981049
2015-10-11 10:45:06.182629 7f9270567900 20 read_log 254342'14800 (251922'9665) modify d1aa1484/rb.0.ac3386.238e1f29.0000000bb0ad/head//8 by unknown.0.0:0 2015-10-11 00:40:34.981049
2015-10-11 10:45:06.182661 7f9270567900 20 read_log 6 divergent_priors
2015-10-11 10:45:06.184076 7f9270567900 10 read_log checking for missing items over interval (0'0,254342'14800]
2015-10-11 10:45:11.861683 7f9270567900 15 read_log missing 251925'9669,e9ea1484/rb.0.ac3386.238e1f29.000000187097/head//8
2015-10-11 10:45:11.861767 7f9270567900 15 read_log missing 251925'9668,e9ea1484/rb.0.ac3386.238e1f29.000000187097/44//8
2015-10-11 10:45:11.861823 7f9270567900 15 read_log missing 251925'9667,c4ea1484/rb.0.ac3386.238e1f29.00000022717d/head//8
2015-10-11 10:45:11.861877 7f9270567900 15 read_log missing 251925'9666,c4ea1484/rb.0.ac3386.238e1f29.00000022717d/68//8
2015-10-11 10:45:11.924425 7f9270567900 -1 osd/PGLog.cc: In function 'static void PGLog::read_log(ObjectStore*, coll_t, coll_t, ghobject_t, const pg_info_t&, std::map<eversion_t, hobject_t>&, PGLog::IndexedLog&, pg_missing_t&, std::ostringstream&, std::set<std::basic_string<char> >*)' thread 7f9270567900 time 2015-10-11 10:45:11.861976
osd/PGLog.cc: 962: FAILED assert(oi.version == i->first)

cluster d960d672-e035-413d-ba39-8341f4131760
 health HEALTH_WARN
 54 pgs backfill
 373 pgs degraded
 1 pgs recovering
 336 pgs recovery_wait
 373 pgs stuck degraded
 391 pgs stuck unclean
 43 pgs stuck undersized
 43 pgs undersized
 recovery 88034/14758314 objects degraded (0.597%)
 recovery 280423/14758314 objects misplaced (1.900%)
 recovery 28/7330234 unfound (0.000%)
 monmap e1: 3 mons at {ceph-mon1=10.20.0.11:6789/0,ceph-mon2=10.20.0.12:6789/0,ceph-mon3=10.20.0.13:6789/0}
 election epoch 6010, quorum 0,1,2 ceph-mon1,ceph-mon2,ceph-mon3
 osdmap e256816: 46 osds: 45 up, 45 in; 65 remapped pgs
 pgmap v19715504: 5184 pgs, 4 pools, 28323 GB data, 7158 kobjects
 57018 GB used, 23027 GB / 80045 GB avail
 88034/14758314 objects degraded (0.597%)
 280423/14758314 objects misplaced (1.900%)
 28/7330234 unfound (0.000%)
 4790 active+clean
 326 active+recovery_wait+degraded
 36 active+undersized+degraded+remapped+wait_backfill
 18 active+remapped+wait_backfill
 6 active+recovery_wait+undersized+degraded+remapped
 4 active+recovery_wait+degraded+remapped
 3 active+clean+scrubbing+deep
 1 active+recovering+undersized+degraded+remapped
 client io 11627 kB/s rd, 36433 B/s wr, 10 op/s

Any help would be greatly appreciated!
Thanks,
Chris

_______________________________________________
 ceph-users mailing list
ceph-users@xxxxxxxxxxxxxx
http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com

_______________________________________________
ceph-users mailing list
ceph-users@xxxxxxxxxxxxxx
http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com