I'm not seeing much in the tracker for scan_range. There's http://tracker.ceph.com/issues/13853 which got closed as Can't Reproduce, and that references http://tracker.ceph.com/issues/10150 which introduced the exemption for r==-ENOENT. Honestly when I see errors on a read my first guess is usually that the disk state is corrupted somehow. Have you tried examined the object it's crashing on through the filesystem and using the objectstore tool? -Greg On Fri, Sep 7, 2018 at 8:15 AM, Willem Jan Withagen <wjw@xxxxxxxxxxx> wrote: > Hi, > > Looking for somebody with a good memory. > > I have a hammer OSD 0.94.10 that keeps crashing in the bottom assert. > Can somebody give me an indication what the problem there is, and perhaps > suggest a way to work around it. (prefably without building a new/fixed > osd.) > > Thanx, > --WjW > > Last parts of the crash: > -11> 2018-09-07 16:38:38.465425 7f409b042700 1 -- 10.128.4.4:6802/3457 > <== osd.48 10.128.4.9:6810/4048 174925602 ==== > osd_repop(client.6334140.0:241128738 11.0 > 11/31e1e080/rbd_data.50b1d42e237d92.00000000000001ab/head v 44254'78496421) > v1 ==== 929+0+4874 (3087930260 0 3406714671) 0x1a632000 con 0x5597e40 > -10> 2018-09-07 16:38:38.465480 7f40ba331700 5 -- op tracker -- seq: > 1556168118, time: 2018-09-07 16:38:38.465480, event: reached_pg, op: > osd_repop(client.6334140.0:241128738 11.0 > 11/31e1e080/rbd_data.50b1d42e237d92.00000000000001ab/head v 44254'78496421) > -9> 2018-09-07 16:38:38.465501 7f40ba331700 5 -- op tracker -- seq: > 1556168118, time: 2018-09-07 16:38:38.465501, event: started, op: > osd_repop(client.6334140.0:241128738 11.0 > 11/31e1e080/rbd_data.50b1d42e237d92.00000000000001ab/head v 44254'78496421) > -8> 2018-09-07 16:38:38.465559 7f40ba331700 5 write_log with: dirty_to: > 0'0, dirty_from: 4294967295'18446744073709551615, dirty_divergent_priors: > false, divergent_priors: 0, writeout_from: 44254'78496421, trimmed: > -7> 2018-09-07 16:38:38.465610 7f40ba331700 5 -- op tracker -- seq: > 1556168118, time: 2018-09-07 16:38:38.465610, event: > commit_queued_for_journal_write, op: osd_repop(client.6334140.0:241128738 > 11.0 11/31e1e080/rbd_data.50b1d42e237d92.00000000000001ab/head v > 44254'78496421) > -6> 2018-09-07 16:38:38.465671 7f40ce2d5700 5 -- op tracker -- seq: > 1556168118, time: 2018-09-07 16:38:38.465670, event: > write_thread_in_journal_buffer, op: osd_repop(client.6334140.0:241128738 > 11.0 11/31e1e080/rbd_data.50b1d42e237d92.00000000000001ab/head v > 44254'78496421) > -5> 2018-09-07 16:38:38.465850 7f40cdad4700 5 -- op tracker -- seq: > 1556168118, time: 2018-09-07 16:38:38.465850, event: > journaled_completion_queued, op: osd_repop(client.6334140.0:241128738 11.0 > 11/31e1e080/rbd_data.50b1d42e237d92.00000000000001ab/head v 44254'78496421) > -4> 2018-09-07 16:38:38.465874 7f40cb2cf700 5 -- op tracker -- seq: > 1556168118, time: 2018-09-07 16:38:38.465873, event: commit_sent, op: > osd_repop(client.6334140.0:241128738 11.0 > 11/31e1e080/rbd_data.50b1d42e237d92.00000000000001ab/head v 44254'78496421) > -3> 2018-09-07 16:38:38.465887 7f40cb2cf700 1 -- 10.128.4.4:6802/3457 > --> 10.128.4.9:6810/4048 -- osd_repop_reply(client.6334140.0:241128738 11.0 > ondisk, result = 0) v1 -- ?+0 0x21e09840 con 0x5597e40 > -2> 2018-09-07 16:38:38.466366 7f40cbad0700 5 -- op tracker -- seq: > 1556168118, time: 2018-09-07 16:38:38.466366, event: sub_op_applied, op: > osd_repop(client.6334140.0:241128738 11.0 > 11/31e1e080/rbd_data.50b1d42e237d92.00000000000001ab/head v 44254'78496421) > -1> 2018-09-07 16:38:38.466396 7f40cbad0700 5 -- op tracker -- seq: > 1556168118, time: 2018-09-07 16:38:38.466396, event: done, op: > osd_repop(client.6334140.0:241128738 11.0 > 11/31e1e080/rbd_data.50b1d42e237d92.00000000000001ab/head v 44254'78496421) > 0> 2018-09-07 16:38:38.468999 7f40b832d700 -1 osd/ReplicatedPG.cc: In > function 'void ReplicatedPG::scan_range(int, int, PG::BackfillInterval*, > ThreadPool::TPHandle&)' thread 7f40b832d700 time 2018-09-07 16:38:38.462751 > osd/ReplicatedPG.cc: 10115: FAILED assert(r >= 0) > > And the hammer code that goes with it: > void ReplicatedPG::scan_range( > int min, int max, BackfillInterval *bi, > ThreadPool::TPHandle &handle) > { > assert(is_locked()); > dout(10) << "scan_range from " << bi->begin << dendl; > bi->objects.clear(); // for good measure > > vector<hobject_t> ls; > ls.reserve(max); > int r = pgbackend->objects_list_partial(bi->begin, min, max, 0, &ls, > &bi->end); > assert(r >= 0); > dout(10) << " got " << ls.size() << " items, next " << bi->end << dendl; > dout(20) << ls << dendl; > > for (vector<hobject_t>::iterator p = ls.begin(); p != ls.end(); ++p) { > handle.reset_tp_timeout(); > ObjectContextRef obc; > if (is_primary()) > obc = object_contexts.lookup(*p); > if (obc) { > bi->objects[*p] = obc->obs.oi.version; > dout(20) << " " << *p << " " << obc->obs.oi.version << dendl; > } else { > bufferlist bl; > int r = pgbackend->objects_get_attr(*p, OI_ATTR, &bl); > > /* If the object does not exist here, it must have been removed > * between the collection_list_partial and here. This can happen > * for the first item in the range, which is usually last_backfill. > */ > if (r == -ENOENT) > continue; > > assert(r >= 0); > object_info_t oi(bl); > bi->objects[*p] = oi.version; > dout(20) << " " << *p << " " << oi.version << dendl; > } > >