Hi,
Looking for somebody with a good memory.
I have a hammer OSD 0.94.10 that keeps crashing in the bottom assert.
Can somebody give me an indication what the problem there is, and
perhaps suggest a way to work around it. (prefably without building a
new/fixed osd.)
Thanx,
--WjW
Last parts of the crash:
-11> 2018-09-07 16:38:38.465425 7f409b042700 1 --
10.128.4.4:6802/3457 <== osd.48 10.128.4.9:6810/4048 174925602 ====
osd_repop(client.6334140.0:241128738 11.0
11/31e1e080/rbd_data.50b1d42e237d92.00000000000001ab/head v
44254'78496421) v1 ==== 929+0+4874 (3087930260 0 3406714671) 0x1a632000
con 0x5597e40
-10> 2018-09-07 16:38:38.465480 7f40ba331700 5 -- op tracker --
seq: 1556168118, time: 2018-09-07 16:38:38.465480, event: reached_pg,
op: osd_repop(client.6334140.0:241128738 11.0
11/31e1e080/rbd_data.50b1d42e237d92.00000000000001ab/head v 44254'78496421)
-9> 2018-09-07 16:38:38.465501 7f40ba331700 5 -- op tracker --
seq: 1556168118, time: 2018-09-07 16:38:38.465501, event: started, op:
osd_repop(client.6334140.0:241128738 11.0
11/31e1e080/rbd_data.50b1d42e237d92.00000000000001ab/head v 44254'78496421)
-8> 2018-09-07 16:38:38.465559 7f40ba331700 5 write_log with:
dirty_to: 0'0, dirty_from: 4294967295'18446744073709551615,
dirty_divergent_priors: false, divergent_priors: 0, writeout_from:
44254'78496421, trimmed:
-7> 2018-09-07 16:38:38.465610 7f40ba331700 5 -- op tracker --
seq: 1556168118, time: 2018-09-07 16:38:38.465610, event:
commit_queued_for_journal_write, op:
osd_repop(client.6334140.0:241128738 11.0
11/31e1e080/rbd_data.50b1d42e237d92.00000000000001ab/head v 44254'78496421)
-6> 2018-09-07 16:38:38.465671 7f40ce2d5700 5 -- op tracker --
seq: 1556168118, time: 2018-09-07 16:38:38.465670, event:
write_thread_in_journal_buffer, op: osd_repop(client.6334140.0:241128738
11.0 11/31e1e080/rbd_data.50b1d42e237d92.00000000000001ab/head v
44254'78496421)
-5> 2018-09-07 16:38:38.465850 7f40cdad4700 5 -- op tracker --
seq: 1556168118, time: 2018-09-07 16:38:38.465850, event:
journaled_completion_queued, op: osd_repop(client.6334140.0:241128738
11.0 11/31e1e080/rbd_data.50b1d42e237d92.00000000000001ab/head v
44254'78496421)
-4> 2018-09-07 16:38:38.465874 7f40cb2cf700 5 -- op tracker --
seq: 1556168118, time: 2018-09-07 16:38:38.465873, event: commit_sent,
op: osd_repop(client.6334140.0:241128738 11.0
11/31e1e080/rbd_data.50b1d42e237d92.00000000000001ab/head v 44254'78496421)
-3> 2018-09-07 16:38:38.465887 7f40cb2cf700 1 --
10.128.4.4:6802/3457 --> 10.128.4.9:6810/4048 --
osd_repop_reply(client.6334140.0:241128738 11.0 ondisk, result = 0) v1
-- ?+0 0x21e09840 con 0x5597e40
-2> 2018-09-07 16:38:38.466366 7f40cbad0700 5 -- op tracker --
seq: 1556168118, time: 2018-09-07 16:38:38.466366, event:
sub_op_applied, op: osd_repop(client.6334140.0:241128738 11.0
11/31e1e080/rbd_data.50b1d42e237d92.00000000000001ab/head v 44254'78496421)
-1> 2018-09-07 16:38:38.466396 7f40cbad0700 5 -- op tracker --
seq: 1556168118, time: 2018-09-07 16:38:38.466396, event: done, op:
osd_repop(client.6334140.0:241128738 11.0
11/31e1e080/rbd_data.50b1d42e237d92.00000000000001ab/head v 44254'78496421)
0> 2018-09-07 16:38:38.468999 7f40b832d700 -1 osd/ReplicatedPG.cc:
In function 'void ReplicatedPG::scan_range(int, int,
PG::BackfillInterval*, ThreadPool::TPHandle&)' thread 7f40b832d700 time
2018-09-07 16:38:38.462751
osd/ReplicatedPG.cc: 10115: FAILED assert(r >= 0)
And the hammer code that goes with it:
void ReplicatedPG::scan_range(
int min, int max, BackfillInterval *bi,
ThreadPool::TPHandle &handle)
{
assert(is_locked());
dout(10) << "scan_range from " << bi->begin << dendl;
bi->objects.clear(); // for good measure
vector<hobject_t> ls;
ls.reserve(max);
int r = pgbackend->objects_list_partial(bi->begin, min, max, 0, &ls,
&bi->end);
assert(r >= 0);
dout(10) << " got " << ls.size() << " items, next " << bi->end << dendl;
dout(20) << ls << dendl;
for (vector<hobject_t>::iterator p = ls.begin(); p != ls.end(); ++p) {
handle.reset_tp_timeout();
ObjectContextRef obc;
if (is_primary())
obc = object_contexts.lookup(*p);
if (obc) {
bi->objects[*p] = obc->obs.oi.version;
dout(20) << " " << *p << " " << obc->obs.oi.version << dendl;
} else {
bufferlist bl;
int r = pgbackend->objects_get_attr(*p, OI_ATTR, &bl);
/* If the object does not exist here, it must have been removed
* between the collection_list_partial and here. This can happen
* for the first item in the range, which is usually last_backfill.
*/
if (r == -ENOENT)
continue;
assert(r >= 0);
object_info_t oi(bl);
bi->objects[*p] = oi.version;
dout(20) << " " << *p << " " << oi.version << dendl;
}