Re: [PoC -- do not apply 3/3] test-tree-bitmap: replace ewah with custom rle encoding

Junio C Hamano <gitster@xxxxxxxxx> · Wed, 10 Oct 2018 09:58:51 +0900

Jeff King <peff@xxxxxxxx> writes:

> +static void strbuf_add_varint(struct strbuf *out, uintmax_t val)
> +{
> +	size_t len;
> +	strbuf_grow(out, 16); /* enough for any varint */
> +	len = encode_varint(val, (unsigned char *)out->buf + out->len);
> +	strbuf_setlen(out, out->len + len);
> +}
> +
> +static void bitmap_to_rle(struct strbuf *out, struct bitmap *bitmap)
> +{
> +	int curval = 0; /* count zeroes, then ones, then zeroes, etc */
> +	size_t run = 0;
> +	size_t word;
> +	size_t orig_len = out->len;
> +
> +	for (word = 0; word < bitmap->word_alloc; word++) {
> +		int bit;
> +
> +		for (bit = 0; bit < BITS_IN_EWORD; bit++) {
> +			int val = !!(bitmap->words[word] & (((eword_t)1) << bit));
> +			if (val == curval)
> +				run++;
> +			else {
> +				strbuf_add_varint(out, run);
> +				curval = 1 - curval; /* flip 0/1 */
> +				run = 1;
> +			}
> +		}

OK.  I find it a bit disturbing to see that the loop knows a bit too
much about how "struct bitmap" is implemented, but that is a complaint
against the bitmap API, not this new user of the API.

We do not try to handle the case where bitmap has bits that is not
multiple of BITS_IN_EWORD and instead pretend that size of such a
bitmap can be rounded up, because we ignore trailing 0-bit anyway,
and we know the "struct bitmap" would pad with 0-bit at the tail?

> +	}
> +
> +	/*
> +	 * complete the run, but do not bother with trailing zeroes, unless we
> +	 * failed to write even an initial run of 0's.
> +	 */
> +	if (curval && run)
> +		strbuf_add_varint(out, run);
> +	else if (orig_len == out->len)
> +		strbuf_add_varint(out, 0);
> +
> +	/* signal end-of-input with an empty run */
> +	strbuf_add_varint(out, 0);
> +}

OK.

> +static size_t rle_each_bit(const unsigned char *in, size_t len,
> +			   void (*fn)(size_t, void *), void *data)
> +{
> +	int curval = 0; /* look for zeroes first, then ones, etc */
> +	const unsigned char *cur = in;
> +	const unsigned char *end = in + len;
> +	size_t pos;
> +
> +	/* we always have a first run, even if it's 0 zeroes */
> +	pos = decode_varint(&cur);
> +
> +	/*
> +	 * ugh, varint does not seem to have a way to prevent reading past
> +	 * the end of the buffer. We'll do a length check after each one,
> +	 * so the worst case is bounded.
> +	 */

Sorry about that :-).

> +	if (cur > end) {
> +		error("input underflow in rle");
> +		return len;
> +	}
> +
> +	while (1) {
> +		size_t run = decode_varint(&cur);
> +
> +		if (cur > end) {
> +			error("input underflow in rle");
> +			return len;
> +		}
> +
> +		if (!run)
> +			break; /* empty run signals end */
> +
> +		curval = 1 - curval; /* flip 0/1 */
> +		if (curval) {
> +			/* we have a run of 1's; deliver them */
> +			size_t i;
> +			for (i = 0; i < run; i++)
> +				fn(pos + i, data);
> +		}
> +		pos += run;
> +	}
> +
> +	return cur - in;
> +}

Makes sense.