Re: [PATCH v10 09/12] Add reftable library

Danh Doan <congdanhqx@xxxxxxxxx> · Tue, 28 Apr 2020 21:55:41 +0700

On 2020-04-27 20:13:35+0000, Han-Wen Nienhuys via GitGitGadget <gitgitgadget@xxxxxxxxx> wrote:
[..snip..]
> diff --git a/reftable/LICENSE b/reftable/LICENSE
> new file mode 100644
> index 00000000000..402e0f9356b
> --- /dev/null
> +++ b/reftable/LICENSE
> @@ -0,0 +1,31 @@
> +BSD License
> +
> +Copyright (c) 2020, Google LLC
> +All rights reserved.

Linking against GPLv2 code will make this code GPLv2, right?

> --- /dev/null
> +++ b/reftable/README.md
> @@ -0,0 +1,11 @@
> +
> +The source code in this directory comes from https://github.com/google/reftable.
> +
> +The VERSION file keeps track of the current version of the reftable library.
> +
> +To update the library, do:
> +
> +   sh reftable/update.sh

I agree with Dscho, we should do a git-subtree merge instead.

[..snip..]
> +#include "system.h"
> +
> +void put_be24(byte *out, uint32_t i)

So, we introduce a new type `byte`?

> +{
> +	out[0] = (byte)((i >> 16) & 0xff);
> +	out[1] = (byte)((i >> 8) & 0xff);
> +	out[2] = (byte)((i)&0xff);

At my first glance, I thought that code is:

	out[2] = (byte)( (i)(&0xff) )

It's totally un-parsable.

Anyway, we're gonna accept "expand tab to 4 spaces" in Git, now?

> +}
> +
> +uint32_t get_be24(byte *in)
> +{
> +	return (uint32_t)(in[0]) << 16 | (uint32_t)(in[1]) << 8 |
> +	       (uint32_t)(in[2]);

I don't think we need this much cast. One cast can rule them all.

> +}
> +
> +void put_be16(uint8_t *out, uint16_t i)
> +{
> +	out[0] = (uint8_t)((i >> 8) & 0xff);
> +	out[1] = (uint8_t)((i)&0xff);

This is un-parsable, see above.

> +}
> +
> +int binsearch(int sz, int (*f)(int k, void *args), void *args)

binary search?

I think we want all size to be `size_t`

> +{
> +	int lo = 0;
> +	int hi = sz;
> +
> +	/* invariant: (hi == sz) || f(hi) == true
> +	   (lo == 0 && f(0) == true) || fi(lo) == false
> +	 */

Comment style

> +	while (hi - lo > 1) {
> +		int mid = lo + (hi - lo) / 2;
> +
> +		int val = f(mid, args);
> +		if (val) {
> +			hi = mid;

But this line of code makes me think this function is binary partition
instead.

> +		} else {
> +			lo = mid;
> +		}

bracket style

> +	}
> +

[..snip..]

> diff --git a/reftable/basics.h b/reftable/basics.h
> new file mode 100644
> index 00000000000..6d89eb5d931
> --- /dev/null
> +++ b/reftable/basics.h
> @@ -0,0 +1,53 @@
[..snip..]
> +  find smallest index i in [0, sz) at which f(i) is true, assuming
> +  that f is ascending. Return sz if f(i) is false for all indices.
> +*/

So, this is about partitioning, not searching

> +int binsearch(int sz, int (*f)(int k, void *args), void *args);
> +
> +int block_writer_add(struct block_writer *w, struct record rec)

So, we're gonna pass a large structure into a function now?

> +{
> +	struct slice empty = { 0 };
> +	struct slice last = w->entries % w->restart_interval == 0 ? empty :
> +								    w->last_key;
> +	struct slice out = {
> +		.buf = w->buf + w->next,
> +		.len = w->block_size - w->next,
> +	};
> +
> +	struct slice start = out;
> +
> +	bool restart = false;

Do we want new type: bool?

> +	struct slice key = { 0 };
> +	int n = 0;
> +
> +	record_key(rec, &key);
> +	n = encode_key(&restart, out, last, key, record_val_type(rec));
> +	if (n < 0) {
> +		goto err;
> +	}
> +	slice_consume(&out, n);
> +
> +	n = record_encode(rec, out, w->hash_size);
> +	if (n < 0) {
> +		goto err;
> +	}
> +	slice_consume(&out, n);
> +
> +	if (block_writer_register_restart(w, start.len - out.len, restart,
> +					  key) < 0) {

Eh, naming is hard, and so does this function. It's overly verbose,
and long. I haven't go though all of this patch (because it's too
long), but I guess some of them can be made static and their name can
be shortened.

> +		goto err;
> +	}
> +
> +	slice_clear(&key);
> +	return 0;
> +
> +err:
> +	slice_clear(&key);
> +	return -1;
> +}

[..snip..]
> +		block_source_return_block(block->source, block);
> +		block->data = uncompressed.buf;
> +		block->len = dst_len; /* XXX: 4 bytes missing? */

Even this is a very lengthy patch, some bytes is missing but we
couldn't figured out.

> +		block->source = malloc_block_source();
> +		full_block_size = src_len + block_header_skip;
> +	} else if (full_block_size == 0) {
> +		full_block_size = sz;
> +	} else if (sz < full_block_size && sz < block->len &&
> +		   block->data[sz] != 0) {
> +		/* If the block is smaller than the full block size, it is
> +		   padded (data followed by '\0') or the next block is
> +		   unaligned. */
> +		full_block_size = sz;
> +	}
> +
> +	{

This cute curly braces is new to me

> +		uint16_t restart_count = get_be16(block->data + sz - 2);
> +		uint32_t restart_start = sz - 2 - 3 * restart_count;
> +
> +		byte *restart_bytes = block->data + restart_start;
> +
> +		/* transfer ownership. */
> +		br->block = *block;
> +		block->data = NULL;
> +		block->len = 0;
> +
> +		br->hash_size = hash_size;
> +		br->block_len = restart_start;
> +		br->full_block_size = full_block_size;
> +		br->header_off = header_off;
> +		br->restart_count = restart_count;
> +		br->restart_bytes = restart_bytes;
> +	}
> +
> +	return 0;
> +}
> +
> +static uint32_t block_reader_restart_offset(struct block_reader *br, int i)
> +{
> +	return get_be24(br->restart_bytes + 3 * i);
> +}
> +
> +void block_reader_start(struct block_reader *br, struct block_iter *it)
> +{
> +	it->br = br;
> +	slice_resize(&it->last_key, 0);
> +	it->next_off = br->header_off + 4;
> +}
> +
> +struct restart_find_args {
> +	int error;
> +	struct slice key;
> +	struct block_reader *r;
> +};

I would like to see all structure declaration at top of file.

> +
> +static int restart_key_less(int idx, void *args)

idx? index? Shouldn't it be size_t?

> +{
> +	struct restart_find_args *a = (struct restart_find_args *)args;
> +	uint32_t off = block_reader_restart_offset(a->r, idx);
> +	struct slice in = {
> +		.buf = a->r->block.data + off,
> +		.len = a->r->block_len - off,

C99 designated initialisation.
We aren't ready, yet.

> +void block_iter_copy_from(struct block_iter *dest, struct block_iter *src)
> +{
> +	dest->br = src->br;
> +	dest->next_off = src->next_off;
> +	slice_copy(&dest->last_key, src->last_key);
> +}
> +
> +/* return < 0 for error, 0 for OK, > 0 for EOF. */
> +int block_iter_next(struct block_iter *it, struct record rec)

If this function is used only in this file, it should be static,
otherwise, the comment should go to header file

> diff --git a/reftable/block.h b/reftable/block.h
> new file mode 100644
> index 00000000000..62b2e0fec6d
> +/*
> +  Writes reftable blocks. The block_writer is reused across blocks to minimize
> +  allocation overhead.
> +*/
> +struct block_writer {
> +	byte *buf;
> +	uint32_t block_size;

I suppose this means 2^32 block should be enough for everyone?

> diff --git a/reftable/bytes.c b/reftable/bytes.c
> new file mode 100644
> index 00000000000..e69de29bb2d

Empty?

> diff --git a/reftable/config.h b/reftable/config.h
> new file mode 100644
> index 00000000000..40a8c178f10
> --- /dev/null
> +++ b/reftable/config.h
> @@ -0,0 +1 @@
> +/* empty */

Empty?

> diff --git a/reftable/dump.c b/reftable/dump.c
> new file mode 100644
> index 00000000000..c0065792a4f
> --- /dev/null
> +++ b/reftable/dump.c
> @@ -0,0 +1,97 @@
> +/*
> +Copyright 2020 Google LLC
> +
> +Use of this source code is governed by a BSD-style
> +license that can be found in the LICENSE file or at
> +https://developers.google.com/open-source/licenses/bsd
> +*/
> +
> +#include "system.h"
> +
> +#include "reftable.h"
> +
> +static int dump_table(const char *tablename)
> +{
> +	struct block_source src = { 0 };
> +	int err = block_source_from_file(&src, tablename);
> +	if (err < 0) {
> +		return err;
> +	}
> +
> +	struct reader *r = NULL;

-Wdeclaration-after-statement

> +	err = new_reader(&r, src, tablename);
> +	if (err < 0) {
> +		return err;
> +	}
> +
> +	{
> +		struct iterator it = { 0 };
> +		err = reader_seek_ref(r, &it, "");
> +		if (err < 0) {
> +			return err;
> +		}
> +
> +		struct ref_record ref = { 0 };
> +		while (1) {

In the same series, we see both `while (true)` and `while (1)`

> +int main(int argc, char *argv[])

Do we ship this file?
(I haven't checked change to Makefile, yet)

> +{
> +	int opt;
> +	const char *table = NULL;
> +	while ((opt = getopt(argc, argv, "t:")) != -1) {
> +		switch (opt) {
> +		case 't':
> +			table = xstrdup(optarg);
> +			break;
> +		case '?':

We uses "-h|--help" for usage.

> +			printf("usage: %s [-table tablefile]\n", argv[0]);
> +			return 2;
> +			break;

break after return

> diff --git a/reftable/iter.c b/reftable/iter.c
> new file mode 100644
> index 00000000000..747f3eae256
> --- /dev/null
> +++ b/reftable/iter.c
> @@ -0,0 +1,234 @@
> +/*
> +Copyright 2020 Google LLC
> +
> +Use of this source code is governed by a BSD-style
> +license that can be found in the LICENSE file or at
> +https://developers.google.com/open-source/licenses/bsd
> +*/
> +
> +#include "iter.h"
> +
> +#include "system.h"
> +
> +#include "block.h"
> +#include "constants.h"
> +#include "reader.h"
> +#include "reftable.h"
> +
> +bool iterator_is_null(struct reftable_iterator it)
> +{
> +	return it.ops == NULL;
> +}

Do we want this verbose?
Or it'll be use in some kind of vtable?

> +
> +static int filtering_ref_iterator_next(void *iter_arg, struct record rec)
> +{
> +	struct filtering_ref_iterator *fri =
> +		(struct filtering_ref_iterator *)iter_arg;
> +	struct reftable_ref_record *ref =
> +		(struct reftable_ref_record *)rec.data;

We're passing a pointer, references a member of a variable in local
scope around, this will ask for trouble if callee doesn't aware about
it, and save that pointer somewhere for later access.

I really think we shouldn't merge a big patch like this into Git,
it's too hard to follow and it's harder to reasoning.

> +
> +	while (true) {
> +		int err = reftable_iterator_next_ref(fri->it, ref);
> +		if (err != 0) {
> +			return err;
> +		}
> +
> +		if (fri->double_check) {
> +			struct reftable_iterator it = { 0 };
> +
> +			int err = reftable_reader_seek_ref(fri->r, &it,
> +							   ref->ref_name);
> +			if (err == 0) {
> +				err = reftable_iterator_next_ref(it, ref);
> +			}
> +
> +			reftable_iterator_destroy(&it);
> +
> +			if (err < 0) {
> +				return err;
> +			}
> +
> +			if (err > 0) {
> +				continue;
> +			}
> +		}
> +
> +		if ((ref->target_value != NULL &&
> +		     !memcmp(fri->oid.buf, ref->target_value, fri->oid.len)) ||
> +		    (ref->value != NULL &&
> +		     !memcmp(fri->oid.buf, ref->value, fri->oid.len))) {
> +			return 0;
> +		}
> +	}
> +}
> +
> +}
> +
> diff --git a/reftable/iter.h b/reftable/iter.h
> new file mode 100644
> index 00000000000..408b7f41a74

I stopped at this. It's too tiresome to read this patch.

Overall, I have those general comments, when I have _NOT_ read the
design:

* This patch introduce a total different coding style
* size should be in size_t, not int
* passing large structure around, and passing pointer to some member
  of that structure around, this will ask for trouble.
* Some code are un-parsable,
* I strongly agree with Dscho, we should merge this patch as subtree
  instead, or we should provide API so other party can provide plugin
  ref-backend.

-- 
Danh