Re: [RFC PATCH 10/10] pack-objects: improve partial packfile reuse

Jonathan Tan <jonathantanmy@xxxxxxxxxx> · Thu, 10 Oct 2019 16:59:52 -0700

I'm going to start with pack-bitmap.h, then builtin/pack-objects.c.

>  int reuse_partial_packfile_from_bitmap(struct bitmap_index *,
>  				       struct packed_git **packfile,
> -				       uint32_t *entries, off_t *up_to);
> +				       uint32_t *entries,
> +				       struct bitmap **bitmap);

Makes sense. The existing code determines if the given packfile is
suitable, and if yes, the span of the packfile to use. With this patch,
we resolve to use the given packfile, and want to compute which objects
to use, storing the computation result in an uncompressed bitmap.
(Resolving to use the given packfile is not that much different from
existing behavior, as far as I know, since we currently only consider
one packfile at most anyway.)

So replacing the out param makes sense, although a more descriptive name
than "bitmap" would be nice.

Skipping pack-bitmaps.c for now.

OK, builtin/pack-objects.c.

> +/*
> + * Record the offsets needed in our reused packfile chunks due to
> + * "gaps" where we omitted some objects.
> + */
> +static struct reused_chunk {
> +	off_t start;
> +	off_t offset;
> +} *reused_chunks;
> +static int reused_chunks_nr;
> +static int reused_chunks_alloc;

This makes sense - offsets may be different when we omit objects from
the packfile. I think this can be computed by calculating the number of
zero bits between the current object's index and the nth object prior
(where n is the offset) in the bitmap resulting from
reuse_partial_packfile_from_bitmap() above, thus eliminating the need
for this array, but I haven't tested it.

> +static void write_reused_pack_one(size_t pos, struct hashfile *out,
> +				  struct pack_window **w_curs)
> +{
> +	off_t offset, next, cur;
> +	enum object_type type;
> +	unsigned long size;
>  
> +	offset = reuse_packfile->revindex[pos].offset;
> +	next = reuse_packfile->revindex[pos + 1].offset;
>  
> +	record_reused_object(offset, offset - hashfile_total(out));
>  
> +	cur = offset;
> +	type = unpack_object_header(reuse_packfile, w_curs, &cur, &size);
> +	assert(type >= 0);
>  
> +	if (type == OBJ_OFS_DELTA) {
> +		off_t base_offset;
> +		off_t fixup;
> +
> +		unsigned char header[MAX_PACK_OBJECT_HEADER];
> +		unsigned len;
> +
> +		base_offset = get_delta_base(reuse_packfile, w_curs, &cur, type, offset);
> +		assert(base_offset != 0);
> +
> +		/* Convert to REF_DELTA if we must... */
> +		if (!allow_ofs_delta) {
> +			int base_pos = find_revindex_position(reuse_packfile, base_offset);
> +			const unsigned char *base_sha1 =
> +				nth_packed_object_sha1(reuse_packfile,
> +						       reuse_packfile->revindex[base_pos].nr);
> +
> +			len = encode_in_pack_object_header(header, sizeof(header),
> +							   OBJ_REF_DELTA, size);
> +			hashwrite(out, header, len);
> +			hashwrite(out, base_sha1, 20);
> +			copy_pack_data(out, reuse_packfile, w_curs, cur, next - cur);
> +			return;
> +		}
>  
> +		/* Otherwise see if we need to rewrite the offset... */
> +		fixup = find_reused_offset(offset) -
> +			find_reused_offset(base_offset);
> +		if (fixup) {
> +			unsigned char ofs_header[10];
> +			unsigned i, ofs_len;
> +			off_t ofs = offset - base_offset - fixup;
> +
> +			len = encode_in_pack_object_header(header, sizeof(header),
> +							   OBJ_OFS_DELTA, size);
> +
> +			i = sizeof(ofs_header) - 1;
> +			ofs_header[i] = ofs & 127;
> +			while (ofs >>= 7)
> +				ofs_header[--i] = 128 | (--ofs & 127);
> +
> +			ofs_len = sizeof(ofs_header) - i;
> +
> +			if (0) {
> +				off_t expected_size = cur - offset;
> +
> +				if (len + ofs_len < expected_size) {
> +					unsigned max_pad = (len >= 4) ? 9 : 5;
> +					header[len - 1] |= 0x80;
> +					while (len < max_pad && len + ofs_len < expected_size)
> +						header[len++] = 0x80;
> +					header[len - 1] &= 0x7F;
> +				}
> +			}

This if(0) should be deleted.

> +
> +			hashwrite(out, header, len);
> +			hashwrite(out, ofs_header + sizeof(ofs_header) - ofs_len, ofs_len);
> +			copy_pack_data(out, reuse_packfile, w_curs, cur, next - cur);
> +			return;
> +		}
> +
> +		/* ...otherwise we have no fixup, and can write it verbatim */
> +	}
> +
> +	copy_pack_data(out, reuse_packfile, w_curs, offset, next - offset);
> +}

I didn't look into detail, but this looks like it's writing a single
object. In particular, it can convert OFS into REF, something that the
existing code cannot do.

> +static size_t write_reused_pack_verbatim(struct hashfile *out,
> +					 struct pack_window **w_curs)
> +{
> +	size_t pos = 0;
> +
> +	while (pos < reuse_packfile_bitmap->word_alloc &&
> +			reuse_packfile_bitmap->words[pos] == (eword_t)~0)
> +		pos++;
> +
> +	if (pos) {
> +		off_t to_write;
> +
> +		written = (pos * BITS_IN_EWORD);
> +		to_write = reuse_packfile->revindex[written].offset
> +			- sizeof(struct pack_header);
> +
> +		record_reused_object(sizeof(struct pack_header), 0);
> +		hashflush(out);
> +		copy_pack_data(out, reuse_packfile, w_curs,
> +			sizeof(struct pack_header), to_write);
>  
>  		display_progress(progress_state, written);
>  	}
> +	return pos;
> +}

I presume this optimization is needed for the case where we are using
*all* objects of a packfile, as is typical during a clone.

> +static void write_reused_pack(struct hashfile *f)
> +{
> +	size_t i = 0;
> +	uint32_t offset;
> +	struct pack_window *w_curs = NULL;
> +
> +	if (allow_ofs_delta)
> +		i = write_reused_pack_verbatim(f, &w_curs);
> +
> +	for (; i < reuse_packfile_bitmap->word_alloc; ++i) {
> +		eword_t word = reuse_packfile_bitmap->words[i];
> +		size_t pos = (i * BITS_IN_EWORD);
> +
> +		for (offset = 0; offset < BITS_IN_EWORD; ++offset) {
> +			if ((word >> offset) == 0)
> +				break;
> +
> +			offset += ewah_bit_ctz64(word >> offset);
> +			write_reused_pack_one(pos + offset, f, &w_curs);
> +			display_progress(progress_state, ++written);
> +		}
> +	}
>  
> +	unuse_pack(&w_curs);
>  }

I didn't look into detail, but it looks like a straightforward loop.

> @@ -1002,6 +1132,10 @@ static int have_duplicate_entry(const struct object_id *oid,
>  {
>  	struct object_entry *entry;
>  
> +	if (reuse_packfile_bitmap &&
> +	    bitmap_walk_contains(bitmap_git, reuse_packfile_bitmap, oid))
> +		return 1;

Hmm...why did we previously not need to check the reuse information, but
we do now? I gave the code a cursory glance but couldn't find the
answer.

> @@ -2571,7 +2706,9 @@ static void ll_find_deltas(struct object_entry **list, unsigned list_size,
>  
>  static int obj_is_packed(const struct object_id *oid)
>  {
> -	return !!packlist_find(&to_pack, oid, NULL);
> +	return packlist_find(&to_pack, oid, NULL) ||
> +		(reuse_packfile_bitmap &&
> +		 bitmap_walk_contains(bitmap_git, reuse_packfile_bitmap, oid));

Same question here - why do we need to check the reuse information?

> @@ -3061,7 +3199,6 @@ static void loosen_unused_packed_objects(void)
>  static int pack_options_allow_reuse(void)
>  {
>  	return pack_to_stdout &&
> -	       allow_ofs_delta &&
>  	       !ignore_packed_keep_on_disk &&
>  	       !ignore_packed_keep_in_core &&
>  	       (!local || !have_non_local_packs) &&

Relaxing of the allow_ofs_delta condition - makes sense given (as above)
we can convert OFS to REF.

Overall I think that this makes sense, except for my unanswered
questions (and the presence of reused_chunk).