Re: [PATCH v2 09/23] pseudo-merge: implement support for selecting pseudo-merge commits

Patrick Steinhardt <ps@xxxxxx> · Mon, 6 May 2024 13:53:01 +0200

On Mon, Apr 29, 2024 at 04:43:37PM -0400, Taylor Blau wrote:
[snip]
> +static uint32_t pseudo_merge_group_size(const struct pseudo_merge_group *group,
> +					const struct pseudo_merge_matches *matches,
> +					uint32_t i)
> +{
> +	float C = 0.0f;
> +	uint32_t n;
> +
> +	/*
> +	 * The size of pseudo-merge groups decays according to a power series,
> +	 * which looks like:
> +	 *
> +	 *   f(n) = C * n^-k
> +	 *
> +	 * , where 'n' is the n-th pseudo-merge group, 'f(n)' is its size, 'k'
> +	 * is the decay rate, and 'C' is a scaling value.
> +	 *
> +	 * The value of C depends on the number of groups, decay rate, and total
> +	 * number of commits. It is computed such that if there are M and N
> +	 * total groups and commits, respectively, that:
> +	 *
> +	 *   N = f(0) + f(1) + ... f(M-1)
> +	 *
> +	 * Rearranging to isolate C, we get:
> +	 *
> +	 *   N = \sum_{n=1}^M C / n^k
> +	 *
> +	 *   N / C = \sum_{n=1}^M n^-k
> +	 *
> +	 *   C = N / \sum_{n=1}^M n^-k
> +	 *
> +	 * For example, if we have a decay rate of 'k' being equal to 1.5, 'N'
> +	 * total commits equal to 10,000, and 'M' being equal to 6 groups, then
> +	 * the (rounded) group sizes are:
> +	 *
> +	 *   { 5469, 1934, 1053, 684, 489, 372 }
> +	 *
> +	 * increasing the number of total groups, say to 10, scales the group
> +	 * sizes appropriately:
> +	 *
> +	 *   { 5012, 1772, 964, 626, 448, 341, 271, 221, 186, 158 }
> +	 */
> +	for (n = 0; n < group->max_merges; n++)
> +		C += 1.0f / gitexp(n + 1, group->decay);
> +	C = matches->unstable_nr / C;
> +
> +	return (int)((C / gitexp(i + 1, group->decay)) + 0.5);

Why do we cast the return to `int` when the function returns a
`uint32_t`?

> +}
> +
> +static void init_pseudo_merge_group(struct pseudo_merge_group *group)

Nit: Should't the name rather be `pseudo_merge_group_init()`?

[snip]
> +	} else if (!strcmp(key, "decay")) {
> +		group->decay = git_config_int(var, value, ctx->kvi);
> +		if (group->decay < 0) {
> +			warning(_("%s must be non-negative, using default"), var);
> +			group->decay = DEFAULT_PSEUDO_MERGE_DECAY;
> +		}

The decay is a float, and your decay rate examples mention a rate of
1.5f. It's impossible to specify fractional rates though because we use
`git_config_int()`. Should we introduce a new `git_config_float()`
function to implement this properly?

> +	} else if (!strcmp(key, "samplerate")) {
> +		group->sample_rate = git_config_int(var, value, ctx->kvi);
> +		if (!(0 <= group->sample_rate && group->sample_rate <= 100)) {
> +			warning(_("%s must be between 0 and 100, using default"), var);
> +			group->sample_rate = DEFAULT_PSEUDO_MERGE_SAMPLE_RATE;
> +		}
> +	} else if (!strcmp(key, "threshold")) {
> +		if (git_config_expiry_date(&group->threshold, var, value)) {
> +			strbuf_release(&buf);

Instead of having multiple exit paths where we need to release `buf` we
should likely have a comment exit path.

[snip]
> +static struct commit *push_pseudo_merge(struct pseudo_merge_group *group)
> +{
> +	struct commit *merge;
> +
> +	ALLOC_GROW(group->merges, group->merges_nr + 1, group->merges_alloc);
> +
> +	merge = alloc_commit_node(the_repository);
> +	merge->object.parsed = 1;

Why can we mark the object as parsed here?

> +	merge->object.flags |= BITMAP_PSEUDO_MERGE;
> +
> +	group->merges[group->merges_nr++] = merge;
> +
> +	return merge;
> +}
> +
> +static struct pseudo_merge_commit_idx *pseudo_merge_idx(kh_oid_map_t *pseudo_merge_commits,
> +							const struct object_id *oid)
> +
> +{
> +	struct pseudo_merge_commit_idx *pmc;
> +	khiter_t hash_pos;
> +
> +	hash_pos = kh_get_oid_map(pseudo_merge_commits, *oid);
> +	if (hash_pos == kh_end(pseudo_merge_commits)) {
> +		int hash_ret;
> +		hash_pos = kh_put_oid_map(pseudo_merge_commits, *oid, &hash_ret);
> +		CALLOC_ARRAY(pmc, 1);
> +
> +		kh_value(pseudo_merge_commits, hash_pos) = pmc;
> +	} else {
> +		pmc = kh_value(pseudo_merge_commits, hash_pos);
> +	}
> +
> +	return pmc;
> +}

Can't we simplify this to the following (untested):

static struct pseudo_merge_commit_idx *pseudo_merge_idx(kh_oid_map_t *pseudo_merge_commits,
                                                       const struct object_id *oid)
{
       struct pseudo_merge_commit_idx *pmc;
       khiter_t hash_pos;
       int hash_ret;

       hash_pos = kh_put_oid_map(pseudo_merge_commits, *oid, &hash_ret);
       if (hash_ret) {
               CALLOC_ARRAY(pmc, 1);
               kh_value(pseudo_merge_commits, hash_pos) = pmc;
       } else {
               pmc = kh_value(pseudo_merge_commits, hash_pos);
       }

       return pmc;
}

> +
> +#define MIN_PSEUDO_MERGE_SIZE 8
> +
> +static void select_pseudo_merges_1(struct pseudo_merge_group *group,
> +				   struct pseudo_merge_matches *matches,
> +				   kh_oid_map_t *pseudo_merge_commits,
> +				   uint32_t *pseudo_merges_nr)
> +{
> +	uint32_t i, j;
> +	uint32_t stable_merges_nr;
> +
> +	if (!matches->stable_nr && !matches->unstable_nr)
> +		return; /* all tips in this group already have bitmaps */

It's nice that there are some comments, but there are quite a lot of
non-obvious things going on in this function that would warrant an
explanation that expands a bit more into what exactly it is that we are
doing here.

I may only be speaking for myself, but I basically have no clue what we
do here :) Something something pseudo merges, I guess. But there is no
in-code explanation at all what a "stable" or "unstable" commit is, how
exactly we match commits and other higher-level ideas.

Patrick
Attachment:
signature.asc

Description: PGP signature