Re: [RFC/PATCH] patch-ids: check modified paths before calculating diff

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Wed, May 29, 2013 at 02:20:07AM -0400, Jeff King wrote:

> In the best case, we compute no patch-ids at all. And even for the
> average case, I'd expect our lazy calculation to only have to compute a
> handful of ids.

Here is a not-well-tested version of the idea. I tried to contain the
changes to patch-ids.c, though we may also be able to simplify the
--cherry code, too.

Here are my timings compared to stock git and to yours
(all are best-of-five, "git log --cherry $revs"):

  revs=origin/master...origin/jk/submodule-subdirectory-ok
        stock    |    you   |    me
        -------------------------------
  real  0m0.501s | 0m0.078s | 0m0.098s
  user  0m0.480s | 0m0.056s | 0m0.084s
  sys   0m0.016s | 0m0.020s | 0m0.012s

  revs=origin/next...origin/pu
        stock    |    you   |    me
        -------------------------------
  real  0m0.857s | 0m0.847s | 0m0.519s
  user  0m0.828s | 0m0.812s | 0m0.488s
  sys   0m0.024s | 0m0.028s | 0m0.024s

So it performs slightly less well on the small case, and a bit better on
the large case. I think part of the problem is that when we do have a
"loose" hit, we end up re-doing the tree diff to find the strict, which
involves re-inflating the trees. It's unavoidable for the lazy entries
unless we want to cache the whole diff_queued_diff, but for the non-lazy
entries we should be able to do the strict patch-id incrementally. We
just need to improve the function interfaces.

---
 diff.c      |  15 ++++-
 diff.h      |   2 +-
 patch-ids.c | 117 +++++++++++++++++++----------------
 patch-ids.h |  11 ++--
 4 files changed, 82 insertions(+), 63 deletions(-)

diff --git a/diff.c b/diff.c
index f0b3e7c..3b55788 100644
--- a/diff.c
+++ b/diff.c
@@ -4233,7 +4233,8 @@ static void patch_id_consume(void *priv, char *line, unsigned long len)
 }
 
 /* returns 0 upon success, and writes result into sha1 */
-static int diff_get_patch_id(struct diff_options *options, unsigned char *sha1)
+static int diff_get_patch_id(struct diff_options *options, unsigned char *sha1,
+			     int loose)
 {
 	struct diff_queue_struct *q = &diff_queued_diff;
 	int i;
@@ -4266,6 +4267,12 @@ static int diff_get_patch_id(struct diff_options *options, unsigned char *sha1)
 		if (DIFF_PAIR_UNMERGED(p))
 			continue;
 
+		if (loose) {
+			git_SHA1_Update(&ctx, p->one->path, strlen(p->one->path));
+			git_SHA1_Update(&ctx, p->two->path, strlen(p->two->path));
+			continue;
+		}
+
 		diff_fill_sha1_info(p->one);
 		diff_fill_sha1_info(p->two);
 		if (fill_mmfile(&mf1, p->one) < 0 ||
@@ -4323,11 +4330,13 @@ int diff_flush_patch_id(struct diff_options *options, unsigned char *sha1)
 	return 0;
 }
 
-int diff_flush_patch_id(struct diff_options *options, unsigned char *sha1)
+int diff_flush_patch_id(struct diff_options *options,
+			unsigned char *sha1,
+			int loose)
 {
 	struct diff_queue_struct *q = &diff_queued_diff;
 	int i;
-	int result = diff_get_patch_id(options, sha1);
+	int result = diff_get_patch_id(options, sha1, loose);
 
 	for (i = 0; i < q->nr; i++)
 		diff_free_filepair(q->queue[i]);
diff --git a/diff.h b/diff.h
index 78b4091..b018aaf 100644
--- a/diff.h
+++ b/diff.h
@@ -320,7 +320,7 @@ extern int do_diff_cache(const unsigned char *, struct diff_options *);
 extern int run_diff_index(struct rev_info *revs, int cached);
 
 extern int do_diff_cache(const unsigned char *, struct diff_options *);
-extern int diff_flush_patch_id(struct diff_options *, unsigned char *);
+extern int diff_flush_patch_id(struct diff_options *, unsigned char *, int loose);
 
 extern int diff_result_code(struct diff_options *, int);
 
diff --git a/patch-ids.c b/patch-ids.c
index bc8a28f..3a83ee6 100644
--- a/patch-ids.c
+++ b/patch-ids.c
@@ -5,7 +5,7 @@ static int commit_patch_id(struct commit *commit, struct diff_options *options,
 #include "patch-ids.h"
 
 static int commit_patch_id(struct commit *commit, struct diff_options *options,
-		    unsigned char *sha1)
+			   unsigned char *sha1, int loose)
 {
 	if (commit->parents)
 		diff_tree_sha1(commit->parents->item->object.sha1,
@@ -13,27 +13,9 @@ static int commit_patch_id(struct commit *commit, struct diff_options *options,
 	else
 		diff_root_tree_sha1(commit->object.sha1, "", options);
 	diffcore_std(options);
-	return diff_flush_patch_id(options, sha1);
+	return diff_flush_patch_id(options, sha1, loose);
 }
 
-static const unsigned char *patch_id_access(size_t index, void *table)
-{
-	struct patch_id **id_table = table;
-	return id_table[index]->patch_id;
-}
-
-static int patch_pos(struct patch_id **table, int nr, const unsigned char *id)
-{
-	return sha1_pos(id, table, nr, patch_id_access);
-}
-
-#define BUCKET_SIZE 190 /* 190 * 21 = 3990, with slop close enough to 4K */
-struct patch_id_bucket {
-	struct patch_id_bucket *next;
-	int nr;
-	struct patch_id bucket[BUCKET_SIZE];
-};
-
 int init_patch_ids(struct patch_ids *ids)
 {
 	memset(ids, 0, sizeof(*ids));
@@ -43,56 +25,83 @@ static struct patch_id *add_commit(struct commit *commit,
 	return 0;
 }
 
+static int free_each_patch_id(void *patch_id, void *data)
+{
+	free(patch_id);
+	return 0;
+}
+
 int free_patch_ids(struct patch_ids *ids)
 {
-	struct patch_id_bucket *next, *patches;
-
-	free(ids->table);
-	for (patches = ids->patches; patches; patches = next) {
-		next = patches->next;
-		free(patches);
-	}
+	for_each_hash(&ids->table, free_each_patch_id, NULL);
+	free_hash(&ids->table);
 	return 0;
 }
 
+static inline unsigned long hash_sha1(const unsigned char *sha1)
+{
+	unsigned int hash;
+	memcpy(&hash, sha1, sizeof(hash));
+	return hash;
+}
+
 static struct patch_id *add_commit(struct commit *commit,
 				   struct patch_ids *ids,
 				   int no_add)
 {
-	struct patch_id_bucket *bucket;
-	struct patch_id *ent;
-	unsigned char sha1[20];
-	int pos;
+	struct patch_id *p;
+	unsigned char loose[20], strict[20];
+	unsigned long hash;
+	void **pos;
 
-	if (commit_patch_id(commit, &ids->diffopts, sha1))
+	if (commit_patch_id(commit, &ids->diffopts, loose, 1))
 		return NULL;
-	pos = patch_pos(ids->table, ids->nr, sha1);
-	if (0 <= pos)
-		return ids->table[pos];
+	hash = hash_sha1(loose);
+
+	p = lookup_hash(hash, &ids->table);
+	for (; p; p = p->next) {
+		/* Skip collisions from our reduced hash. */
+		if (hashcmp(loose, p->loose))
+		    continue;
+
+		/*
+		 * We have a real loose match; lazily load and cache the strict
+		 * id to compare.
+		 */
+		if (is_null_sha1(p->strict)) {
+			if (commit_patch_id(p->commit, &ids->diffopts,
+					    p->strict, 0))
+				return NULL;
+		}
+		if (commit_patch_id(commit, &ids->diffopts, strict, 0))
+			return NULL;
+
+		/*
+		 * If the strict ones match, we do not need to look farther;
+		 * the patch-id is here.
+		 */
+		if (!hashcmp(strict, p->strict))
+			return p;
+	}
+
+	/*
+	 * Otherwise, we may have a loose but not strict match, or even no
+	 * loose match at all. Now we can add the new entry (or return failure
+	 * to look it up).
+	 */
 	if (no_add)
 		return NULL;
 
-	pos = -1 - pos;
+	p = xcalloc(1, sizeof(*p));
+	p->commit = commit;
+	hashcpy(p->loose, loose);
 
-	bucket = ids->patches;
-	if (!bucket || (BUCKET_SIZE <= bucket->nr)) {
-		bucket = xcalloc(1, sizeof(*bucket));
-		bucket->next = ids->patches;
-		ids->patches = bucket;
+	pos = insert_hash(hash, p, &ids->table);
+	if (pos) {
+		p->next = *pos;
+		*pos = p;
 	}
-	ent = &bucket->bucket[bucket->nr++];
-	hashcpy(ent->patch_id, sha1);
-
-	if (ids->alloc <= ids->nr) {
-		ids->alloc = alloc_nr(ids->nr);
-		ids->table = xrealloc(ids->table, sizeof(ent) * ids->alloc);
-	}
-	if (pos < ids->nr)
-		memmove(ids->table + pos + 1, ids->table + pos,
-			sizeof(ent) * (ids->nr - pos));
-	ids->nr++;
-	ids->table[pos] = ent;
-	return ids->table[pos];
+	return p;
 }
 
 struct patch_id *has_commit_patch_id(struct commit *commit,
diff --git a/patch-ids.h b/patch-ids.h
index c8c7ca1..c40ff39 100644
--- a/patch-ids.h
+++ b/patch-ids.h
@@ -2,15 +2,16 @@ struct patch_ids {
 #define PATCH_IDS_H
 
 struct patch_id {
-	unsigned char patch_id[20];
-	char seen;
+	struct commit *commit;
+	unsigned char loose[20];
+	unsigned char strict[20];
+	unsigned seen:1;
+	struct patch_id *next;
 };
 
 struct patch_ids {
 	struct diff_options diffopts;
-	int nr, alloc;
-	struct patch_id **table;
-	struct patch_id_bucket *patches;
+	struct hash_table table;
 };
 
 int init_patch_ids(struct patch_ids *);
--
To unsubscribe from this list: send the line "unsubscribe git" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html




[Index of Archives]     [Linux Kernel Development]     [Gcc Help]     [IETF Annouce]     [DCCP]     [Netdev]     [Networking]     [Security]     [V4L]     [Bugtraq]     [Yosemite]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Linux SCSI]     [Fedora Users]