Re: [PATCH, take 2] Linear-time/space rename logic (exact renames only)

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 




On Mon, 22 Oct 2007, Linus Torvalds wrote:
> 
> Ok, as some people notices, there were a few bugs in the previous patch. I 
> didn't free the hashes correctly (stupid) and the Makefile had "hash.o" 
> instead of "hash.h".

Ok, there were still more bugs, and before you get too involved with this 
last patch (not that I've seen any comments yet), apply this appended 
patch to actually fix things a bit more first!

Yes, I'm a moron. I hadn't even bothered to run the test-suite on it, and 
that showed several silly problems.

One of the problems was that since the rename detection copied the 
diffspecs around, the "rename_used" count couldn't work right, because 
things got copied around and the count stayed with one diffspec, but not 
the other..

In the kernel, we have a rule that says that any data structure that isn't 
ref-counted is basically a bug, and that was true here too. Instead of 
copying and splitting the diffspecs, just refcount them and keep track of 
how many users there are.

While the above bug was a somewhat subtle issue from me trying to be 
clever in avoiding the O(n*m) file copy/rename reuse issue, there were a 
few issues that were me just being totally braindead: the exact rename 
detection had lost the code that took file modes into account, so it would 
generate "renames" from regular files to symlinks, that the generic diff 
core layer would just split up again.

And even more stupidly, I had matched up the src/dst things when finding 
the rename, which just complicated things (added a totally unnecessary 
need to keep track of a destination being used more than once) and also 
broke the basename matching comparison. Duh.

So here's an incremental patch on top of the previous failed try. And if 
somebody is confused (and that might be me) and cannot get things to 
apply, just holler and I'll send the whole thing again. I might even try 
to clean up the series a bit and do it in stages.

This patch shouldn't change any performance behaviour (well, it might 
speed things up a bit to not allocate those diffspec structures, but it's 
unlikely that is even measurable). It just fixes stuff.

I'm sure there's more to come..

		Linus

---
 diff.c            |   17 ++++++++++++-----
 diffcore-rename.c |   40 ++++++++++++++++++++++------------------
 diffcore.h        |    2 ++
 3 files changed, 36 insertions(+), 23 deletions(-)

diff --git a/diff.c b/diff.c
index e892030..2e74cb3 100644
--- a/diff.c
+++ b/diff.c
@@ -1440,9 +1440,18 @@ struct diff_filespec *alloc_filespec(const char *path)
 	memset(spec, 0, sizeof(*spec));
 	spec->path = (char *)(spec + 1);
 	memcpy(spec->path, path, namelen+1);
+	spec->count = 1;
 	return spec;
 }
 
+void free_filespec(struct diff_filespec *spec)
+{
+	if (!--spec->count) {
+		diff_free_filespec_data(spec);
+		free(spec);
+	}
+}
+
 void fill_filespec(struct diff_filespec *spec, const unsigned char *sha1,
 		   unsigned short mode)
 {
@@ -2431,10 +2440,8 @@ struct diff_filepair *diff_queue(struct diff_queue_struct *queue,
 
 void diff_free_filepair(struct diff_filepair *p)
 {
-	diff_free_filespec_data(p->one);
-	diff_free_filespec_data(p->two);
-	free(p->one);
-	free(p->two);
+	free_filespec(p->one);
+	free_filespec(p->two);
 	free(p);
 }
 
@@ -2588,7 +2595,7 @@ void diff_debug_filepair(const struct diff_filepair *p, int i)
 	diff_debug_filespec(p->two, i, "two");
 	fprintf(stderr, "score %d, status %c rename_used %d broken %d\n",
 		p->score, p->status ? p->status : '?',
-		p->rename_used, p->broken_pair);
+		p->one->rename_used, p->broken_pair);
 }
 
 void diff_debug_queue(const char *msg, struct diff_queue_struct *q)
diff --git a/diffcore-rename.c b/diffcore-rename.c
index cc105db..3946932 100644
--- a/diffcore-rename.c
+++ b/diffcore-rename.c
@@ -184,7 +184,7 @@ static int estimate_similarity(struct diff_filespec *src,
 
 static void record_rename_pair(int dst_index, int src_index, int score)
 {
-	struct diff_filespec *one, *two, *src, *dst;
+	struct diff_filespec *src, *dst;
 	struct diff_filepair *dp;
 
 	if (rename_dst[dst_index].pair)
@@ -192,14 +192,12 @@ static void record_rename_pair(int dst_index, int src_index, int score)
 
 	src = rename_src[src_index].one;
 	src->rename_used++;
-	one = alloc_filespec(src->path);
-	fill_filespec(one, src->sha1, src->mode);
+	src->count++;
 
 	dst = rename_dst[dst_index].two;
-	two = alloc_filespec(dst->path);
-	fill_filespec(two, dst->sha1, dst->mode);
+	dst->count++;
 
-	dp = diff_queue(NULL, one, two);
+	dp = diff_queue(NULL, src, dst);
 	dp->renamed_pair = 1;
 	if (!strcmp(src->path, dst->path))
 		dp->score = rename_src[src_index].score;
@@ -232,21 +230,30 @@ static int find_identical_files(struct file_similarity *src,
 				struct file_similarity *dst)
 {
 	int renames = 0;
+
+	/*
+	 * Walk over all the destinations ...
+	 */
 	do {
-		struct diff_filespec *one = src->filespec;
+		struct diff_filespec *one = dst->filespec;
 		struct file_similarity *p, *best;
 		int i = 100;
 
+		/*
+		 * .. to find the best source match
+		 */
 		best = NULL;
-		for (p = dst; p; p = p->next) {
+		for (p = src; p; p = p->next) {
 			struct diff_filespec *two = p->filespec;
 
-			/* Already picked as a destination? */
-			if (!p->src_dst)
-				continue;
 			/* False hash collission? */
 			if (hashcmp(one->sha1, two->sha1))
 				continue;
+			/* Non-regular files? If so, the modes must match! */
+			if (!S_ISREG(one->mode) || !S_ISREG(two->mode)) {
+				if (one->mode != two->mode)
+					continue;
+			}
 			best = p;
 			if (basename_same(one, two))
 				break;
@@ -256,11 +263,10 @@ static int find_identical_files(struct file_similarity *src,
 				break;
 		}
 		if (best) {
-			best->src_dst = 0;
-			record_rename_pair(best->index, src->index, MAX_SCORE);
+			record_rename_pair(dst->index, best->index, MAX_SCORE);
 			renames++;
 		}
-	} while ((src = src->next) != NULL);
+	} while ((dst = dst->next) != NULL);
 	return renames;
 }
 
@@ -569,10 +575,8 @@ void diffcore_rename(struct diff_options *options)
 	*q = outq;
 	diff_debug_queue("done collapsing", q);
 
-	for (i = 0; i < rename_dst_nr; i++) {
-		diff_free_filespec_data(rename_dst[i].two);
-		free(rename_dst[i].two);
-	}
+	for (i = 0; i < rename_dst_nr; i++)
+		free_filespec(rename_dst[i].two);
 
 	free(rename_dst);
 	rename_dst = NULL;
diff --git a/diffcore.h b/diffcore.h
index ceda932..cc96c20 100644
--- a/diffcore.h
+++ b/diffcore.h
@@ -29,6 +29,7 @@ struct diff_filespec {
 	void *cnt_data;
 	const char *funcname_pattern_ident;
 	unsigned long size;
+	int count;               /* Reference count */
 	int xfrm_flags;		 /* for use by the xfrm */
 	int rename_used;         /* Count of rename users */
 	unsigned short mode;	 /* file mode */
@@ -44,6 +45,7 @@ struct diff_filespec {
 };
 
 extern struct diff_filespec *alloc_filespec(const char *);
+extern void free_filespec(struct diff_filespec *);
 extern void fill_filespec(struct diff_filespec *, const unsigned char *,
 			  unsigned short);
 
-
To unsubscribe from this list: send the line "unsubscribe git" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[Index of Archives]     [Linux Kernel Development]     [Gcc Help]     [IETF Annouce]     [DCCP]     [Netdev]     [Networking]     [Security]     [V4L]     [Bugtraq]     [Yosemite]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Linux SCSI]     [Fedora Users]

  Powered by Linux