[RFC PATCH] git-svn: fix performance importing tagged subdirectories

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



NOTE: This is an RFC patch because I'm pretty sure I should be
using Git::SVN::Ra::can_do_switch() somewhere, and because I am
very likely abusing git-svn internal methods in a way that will
produce incorrect results in some cases.

Dave
--

When an svn repository has multiple related projects checked in
as individual directories under trunk:

    trunk/project1/
    trunk/project2/
    trunk/project3/

and each project subdirectory is tagged instead of tagging
trunk:

    [...]
    tags/project1-204
    tags/project1-205
    [...]
    tags/project2-395
    tags/project2-396
    [...]
    tags/project3-77
    tags/project3-78
    [...]

then git-svn currently imports the entire history of each new
tag beginning with r1.  This happens because git-svn uses the
name of the branch or tag when attempting to fast-forward svn
history.  For large svn repositories, the time required to
import each additional tag grows exponentially.

A better approach is to search through all known refs for a
ref that has the same repository URL, but with a smaller max
revision.  This ref could then be used to seed a new ref for
the tag being imported, thus bypassing the majority of the
work.

This approach is implemented by changing find_by_url() to take
an additional parameter ($rev) that tells it to return a ref
that represents the closest match to the desired repo url while
having a revision less than or equal to $rev.  When a brand new
ref is created in other_gs(), the new find_by_url() behavior is
used to find the closest matching ref and use it as a seed.
---
 git-svn.perl                          |   45 +++++++++++++++++++++---
 t/t9157-git-svn-subdir-import-perf.sh |   59 +++++++++++++++++++++++++++++++++
 2 files changed, 98 insertions(+), 6 deletions(-)
 create mode 100755 t/t9157-git-svn-subdir-import-perf.sh

diff --git a/git-svn.perl b/git-svn.perl
index 9b046b6..af46f5f 100755
--- a/git-svn.perl
+++ b/git-svn.perl
@@ -1967,8 +1967,12 @@ sub init_remote_config {
 	$self->{url} = $url;
 }
 
-sub find_by_url { # repos_root and, path are optional
-	my ($class, $full_url, $repos_root, $path) = @_;
+# Finds an exact match for a ref based on $full_url, $repos_root and
+# $path.  If no exact match is found and if $rev is specified, the
+# closest match with the same url and a revision <= $rev is returned.
+# Note that $repos_root, $path and $rev are optional.
+sub find_by_url {
+	my ($class, $full_url, $repos_root, $path, $rev) = @_;
 
 	return undef unless defined $full_url;
 	remove_username($full_url);
@@ -1978,6 +1982,7 @@ sub find_by_url { # repos_root and, path are optional
 		$path = $full_url;
 		$path =~ s#^\Q$repos_root\E(?:/|$)##;
 	}
+	my ($closest_gs, $closest_max_rev);
 	foreach my $repo_id (keys %$remotes) {
 		my $u = $remotes->{$repo_id}->{url} or next;
 		remove_username($u);
@@ -2009,11 +2014,22 @@ sub find_by_url { # repos_root and, path are optional
 			$p =~ s#^\Q$z\E(?:/|$)#$prefix# or next;
 		}
 		foreach my $f (keys %$fetch) {
-			next if $f ne $p;
-			return Git::SVN->new($fetch->{$f}, $repo_id, $f);
+			unless ($rev) {
+				next if $f ne $p;
+				return Git::SVN->new($fetch->{$f}, $repo_id, $f);
+			}
+			my $gs = Git::SVN->new($fetch->{$f}, $repo_id, $f);
+			my ($max_rev, $max_commit) = $gs->rev_map_max(1);
+			next if !$max_rev || !$max_commit;
+			my ($url) = ::cmt_metadata($max_commit);
+			next if $url ne $full_url || $max_rev > $rev;
+			if (!$closest_gs || $closest_max_rev < $max_rev) {
+				$closest_gs = $gs;
+				$closest_max_rev = $max_rev;
+			}
 		}
 	}
-	undef;
+	$closest_gs && $rev ? $closest_gs : undef;
 }
 
 sub init {
@@ -2969,18 +2985,35 @@ sub other_gs {
 			$u = $url;
 			$repo_id = $self->{repo_id};
 		}
+		my $max_commit;
 		while (1) {
 			# It is possible to tag two different subdirectories at
 			# the same revision.  If the url for an existing ref
 			# does not match, we must either find a ref with a
 			# matching url or create a new ref by growing a tail.
 			$gs = Git::SVN->init($u, $p, $repo_id, $ref_id, 1);
-			my (undef, $max_commit) = $gs->rev_map_max(1);
+			(undef, $max_commit) = $gs->rev_map_max(1);
 			last if (!$max_commit);
 			my ($url) = ::cmt_metadata($max_commit);
 			last if ($url eq $gs->full_url);
 			$ref_id .= '-';
 		}
+		unless ($max_commit) {
+			# If a brand new ref was created, try to find a matching
+			# ref with the same url and a smaller revision to use as
+			# as a seed.  This avoids reloading the entire history
+			# of the repository when the same subdirectory is tagged
+			# frequently.
+			my $parent_gs = Git::SVN->find_by_url($new_url, $url,
+				$branch_from, $r);
+			if ($parent_gs) {
+				my ($parent_rev, $parent_commit) =
+					$parent_gs->rev_map_max(1);
+				$gs->rev_map_set($parent_rev, $parent_commit);
+				print STDERR "Using " . $parent_gs->{path} .
+					" as seed: $ref_id\n" unless $::_q > 1;
+			}
+		}
 		print STDERR "Initializing parent: $ref_id\n" unless $::_q > 1;
 	}
 	$gs
diff --git a/t/t9157-git-svn-subdir-import-perf.sh b/t/t9157-git-svn-subdir-import-perf.sh
new file mode 100755
index 0000000..d28d0e0
--- /dev/null
+++ b/t/t9157-git-svn-subdir-import-perf.sh
@@ -0,0 +1,59 @@
+#!/bin/sh
+
+test_description='git svn import subdirectory performance'
+
+. ./lib-git-svn.sh
+
+test_expect_success 'setup svn repo' '
+	mkdir -p import/trunk/subdir &&
+	mkdir -p import/branches &&
+	mkdir -p import/tags &&
+	echo "base" >import/trunk/subdir/file &&
+	svn_cmd import -m "import for git svn" import "$svnrepo" &&
+	rm -rf import &&
+
+	svn_cmd co "$svnrepo/trunk" svn_project &&
+	j=4 &&
+	(cd svn_project &&
+		i=1 &&
+		while [ $i -le $j ]; do
+			echo "$i" >>subdir/file &&
+			svn_cmd ci -m "trunk change $i" subdir/file &&
+			i=$(($i+1))
+		done
+	) &&
+
+	svn_cmd cp -m "create tag mytag1" "$svnrepo/trunk/subdir" "$svnrepo/tags/mytag1" &&
+
+	(cd svn_project &&
+		i=$(($j+1)) &&
+		echo "$i" >>subdir/file &&
+		svn_cmd ci -m "trunk change $i" subdir/file
+	) &&
+
+	svn_cmd cp -m "create tag mytag2" "$svnrepo/trunk/subdir" "$svnrepo/tags/mytag2"
+
+	(cd svn_project &&
+		i=$(($j+2)) &&
+		echo "$i" >>subdir/file &&
+		svn_cmd ci -m "trunk change $i" subdir/file
+	) &&
+
+	svn_cmd cp -m "create tag mytag3" "$svnrepo/trunk/subdir" "$svnrepo/tags/mytag3"
+'
+
+test_expect_success 'import subdirectory performance' '
+	git svn init --stdlayout "$svnrepo" git_project &&
+	cd git_project &&
+	git svn fetch | tee fetch.txt &&
+
+	grep "refs/remotes/tags/mytag2@7" fetch.txt >actual.txt &&
+	grep "^r7" actual.txt >expected.txt &&
+	diff -u expected.txt actual.txt &&
+
+	git diff --exit-code tags/mytag1..tags/mytag2^^ &&
+	git diff --exit-code tags/mytag1..tags/mytag3^^^ &&
+	git diff --exit-code tags/mytag2..tags/mytag3^^
+'
+
+test_done
-- 
1.7.2.1.158.gbd3a97

--
To unsubscribe from this list: send the line "unsubscribe git" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[Index of Archives]     [Linux Kernel Development]     [Gcc Help]     [IETF Annouce]     [DCCP]     [Netdev]     [Networking]     [Security]     [V4L]     [Bugtraq]     [Yosemite]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Linux SCSI]     [Fedora Users]