[PATCH] git-svn: improve rebase/mkdirs performance

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Processing empty_dir directives becomes extremely slow for svn
repositories with a large enough history.

This is due to using a single hash to store the list of empty
directories, with the expensive step being purging items from
that hash using grep+delete.

Storing directories in a hash of hashes improves the performance
of this purge step and removes a potentially lengthy delay after
every rebase/mkdirs command.

The svn repository with this behaviour has 110K commits with
unhandled.log containing 170K empty_dir directives.

This takes 10 minutes to process when using a single hash, vs
3 seconds with a hash of hashes.

Signed-off-by: Dair Grant <dair@xxxxxxxxxxxxxxxxxxxx>
---
 perl/Git/SVN.pm | 82 +++++++++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 74 insertions(+), 8 deletions(-)

diff --git a/perl/Git/SVN.pm b/perl/Git/SVN.pm
index 152fb7e..ebbdd37 100644
--- a/perl/Git/SVN.pm
+++ b/perl/Git/SVN.pm
@@ -1211,20 +1211,85 @@ sub do_fetch {
 sub mkemptydirs {
 	my ($self, $r) = @_;
 
+	# add/remove/collect a paths table
+	#
+	# Paths are split into a tree of nodes, stored as a hash of hashes.
+	#
+	# Each node contains a 'path' entry for the path (if any) associated with
+	# that node and a 'children' entry for any nodes under that location.
+	#
+	# Removing a path requires a hash lookup for each component then dropping
+	# that node (and anything under it), which is substantially faster than a
+	# grep slice into a single hash of paths for large numbers of paths.
+	#
+	# For a large (200K) number of empty_dir directives this reduces scanning
+	# time to 3 seconds vs 10 minutes for grep+delete on a single hash of paths.
+	sub add_path {
+		my ($paths_table, $path) = @_;
+		my $node_ref = undef;
+
+		foreach my $x (split('/', $path)) {
+			if (!exists($paths_table->{$x})) {
+				$paths_table->{$x}             = {};
+				$paths_table->{$x}{"children"} = {};
+			}
+
+			$node_ref    = $paths_table->{$x};
+			$paths_table = $paths_table->{$x}{"children"};
+		}
+
+		$node_ref->{"path"} = $path;
+	}
+
+	sub remove_path {
+		my ($paths_table, $path) = @_;
+		my $nodes_ref = undef;
+		my $node_name = undef;
+
+		foreach my $x (split('/', $path)) {
+			if (!exists($paths_table->{$x})) {
+				return;
+			}
+
+			$nodes_ref = $paths_table;
+			$node_name = $x;
+
+			$paths_table = $paths_table->{$x}{"children"};
+		}
+	
+		delete($nodes_ref->{$node_name});
+	}
+
+	sub collect_paths {
+		my ($paths_table, $paths_ref) = @_;
+
+		foreach my $v (values %$paths_table) {
+			my $p = $v->{"path"};
+			my $c = $v->{"children"};
+
+			collect_paths($c, $paths_ref);
+			
+			if (defined($p)) {
+				push(@$paths_ref, $p);
+			}
+		}
+	}
+
 	sub scan {
-		my ($r, $empty_dirs, $line) = @_;
+		my ($r, $paths_table, $line) = @_;
 		if (defined $r && $line =~ /^r(\d+)$/) {
 			return 0 if $1 > $r;
 		} elsif ($line =~ /^  \+empty_dir: (.+)$/) {
-			$empty_dirs->{$1} = 1;
+			add_path($paths_table, $1);
 		} elsif ($line =~ /^  \-empty_dir: (.+)$/) {
-			my @d = grep {m[^\Q$1\E(/|$)]} (keys %$empty_dirs);
-			delete @$empty_dirs{@d};
+			remove_path($paths_table, $1);
 		}
 		1; # continue
 	};
 
-	my %empty_dirs = ();
+	my @empty_dirs  = ();
+	my %paths_table = ();
+
 	my $gz_file = "$self->{dir}/unhandled.log.gz";
 	if (-f $gz_file) {
 		if (!can_compress()) {
@@ -1235,7 +1300,7 @@ sub mkemptydirs {
 				die "Unable to open $gz_file: $!\n";
 			my $line;
 			while ($gz->gzreadline($line) > 0) {
-				scan($r, \%empty_dirs, $line) or last;
+				scan($r, \%paths_table, $line) or last;
 			}
 			$gz->gzclose;
 		}
@@ -1244,13 +1309,14 @@ sub mkemptydirs {
 	if (open my $fh, '<', "$self->{dir}/unhandled.log") {
 		binmode $fh or croak "binmode: $!";
 		while (<$fh>) {
-			scan($r, \%empty_dirs, $_) or last;
+			scan($r, \%paths_table, $_) or last;
 		}
 		close $fh;
 	}
 
+	collect_paths(\%paths_table, \@empty_dirs);
 	my $strip = qr/\A\Q@{[$self->path]}\E(?:\/|$)/;
-	foreach my $d (sort keys %empty_dirs) {
+	foreach my $d (sort @empty_dirs) {
 		$d = uri_decode($d);
 		$d =~ s/$strip//;
 		next unless length($d);
-- 
2.6.2

--
To unsubscribe from this list: send the line "unsubscribe git" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html



[Index of Archives]     [Linux Kernel Development]     [Gcc Help]     [IETF Annouce]     [DCCP]     [Netdev]     [Networking]     [Security]     [V4L]     [Bugtraq]     [Yosemite]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Linux SCSI]     [Fedora Users]