Recently I wanted to know how well Git's pack files were doing at storing rather large JAR files. So I wrote the attached script to parse the output of `git verify-pack -v` and use that to determine how many bytes are needed for each revision of any given file. For example running it on builtin-blame.c: $ perl ../delta-sizes.pl builtin-blame.c Caching cache-cdc41646a9de201b06a936fc3bddcbd51aeb532c.v... Pack index cache created. builtin-blame.c 16660221... s 2 44 066dee74... s 1 62 176f51a4... 0 12797 ---------------------------------------- 3 revs 12 KiB 3 revs 12 KiB There are 3 revisions of this file, totalling 12 KiB in disk space within the pack files. One of those revisions uses 44 bytes and the other uses 62 bytes. Given that this includes the complete overhead (including the 20 byte OBJ_REF_DELTA header) we're talking about ~20 bytes of delta data in revision 16660221. Pretty good. :) Of course this only looks at a single blob object and does not take into account the tree and commit overheads for a given revision, but it does give a really good idea of what is going on. -- Shawn.
#!/usr/bin/perl use strict; unless ($ENV{GIT_DIR}) { $ENV{GIT_DIR} = '.git' if -f '.git/config'; } unless ($ENV{GIT_DIR}) { $ENV{GIT_DIR} = shift || die "usage: $0 gitdir file...\n"; } my %revs_by_path; my %path_by_rev; my %by_hash; open(R, "git rev-list --objects --all |"); while (<R>) { chomp; my ($sha1, $path) = split / /, $_, 2; next unless $path; push(@{$revs_by_path{$path}}, $sha1); $path_by_rev{$sha1}{$path} = 1; } close R; sub index_pack { my $idx = shift; my $pack = $idx; local *R, *V, $_; $pack =~ s/\.idx$/.pack/; $pack =~ /pack-([a-z0-9]{40})\.pack$/; my $cache = "cache-$1.v"; my @objects; unless (open(R, $cache)) { print STDERR "Caching $cache...\n"; open(R, ">$cache"); open(V, "git verify-pack -v $idx|"); print R while $_ = <V>; close V; close R; print STDERR "Pack index cache created.\n\n"; open(R, $cache); } while (<R>) { last if /^chain length/; chomp; my ($sha1, $type, $size, $offset, $depth, $base) = split /\s+/; my $o = { sha1 => $sha1, type => $type, uncompressed_size => $size, offset => $offset, depth => $depth, base => $base, }; push @objects, $o; $by_hash{$sha1} = $o; } close R; my $last = undef; foreach my $o (sort {$a->{offset} <=> $b->{offset}} @objects) { $last->{pack_size} = $o->{offset} - $last->{offset} if $last; $last = $o; } $last->{pack_size} = ((-s $pack) - 20) - $last->{offset}; } opendir(D, "$ENV{GIT_DIR}/objects/pack"); while (my $entry = readdir D) { next unless $entry =~ /^pack-[a-z0-9]{40}\.idx$/; index_pack "$ENV{GIT_DIR}/objects/pack/$entry"; } closedir D; if (@ARGV) { my $g_total = 0; my $g_revs = 0; foreach my $path (@ARGV) { print $path, "\n"; my $total = 0; my $revs = 0; foreach my $sha1 ( sort {$by_hash{$b}{depth} <=> $by_hash{$a}{depth}} grep {$by_hash{$_}} @{$revs_by_path{$path}}) { my $o = $by_hash{$sha1}; printf "%8s... %1s%2i %10i\n", substr($sha1, 0, 8), ($o->{depth} ? ($path_by_rev{$o->{base}}{$path} ? 's' : 'o') : ''), $o->{depth}, $o->{pack_size}; $total += $o->{pack_size}; $revs++; } $g_total += $total; $g_revs += $revs; my $units = 'bytes'; if ($total >= 1024) { $units = 'KiB'; $total /= 1024; if ($total >= 1024) { $units = 'MiB'; $total /= 1024; } } print '-'x40, "\n"; printf "%15s %10i %s\n", "$revs revs", $total, $units; print "\n"; } my $units = 'bytes'; if ($g_total >= 1024) { $units = 'KiB'; $g_total /= 1024; if ($g_total >= 1024) { $units = 'MiB'; $g_total /= 1024; } } printf "%15s %10i %s\n", "$g_revs revs", $g_total, $units; } else { foreach my $path (sort keys %revs_by_path) { my $total = 0; my $revs = 0; foreach my $sha1 ( sort {$by_hash{$b}{depth} <=> $by_hash{$a}{depth}} grep {$by_hash{$_}} @{$revs_by_path{$path}}) { $total += $by_hash{$sha1}{pack_size}; $revs++; } my $units = 'bytes'; if ($total >= 1024) { $units = 'KiB'; $total /= 1024; if ($total >= 1024) { $units = 'MiB'; $total /= 1024; } } $total = int $total; printf "%3i revs %10i %-5s %s\n", $revs, $total, $units, $path; } }