Re: Print last time and committer a file was touched by for a whole repo

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Sat, 3 July 2010, Jakub Narebski wrote:
> On Fri, 2 July 2010, Tim Visher wrote:
>> On Thu, Jul 1, 2010 at 4:12 PM, Jakub Narebski <jnareb@xxxxxxxxx> wrote:
>>> Tim Visher <tim.visher@xxxxxxxxx> writes:
>>>
>>>> I need to get a listing of the entire contents of my current repo (as
>>>> in, I don't need deleted files or anything like that, just the current
>>>> snapshot) with the time the file was committed and who committed it.
>>>>
>>>> Thoughts on how to do that?
>>>
>>> There does not exist a single git command that would do what you want.
>>> You would need to use 'git log -1 --follow' for each file in current
>>> snapshot ('git ls-tree -r HEAD').  IIRC there is some example how to
>>> do that in GitFaq or GitTips on git wiki (http://git.wiki.kernel.org).
>>>
>>> Perhaps in the future 'git blame <directory>' would provide such
>>> output, or its equivalent (tree blame).
>> 
>> That'd be cool.
> 
> I am currently working on prototype in Perl, using 'git cat-file --batch'
> and 'git diff-tree --stdin', as I don't know git C code/API enought to
> write it in C; it is planned to be converted to C after proof of concept
> works.

And it even works[1]... but only for a top directory, because of bug in
--relative=<path> implementation for --raw / git-diff-tree output, see
http://permalink.gmane.org/gmane.comp.version-control.git/150248

[1] But I have not tested it very extensively.
[2] It is also missing some features.

-- 
Jakub Narebski
Poland
#!/usr/bin/perl

use strict;
use warnings;
use Data::Dumper;

use Encode;
use Fcntl ':mode';
use List::Compare::Functional qw(:originals);
use List::MoreUtils qw(uniq pairwise);

use constant DEBUG => 0;

# ----------------------------------------------------------------------
{
package Git::Repo;

use strict;
use warnings;

use IPC::Open2 qw(open2);
use IO::Handle;

use base qw(Exporter);

our @EXPORT = qw();
our @EXPORT_OK = qw();

# Auxiliary subroutines

sub _assert_opts {
	die "must have an even number of arguments for named options"
		unless $#_ % 2;
}

sub _assert_sha1 {
	my $sha1 = shift;
	die "'$sha1' is not a SHA1 (need to use get_sha1?)"
		unless $sha1 && $sha1 =~ /^[a-f0-9]{40}$/;
}

sub new {
	my $class = shift;
	_assert_opts @_;
	my $self = {@_};
	bless $self, $class;
	die 'no repo_dir given' unless $self->{repo_dir};
	return $self;
}

# Return the first items of the git command line, for instance
# qw(/usr/bin/git --git-dir=/path/to/repo.git).
sub _git_cmd {
	my $self = shift;
	return ($self->{git_binary} || 'git', '--git-dir=' . $self->{repo_dir});
}

sub get_sha1 {
	my ($self, $object_id) = @_;

	die 'no object identifier given' unless $object_id;
	die 'object identifier must not contain newlines' if $object_id =~ /\n/;

	unless ($self->{sha1_stdout}) {
		# Open bidi pipe the first time get_sha1 is called.
		# open2 raises an exception on error, no need to 'or die'.
		$self->{sha1_pid} =
			open2($self->{sha1_stdout}, $self->{sha1_stdin},
			      $self->_git_cmd, 'cat-file', '--batch-check');
	}
	$self->{sha1_stdin}->printflush("$object_id\n")
		or die "cannot write to pipe: $!";
	my $output = $self->{sha1_stdout}->getline()
		or die "cannot read from pipe: $!";
	chomp $output;
	return if $output =~ /missing$/;
	my ($sha1, $type, $size) =
		($output =~ /^([0-9a-f]{40}) ([a-z]+) ([0-9]+)$/)
		or die "invalid response: $output";
	return wantarray ? ($sha1, $type, $size) : $sha1;
}


sub get_object {
	my ($self, $object_id) = @_;

	unless ($self->{object_stdout}) {
		# Open bidi pipe the first time get_object is called.
		# open2 raises an exception on error, no need to 'or die'.
		$self->{object_pid} =
			open2($self->{object_stdout}, $self->{object_stdin},
			      $self->_git_cmd, 'cat-file', '--batch');
	}
	$self->{object_stdin}->printflush("$object_id\n")
		or die "get_object: cannot write to pipe: $!";
	my ($sha1, $type, $size) =
		split ' ', $self->{object_stdout}->getline()
		or die "get_object: cannot read from pipe: $!";
	die "'$object_id' not found in repository"
		if $type eq 'missing';
	$self->{object_stdout}->read(my $content, $size);
	$self->{object_stdout}->getline();  # eat trailing newline
	return wantarray ? ($sha1, $type, $size, $content) : $content;
}

sub get_commit_difftree {
	my ($self, $commit_id, $parent_id, $path) = @_;

	unless ($self->{difftree_stdout}) {
		# Open bidi pipe the first time get_object is called.
		# open2 raises an exception on error, no need to 'or die'.
		$self->{difftree_pid} =
			open2($self->{difftree_stdout}, $self->{difftree_stdin},
			      $self->_git_cmd, 'diff-tree', '--stdin', '--raw', '--no-commit-id',
			                       '--root', '--no-renames',
			#                       defined $path ? ('--', $path) : ());
			                       defined $path ? ("--relative=$path") : ());
	}
	# the additional LF ("\n") is to able to detect end of difftree
	$self->{difftree_stdin}->printflush("$commit_id $parent_id\n\n")
		or die "get_commit_difftree: cannot write to pipe: $!";

	my @difftree_lines;
	while (my $line = $self->{difftree_stdout}->getline()) {
		chomp $line;
		last unless $line;

		push @difftree_lines, $line;
	}
	return wantarray ? @difftree_lines : \@difftree_lines;
}


sub DESTROY {
	my $self = shift;

	if (exists $self->{object_stdout}) {
		close $self->{object_stdout}
			or die "Closing stdout of git-cat-file --batch failed: $!";
	}
	if (exists $self->{object_stdin}) {
		close $self->{object_stdin}
			or die "Closing stdin of git-cat-file --batch  failed: $!";
	}
	if (exists $self->{object_pid}) {
		waitpid $self->{object_pid}, 0
			or die "Waiting for pid=$self->{object_pid} failed: $!";
	}

	if (exists $self->{sha1_stdout}) {
		close $self->{sha1_stdout}
			or die "Closing stdout of git-cat-file --batch-check failed: $!";
	}
	if (exists $self->{sha1_stdin}) {
		close $self->{sha1_stdin}
			or die "Closing stdin of git-cat-file --batch-check failed: $!";
	}
	if (exists $self->{sha1_pid}) {
		waitpid $self->{sha1_pid}, 0
			or die "Waiting for pid=$self->{sha1_pid} failed: $!";
	}

	if (exists $self->{difftree_stdout}) {
		close $self->{difftree_stdout}
			or die "Closing stdout of git-cat-file --batch-check failed: $!";
	}
	if (exists $self->{difftree_stdin}) {
		close $self->{difftree_stdin}
			or die "Closing stdin of git-cat-file --batch-check failed: $!";
	}
	if (exists $self->{difftree_pid}) {
		waitpid $self->{difftree_pid}, 0
			or die "Waiting for pid=$self->{difftree_pid} failed: $!";
	}
}

} # end package Git::Repo;

# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# ::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

sub set_signals {
	my $pid = shift;

	$SIG{'PIPE'} = 'IGNORE';
	$SIG{'CHLD'} = 'IGNORE';

	$SIG{'CHLD'} = sub {
		print "REAPER: status $? on $pid\n" 
			if waitpid($pid, 0) > 0;
	};
}

sub cat_file_gitrepo {
	my ($repo, $object_name) = @_;

	my ($ret_sha1, $type, $size, $content) =
		$repo->get_object($object_name);
	return wantarray ? ("$ret_sha1 $type $size", $content) : $content;
}

# ----------------------------------------------------------------------

my $fallback_encoding = 'latin1';
# decode sequences of octets in utf8 into Perl's internal form,
# which is utf-8 with utf8 flag set if needed.  gitweb writes out
# in utf-8 thanks to "binmode STDOUT, ':utf8'" at beginning
sub to_utf8 {
	my $str = shift;
	if (utf8::valid($str)) {
		utf8::decode($str);
		return $str;
	} else {
		return decode($fallback_encoding, $str, Encode::FB_DEFAULT);
	}
}

sub parse_commit {
	my ($commit_text) = @_;
	my @commit_lines = split '\n', $commit_text;
	my %co;

	if (! @commit_lines) {
		return;
	}

	my @parents;
 HEADER:
	while (my $line = shift @commit_lines) {
		last if $line eq "\n";
		if ($line =~ m/^tree ([0-9a-fA-F]{40})$/) {
			$co{'tree'} = $1;
		} elsif ($line =~ m/^parent ([0-9a-fA-F]{40})$/) {
			push @parents, $1;
		} elsif ($line =~ m/^author (.*) ([0-9]+) (.*)$/) {
			$co{'author'} = to_utf8($1);
			$co{'author_epoch'} = $2;
			$co{'author_tz'} = $3;
			if ($co{'author'} =~ m/^([^<]+) <([^>]*)>/) {
				$co{'author_name'}  = $1;
				$co{'author_email'} = $2;
			} else {
				$co{'author_name'} = $co{'author'};
			}
		} elsif ($line =~ m/^committer (.*) ([0-9]+) (.*)$/) {
			$co{'committer'} = to_utf8($1);
			$co{'committer_epoch'} = $2;
			$co{'committer_tz'} = $3;
			if ($co{'committer'} =~ m/^([^<]+) <([^>]*)>/) {
				$co{'committer_name'}  = $1;
				$co{'committer_email'} = $2;
			} else {
				$co{'committer_name'} = $co{'committer'};
			}
		}
	}
	if (!defined $co{'tree'}) {
		return;
	};
	$co{'parents'} = \@parents;
	$co{'parent'}  =  $parents[0];

	$co{'comment'} = \@commit_lines;

	return wantarray ? %co : \%co;
}

sub unquote { return $_[0] }

# parse line of git-ls-tree output
sub parse_ls_tree_line {
	my $line = shift;
	my %opts = @_;
	my %res;

	#'100644 blob 0fa3f3a66fb6a137f6ec2c19351ed4d807070ffa	panic.c'
	$line =~ m/^([0-9]+) (.+) ([0-9a-fA-F]{40})\t(.+)$/s;

	$res{'mode'} = $1;
	$res{'type'} = $2;
	$res{'hash'} = $3;
	if ($opts{'-z'}) {
		$res{'name'} = $4;
	} else {
		$res{'name'} = unquote($4);
	}

	return wantarray ? %res : \%res;
}

# parse line of git-diff-tree "raw" output
sub parse_difftree_raw_line {
	my $line = shift;
	my %res;

	# ':100644 100644 03b218260e99b78c6df0ed378e59ed9205ccc96d 3b93d5e7cc7f7dd4ebed13a5cc1a4ad976fc94d8 M	ls-files.c'
	# ':100644 100644 7f9281985086971d3877aca27704f2aaf9c448ce bc190ebc71bbd923f2b728e505408f5e54bd073a M	rev-tree.c'
	if ($line =~ m/^:([0-7]{6}) ([0-7]{6}) ([0-9a-fA-F]{40}) ([0-9a-fA-F]{40}) (.)([0-9]{0,3})\t(.*)$/) {
		$res{'from_mode'} = $1;
		$res{'to_mode'} = $2;
		$res{'from_id'} = $3;
		$res{'to_id'} = $4;
		$res{'status'} = $5;
		$res{'similarity'} = $6;
		if ($res{'status'} eq 'R' || $res{'status'} eq 'C') { # renamed or copied
			($res{'from_file'}, $res{'to_file'}) = map { unquote($_) } split("\t", $7);
		} else {
			$res{'from_file'} = $res{'to_file'} = $res{'file'} = unquote($7);
		}
	}
	# '::100755 100755 100755 60e79ca1b01bc8b057abe17ddab484699a7f5fdb 94067cc5f73388f33722d52ae02f44692bc07490 94067cc5f73388f33722d52ae02f44692bc07490 MR	git-gui/git-gui.sh'
	# combined diff (for merge commit)
	elsif ($line =~ s/^(::+)((?:[0-7]{6} )+)((?:[0-9a-fA-F]{40} )+)([a-zA-Z]+)\t(.*)$//) {
		$res{'nparents'}  = length($1);
		$res{'from_mode'} = [ split(' ', $2) ];
		$res{'to_mode'} = pop @{$res{'from_mode'}};
		$res{'from_id'} = [ split(' ', $3) ];
		$res{'to_id'} = pop @{$res{'from_id'}};
		$res{'status'} = [ split('', $4) ];
		$res{'to_file'} = unquote($5);
	}
	# 'c512b523472485aef4fff9e57b229d9d243c967f'
	elsif ($line =~ m/^([0-9a-fA-F]{40})$/) {
		$res{'commit'} = $1;
	}

	return wantarray ? %res : \%res;
}

# ......................................................................

# parse one entry of raw 'tree' object output (from 'git cat-file --batch')
sub decode_tree_entry {
	my $buf = shift;
	#use bytes;

	$buf =~ s/^([0-7]+) //;
	my ($mode_str) = $1;
	my ($filename, $sha1_str) = unpack('Z*H[40]', $buf);

	return ($mode_str, $filename, $sha1_str);
}

sub tree_entry_len {
	my ($mode_str, $filename) = @_;
	#use bytes;

	# length of mode string + separator + 20 bytes of SHA-1
	# + length of filename (in bytes) + terminating NUL ('\0')
	length($mode_str)+1 + length($filename)+1 + 20;
}

use constant {
	S_IFINVALID => 0030000,
	S_IFGITLINK => 0160000,
};

# submodule/subproject, a commit object reference
sub S_ISGITLINK {
	my $mode = shift;

	return (($mode & S_IFMT) == S_IFGITLINK)
}

sub type_from_mode {
	my $mode = oct shift;

	if (S_ISGITLINK($mode)) {
		return "commit";
	} elsif (S_ISDIR($mode & S_IFMT)) {
		return "tree";
	}
		return "blob";
}

# ......................................................................

sub decode_tree {
	my $contents = shift;
	#use bytes;

	my @result;

	while (my @entry = decode_tree_entry($contents)) {
		#print join(' ', @entry)."\n";
		#printf("%06d %s\t%s\n", $entry[0], $entry[2], $entry[1]);

		push @result,
			{ 'mode' => $entry[0],
			  'type' => type_from_mode($entry[0]),
			  'name' => $entry[1],
			  'hash' => $entry[2] };

		my $len = tree_entry_len(@entry);
		#print substr($contents, 0, $len)."\n";
		$contents = substr($contents, $len);
		last unless $contents;
	}

	return wantarray ? @result : \@result;
}

sub tree_entry_eq {
	my ($a, $b) = @_;

	return
		$a->{'mode'} == $b->{'mode'} &&
		$a->{'type'} eq $b->{'type'} &&
		$a->{'name'} eq $b->{'name'} &&
		$a->{'hash'} eq $b->{'hash'};
}

# ......................................................................

sub print_parsed_tree {
	my $tree = shift;

	foreach my $tree_entry (@$tree) {
		print format_tree_entry($tree_entry)."\n";
	}
}

sub print_tree_blame {
	my ($tree, $tree_blame) = @_;

	foreach my $tree_entry (@$tree) {
		#print format_tree_blame_entry($tree_blame->{$tree_entry->{'hash'}})."\n";
		print format_tree_blame_entry($tree_entry)."\n";
	}
}

sub format_tree_entry {
	my $tree_entry = shift;

	return sprintf("%06d %s %s\t%s",
		$tree_entry->{'mode'}, $tree_entry->{'type'},
		$tree_entry->{'hash'}, $tree_entry->{'name'});
}

sub format_tree_blame_entry {
	my $tree_entry = shift;

	#my ($sec, $min, $hour, $mday, $mon, $year, $wday, $yday)
	#	= gmtime($tree_entry->{'author_epoch'});
	#return sprintf("%06d %s %.8s %.8s %.15s %02d-%02d-%04d\t%s",
	#	$tree_entry->{'mode'}, $tree_entry->{'type'},
	#	$tree_entry->{'hash'}, $tree_entry->{'commit'},
	#	$tree_entry->{'author_name'}, $mday, $mon, 1900 + $year,
	#	$tree_entry->{'name'});
	return sprintf("%06d %s %.8s %.8s\t%.10s\t%s",
		$tree_entry->{'mode'}, $tree_entry->{'type'},
		$tree_entry->{'hash'}, $tree_entry->{'commit'},
		$tree_entry->{'name'}, $tree_entry->{'summary'});
}

# ======================================================================

sub tree_blame_commit {
	my ($repo, $commit_id, $tree_blame, $path) = @_;

	my ($commit_sha1, undef, undef, $commit_text) =
		$repo->get_object($commit_id);
	my %commit = parse_commit($commit_text);

	my $nunblamed = scalar grep { !exists $_->{'commit'} } @$tree_blame;
	printf("processing %6s (%1d parents, %d unblamed): %s\n",
	       substr($commit_sha1,0,6), scalar @{$commit{'parents'}},
	       $nunblamed, $commit{'comment'}[0])
		if DEBUG >= 1;
	print Dumper($tree_blame)
		if DEBUG >= 2;
	return unless $nunblamed > 0;

	foreach my $parent (@{$commit{'parents'}}) {
		my @difftree = $repo->get_commit_difftree($commit_sha1, $parent, $path);
		@difftree = map { { parse_difftree_raw_line($_) } } @difftree;

		printf("processing %6s: parent %6s has %d in difftree\n",
		       substr($commit_sha1,0,6), substr($parent,0,6),
		       scalar @difftree)
			if DEBUG >= 1;

		no warnings 'recursion';

		# mark entries from @difftree
		#mark_changed($tree_blame, \@difftree, [ $commit_sha1 ]);
		mark_changed($tree_blame, \@difftree, \$commit_sha1);
		# pass blame to parent
		tree_blame_commit($repo, $parent, $tree_blame, $path);
		# remove marks
		#remove_marks($tree_blame, [ $commit_sha1 ]);
		remove_marks($tree_blame, \$commit_sha1);
	}

	my $nblames_this = 0;
	foreach my $tree_entry (@$tree_blame) {
		if (!exists $tree_entry->{'commit'}) {
			$tree_entry->{'commit'}  = $commit_sha1;
			$tree_entry->{'summary'} = $commit{'comment'}[0];
			$tree_entry->{'author_name'} = $commit{'author_name'};
			if ($commit{'author_email'} &&
			    $commit{'author_email'} =~ /^([^@]+)@/) {
				$tree_entry->{'author_user'} = $1;
			}
			$tree_entry->{'author_epoch'} = $commit{'author_epoch'};

			my @difftree = $repo->get_commit_difftree($commit_sha1, '-m', $path);
			@difftree = map { { parse_difftree_raw_line($_) } } @difftree;
			$tree_entry->{'difftree'} =
				[ grep { $_->{'to_id'} eq $tree_entry->{'hash'} } @difftree ];

			$nblames_this++;
		}
	}

	printf("done %6s (got blamed by %1d, %d unblamed left)\n",
	       substr($commit_sha1,0,6), $nblames_this,
	       scalar grep { !exists $_->{'commit'} } @$tree_blame)
		if DEBUG >= 1;
}

sub mark_changed {
	my ($tree_blame, $difftree, $value) = @_;

	my @blame_sha1    = map { $_->{'hash'} }  @$tree_blame;
	my @difftree_sha1 = map { $_->{'to_id'} } @$difftree;
	# not optimized: both @$tree_blame and @$difftree are sorted by filename
	my @common_sha1 = get_intersection('--unsorted', [ \@blame_sha1, \@difftree_sha1 ]);

	my %tree_blame = map { $_->{'hash'} => $_ } @$tree_blame;
	map { $_->{'commit'} = $value unless exists $_->{'commit'} }
		@tree_blame{@common_sha1};
	@$tree_blame = values %tree_blame;

	return @$tree_blame;
}

sub remove_marks {
	my ($tree_blame, $value) = @_;

	map { 
		delete($_->{'commit'})
			#if (ref($_->{'commit'}) && ($_->{'commit'}->[0] eq $value->[0]))
			if (ref($_->{'commit'}) && (${$_->{'commit'}} eq $$value))
	}	@$tree_blame;

	return @$tree_blame;
}

# ======================================================================
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# ----------------------------------------------------------------------
# MAIN

my $git_dir = "/home/jnareb/git/gitweb/test/.git";
#my $tree_path    = "sub";
my $tree_path    = "";
my $start_commit = "HEAD";

#my $git_dir = "/home/jnareb/git/.git";
#my $tree_path = "contrib";
#my $start_commit = "HEAD";
#my $tree_path    = '';
#my $start_commit = "todo";

my $repo = Git::Repo->new(repo_dir=>$git_dir);

my ($sha1, $type, $size, $obj_data) =
	$repo->get_object("$start_commit:$tree_path");
my @tree_blame = decode_tree($obj_data);

tree_blame_commit($repo, $start_commit, \@tree_blame, $tree_path);

print Dumper(\@tree_blame)
	if DEBUG >= 2;

print_tree_blame(\@tree_blame)
	if DEBUG < 2;

#print Dumper(\@tree_blame);
#print Dumper($tree_blame[0]);

__END__
# ----------------------------------------------------------------------
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# ======================================================================

#http://git.or.cz/gitwiki/ExampleScripts#Findingwhichcommitslasttouchedthefiles
#http://gist.github.com/247395

my %attributions = ();
my @files = ();

open IN, "git ls-tree -r --full-name HEAD |" or die;
while (<IN>) {
	if (/^\S+\s+blob \S+\s+(\S+)$/) {
		$files[$#files + 1] = $1;
		$attributions{$1} = -1;
	}
}
close IN;

my $remaining = $#files + 1;

open IN, "git log -r --root --raw --no-abbrev --pretty=format:%h~%an~%ad~ |" or die;
while (<IN>) {
	if (/^([^:~]+)~(.*)~([^~]+)~$/) {
		($commit, $author, $date) = ($1, $2, $3);
	} elsif (/^:\S+\s+1\S+\s+\S+\s+\S+\s+\S\s+(.*)$/) {
		if ($attributions{$1} == -1) {
			$attributions{$1} = "$author, $date ($commit)";
			$remaining--;
			if ($remaining <= 0) {
				break;
			}
		}
	}
}
close IN;

for $f (@files) {
	print "$f	$attributions{$f}\n";
}

[Index of Archives]     [Linux Kernel Development]     [Gcc Help]     [IETF Annouce]     [DCCP]     [Netdev]     [Networking]     [Security]     [V4L]     [Bugtraq]     [Yosemite]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Linux SCSI]     [Fedora Users]