Implement a gate between git and mediawiki, allowing git users to push and pull objects from mediawiki just as one would do with a classic git repository thanks to remote-helpers. Currently supported commands are : git clone mediawiki::http://onewiki.com git pull You need the following packages installed (available on common repositories): libmediawiki-api-perl libdatetime-format-iso8601-perl Use remote helpers in order to be as transparent as possible to the git user. Download Mediawiki revisions through the Mediawiki API and then fast-import into git. Mediawiki revisions and git commits are linked thanks to notes bound to commits. The import part is done on a refs/mediawiki/<remote> branch before coming to refs/remote/origin/master (Huge thanks to Jonathan Nieder for his help) For now, the whole wiki is cloned, but it will be possible to clone only some pages: the clone is based on a list of pages which is now all pages. Code clarified & improved with the help of Jeff King and Junio C Hamano. We were not able to reproduce the empty timestamp bug noticed by Jeff King, thus needing some further testing. A placeholder is still implemented just in case. Signed-off-by: JÃrÃmie Nikaes <jeremie.nikaes@xxxxxxxxxxxxxxx> Signed-off-by: Arnaud Lacurie <arnaud.lacurie@xxxxxxxxxxxxxxx> Signed-off-by: Claire Fousse <claire.fousse@xxxxxxxxxxxxxxx> Signed-off-by: David Amouyal <david.amouyal@xxxxxxxxxxxxxxx> Signed-off-by: Matthieu Moy <matthieu.moy@xxxxxxxxxxxxxxx> Signed-off-by: Sylvain Boulmà <sylvain.boulme@xxxxxxx> --- Changes since v1 - Better loop control using while(1) and removing use of switch - Removed unnecessary "use storable qw(freeze thaw)" leftover from Peff's script - Removed a few print STDERR lines used for debugging - Use of git notes show instead of git log and parsing - Added "use warnings;" and made the changes that these warnings pointed to (mainly uses of uninitialized variables in the parser) - Standardization of print STDOUT / print STDERR / print contrib/mw-to-git/git-remote-mediawiki | 278 ++++++++++++++++++++++++++++ contrib/mw-to-git/git-remote-mediawiki.txt | 7 + 2 files changed, 285 insertions(+), 0 deletions(-) create mode 100755 contrib/mw-to-git/git-remote-mediawiki create mode 100644 contrib/mw-to-git/git-remote-mediawiki.txt diff --git a/contrib/mw-to-git/git-remote-mediawiki b/contrib/mw-to-git/git-remote-mediawiki new file mode 100755 index 0000000..34fe5dd --- /dev/null +++ b/contrib/mw-to-git/git-remote-mediawiki @@ -0,0 +1,278 @@ +#! /usr/bin/perl + +use strict; +use MediaWiki::API; +use DateTime::Format::ISO8601; +use Encode qw(encode_utf8); +use warnings; + +# Mediawiki filenames can contain forward slashes. This variable decides by which pattern they should be replaced +my $slash_replacement = "<slash>"; + +my $remotename = $ARGV[0]; +my $url = $ARGV[1]; + +# commands parser +my $entry; +my @cmd; +while (1) { + $| = 1; #flush STDOUT + $entry = <STDIN>; + chomp($entry); + @cmd = split(/ /,$entry); + if (defined($cmd[0])) { + if ($cmd[0] eq "capabilities") { + last unless (!defined($cmd[1])); + mw_capabilities(); + } elsif ($cmd[0] eq "list") { + last unless (!defined($cmd[2])); + mw_list($cmd[1]); + } elsif ($cmd[0] eq "import") { + last unless ($cmd[1] ne "" && !defined($cmd[2])); + mw_import($cmd[1]); + } elsif ($cmd[0] eq "option") { + last unless ($cmd[1] ne "" && $cmd[2] ne "" && !defined($cmd[3])); + mw_option($cmd[1],$cmd[2]); + } elsif ($cmd[0] eq "push") { + # Check the pattern <src>:<dst> + my @pushargs = split(/:/,$cmd[1]); + last unless ($pushargs[1] ne "" && !defined($pushargs[2])); + mw_push($pushargs[0],$pushargs[1]); + } else { + print STDERR "Unknown capability. Aborting...\n"; + last; + } + } else { + # End of input + last; + } + +} + +########################## Functions ############################## + +sub get_last_local_revision { + # Get note regarding last mediawiki revision + my $note = `git notes --ref=mediawiki show refs/mediawiki/$remotename/master 2>/dev/null`; + my @note_info = split(/ /, $note); + + my $lastrevision_number; + if (!(defined($note_info[0]) && $note_info[0] eq "mediawiki_revision:")) { + print STDERR "No previous mediawiki revision found"; + $lastrevision_number = 0; + } else { + # Notes are formatted : mediawiki_revision: #number + $lastrevision_number = $note_info[1]; + chomp($lastrevision_number); + print STDERR "Last local mediawiki revision found is $lastrevision_number "; + } + return $lastrevision_number; +} + +sub get_last_remote_revision { + my $mediawiki = MediaWiki::API->new; + $mediawiki->{config}->{api_url} = "$url/api.php"; + + my $pages = $mediawiki->list({ + action => 'query', + list => 'allpages', + aplimit => 500, + }); + + my $max_rev_num = 0; + + foreach my $page (@$pages) { + my $id = $page->{pageid}; + + + my $query = { + action => 'query', + prop => 'revisions', + rvprop => 'ids', + pageids => $id, + }; + + my $result = $mediawiki->api($query); + + my $lastrev = pop(@{$result->{query}->{pages}->{$id}->{revisions}}); + + $max_rev_num = ($lastrev->{revid} > $max_rev_num ? $lastrev->{revid} : $max_rev_num); + } + + print STDERR "Last remote revision found is $max_rev_num\n"; + return $max_rev_num; +} + +sub literal_data { + my ($content) = @_; + print STDOUT "data ", bytes::length($content), "\n", $content; +} + +sub mw_capabilities { + # Revisions are imported to the private namespace + # refs/mediawiki/$remotename/ by the helper and fetched into + # refs/remotes/$remotename later by fetch. + print STDOUT "refspec refs/heads/*:refs/mediawiki/$remotename/*\n"; + print STDOUT "import\n"; + print STDOUT "list\n"; + print STDOUT "option\n"; + print STDOUT "push\n"; + print STDOUT "\n"; +} + +sub mw_list { + # MediaWiki do not have branches, we consider one branch arbitrarily + # called master + print STDOUT "? refs/heads/master\n"; + print STDOUT '@'."refs/heads/master HEAD\n"; + print STDOUT "\n"; + +} + +sub mw_option { + print STDOUT "unsupported\n"; +} + +sub mw_import { + my @wiki_name = split(/:\/\//,$url); + my $wiki_name = $wiki_name[1]; + + my $mediawiki = MediaWiki::API->new; + $mediawiki->{config}->{api_url} = "$url/api.php"; + + my $pages = $mediawiki->list({ + action => 'query', + list => 'allpages', + aplimit => 500, + }); + + if (!defined($pages)) { + print STDERR "fatal: '$url' does not appear to be a mediawiki\n"; + print STDERR "fatal: make sure '$url/api.php' is a valid page\n"; + exit; + } + + my @revisions; + print STDERR "Searching revisions...\n"; + my $fetch_from = get_last_local_revision() + 1; + if ($fetch_from == 1) { + print STDERR ", fetching from beginning\n"; + } else { + print STDERR ", fetching from here\n"; + } + my $n = 1; + foreach my $page (@$pages) { + my $id = $page->{pageid}; + + print STDERR "$n/", scalar(@$pages), ": ". encode_utf8($page->{title})."\n"; + $n++; + + my $query = { + action => 'query', + prop => 'revisions', + rvprop => 'ids', + rvdir => 'newer', + rvstartid => $fetch_from, + rvlimit => 500, + pageids => $page->{pageid}, + }; + + my $revnum = 0; + # Get 500 revisions at a time due to the mediawiki api limit + while (1) { + my $result = $mediawiki->api($query); + + # Parse each of those 500 revisions + foreach my $revision (@{$result->{query}->{pages}->{$id}->{revisions}}) { + my $page_rev_ids; + $page_rev_ids->{pageid} = $page->{pageid}; + $page_rev_ids->{revid} = $revision->{revid}; + push (@revisions, $page_rev_ids); + $revnum++; + } + last unless $result->{'query-continue'}; + $query->{rvstartid} = $result->{'query-continue'}->{revisions}->{rvstartid}; + } + print STDERR " Found ", $revnum, " revision(s).\n"; + } + + # Creation of the fast-import stream + print STDERR "Fetching & writing export data...\n"; + + #binmode STDOUT, ':binary'; # Line from Jeff King that raises a warning. What is this line for ? + $n = 0; + + foreach my $pagerevids (sort {$a->{revid} <=> $b->{revid}} @revisions) { + #fetch the content of the pages + my $query = { + action => 'query', + prop => 'revisions', + rvprop => 'content|timestamp|comment|user|ids', + revids => $pagerevids->{revid}, + }; + + my $result = $mediawiki->api($query); + + my $rev = pop(@{$result->{query}->{pages}->{$pagerevids->{pageid}}->{revisions}}); + + $n++; + my $user = $rev->{user} || 'Anonymous'; + my $dt = DateTime::Format::ISO8601->parse_datetime($rev->{timestamp}); + + my $comment = defined $rev->{comment} ? $rev->{comment} : '*Empty MediaWiki Message*'; + my $title = encode_utf8($result->{query}->{pages}->{$pagerevids->{pageid}}->{title}); + my $content = $rev->{'*'}; + # This \n is important. If it's not added, a conflict is going to happen if you change + # the last line of a file, push it and then pull it back from mediawiki. + # Note : it seems like there are still problems with this. To be investigated further + $content .= "\n"; + + $title =~ y/ /_/; + $title =~ s/\//$slash_replacement/g; + + print STDERR "$n/", scalar(@revisions), ": Revision nÂ$pagerevids->{revid} of $title\n"; + + print STDOUT "commit refs/mediawiki/$remotename/master\n"; + print STDOUT "mark :$n\n"; + print STDOUT "committer $user <$user\@$wiki_name> ", $dt->epoch, " +0000\n"; + literal_data(encode_utf8($comment)); + # If it's not a clone, needs to know where to start from + if ($fetch_from != 1 && $n == 1) { + print STDOUT "from refs/mediawiki/$remotename/master^0\n"; + } + print STDOUT "M 644 inline $title.mw\n"; + literal_data(encode_utf8($content)); + print STDOUT "\n\n"; + + + # mediawiki revision number in the git note + if ($fetch_from == 1 && $n == 1) { + print STDOUT "reset refs/notes/mediawiki\n"; + } + print STDOUT "commit refs/notes/mediawiki\n"; + print STDOUT "committer $user <$user\@$wiki_name> ", $dt->epoch, " +0000\n"; + literal_data(encode_utf8("note added by git-mediawiki")); + if ($fetch_from != 1 && $n == 1) { + print STDOUT "from refs/notes/mediawiki^0\n"; + } + print STDOUT "N inline :$n\n"; + literal_data(encode_utf8("mediawiki_revision: " . $pagerevids->{revid})); + print STDOUT "\n\n"; + } + + if ($fetch_from == 1) { + if ($n != 0) { + print STDOUT "reset $_[0]\n"; #$_[0] contains refs/heads/master + print STDOUT "from :$n\n"; + } else { + print STDERR "You appear to have cloned an empty mediawiki\n"; + #Something has to be done remote-helper side. If nothing is done, an error is + #thrown saying that HEAD is refering to unknown object 0000000000000000000 + } + } + +} + +sub mw_push { + print STDERR "Push not yet implemented\n"; +} diff --git a/contrib/mw-to-git/git-remote-mediawiki.txt b/contrib/mw-to-git/git-remote-mediawiki.txt new file mode 100644 index 0000000..4d211f5 --- /dev/null +++ b/contrib/mw-to-git/git-remote-mediawiki.txt @@ -0,0 +1,7 @@ +Git-Mediawiki is a project which aims the creation of a gate +between git and mediawiki, allowing git users to push and pull +objects from mediawiki just as one would do with a classic git +repository thanks to remote-helpers. + +For more information, visit the wiki at +https://github.com/Bibzball/Git-Mediawiki/wiki -- 1.7.4.1 -- To unsubscribe from this list: send the line "unsubscribe git" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html