[PATCHv2] Add a remote helper to interact with mediawiki, pull & clone handled

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]


Implement a gate between git and mediawiki, allowing git users to
push and pull objects from mediawiki just as one would do with a
classic git repository thanks to remote-helpers.

Currently supported commands are :
     git clone mediawiki::http://onewiki.com
     git pull

You need the following packages installed (available on common

Use remote helpers in order to be as transparent as possible
to the git user.

Download Mediawiki revisions through the Mediawiki API and then
fast-import into git.

Mediawiki revisions and git commits are linked thanks to notes bound to

The import part is done on a refs/mediawiki/<remote> branch before
coming to refs/remote/origin/master (Huge thanks to Jonathan Nieder
for his help)

For now, the whole wiki is cloned, but it will be possible to clone only
some pages: the clone is based on a list of pages which is now all

Code clarified & improved with the help of Jeff King and Junio C Hamano.

We were not able to reproduce the empty timestamp bug noticed by Jeff
King, thus needing some further testing. A placeholder is still
implemented just in case.

Signed-off-by: JÃrÃmie Nikaes <jeremie.nikaes@xxxxxxxxxxxxxxx>
Signed-off-by: Arnaud Lacurie <arnaud.lacurie@xxxxxxxxxxxxxxx>
Signed-off-by: Claire Fousse <claire.fousse@xxxxxxxxxxxxxxx>
Signed-off-by: David Amouyal <david.amouyal@xxxxxxxxxxxxxxx>
Signed-off-by: Matthieu Moy <matthieu.moy@xxxxxxxxxxxxxxx>
Signed-off-by: Sylvain Boulmà <sylvain.boulme@xxxxxxx>
 Changes since v1
 - Better loop control using while(1) and removing use of switch
 - Removed unnecessary "use storable qw(freeze thaw)" leftover from Peff's script
 - Removed a few print STDERR lines used for debugging
 - Use of git notes show instead of git log and parsing
 - Added "use warnings;" and made the changes that these warnings pointed to
 (mainly uses of uninitialized variables in the parser)
 - Standardization of print STDOUT / print STDERR / print

 contrib/mw-to-git/git-remote-mediawiki     |  278 ++++++++++++++++++++++++++++
 contrib/mw-to-git/git-remote-mediawiki.txt |    7 +
 2 files changed, 285 insertions(+), 0 deletions(-)
 create mode 100755 contrib/mw-to-git/git-remote-mediawiki
 create mode 100644 contrib/mw-to-git/git-remote-mediawiki.txt

diff --git a/contrib/mw-to-git/git-remote-mediawiki b/contrib/mw-to-git/git-remote-mediawiki
new file mode 100755
index 0000000..34fe5dd
--- /dev/null
+++ b/contrib/mw-to-git/git-remote-mediawiki
@@ -0,0 +1,278 @@
+#! /usr/bin/perl
+use strict;
+use MediaWiki::API;
+use DateTime::Format::ISO8601;
+use Encode qw(encode_utf8);
+use warnings;
+# Mediawiki filenames can contain forward slashes. This variable decides by which pattern they should be replaced
+my $slash_replacement = "<slash>";
+my $remotename = $ARGV[0];
+my $url = $ARGV[1];
+# commands parser
+my $entry;
+my @cmd;
+while (1) {
+	$| = 1; #flush STDOUT
+	$entry = <STDIN>;
+	chomp($entry);
+	@cmd = split(/ /,$entry);
+	if (defined($cmd[0])) {
+		if ($cmd[0] eq "capabilities") {
+			last unless (!defined($cmd[1]));
+			mw_capabilities();
+		} elsif ($cmd[0] eq "list") {
+			last unless (!defined($cmd[2]));
+			mw_list($cmd[1]);
+		} elsif ($cmd[0] eq "import") {
+			last unless ($cmd[1] ne "" && !defined($cmd[2]));
+			mw_import($cmd[1]);
+		} elsif ($cmd[0] eq "option") {
+			last unless ($cmd[1] ne "" && $cmd[2] ne "" && !defined($cmd[3]));
+			mw_option($cmd[1],$cmd[2]);
+		} elsif ($cmd[0] eq "push") {
+			# Check the pattern <src>:<dst>
+			my @pushargs = split(/:/,$cmd[1]);
+			last unless ($pushargs[1] ne "" && !defined($pushargs[2]));
+			mw_push($pushargs[0],$pushargs[1]);
+		} else {
+			print STDERR "Unknown capability. Aborting...\n";
+			last;
+		}
+	} else {
+		# End of input
+		last;
+	}
+########################## Functions ##############################
+sub get_last_local_revision {
+	# Get note regarding last mediawiki revision
+	my $note = `git notes --ref=mediawiki show refs/mediawiki/$remotename/master 2>/dev/null`;
+	my @note_info = split(/ /, $note);
+	my $lastrevision_number;
+	if (!(defined($note_info[0]) && $note_info[0] eq "mediawiki_revision:")) {
+		print STDERR "No previous mediawiki revision found";
+		$lastrevision_number = 0;
+	} else {
+		# Notes are formatted : mediawiki_revision: #number
+		$lastrevision_number = $note_info[1];
+		chomp($lastrevision_number);
+		print STDERR "Last local mediawiki revision found is $lastrevision_number ";
+	}
+	return $lastrevision_number;
+sub get_last_remote_revision {
+	my $mediawiki = MediaWiki::API->new;
+	$mediawiki->{config}->{api_url} = "$url/api.php";
+	my $pages = $mediawiki->list({
+		action => 'query',
+		list => 'allpages',
+		aplimit => 500,
+	});
+	my $max_rev_num = 0;
+	foreach my $page (@$pages) {
+		my $id = $page->{pageid};
+		my $query = {
+			action => 'query',
+			prop => 'revisions',
+			rvprop => 'ids',
+			pageids => $id,
+		};
+		my $result = $mediawiki->api($query);
+		my $lastrev = pop(@{$result->{query}->{pages}->{$id}->{revisions}});
+		$max_rev_num = ($lastrev->{revid} > $max_rev_num ? $lastrev->{revid} : $max_rev_num);
+	}
+	print STDERR "Last remote revision found is $max_rev_num\n";
+	return $max_rev_num;
+sub literal_data {
+	my ($content) = @_;
+	print STDOUT "data ", bytes::length($content), "\n", $content;
+sub mw_capabilities {
+	# Revisions are imported to the private namespace
+	# refs/mediawiki/$remotename/ by the helper and fetched into
+	# refs/remotes/$remotename later by fetch.
+	print STDOUT "refspec refs/heads/*:refs/mediawiki/$remotename/*\n";
+	print STDOUT "import\n";
+	print STDOUT "list\n";
+	print STDOUT "option\n";
+	print STDOUT "push\n";
+	print STDOUT "\n";
+sub mw_list {
+	# MediaWiki do not have branches, we consider one branch arbitrarily
+	# called master
+	print STDOUT "? refs/heads/master\n";
+	print STDOUT '@'."refs/heads/master HEAD\n";
+	print STDOUT "\n";
+sub mw_option {
+	print STDOUT "unsupported\n";
+sub mw_import {
+	my @wiki_name = split(/:\/\//,$url);
+	my $wiki_name = $wiki_name[1];
+	my $mediawiki = MediaWiki::API->new;
+	$mediawiki->{config}->{api_url} = "$url/api.php";
+	my $pages = $mediawiki->list({
+		action => 'query',
+		list => 'allpages',
+		aplimit => 500,
+	});
+	if (!defined($pages)) {
+		print STDERR "fatal: '$url' does not appear to be a mediawiki\n";
+		print STDERR "fatal: make sure '$url/api.php' is a valid page\n";
+		exit;
+	}
+	my @revisions;
+	print STDERR "Searching revisions...\n";
+	my $fetch_from = get_last_local_revision() + 1;
+	if ($fetch_from == 1) {
+		print STDERR ", fetching from beginning\n";
+	} else {
+		print STDERR ", fetching from here\n";
+	}
+	my $n = 1;
+	foreach my $page (@$pages) {
+		my $id = $page->{pageid};
+		print STDERR "$n/", scalar(@$pages), ": ". encode_utf8($page->{title})."\n";
+		$n++;
+		my $query = {
+			action => 'query',
+			prop => 'revisions',
+			rvprop => 'ids',
+			rvdir => 'newer',
+			rvstartid => $fetch_from,
+			rvlimit => 500,
+			pageids => $page->{pageid},
+		};
+		my $revnum = 0;
+		# Get 500 revisions at a time due to the mediawiki api limit
+		while (1) {
+			my $result = $mediawiki->api($query);
+			# Parse each of those 500 revisions
+			foreach my $revision (@{$result->{query}->{pages}->{$id}->{revisions}}) {
+				my $page_rev_ids;
+				$page_rev_ids->{pageid} = $page->{pageid};
+				$page_rev_ids->{revid} = $revision->{revid};
+				push (@revisions, $page_rev_ids);
+				$revnum++;
+			}
+			last unless $result->{'query-continue'};
+			$query->{rvstartid} = $result->{'query-continue'}->{revisions}->{rvstartid};
+		}
+		print STDERR "  Found ", $revnum, " revision(s).\n";
+	}
+	# Creation of the fast-import stream
+	print STDERR "Fetching & writing export data...\n";
+	#binmode STDOUT, ':binary'; # Line from Jeff King that raises a warning. What is this line for ?
+	$n = 0;
+	foreach my $pagerevids (sort {$a->{revid} <=> $b->{revid}} @revisions) {
+		#fetch the content of the pages
+		my $query = {
+			action => 'query',
+			prop => 'revisions',
+			rvprop => 'content|timestamp|comment|user|ids',
+			revids => $pagerevids->{revid},
+		};
+		my $result = $mediawiki->api($query);
+		my $rev = pop(@{$result->{query}->{pages}->{$pagerevids->{pageid}}->{revisions}});
+		$n++;
+		my $user = $rev->{user} || 'Anonymous';
+		my $dt = DateTime::Format::ISO8601->parse_datetime($rev->{timestamp});
+		my $comment = defined $rev->{comment} ? $rev->{comment} : '*Empty MediaWiki Message*';
+		my $title = encode_utf8($result->{query}->{pages}->{$pagerevids->{pageid}}->{title});
+		my $content = $rev->{'*'};
+		# This \n is important. If it's not added, a conflict is going to happen if you change
+		# the last line of a file, push it and then pull it back from mediawiki.
+		# Note : it seems like there are still problems with this. To be investigated further
+		$content .= "\n"; 
+		$title =~ y/ /_/;
+		$title =~ s/\//$slash_replacement/g;
+		print STDERR "$n/", scalar(@revisions), ": Revision nÂ$pagerevids->{revid} of $title\n";
+		print STDOUT "commit refs/mediawiki/$remotename/master\n";
+		print STDOUT "mark :$n\n";
+		print STDOUT "committer $user <$user\@$wiki_name> ", $dt->epoch, " +0000\n";
+		literal_data(encode_utf8($comment));
+		# If it's not a clone, needs to know where to start from
+		if ($fetch_from != 1 && $n == 1) {
+			print STDOUT "from refs/mediawiki/$remotename/master^0\n";
+		}
+		print STDOUT "M 644 inline $title.mw\n";
+		literal_data(encode_utf8($content));
+		print STDOUT "\n\n";
+		# mediawiki revision number in the git note
+		if ($fetch_from == 1 && $n == 1) {
+			print STDOUT "reset refs/notes/mediawiki\n";
+		}
+		print STDOUT "commit refs/notes/mediawiki\n";
+		print STDOUT "committer $user <$user\@$wiki_name> ", $dt->epoch, " +0000\n";
+		literal_data(encode_utf8("note added by git-mediawiki"));
+		if ($fetch_from != 1 && $n == 1) {
+			print STDOUT "from refs/notes/mediawiki^0\n";
+		}
+		print STDOUT "N inline :$n\n";
+		literal_data(encode_utf8("mediawiki_revision: " . $pagerevids->{revid}));
+		print STDOUT "\n\n";
+	}
+	if ($fetch_from == 1) {
+		if ($n != 0) {
+			print STDOUT "reset $_[0]\n"; #$_[0] contains refs/heads/master
+			print STDOUT "from :$n\n";
+		} else {
+			print STDERR "You appear to have cloned an empty mediawiki\n";
+			#Something has to be done remote-helper side. If nothing is done, an error is 
+			#thrown saying that HEAD is refering to unknown object 0000000000000000000
+		}
+	}
+sub mw_push {
+	print STDERR "Push not yet implemented\n";
diff --git a/contrib/mw-to-git/git-remote-mediawiki.txt b/contrib/mw-to-git/git-remote-mediawiki.txt
new file mode 100644
index 0000000..4d211f5
--- /dev/null
+++ b/contrib/mw-to-git/git-remote-mediawiki.txt
@@ -0,0 +1,7 @@
+Git-Mediawiki is a project which aims the creation of a gate
+between git and mediawiki, allowing git users to push and pull
+objects from mediawiki just as one would do with a classic git
+repository thanks to remote-helpers.
+For more information, visit the wiki at

To unsubscribe from this list: send the line "unsubscribe git" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[Index of Archives]     [Linux Kernel Development]     [Gcc Help]     [IETF Annouce]     [DCCP]     [Netdev]     [Networking]     [Security]     [V4L]     [Bugtraq]     [Yosemite]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Linux SCSI]     [Fedora Users]