Hi,attached are two files of take #1 of writing a hg2git converter/tracker using git-fast-import. It basically works so use at your own risk and send patches... :)
"Basically" means that it gets tags, branches and merges right (working tree md5 sums match after imports). It also means that it is horribly slow for the repos I tested it own (only mutt and hg-crew).
The performance bottleneck is hg exporting data, as discovered by people on #mercurial, the problem is not really fixable and is due to hg's revlog handling. As a result, I needed to let the script feed the full contents of the repository at each revision we walk (i.e. all for the initial import) into git-fast-import. This is horribly slow. For mutt which contains several tags, a handfull of branches and only 5k commits this takes roughly two hours at 1 commit/sec. My earlier version not using 'deleteall' and feeding only files that changed took 15 minutes alltogether, git-fast-import from a textfile 1 min 30 sec.
As I'll use this my for daily work (more or less), I'll think I'll "maintain" and keep improving it, so if anyone has comments, critics, hints, patches, ...
Somewhat related: It would be really nice to teach git-fast-import to init from a previously saved mark file. Right now I use hg revision numbers as marks, let git-fast-import save them, and read them back next time. These are needed to map hg revisions to git SHA1s in case I need to reference something in an incremental import from an earlier run. It would be nice if git-fast-import could do this on its own so that all consumers can benefit and can have persistent marks accross sessions.
About the attached files: hg2git.py is the worker script using the mercurial python package so that no more slow shell or pipes including fork are needed for the raw export, hg2git.sh is a convenience shell wrapper taking core of the state files for incremental imports.
bye, Rocco -- :wq!
Attachment:
hg2git.sh
Description: Bourne shell script
#!/usr/bin/env python # Copyright (c) 2007 Rocco Rutte <pdmef@xxxxxxx> # License: GPLv2 """hg2git.py - A mercurial-to-git filter for git-fast-import(1) Usage: hg2git.py <hg repo url> <marks file> <heads file> <tip file> """ from mercurial import repo,hg,cmdutil,util,ui,revlog from tempfile import mkstemp import re import sys import os # silly regex to see if user field has email address user_re=re.compile('[^<]+ <[^>]+>$') # git branch for hg's default 'HEAD' branch cfg_master='master' # insert 'checkpoint' command after this many commits cfg_checkpoint_count=1000 def usage(ret): sys.stderr.write(__doc__) return ret def setup_repo(url): myui=ui.ui() return myui,hg.repository(myui,url) def get_changeset(ui,repo,revision): def get_branch(name): if name=='HEAD': name=cfg_master return name def fixup_user(user): if user_re.match(user)==None: if '@' not in user: return user+' <none@none>' return user+' <'+user+'>' return user node=repo.lookup(revision) (manifest,user,(time,timezone),files,desc,extra)=repo.changelog.read(node) tz="%+03d%02d" % (-timezone / 3600, ((-timezone % 3600) / 60)) branch=get_branch(extra.get('branch','master')) return (manifest,fixup_user(user),(time,tz),files,desc,branch,extra) def gitmode(x): return x and '100755' or '100644' def wr(msg=''): print msg #map(lambda x: sys.stderr.write('\t[%s]\n' % x),msg.split('\n')) def checkpoint(count): count=count+1 if count%cfg_checkpoint_count==0: sys.stderr.write("Checkpoint after %d commits\n" % count) wr('checkpoint') wr() return count def get_parent_mark(parent,marks): p=marks.get(str(parent),None) if p==None: # if we didn't see parent previously, assume we saw it in this run p=':%d' % (parent+1) return p def export_commit(ui,repo,revision,marks,heads,last,max,count): sys.stderr.write('Exporting revision %d (tip %d) as [:%d]\n' % (revision,max,revision+1)) (_,user,(time,timezone),files,desc,branch,_)=get_changeset(ui,repo,revision) parents=repo.changelog.parentrevs(revision) # we need this later to write out tags marks[str(revision)]=':%d'%(revision+1) wr('commit refs/heads/%s' % branch) wr('mark :%d' % (revision+1)) wr('committer %s %d %s' % (user,time,timezone)) wr('data %d' % (len(desc)+1)) # wtf? wr(desc) wr() src=heads.get(branch,'') link='' if src!='': # if we have a cached head, this is an incremental import: initialize it # and kill reference so we won't init it again wr('from %s' % src) heads[branch]='' elif not heads.has_key(branch) and revision>0: # newly created branch and not the first one: connect to parent tmp=get_parent_mark(parents[0],marks) wr('from %s' % tmp) sys.stderr.write('Link new branch [%s] to parent [%s]\n' % (branch,tmp)) link=tmp # avoid making a merge commit for branch fork if parents: l=last.get(branch,revision) for p in parents: # 1) as this commit implicitely is the child of the most recent # commit of this branch, ignore this parent # 2) ignore nonexistent parents # 3) merge otherwise if p==l or p==revision or p<0: continue tmp=get_parent_mark(p,marks) # if we fork off a branch, don't merge via 'merge' as we have # 'from' already above if tmp==link: continue sys.stderr.write('Merging branch [%s] with parent [%s] from [r%d]\n' % (branch,tmp,p)) wr('merge %s' % tmp) last[branch]=revision heads[branch]='' # just wipe the branch clean, all full manifest contents wr('deleteall') ctx=repo.changectx(str(revision)) man=ctx.manifest() #for f in man.keys(): # fctx=ctx.filectx(f) # d=fctx.data() # wr('M %s inline %s' % (gitmode(man.execf(f)),f)) # wr('data %d' % len(d)) # had some trouble with size() # wr(d) for fctx in ctx.filectxs(): f=fctx.path() d=fctx.data() wr('M %s inline %s' % (gitmode(man.execf(f)),f)) wr('data %d' % len(d)) # had some trouble with size() wr(d) wr() return checkpoint(count) def export_tags(ui,repo,cache,count): l=repo.tagslist() for tag,node in l: if tag=='tip': continue rev=repo.changelog.rev(node) ref=cache.get(str(rev),None) if ref==None: sys.stderr.write('Failed to find reference for creating tag' ' %s at r%d\n' % (tag,rev)) continue (_,user,(time,timezone),_,desc,branch,_)=get_changeset(ui,repo,rev) sys.stderr.write('Exporting tag [%s] at [hg r%d] [git %s]\n' % (tag,rev,ref)) wr('tag %s' % tag) wr('from %s' % ref) wr('tagger %s %d %s' % (user,time,timezone)) msg='hg2git created tag %s for hg revision %d on branch %s on (summary):\n\t%s' % (tag, rev,branch,desc.split('\n')[0]) wr('data %d' % (len(msg)+1)) wr(msg) wr() count=checkpoint(count) return count def load_cache(filename): cache={} if not os.path.exists(filename): return cache f=open(filename,'r') l=0 for line in f.readlines(): l+=1 fields=line.split(' ') if fields==None or not len(fields)==2 or fields[0][0]!=':': sys.stderr.write('Invalid file format in [%s], line %d\n' % (filename,l)) continue # put key:value in cache, key without ^: cache[fields[0][1:]]=fields[1].split('\n')[0] f.close() return cache def save_cache(filename,cache): f=open(filename,'w+') map(lambda x: f.write(':%s %s\n' % (str(x),str(cache.get(x)))),cache.keys()) f.close() def verify_heads(ui,repo,cache): def getsha1(branch): f=open(os.getenv('GIT_DIR','/dev/null')+'/refs/heads/'+branch) sha1=f.readlines()[0].split('\n')[0] f.close() return sha1 for b in cache.keys(): sys.stderr.write('Verifying branch [%s]\n' % b) sha1=getsha1(b) c=cache.get(b) if sha1!=c: sys.stderr.write('Warning: Branch [%s] modified outside hg2git:' '\n%s (repo) != %s (cache)\n' % (b,sha1,c)) return True if __name__=='__main__': if len(sys.argv)!=6: sys.exit(usage(1)) repourl,m,marksfile,headsfile,tipfile=sys.argv[1:] _max=int(m) marks_cache=load_cache(marksfile) heads_cache=load_cache(headsfile) state_cache=load_cache(tipfile) ui,repo=setup_repo(repourl) if not verify_heads(ui,repo,heads_cache): sys.exit(1) tip=repo.changelog.count() min=int(state_cache.get('tip',0)) max=_max if _max<0: max=tip c=int(state_cache.get('count',0)) last={} for rev in range(min,max): c=export_commit(ui,repo,rev,marks_cache,heads_cache,last,tip,c) c=export_tags(ui,repo,marks_cache,c) state_cache['tip']=max state_cache['count']=c state_cache['repo']=repourl save_cache(tipfile,state_cache)