mercurial to git

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Hi,

attached are two files of take #1 of writing a hg2git converter/tracker using git-fast-import. It basically works so use at your own risk and send patches... :)

"Basically" means that it gets tags, branches and merges right (working tree md5 sums match after imports). It also means that it is horribly slow for the repos I tested it own (only mutt and hg-crew).

The performance bottleneck is hg exporting data, as discovered by people on #mercurial, the problem is not really fixable and is due to hg's revlog handling. As a result, I needed to let the script feed the full contents of the repository at each revision we walk (i.e. all for the initial import) into git-fast-import. This is horribly slow. For mutt which contains several tags, a handfull of branches and only 5k commits this takes roughly two hours at 1 commit/sec. My earlier version not using 'deleteall' and feeding only files that changed took 15 minutes alltogether, git-fast-import from a textfile 1 min 30 sec.

As I'll use this my for daily work (more or less), I'll think I'll "maintain" and keep improving it, so if anyone has comments, critics, hints, patches, ...

Somewhat related: It would be really nice to teach git-fast-import to init from a previously saved mark file. Right now I use hg revision numbers as marks, let git-fast-import save them, and read them back next time. These are needed to map hg revisions to git SHA1s in case I need to reference something in an incremental import from an earlier run. It would be nice if git-fast-import could do this on its own so that all consumers can benefit and can have persistent marks accross sessions.

About the attached files: hg2git.py is the worker script using the mercurial python package so that no more slow shell or pipes including fork are needed for the raw export, hg2git.sh is a convenience shell wrapper taking core of the state files for incremental imports.

  bye, Rocco
--
:wq!

Attachment: hg2git.sh
Description: Bourne shell script

#!/usr/bin/env python

# Copyright (c) 2007 Rocco Rutte <pdmef@xxxxxxx>
# License: GPLv2

"""hg2git.py - A mercurial-to-git filter for git-fast-import(1)
Usage: hg2git.py <hg repo url> <marks file> <heads file> <tip file>
"""

from mercurial import repo,hg,cmdutil,util,ui,revlog
from tempfile import mkstemp
import re
import sys
import os

# silly regex to see if user field has email address
user_re=re.compile('[^<]+ <[^>]+>$')
# git branch for hg's default 'HEAD' branch
cfg_master='master'
# insert 'checkpoint' command after this many commits
cfg_checkpoint_count=1000

def usage(ret):
  sys.stderr.write(__doc__)
  return ret

def setup_repo(url):
  myui=ui.ui()
  return myui,hg.repository(myui,url)

def get_changeset(ui,repo,revision):
  def get_branch(name):
    if name=='HEAD':
      name=cfg_master
    return name
  def fixup_user(user):
    if user_re.match(user)==None:
      if '@' not in user:
        return user+' <none@none>'
      return user+' <'+user+'>'
    return user
  node=repo.lookup(revision)
  (manifest,user,(time,timezone),files,desc,extra)=repo.changelog.read(node)
  tz="%+03d%02d" % (-timezone / 3600, ((-timezone % 3600) / 60))
  branch=get_branch(extra.get('branch','master'))
  return (manifest,fixup_user(user),(time,tz),files,desc,branch,extra)

def gitmode(x):
  return x and '100755' or '100644'

def wr(msg=''):
  print msg
  #map(lambda x: sys.stderr.write('\t[%s]\n' % x),msg.split('\n'))

def checkpoint(count):
  count=count+1
  if count%cfg_checkpoint_count==0:
    sys.stderr.write("Checkpoint after %d commits\n" % count)
    wr('checkpoint')
    wr()
  return count

def get_parent_mark(parent,marks):
  p=marks.get(str(parent),None)
  if p==None:
    # if we didn't see parent previously, assume we saw it in this run
    p=':%d' % (parent+1)
  return p

def export_commit(ui,repo,revision,marks,heads,last,max,count):
  sys.stderr.write('Exporting revision %d (tip %d) as [:%d]\n' % (revision,max,revision+1))

  (_,user,(time,timezone),files,desc,branch,_)=get_changeset(ui,repo,revision)
  parents=repo.changelog.parentrevs(revision)

  # we need this later to write out tags
  marks[str(revision)]=':%d'%(revision+1)

  wr('commit refs/heads/%s' % branch)
  wr('mark :%d' % (revision+1))
  wr('committer %s %d %s' % (user,time,timezone))
  wr('data %d' % (len(desc)+1)) # wtf?
  wr(desc)
  wr()

  src=heads.get(branch,'')
  link=''
  if src!='':
    # if we have a cached head, this is an incremental import: initialize it
    # and kill reference so we won't init it again
    wr('from %s' % src)
    heads[branch]=''
  elif not heads.has_key(branch) and revision>0:
    # newly created branch and not the first one: connect to parent
    tmp=get_parent_mark(parents[0],marks)
    wr('from %s' % tmp)
    sys.stderr.write('Link new branch [%s] to parent [%s]\n' %
        (branch,tmp))
    link=tmp # avoid making a merge commit for branch fork

  if parents:
    l=last.get(branch,revision)
    for p in parents:
      # 1) as this commit implicitely is the child of the most recent
      #    commit of this branch, ignore this parent
      # 2) ignore nonexistent parents
      # 3) merge otherwise
      if p==l or p==revision or p<0:
        continue
      tmp=get_parent_mark(p,marks)
      # if we fork off a branch, don't merge via 'merge' as we have
      # 'from' already above
      if tmp==link:
        continue
      sys.stderr.write('Merging branch [%s] with parent [%s] from [r%d]\n' %
          (branch,tmp,p))
      wr('merge %s' % tmp)

  last[branch]=revision
  heads[branch]=''

  # just wipe the branch clean, all full manifest contents
  wr('deleteall')

  ctx=repo.changectx(str(revision))
  man=ctx.manifest()

  #for f in man.keys():
  #  fctx=ctx.filectx(f)
  #  d=fctx.data()
  #  wr('M %s inline %s' % (gitmode(man.execf(f)),f))
  #  wr('data %d' % len(d)) # had some trouble with size()
  #  wr(d)

  for fctx in ctx.filectxs():
    f=fctx.path()
    d=fctx.data()
    wr('M %s inline %s' % (gitmode(man.execf(f)),f))
    wr('data %d' % len(d)) # had some trouble with size()
    wr(d)

  wr()
  return checkpoint(count)

def export_tags(ui,repo,cache,count):
  l=repo.tagslist()
  for tag,node in l:
    if tag=='tip':
      continue
    rev=repo.changelog.rev(node)
    ref=cache.get(str(rev),None)
    if ref==None:
      sys.stderr.write('Failed to find reference for creating tag'
          ' %s at r%d\n' % (tag,rev))
      continue
    (_,user,(time,timezone),_,desc,branch,_)=get_changeset(ui,repo,rev)
    sys.stderr.write('Exporting tag [%s] at [hg r%d] [git %s]\n' % (tag,rev,ref))
    wr('tag %s' % tag)
    wr('from %s' % ref)
    wr('tagger %s %d %s' % (user,time,timezone))
    msg='hg2git created tag %s for hg revision %d on branch %s on (summary):\n\t%s' % (tag,
        rev,branch,desc.split('\n')[0])
    wr('data %d' % (len(msg)+1))
    wr(msg)
    wr()
    count=checkpoint(count)
  return count

def load_cache(filename):
  cache={}
  if not os.path.exists(filename):
    return cache
  f=open(filename,'r')
  l=0
  for line in f.readlines():
    l+=1
    fields=line.split(' ')
    if fields==None or not len(fields)==2 or fields[0][0]!=':':
      sys.stderr.write('Invalid file format in [%s], line %d\n' % (filename,l))
      continue
    # put key:value in cache, key without ^:
    cache[fields[0][1:]]=fields[1].split('\n')[0]
  f.close()
  return cache

def save_cache(filename,cache):
  f=open(filename,'w+')
  map(lambda x: f.write(':%s %s\n' % (str(x),str(cache.get(x)))),cache.keys())
  f.close()

def verify_heads(ui,repo,cache):
  def getsha1(branch):
    f=open(os.getenv('GIT_DIR','/dev/null')+'/refs/heads/'+branch)
    sha1=f.readlines()[0].split('\n')[0]
    f.close()
    return sha1

  for b in cache.keys():
    sys.stderr.write('Verifying branch [%s]\n' % b)
    sha1=getsha1(b)
    c=cache.get(b)
    if sha1!=c:
      sys.stderr.write('Warning: Branch [%s] modified outside hg2git:'
        '\n%s (repo) != %s (cache)\n' % (b,sha1,c))
  return True

if __name__=='__main__':
  if len(sys.argv)!=6: sys.exit(usage(1))
  repourl,m,marksfile,headsfile,tipfile=sys.argv[1:]
  _max=int(m)

  marks_cache=load_cache(marksfile)
  heads_cache=load_cache(headsfile)
  state_cache=load_cache(tipfile)

  ui,repo=setup_repo(repourl)

  if not verify_heads(ui,repo,heads_cache):
    sys.exit(1)

  tip=repo.changelog.count()

  min=int(state_cache.get('tip',0))
  max=_max
  if _max<0:
    max=tip

  c=int(state_cache.get('count',0))
  last={}
  for rev in range(min,max):
    c=export_commit(ui,repo,rev,marks_cache,heads_cache,last,tip,c)

  c=export_tags(ui,repo,marks_cache,c)

  state_cache['tip']=max
  state_cache['count']=c
  state_cache['repo']=repourl
  save_cache(tipfile,state_cache)

[Index of Archives]     [Linux Kernel Development]     [Gcc Help]     [IETF Annouce]     [DCCP]     [Netdev]     [Networking]     [Security]     [V4L]     [Bugtraq]     [Yosemite]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Linux SCSI]     [Fedora Users]