Re: cvs2svn and git progress

"Jon Smirl" <jonsmirl@xxxxxxxxx> · Thu, 10 Aug 2006 09:32:39 -0400

On 8/10/06, Martin Langhoff <martin.langhoff@xxxxxxxxx> wrote:
On 8/10/06, Jon Smirl <jonsmirl@xxxxxxxxx> wrote:
> I've finally got cvs2svn running through pass 7 now. It took me a

Jon,

great stuff. Is this published somewhere I can pull it from?

This is a diff relative to your git repo of cvs2svn


cheers,


martin



--
Jon Smirl
jonsmirl@xxxxxxxxx

diff --git a/cvs2svn_lib/collect_data.py b/cvs2svn_lib/collect_data.py
index c05e387..364582d 100644
--- a/cvs2svn_lib/collect_data.py
+++ b/cvs2svn_lib/collect_data.py
@@ -24,6 +24,10 @@ import os
 import re
 import time
 import stat
+import sha
+import zlib
+import struct
+from subprocess import Popen,PIPE
 
 from cvs2svn_lib.boolean import *
 from cvs2svn_lib.set_support import *
@@ -52,7 +56,6 @@ from cvs2svn_lib.metadata_database impor
 
 import cvs2svn_rcsparse
 
-
 branch_tag_re = re.compile(r'''
     ^
     ((?:\d+\.\d+\.)+)   # A nonzero even number of digit groups w/trailing dot
@@ -70,6 +73,10 @@ # assuming that the non-standard vendor 
 # default branch anyway, so we don't want this to match them anyway.
 vendor_revision = re.compile(r'^1\.1\.1\.\d+$')
 
+# Used to parse revision deltas which either add or delete text
+# format is add/delete start range
+deltaPattern = re.compile(r'^([ad])(\d+)\D+(\d+)$')
+
 
 def is_trunk_revision(rev):
   """Return True iff REV is a trunk revision."""
@@ -413,6 +420,14 @@ class _FileDataCollector(cvs2svn_rcspars
 
     pass
 
+  def set_head_revision(self, revision):
+    """This is a callback method declared in Sink."""
+    
+    self.git_head = revision
+    self.git_branches = {} 
+    self.git_next = {}
+    self.git_text = {}
+
   def define_revision(self, revision, timestamp, author, state,
                       branches, next):
     """This is a callback method declared in Sink."""
@@ -460,6 +475,11 @@ class _FileDataCollector(cvs2svn_rcspars
         self._primary_dependencies.append( (next, revision,) )
       else:
         self._primary_dependencies.append( (revision, next,) )
+        
+    # record the CVS diff structure, git code will build the revisions
+    self.git_branches[revision] = branches
+    self.git_next[revision] = next
+
 
   def _resolve_dependencies(self):
     """Store the primary and branch dependencies into the rev_data objects."""
@@ -476,7 +496,14 @@ class _FileDataCollector(cvs2svn_rcspars
     for branch_data in self.sdc.branches_data.values():
       # The branch_data's parent has the branch as a child regardless
       # of whether the branch had any subsequent commits:
-      parent_data = self._rev_data[branch_data.parent]
+      try:
+        parent_data = self._rev_data[branch_data.parent]
+      except KeyError:
+        sys.stderr.write("%s: in '%s':\n"
+                       "   Missing revision %s\n"
+                       % (warning_prefix,
+                          self.cvs_file.filename, branch_data.parent))
+        continue
       parent_data.branches_data.append(branch_data)
 
       if not Ctx().trunk_only and parent_data.child is not None:
@@ -497,7 +524,14 @@ class _FileDataCollector(cvs2svn_rcspars
     for tag_data_list in self.sdc.tags_data.values():
       for tag_data in tag_data_list:
         # The tag_data's rev has the tag as a child:
-        parent_data = self._rev_data[tag_data.rev]
+        try:
+          parent_data = self._rev_data[tag_data.rev]
+        except KeyError:
+            sys.stderr.write("%s: in '%s':\n"
+                       "   Missing revision %s\n"
+                       % (warning_prefix,
+                          self.cvs_file.filename, tag_data.rev))
+        continue
         parent_data.tags_data.append(tag_data)
 
         if not Ctx().trunk_only and parent_data.child is not None:
@@ -708,12 +742,14 @@ class _FileDataCollector(cvs2svn_rcspars
         self._get_rev_id(rev_data.child),
         self._determine_operation(rev_data),
         revision,
-        bool(text),
+        bool(text), "",
         lod,
         rev_data.is_first_on_branch(),
         tag_ids, branch_ids, closed_symbol_ids)
     rev_data.c_rev = c_rev
-    self.collect_data.add_cvs_revision(c_rev)
+
+    # record the CVS tree so that git can build the revisions
+    self.git_text[revision] = text
 
   def parse_completed(self):
     """Walk through all branches and tags and register them with their
@@ -725,6 +761,66 @@ class _FileDataCollector(cvs2svn_rcspars
 
     self.sdc.register_branch_blockers()
 
+    def git_write_file(revision, text):
+      header = 'blob ' + str(len(text)) + '\0'
+      sha1 = sha.new(header)
+      sha1.update(text)
+      digest = sha1.digest()
+      rev_data = self._rev_data[revision]
+      rev_data.c_rev.sha = sha1.hexdigest()
+      self.collect_data.add_cvs_revision(rev_data.c_rev)
+      print rev_data.c_rev.id, rev_data.c_rev.sha
+      
+      if digest not in self.collect_data.object_names:
+        self.collect_data.object_names[digest] = ""
+        self.collect_data.fimport.stdin.write(struct.pack("l",len(text)))
+        self.collect_data.fimport.stdin.write(text)
+        
+    def git_process_diffs(lines, deltas):
+      stack = []
+      delta = 0
+      while delta < len(deltas):
+        ops = deltaPattern.search(deltas[delta]).groups()
+
+        delta += 1
+        x = int(ops[1]) - 1
+        y = int(ops[2])
+        stack.append([ops[0], x, y, delta])
+        
+        if ops[0] == 'a':
+            delta += y
+
+      while stack != []:
+        ops = stack.pop()
+        if ops[0] == 'd':
+          lines[ops[1] : ops[1] + ops[2]] = {}
+        elif ops[0] == 'a':
+          lines[ops[1] + 1: ops[1] + 1] = deltas[ops[3] : ops[3] + ops[2]]
+
+    def git_process_revs(lines, revision):
+      
+      while revision:
+        deltas = self.git_text[revision].split('\n')
+        deltas.pop()
+        if len(deltas) > 0:
+          git_process_diffs(lines, deltas)
+
+        git_write_file(revision, ''.join(lines))
+
+        if len(self.git_branches[revision]):
+          for branch in self.git_branches[revision]:
+            git_process_revs(lines[:], branch)
+
+        revision = self.git_next[revision]
+        
+    revision = self.git_head
+
+    git_write_file(revision, self.git_text[revision])
+    
+    lines = self.git_text[revision].splitlines(True)
+    revision = self.git_next[revision]
+    if revision:
+      git_process_revs(lines, revision)
 
 ctrl_characters_regexp = re.compile('[\\\x00-\\\x1f\\\x7f]')
 
@@ -746,8 +842,14 @@ class _ProjectDataCollector:
     self.found_valid_file = False
     self.fatal_errors = []
     self.num_files = 0
+    collect_data.fimport = Popen(['git-fast-import', 'testme', '760000'], stdin = PIPE)
+    collect_data.object_names = {}
+
     os.path.walk(self.project.project_cvs_repos_path,
                  _ProjectDataCollector._visit_directory, self)
+    collect_data.fimport.stdin.close()
+    collect_data.object_names = {}
+
     if not self.fatal_errors and not self.found_valid_file:
       self.fatal_errors.append(
           '\n'
@@ -783,6 +885,7 @@ class _ProjectDataCollector:
     self.num_files += 1
 
   def _visit_directory(self, dirname, files):
+    
     for fname in files:
       verify_filename_legal(fname)
       if not fname.endswith(',v'):
@@ -793,7 +896,6 @@ class _ProjectDataCollector:
 
       self._process_file(pathname)
 
-
 class CollectData:
   """Repository for data collected by parsing the CVS repository files.
 
@@ -814,6 +916,7 @@ class CollectData:
     self.num_files = 0
     self.symbol_stats = SymbolStatisticsCollector()
     self.stats_keeper = stats_keeper
+    self.object_names = []
 
     # Key generator to generate unique keys for each CVSRevision object:
     self.key_generator = KeyGenerator()
@@ -836,5 +939,3 @@ class CollectData:
 
   def write_symbol_stats(self):
     self.symbol_stats.write()
-
-
diff --git a/cvs2svn_lib/cvs_item.py b/cvs2svn_lib/cvs_item.py
index beabd7c..995fb76 100644
--- a/cvs2svn_lib/cvs_item.py
+++ b/cvs2svn_lib/cvs_item.py
@@ -46,7 +46,7 @@ class CVSRevision(CVSItem):
                id, cvs_file,
                timestamp, metadata_id,
                prev_id, next_id,
-               op, rev, deltatext_exists,
+               op, rev, deltatext_exists, sha,
                lod, first_on_branch,
                tag_ids, branch_ids, closed_symbol_ids):
     """Initialize a new CVSRevision object.
@@ -61,6 +61,7 @@ class CVSRevision(CVSItem):
        OP              -->  (char) OP_ADD, OP_CHANGE, or OP_DELETE
        REV             -->  (string) this CVS rev, e.g., '1.3'
        DELTATEXT_EXISTS-->  (bool) true iff non-empty deltatext
+       SHA             -->  sha1 of git revision
        LOD             -->  (LineOfDevelopment) LOD where this rev occurred
        FIRST_ON_BRANCH -->  (bool) true iff the first rev on its branch
        TAG_IDS         -->  (list of int) ids of all tags on this revision
@@ -79,6 +80,7 @@ class CVSRevision(CVSItem):
     self.prev_id = prev_id
     self.next_id = next_id
     self.deltatext_exists = deltatext_exists
+    self.sha = sha
     self.lod = lod
     self.first_on_branch = first_on_branch
     self.tag_ids = tag_ids
@@ -113,6 +115,7 @@ class CVSRevision(CVSItem):
         self.op,
         self.rev,
         self.deltatext_exists,
+        self.sha,
         lod_id,
         self.first_on_branch,
         ' '.join(['%x' % id for id in self.tag_ids]),
@@ -122,7 +125,7 @@ class CVSRevision(CVSItem):
   def __setstate__(self, data):
     (self.id, cvs_file_id, self.timestamp, self.metadata_id,
      self.prev_id, self.next_id, self.op, self.rev,
-     self.deltatext_exists, lod_id, self.first_on_branch,
+     self.deltatext_exists, self.sha, lod_id, self.first_on_branch,
      tag_ids, branch_ids, closed_symbol_ids) = data
     self.cvs_file = Ctx()._cvs_file_db.get_file(cvs_file_id)
     if lod_id is None:
diff --git a/cvs2svn_lib/dumpfile_delegate.py b/cvs2svn_lib/dumpfile_delegate.py
index fb0606f..03a06d4 100644
--- a/cvs2svn_lib/dumpfile_delegate.py
+++ b/cvs2svn_lib/dumpfile_delegate.py
@@ -232,67 +232,20 @@ class DumpfileDelegate(SVNRepositoryMirr
     # If the file has keywords, we must prevent CVS/RCS from expanding
     # the keywords because they must be unexpanded in the repository,
     # or Subversion will get confused.
-    pipe_cmd, pipe = Ctx().project.cvs_repository.get_co_pipe(
-        c_rev, suppress_keyword_substitution=s_item.has_keywords)
+    #pipe_cmd, pipe = Ctx().project.cvs_repository.get_co_pipe(
+    #    c_rev, suppress_keyword_substitution=s_item.has_keywords)
 
     self.dumpfile.write('Node-path: %s\n'
                         'Node-kind: file\n'
                         'Node-action: %s\n'
                         '%s'  # no property header if no props
-                        'Text-content-length: '
                         % (self._utf8_path(c_rev.svn_path),
                            action, props_header))
 
-    pos = self.dumpfile.tell()
-
-    self.dumpfile.write('0000000000000000\n'
-                        'Text-content-md5: 00000000000000000000000000000000\n'
-                        'Content-length: 0000000000000000\n'
-                        '\n')
-
     if prop_contents:
       self.dumpfile.write(prop_contents)
 
-    # Insert a filter to convert all EOLs to LFs if neccessary
-    if s_item.needs_eol_filter:
-      data_reader = LF_EOL_Filter(pipe.stdout)
-    else:
-      data_reader = pipe.stdout
-
-    # Insert the rev contents, calculating length and checksum as we go.
-    checksum = md5.new()
-    length = 0
-    while True:
-      buf = data_reader.read(config.PIPE_READ_SIZE)
-      if buf == '':
-        break
-      checksum.update(buf)
-      length += len(buf)
-      self.dumpfile.write(buf)
-
-    pipe.stdout.close()
-    error_output = pipe.stderr.read()
-    exit_status = pipe.wait()
-    if exit_status:
-      raise FatalError("The command '%s' failed with exit status: %s\n"
-                       "and the following output:\n"
-                       "%s" % (pipe_cmd, exit_status, error_output))
-
-    # Go back to patch up the length and checksum headers:
-    self.dumpfile.seek(pos, 0)
-    # We left 16 zeros for the text length; replace them with the real
-    # length, padded on the left with spaces:
-    self.dumpfile.write('%16d' % length)
-    # 16... + 1 newline + len('Text-content-md5: ') == 35
-    self.dumpfile.seek(pos + 35, 0)
-    self.dumpfile.write(checksum.hexdigest())
-    # 35... + 32 bytes of checksum + 1 newline + len('Content-length: ') == 84
-    self.dumpfile.seek(pos + 84, 0)
-    # The content length is the length of property data, text data,
-    # and any metadata around/inside around them.
-    self.dumpfile.write('%16d' % (length + len(prop_contents)))
-    # Jump back to the end of the stream
-    self.dumpfile.seek(0, 2)
+    self.dumpfile.write('Git-sha1: %s\n' % (c_rev.sha))
 
     # This record is done (write two newlines -- one to terminate
     # contents that weren't themselves newline-termination, one to
diff --git a/cvs2svn_rcsparse/debug.py b/cvs2svn_rcsparse/debug.py
index cfeaf2b..c2d143d 100644
--- a/cvs2svn_rcsparse/debug.py
+++ b/cvs2svn_rcsparse/debug.py
@@ -1,22 +1,113 @@
-# -*-python-*-
 #
-# Copyright (C) 1999-2006 The ViewCVS Group. All Rights Reserved.
+# Copyright (C) 2000-2002 The ViewCVS Group. All Rights Reserved.
 #
 # By using this file, you agree to the terms and conditions set forth in
-# the LICENSE.html file which can be found at the top level of the ViewVC
-# distribution or at http://viewvc.org/license-1.html.
+# the LICENSE.html file which can be found at the top level of the ViewCVS
+# distribution or at http://viewcvs.sourceforge.net/license-1.html.
 #
-# For more information, visit http://viewvc.org/
+# Contact information:
+#   Greg Stein, PO Box 760, Palo Alto, CA, 94302
+#   gstein@xxxxxxxx, http://viewcvs.sourceforge.net/
+#
+# -----------------------------------------------------------------------
+#
+# This software is being maintained as part of the ViewCVS project.
+# Information is available at:
+#    http://viewcvs.sourceforge.net/
 #
 # -----------------------------------------------------------------------
 
 """debug.py: various debugging tools for the rcsparse package."""
 
 import time
+import re
+import sha
+import zlib
+import os
+import struct
+from subprocess import Popen,PIPE
 
 from __init__ import parse
 import common
 
+deltaPattern = re.compile(r'^([ad])(\d+)\D+(\d+)$')
+
+class RevisionsSink(common.Sink):
+  
+  def __init__(self, fimport):
+    self.fimport = fimport
+                         
+  def set_head_revision(self, revision):
+    self.git_head = revision
+    self.git_branches = {} 
+    self.git_next = {}
+    self.git_text = {}
+
+  def define_revision(self, revision, timestamp, author, state,
+                      branches, next):
+    self.git_branches[revision] = branches
+    self.git_next[revision] = next
+
+  def set_revision_info(self, revision, log, text):
+    self.git_text[revision] = text
+
+  def parse_completed(self):
+
+    def write_file(text):
+      header = 'blob ' + str(len(text)) + '\0'
+      sha1 = sha.new(header)
+      sha1.update(text)
+      name = sha1.hexdigest()
+      
+      print 'length is ', len(text)
+      self.fimport.stdin.write(struct.pack("l",len(text)))
+      self.fimport.stdin.write(text)
+        
+    def process_diffs(lines, deltas):
+      stack = []
+      delta = 0
+      while delta < len(deltas):
+        ops = deltaPattern.search(deltas[delta]).groups()
+
+        delta += 1
+        x = int(ops[1]) - 1
+        y = int(ops[2])
+        stack.append([ops[0], x, y, delta])
+        
+        if ops[0] == 'a':
+            delta += y
+
+      while stack != []:
+        ops = stack.pop()
+        if ops[0] == 'd':
+          lines[ops[1] : ops[1] + ops[2]] = {}
+        elif ops[0] == 'a':
+          lines[ops[1] + 1: ops[1] + 1] = deltas[ops[3] : ops[3] + ops[2]]
+
+    def process_revs(lines, revision):
+      
+      while revision:
+        deltas = self.git_text[revision].split('\n')
+        deltas.pop()
+        if len(deltas) > 0:
+          process_diffs(lines, deltas)
+
+        write_file(''.join(lines))
+
+        if len(self.git_branches[revision]):
+          for branch in self.git_branches[revision]:
+            process_revs(lines[:], branch)
+
+        revision = self.git_next[revision]
+        
+    revision = self.git_head
+
+    write_file(self.git_text[revision])
+    
+    lines = self.git_text[revision].splitlines(True)
+    revision = self.git_next[revision]
+    if revision:
+      process_revs(lines, revision)
 
 class DebugSink(common.Sink):
   def set_head_revision(self, revision):
@@ -46,7 +137,7 @@ class DebugSink(common.Sink):
   def set_revision_info(self, revision, log, text):
     print 'revision:', revision
     print '    log:', log
-    print '    text:', text[:100], '...'
+    print '    text:', text
 
 
 class DumpSink(common.Sink):
@@ -90,9 +181,17 @@ class DumpSink(common.Sink):
     print 'parse_completed'
 
 
+def debug_file(fname):
+  parse(open(fname, 'rb'), DebugSink())
+
 def dump_file(fname):
   parse(open(fname, 'rb'), DumpSink())
 
+def revisions_file(fname):
+  fimport = Popen(['git-fast-import', 'testme'], stdin = PIPE)
+  parse(open(fname, 'rb'), RevisionsSink(fimport))
+  fimport.stdin.close()
+
 def time_file(fname):
   f = open(fname, 'rb')
   s = common.Sink()
@@ -116,7 +215,11 @@ if __name__ == '__main__':
     _usage()
   if sys.argv[1] == 'dump':
     dump_file(sys.argv[2])
+  elif sys.argv[1] == 'debug':
+    debug_file(sys.argv[2])
   elif sys.argv[1] == 'time':
     time_file(sys.argv[2])
+  elif sys.argv[1] == 'revisions':
+    revisions_file(sys.argv[2])
   else:
     _usage()
diff --git a/cvs2svn_rcsparse/default.py b/cvs2svn_rcsparse/default.py
index 14c9958..4e108f4 100644
--- a/cvs2svn_rcsparse/default.py
+++ b/cvs2svn_rcsparse/default.py
@@ -24,7 +24,7 @@ class _TokenStream:
   # the algorithm is about the same speed for any CHUNK_SIZE chosen.
   # grab a good-sized chunk, but not too large to overwhelm memory.
   # note: we use a multiple of a standard block size
-  CHUNK_SIZE  = 192 * 512  # about 100k
+  CHUNK_SIZE  = 4096 * 512  # about 2MB
 
 #  CHUNK_SIZE  = 5	# for debugging, make the function grind...