On 8/10/06, Martin Langhoff <martin.langhoff@xxxxxxxxx> wrote:
On 8/10/06, Jon Smirl <jonsmirl@xxxxxxxxx> wrote: > I've finally got cvs2svn running through pass 7 now. It took me a Jon, great stuff. Is this published somewhere I can pull it from?
This is a diff relative to your git repo of cvs2svn
cheers, martin
-- Jon Smirl jonsmirl@xxxxxxxxx
diff --git a/cvs2svn_lib/collect_data.py b/cvs2svn_lib/collect_data.py index c05e387..364582d 100644 --- a/cvs2svn_lib/collect_data.py +++ b/cvs2svn_lib/collect_data.py @@ -24,6 +24,10 @@ import os import re import time import stat +import sha +import zlib +import struct +from subprocess import Popen,PIPE from cvs2svn_lib.boolean import * from cvs2svn_lib.set_support import * @@ -52,7 +56,6 @@ from cvs2svn_lib.metadata_database impor import cvs2svn_rcsparse - branch_tag_re = re.compile(r''' ^ ((?:\d+\.\d+\.)+) # A nonzero even number of digit groups w/trailing dot @@ -70,6 +73,10 @@ # assuming that the non-standard vendor # default branch anyway, so we don't want this to match them anyway. vendor_revision = re.compile(r'^1\.1\.1\.\d+$') +# Used to parse revision deltas which either add or delete text +# format is add/delete start range +deltaPattern = re.compile(r'^([ad])(\d+)\D+(\d+)$') + def is_trunk_revision(rev): """Return True iff REV is a trunk revision.""" @@ -413,6 +420,14 @@ class _FileDataCollector(cvs2svn_rcspars pass + def set_head_revision(self, revision): + """This is a callback method declared in Sink.""" + + self.git_head = revision + self.git_branches = {} + self.git_next = {} + self.git_text = {} + def define_revision(self, revision, timestamp, author, state, branches, next): """This is a callback method declared in Sink.""" @@ -460,6 +475,11 @@ class _FileDataCollector(cvs2svn_rcspars self._primary_dependencies.append( (next, revision,) ) else: self._primary_dependencies.append( (revision, next,) ) + + # record the CVS diff structure, git code will build the revisions + self.git_branches[revision] = branches + self.git_next[revision] = next + def _resolve_dependencies(self): """Store the primary and branch dependencies into the rev_data objects.""" @@ -476,7 +496,14 @@ class _FileDataCollector(cvs2svn_rcspars for branch_data in self.sdc.branches_data.values(): # The branch_data's parent has the branch as a child regardless # of whether the branch had any subsequent commits: - parent_data = self._rev_data[branch_data.parent] + try: + parent_data = self._rev_data[branch_data.parent] + except KeyError: + sys.stderr.write("%s: in '%s':\n" + " Missing revision %s\n" + % (warning_prefix, + self.cvs_file.filename, branch_data.parent)) + continue parent_data.branches_data.append(branch_data) if not Ctx().trunk_only and parent_data.child is not None: @@ -497,7 +524,14 @@ class _FileDataCollector(cvs2svn_rcspars for tag_data_list in self.sdc.tags_data.values(): for tag_data in tag_data_list: # The tag_data's rev has the tag as a child: - parent_data = self._rev_data[tag_data.rev] + try: + parent_data = self._rev_data[tag_data.rev] + except KeyError: + sys.stderr.write("%s: in '%s':\n" + " Missing revision %s\n" + % (warning_prefix, + self.cvs_file.filename, tag_data.rev)) + continue parent_data.tags_data.append(tag_data) if not Ctx().trunk_only and parent_data.child is not None: @@ -708,12 +742,14 @@ class _FileDataCollector(cvs2svn_rcspars self._get_rev_id(rev_data.child), self._determine_operation(rev_data), revision, - bool(text), + bool(text), "", lod, rev_data.is_first_on_branch(), tag_ids, branch_ids, closed_symbol_ids) rev_data.c_rev = c_rev - self.collect_data.add_cvs_revision(c_rev) + + # record the CVS tree so that git can build the revisions + self.git_text[revision] = text def parse_completed(self): """Walk through all branches and tags and register them with their @@ -725,6 +761,66 @@ class _FileDataCollector(cvs2svn_rcspars self.sdc.register_branch_blockers() + def git_write_file(revision, text): + header = 'blob ' + str(len(text)) + '\0' + sha1 = sha.new(header) + sha1.update(text) + digest = sha1.digest() + rev_data = self._rev_data[revision] + rev_data.c_rev.sha = sha1.hexdigest() + self.collect_data.add_cvs_revision(rev_data.c_rev) + print rev_data.c_rev.id, rev_data.c_rev.sha + + if digest not in self.collect_data.object_names: + self.collect_data.object_names[digest] = "" + self.collect_data.fimport.stdin.write(struct.pack("l",len(text))) + self.collect_data.fimport.stdin.write(text) + + def git_process_diffs(lines, deltas): + stack = [] + delta = 0 + while delta < len(deltas): + ops = deltaPattern.search(deltas[delta]).groups() + + delta += 1 + x = int(ops[1]) - 1 + y = int(ops[2]) + stack.append([ops[0], x, y, delta]) + + if ops[0] == 'a': + delta += y + + while stack != []: + ops = stack.pop() + if ops[0] == 'd': + lines[ops[1] : ops[1] + ops[2]] = {} + elif ops[0] == 'a': + lines[ops[1] + 1: ops[1] + 1] = deltas[ops[3] : ops[3] + ops[2]] + + def git_process_revs(lines, revision): + + while revision: + deltas = self.git_text[revision].split('\n') + deltas.pop() + if len(deltas) > 0: + git_process_diffs(lines, deltas) + + git_write_file(revision, ''.join(lines)) + + if len(self.git_branches[revision]): + for branch in self.git_branches[revision]: + git_process_revs(lines[:], branch) + + revision = self.git_next[revision] + + revision = self.git_head + + git_write_file(revision, self.git_text[revision]) + + lines = self.git_text[revision].splitlines(True) + revision = self.git_next[revision] + if revision: + git_process_revs(lines, revision) ctrl_characters_regexp = re.compile('[\\\x00-\\\x1f\\\x7f]') @@ -746,8 +842,14 @@ class _ProjectDataCollector: self.found_valid_file = False self.fatal_errors = [] self.num_files = 0 + collect_data.fimport = Popen(['git-fast-import', 'testme', '760000'], stdin = PIPE) + collect_data.object_names = {} + os.path.walk(self.project.project_cvs_repos_path, _ProjectDataCollector._visit_directory, self) + collect_data.fimport.stdin.close() + collect_data.object_names = {} + if not self.fatal_errors and not self.found_valid_file: self.fatal_errors.append( '\n' @@ -783,6 +885,7 @@ class _ProjectDataCollector: self.num_files += 1 def _visit_directory(self, dirname, files): + for fname in files: verify_filename_legal(fname) if not fname.endswith(',v'): @@ -793,7 +896,6 @@ class _ProjectDataCollector: self._process_file(pathname) - class CollectData: """Repository for data collected by parsing the CVS repository files. @@ -814,6 +916,7 @@ class CollectData: self.num_files = 0 self.symbol_stats = SymbolStatisticsCollector() self.stats_keeper = stats_keeper + self.object_names = [] # Key generator to generate unique keys for each CVSRevision object: self.key_generator = KeyGenerator() @@ -836,5 +939,3 @@ class CollectData: def write_symbol_stats(self): self.symbol_stats.write() - - diff --git a/cvs2svn_lib/cvs_item.py b/cvs2svn_lib/cvs_item.py index beabd7c..995fb76 100644 --- a/cvs2svn_lib/cvs_item.py +++ b/cvs2svn_lib/cvs_item.py @@ -46,7 +46,7 @@ class CVSRevision(CVSItem): id, cvs_file, timestamp, metadata_id, prev_id, next_id, - op, rev, deltatext_exists, + op, rev, deltatext_exists, sha, lod, first_on_branch, tag_ids, branch_ids, closed_symbol_ids): """Initialize a new CVSRevision object. @@ -61,6 +61,7 @@ class CVSRevision(CVSItem): OP --> (char) OP_ADD, OP_CHANGE, or OP_DELETE REV --> (string) this CVS rev, e.g., '1.3' DELTATEXT_EXISTS--> (bool) true iff non-empty deltatext + SHA --> sha1 of git revision LOD --> (LineOfDevelopment) LOD where this rev occurred FIRST_ON_BRANCH --> (bool) true iff the first rev on its branch TAG_IDS --> (list of int) ids of all tags on this revision @@ -79,6 +80,7 @@ class CVSRevision(CVSItem): self.prev_id = prev_id self.next_id = next_id self.deltatext_exists = deltatext_exists + self.sha = sha self.lod = lod self.first_on_branch = first_on_branch self.tag_ids = tag_ids @@ -113,6 +115,7 @@ class CVSRevision(CVSItem): self.op, self.rev, self.deltatext_exists, + self.sha, lod_id, self.first_on_branch, ' '.join(['%x' % id for id in self.tag_ids]), @@ -122,7 +125,7 @@ class CVSRevision(CVSItem): def __setstate__(self, data): (self.id, cvs_file_id, self.timestamp, self.metadata_id, self.prev_id, self.next_id, self.op, self.rev, - self.deltatext_exists, lod_id, self.first_on_branch, + self.deltatext_exists, self.sha, lod_id, self.first_on_branch, tag_ids, branch_ids, closed_symbol_ids) = data self.cvs_file = Ctx()._cvs_file_db.get_file(cvs_file_id) if lod_id is None: diff --git a/cvs2svn_lib/dumpfile_delegate.py b/cvs2svn_lib/dumpfile_delegate.py index fb0606f..03a06d4 100644 --- a/cvs2svn_lib/dumpfile_delegate.py +++ b/cvs2svn_lib/dumpfile_delegate.py @@ -232,67 +232,20 @@ class DumpfileDelegate(SVNRepositoryMirr # If the file has keywords, we must prevent CVS/RCS from expanding # the keywords because they must be unexpanded in the repository, # or Subversion will get confused. - pipe_cmd, pipe = Ctx().project.cvs_repository.get_co_pipe( - c_rev, suppress_keyword_substitution=s_item.has_keywords) + #pipe_cmd, pipe = Ctx().project.cvs_repository.get_co_pipe( + # c_rev, suppress_keyword_substitution=s_item.has_keywords) self.dumpfile.write('Node-path: %s\n' 'Node-kind: file\n' 'Node-action: %s\n' '%s' # no property header if no props - 'Text-content-length: ' % (self._utf8_path(c_rev.svn_path), action, props_header)) - pos = self.dumpfile.tell() - - self.dumpfile.write('0000000000000000\n' - 'Text-content-md5: 00000000000000000000000000000000\n' - 'Content-length: 0000000000000000\n' - '\n') - if prop_contents: self.dumpfile.write(prop_contents) - # Insert a filter to convert all EOLs to LFs if neccessary - if s_item.needs_eol_filter: - data_reader = LF_EOL_Filter(pipe.stdout) - else: - data_reader = pipe.stdout - - # Insert the rev contents, calculating length and checksum as we go. - checksum = md5.new() - length = 0 - while True: - buf = data_reader.read(config.PIPE_READ_SIZE) - if buf == '': - break - checksum.update(buf) - length += len(buf) - self.dumpfile.write(buf) - - pipe.stdout.close() - error_output = pipe.stderr.read() - exit_status = pipe.wait() - if exit_status: - raise FatalError("The command '%s' failed with exit status: %s\n" - "and the following output:\n" - "%s" % (pipe_cmd, exit_status, error_output)) - - # Go back to patch up the length and checksum headers: - self.dumpfile.seek(pos, 0) - # We left 16 zeros for the text length; replace them with the real - # length, padded on the left with spaces: - self.dumpfile.write('%16d' % length) - # 16... + 1 newline + len('Text-content-md5: ') == 35 - self.dumpfile.seek(pos + 35, 0) - self.dumpfile.write(checksum.hexdigest()) - # 35... + 32 bytes of checksum + 1 newline + len('Content-length: ') == 84 - self.dumpfile.seek(pos + 84, 0) - # The content length is the length of property data, text data, - # and any metadata around/inside around them. - self.dumpfile.write('%16d' % (length + len(prop_contents))) - # Jump back to the end of the stream - self.dumpfile.seek(0, 2) + self.dumpfile.write('Git-sha1: %s\n' % (c_rev.sha)) # This record is done (write two newlines -- one to terminate # contents that weren't themselves newline-termination, one to diff --git a/cvs2svn_rcsparse/debug.py b/cvs2svn_rcsparse/debug.py index cfeaf2b..c2d143d 100644 --- a/cvs2svn_rcsparse/debug.py +++ b/cvs2svn_rcsparse/debug.py @@ -1,22 +1,113 @@ -# -*-python-*- # -# Copyright (C) 1999-2006 The ViewCVS Group. All Rights Reserved. +# Copyright (C) 2000-2002 The ViewCVS Group. All Rights Reserved. # # By using this file, you agree to the terms and conditions set forth in -# the LICENSE.html file which can be found at the top level of the ViewVC -# distribution or at http://viewvc.org/license-1.html. +# the LICENSE.html file which can be found at the top level of the ViewCVS +# distribution or at http://viewcvs.sourceforge.net/license-1.html. # -# For more information, visit http://viewvc.org/ +# Contact information: +# Greg Stein, PO Box 760, Palo Alto, CA, 94302 +# gstein@xxxxxxxx, http://viewcvs.sourceforge.net/ +# +# ----------------------------------------------------------------------- +# +# This software is being maintained as part of the ViewCVS project. +# Information is available at: +# http://viewcvs.sourceforge.net/ # # ----------------------------------------------------------------------- """debug.py: various debugging tools for the rcsparse package.""" import time +import re +import sha +import zlib +import os +import struct +from subprocess import Popen,PIPE from __init__ import parse import common +deltaPattern = re.compile(r'^([ad])(\d+)\D+(\d+)$') + +class RevisionsSink(common.Sink): + + def __init__(self, fimport): + self.fimport = fimport + + def set_head_revision(self, revision): + self.git_head = revision + self.git_branches = {} + self.git_next = {} + self.git_text = {} + + def define_revision(self, revision, timestamp, author, state, + branches, next): + self.git_branches[revision] = branches + self.git_next[revision] = next + + def set_revision_info(self, revision, log, text): + self.git_text[revision] = text + + def parse_completed(self): + + def write_file(text): + header = 'blob ' + str(len(text)) + '\0' + sha1 = sha.new(header) + sha1.update(text) + name = sha1.hexdigest() + + print 'length is ', len(text) + self.fimport.stdin.write(struct.pack("l",len(text))) + self.fimport.stdin.write(text) + + def process_diffs(lines, deltas): + stack = [] + delta = 0 + while delta < len(deltas): + ops = deltaPattern.search(deltas[delta]).groups() + + delta += 1 + x = int(ops[1]) - 1 + y = int(ops[2]) + stack.append([ops[0], x, y, delta]) + + if ops[0] == 'a': + delta += y + + while stack != []: + ops = stack.pop() + if ops[0] == 'd': + lines[ops[1] : ops[1] + ops[2]] = {} + elif ops[0] == 'a': + lines[ops[1] + 1: ops[1] + 1] = deltas[ops[3] : ops[3] + ops[2]] + + def process_revs(lines, revision): + + while revision: + deltas = self.git_text[revision].split('\n') + deltas.pop() + if len(deltas) > 0: + process_diffs(lines, deltas) + + write_file(''.join(lines)) + + if len(self.git_branches[revision]): + for branch in self.git_branches[revision]: + process_revs(lines[:], branch) + + revision = self.git_next[revision] + + revision = self.git_head + + write_file(self.git_text[revision]) + + lines = self.git_text[revision].splitlines(True) + revision = self.git_next[revision] + if revision: + process_revs(lines, revision) class DebugSink(common.Sink): def set_head_revision(self, revision): @@ -46,7 +137,7 @@ class DebugSink(common.Sink): def set_revision_info(self, revision, log, text): print 'revision:', revision print ' log:', log - print ' text:', text[:100], '...' + print ' text:', text class DumpSink(common.Sink): @@ -90,9 +181,17 @@ class DumpSink(common.Sink): print 'parse_completed' +def debug_file(fname): + parse(open(fname, 'rb'), DebugSink()) + def dump_file(fname): parse(open(fname, 'rb'), DumpSink()) +def revisions_file(fname): + fimport = Popen(['git-fast-import', 'testme'], stdin = PIPE) + parse(open(fname, 'rb'), RevisionsSink(fimport)) + fimport.stdin.close() + def time_file(fname): f = open(fname, 'rb') s = common.Sink() @@ -116,7 +215,11 @@ if __name__ == '__main__': _usage() if sys.argv[1] == 'dump': dump_file(sys.argv[2]) + elif sys.argv[1] == 'debug': + debug_file(sys.argv[2]) elif sys.argv[1] == 'time': time_file(sys.argv[2]) + elif sys.argv[1] == 'revisions': + revisions_file(sys.argv[2]) else: _usage() diff --git a/cvs2svn_rcsparse/default.py b/cvs2svn_rcsparse/default.py index 14c9958..4e108f4 100644 --- a/cvs2svn_rcsparse/default.py +++ b/cvs2svn_rcsparse/default.py @@ -24,7 +24,7 @@ class _TokenStream: # the algorithm is about the same speed for any CHUNK_SIZE chosen. # grab a good-sized chunk, but not too large to overwhelm memory. # note: we use a multiple of a standard block size - CHUNK_SIZE = 192 * 512 # about 100k + CHUNK_SIZE = 4096 * 512 # about 2MB # CHUNK_SIZE = 5 # for debugging, make the function grind...