This is meant to be a post-processing step after SVN-to-Git conversion by SubGit (https://subgit.com/), which creates a ".gitsvnextmodules" file that we will use for svn:externals conversion. Signed-off-by: Markus Heidelberg <markus.heidelberg@xxxxxx> --- The SVN-to-Git mapping file format might have to be changed in the future when implementing other features, don't know yet. At least I wanted to publish this script now that the basics work. Maybe there are even some other late-adopters for whose this script might still be useful. I'm eager for hints where descriptions/comments are confusing or incomplete. This script was already used for migration of an SVN repository in our company. Relates to https://github.com/newren/git-filter-repo/issues/14 contrib/filter-repo-demos/README.md | 1 + .../filter-repo-demos/convert-svnexternals | 587 ++++++++++++++++++ 2 files changed, 588 insertions(+) create mode 100644 contrib/filter-repo-demos/convert-svnexternals diff --git a/contrib/filter-repo-demos/README.md b/contrib/filter-repo-demos/README.md index ef021b1..9612428 100644 --- a/contrib/filter-repo-demos/README.md +++ b/contrib/filter-repo-demos/README.md @@ -16,6 +16,7 @@ lint-history |Run some lint command on all non-binary files in history. clean-ignore |Delete files from history which match current gitignore rules. filter-lamely (or filter‑branch‑ish) |A nearly bug compatible re-implementation of filter-branch (the git testsuite passes using it instead of filter-branch), with some performance tricks to make it several times faster (though it's still glacially slow compared to filter-repo). bfg-ish |A re-implementation of most of BFG Repo Cleaner, with new features and bug fixes. +convert-svnexternals |Insert Git submodules according to SVN externals. ## Purpose diff --git a/contrib/filter-repo-demos/convert-svnexternals b/contrib/filter-repo-demos/convert-svnexternals new file mode 100644 index 0000000..0c81507 --- /dev/null +++ b/contrib/filter-repo-demos/convert-svnexternals @@ -0,0 +1,587 @@ +#!/usr/bin/env python3 + +""" +This is a program that will insert Git submodules according to SVN externals +definitions (svn:externals properties) from the original Subversion repository +throughout the history. + +Information about the externals is obtained from the ".gitsvnextmodules" file +created during SVN-to-Git conversion by SubGit (https://subgit.com/). Its +config option "translate.externals=true" had to be used therefore. + +Actual modifications: +- Insert gitlinks (mode 160000) into the tree. +- Add .gitmodules file with relevant sections. +- Remove sections converted to submodules from .gitsvnextmodules file + and delete it if empty. + +.gitsvnextmodules example: +[submodule "somedir/extdir"] + path = somedir/extdir + owner = somedir + url = https://svn.example.com/somesvnrepo/trunk + revision = 1234 + branch = / + fetch = :refs/remotes/git-svn + remote = svn + type = dir + +Resulting addition in "somedir" tree (cat-file pretty-print format): +160000 commit 1234123412341234123412341234123412341234 extdir + +Resulting .gitmodules entry: +[submodule "somedir/extdir"] + path = somedir/extdir + url = https://git.example.com/somegitrepo.git + +SVN-to-Git mapping file: +Can be created from SubGit's "refs/svn/map". +One line per mapping in following format: +<svn url> TAB <svn rev> TAB <git url> TAB <git commit> TAB <state> +- Leading '#' can be used for comments. +- <svn url> must not contain a trailing slash. +- <state> has to be "commit" to be usable, but can be "missing" if <git commit> + does not exist in the repository anymore. Adopted from git-cat-file output. +Example: +https://svn.example.com/somesvnrepo/trunk 1234 https://git.example.com/somegitrepo.git 1234123412341234123412341234123412341234 commit + +Features: +- Repeatedly added/removed externals will be handled properly. +- Externals replaced by directly added files and vice versa will be handled + properly. + +Caveats: +- This script must NOT be run repeatedly. A second invocation would lead to a + different result in case the externals could only be converted partially. +- Inconsistent SVN repositories (with failing checkout) not handled, i.e. + - normal directory and external with the same path + - external path not existing for the given revision +- No attention was paid to non-ASCII and special characters in gitlink paths, + might cause problems. +- There is no error handling for mandatory options missing in .gitsvnextmodules + file. The script would crash in case of such buggy files, but that shouldn't + happen in practice. + +TODO: +- Add external files directly. +- Alternatively add external directories directly instead of using a submodule. +""" + +""" +Please see the + ***** API BACKWARD COMPATIBILITY CAVEAT ***** +near the top of git-filter-repo. +""" + +import argparse +import os +import sys +import shutil +import subprocess +import configparser +from urllib.parse import urlsplit + +try: + import git_filter_repo as fr +except ImportError: + raise SystemExit("Error: Couldn't find git_filter_repo.py. Did you forget to make a symlink to git-filter-repo named git_filter_repo.py or did you forget to put the latter in your PYTHONPATH?") + +svn_root_url = "" +svn_git_mappings = [] + +def parse_args(): + """ + Parse and return arguments for this script. + + Also do some argument sanity checks and adaptions. + """ + parser = argparse.ArgumentParser( + description="Add Git submodules according to svn:externals from .gitsvnextmodules. " + "As preparation for this conversion process, an analysis can be performed.") + + parser.add_argument('--force', '-f', action='store_true', + help="Rewrite repository history even if the current repo does not " + "look like a fresh clone.") + parser.add_argument('--refs', nargs='+', + help="Limit history rewriting to the specified refs. Option is directly " + "forwarded to git-filter-repo, see there for details and caveats. " + "Use for debugging purposes only!") + parser.add_argument('--svn-root-url', + help="Root URL of the corresponding SVN repository, " + "needed for conversion of relative to absolute external URLs.") + + analysis = parser.add_argument_group(title="Analysis") + analysis.add_argument('--analyze', action='store_true', + help="Analyze repository history and create auxiliary files for conversion process.") + analysis.add_argument('--report-dir', type=os.fsencode, + help="Directory to write report, defaults to GIT_DIR/filter-repo/svnexternals, " + "refuses to run if exists, --force delete existing dir first.") + + conversion = parser.add_argument_group(title="Conversion") + conversion.add_argument('--svn-git-mapfiles', type=os.fsencode, nargs='+', metavar='MAPFILE', + help="Files with SVN-to-Git revision mappings for SVN externals conversion.") + + args = parser.parse_args() + + if args.analyze and args.svn_git_mapfiles: + raise SystemExit("Error: --svn-git-mapfiles makes no sense with --analyze.") + + if not args.analyze and not args.svn_git_mapfiles: + raise SystemExit("Error: --svn-git-mapfiles is required for the conversion process.") + + return args + +def read_mappings(mapfiles): + """ + Read files with SVN-to-Git mappings and return a list of mappings from it. + """ + mappings = [] + for mapfile in mapfiles: + with open(mapfile, "rb") as f: + for line in f: + line = line.rstrip(b'\r\n') + + # Skip blank and comment lines + if not line or line.startswith(b'#'): + continue + + # Convert to string for use with configparser later + line = line.decode() + + # Parse the line + fields = line.split('\t', 4) + mapping = {'svn_url': fields[0], + 'svn_rev': int(fields[1]), + 'git_url': fields[2], + 'git_commit': fields[3], + 'state': fields[4]} + + mappings.append(mapping) + return mappings + +cat_file_process = None +def parse_config(blob_id): + """ + Create a configparser object for a .gitsvnextmodules/.gitmodules file from + its blob ID. + """ + parsed_config = configparser.ConfigParser() + + if blob_id is not None: + # Get the blob contents + cat_file_process.stdin.write(blob_id + b'\n') + cat_file_process.stdin.flush() + objhash, objtype, objsize = cat_file_process.stdout.readline().split() + contents_plus_newline = cat_file_process.stdout.read(int(objsize)+1) + + # Parse it + parsed_config.read_string(contents_plus_newline.decode()) + + return parsed_config + +def create_blob(parsed_config): + """ + Create a filter-repo blob object from a .gitsvnextmodules/.gitmodules + configparser object according to Git config style. + """ + lines = [] + for sec in parsed_config.sections(): + lines.append("[" + sec + "]\n") + for opt in parsed_config.options(sec): + lines.append("\t" + opt + " = " + parsed_config[sec][opt] + "\n") + + return fr.Blob(''.join(lines).encode()) + +def get_git_url(svn_url): + """ + Get the Git URL for a corresponding SVN URL. + """ + for entry in svn_git_mappings: + if entry['svn_url'] == svn_url: + return entry['git_url'] + else: + return None + +def get_git_commit_hash(svn_url, svn_rev): + """ + Get the Git commit hash for its corresponding SVN URL+revision. + + The mapping is not restricted to the exact revision, but also uses the next + lower revision found. Needed when the revision was set to that of the root + URL instead of to that of the specific subdirectory (e.g. trunk). TortoiseSVN + behaves so when setting the external to HEAD. + """ + ent = None + rev = 0 + + for entry in svn_git_mappings: + if (entry['svn_url'] == svn_url + and entry['svn_rev'] <= svn_rev + and entry['svn_rev'] > rev): + ent = entry + rev = entry['svn_rev'] + + if ent is not None and ent['state'] == "commit": + return ent['git_commit'] + else: + return None + +def get_absolute_svn_url(svnext_url, svn_root_url): + """ + Convert a relative svn:externals URL to an absolute one. + + If the format is unsupported, return the URL unchanged with success=False. + If no root URL is given or the URL is absolute already, return it unchanged. + + In all cases, even if returned "unchanged", trailing slashes are removed. + """ + # Remove trailing slash(es) + svnext_url = svnext_url.rstrip("/") + svn_root_url = svn_root_url.rstrip("/") + + # Normalize URLs in relative format + svn_root_parsed = urlsplit(svn_root_url) + if svnext_url.startswith(("../", "^/../")): # unsupported + return (False, svnext_url) + elif not svn_root_url: + pass # unchanged + elif svnext_url.startswith("^/"): + svnext_url = svn_root_url + svnext_url[1:] + elif svnext_url.startswith("//"): + svnext_url = svn_root_parsed.scheme + ":" + svnext_url + elif svnext_url.startswith("/"): + svnext_url = svn_root_parsed.scheme + "://" + svn_root_parsed.netloc + svnext_url + + return True, svnext_url + +def add_submodule_tree_entry(commit, parsed_config, section): + """ + Add a submodule entry to the tree of a Git commit. + + SVN externals information obtained from parsed .gitsvnextmodules file. + """ + # Skip type=file (SVN file external), not possible as submodule + if parsed_config[section]['type'] != 'dir': + return False + + success, svn_url = get_absolute_svn_url(parsed_config[section]['url'], svn_root_url) + # Skip unsupported URL format + if not success: + return False + + # Get SVN revision + if parsed_config.has_option(section, 'revision'): + svn_rev = int(parsed_config[section]['revision']) + else: + # TODO: revision has to be guessed according to commit timestamp, skip for now + return False + + # SVN url+revision mapping to Git commit + git_hash = get_git_commit_hash(svn_url, svn_rev) + # Skip missing or unusable mapping + if git_hash is None: + return False + git_hash = git_hash.encode() + + dirname = parsed_config[section]['path'].encode() + + # Add gitlink to tree + commit.file_changes.append(fr.FileChange(b'M', dirname, git_hash, b'160000')) + + return True + +def get_commit_map_path(): + """ + Return path to commit-map file. + """ + git_dir = fr.GitUtils.determine_git_dir(b'.') + return os.path.join(git_dir, b'filter-repo', b'commit-map') + +def parse_commit_map(commit_map_file): + """ + Parse commit-map file and return a dictionary. + """ + parsed_map = {} + with open(commit_map_file, "rb") as f: + for line in f: + line = line.rstrip(b'\r\n') + + # Skip blank lines + if not line: + continue + + # Store old/new commits, also the "old"/"new" header in the first line + old, new = line.split() + parsed_map[old] = new + return parsed_map + +def merge_commit_maps(old_commit_map, new_commit_map): + """ + Merge old and new commit-map by omitting intermediate commits. + + Return the merged dictionary. + """ + merged_map = {} + for (key, old_val) in old_commit_map.items(): + new_val = new_commit_map[old_val] if old_val in new_commit_map else old_val + merged_map[key] = new_val + return merged_map + +def write_commit_map(commit_map, commit_map_file): + """ + Write commit-map dictionary to file. + """ + with open(commit_map_file, 'wb') as f: + for (old, new) in commit_map.items(): + f.write(b'%-40s %s\n' % (old, new)) + +def create_report_dir(args): + """ + Create the directory for analysis report. + """ + if args.report_dir: + reportdir = args.report_dir + else: + git_dir = fr.GitUtils.determine_git_dir(b'.') + + # Create the report directory as necessary + results_tmp_dir = os.path.join(git_dir, b'filter-repo') + if not os.path.isdir(results_tmp_dir): + os.mkdir(results_tmp_dir) + reportdir = os.path.join(results_tmp_dir, b'svnexternals') + + if os.path.isdir(reportdir): + if args.force: + sys.stdout.write("Warning: Removing recursively: \"%s\"" % fr.decode(reportdir)) + shutil.rmtree(reportdir) + else: + sys.stdout.write("Error: dir already exists (use --force to delete): \"%s\"\n" % fr.decode(reportdir)) + sys.exit(1) + + os.mkdir(reportdir) + + return reportdir + +analysis = {'dir_ext_orig': [], + 'dir_ext_abs': [], + 'file_ext_orig': [], + 'file_ext_abs': []} +def write_analysis(reportdir): + """ + Prepare analysis and write it to files in report directory. + """ + analysis['dir_ext_orig'].sort() + analysis['dir_ext_abs'].sort() + analysis['file_ext_orig'].sort() + analysis['file_ext_abs'].sort() + + sys.stdout.write("Writing reports to %s..." % fr.decode(reportdir)) + sys.stdout.flush() + + with open(os.path.join(reportdir, b"dir-externals-original.txt"), 'wb') as f: + for url in analysis['dir_ext_orig']: + f.write(("%s\n" % url).encode()) + + with open(os.path.join(reportdir, b"dir-externals-absolute.txt"), 'wb') as f: + for url in analysis['dir_ext_abs']: + f.write(("%s\n" % url).encode()) + + with open(os.path.join(reportdir, b"file-externals-original.txt"), 'wb') as f: + for url in analysis['file_ext_orig']: + f.write(("%s\n" % url).encode()) + + with open(os.path.join(reportdir, b"file-externals-absolute.txt"), 'wb') as f: + for url in analysis['file_ext_abs']: + f.write(("%s\n" % url).encode()) + + sys.stdout.write("done.\n") + +def analyze_externals(commit, metadata): + """ + Generate/extend analysis of SVN externals for a Git commit. + + Used as filter-repo commit callback. + """ + for change in commit.file_changes: + if change.filename == b'.gitsvnextmodules' and change.type == b'M': + gitsvnextmodules = parse_config(change.blob_id) + + for sec in gitsvnextmodules.sections(): + url = gitsvnextmodules[sec]['url'] + success, abs_url = get_absolute_svn_url(url, svn_root_url) + + # List of svn:externals URLs, also add the URL to the absolute list if + # conversion was not successful + if gitsvnextmodules[sec]['type'] == 'dir': + if url not in analysis['dir_ext_orig']: + analysis['dir_ext_orig'].append(url) + if abs_url not in analysis['dir_ext_abs']: + analysis['dir_ext_abs'].append(abs_url) + else: + if url not in analysis['file_ext_orig']: + analysis['file_ext_orig'].append(url) + if abs_url not in analysis['file_ext_abs']: + analysis['file_ext_abs'].append(abs_url) + +def insert_submodules(commit, metadata): + """ + Insert submodules for a Git commit. + + Used as filter-repo commit callback. + + Since .gitsvnextmodules just contains the svn:externals state for the given + commit, we cannot derive specific changes from that file. + So we can only add/modify the gitlinks according to .gitsvnextmodules + (without knowing whether adding a new or modifying an existing or even + "modifying" an unchanged submodule, but none of that really matters). + We do not have information about deleted externals, those will be handled in + a separate filter run afterwards. + + The .gitmodules file however will already be correct in this function because + we don't need to know about specific changes to add, modify or delete it. + """ + for change in commit.file_changes: + if change.filename == b'.gitsvnextmodules' and change.type in (b'M', b'D'): + gitsvnextmodules = parse_config(change.blob_id) + gitmodules = configparser.ConfigParser() + + # Add gitlinks to the tree and prepare .gitmodules file content + for sec in gitsvnextmodules.sections(): + if add_submodule_tree_entry(commit, gitsvnextmodules, sec): + # Gitlink added + # -> Add this entry to .gitmodules as well + + # Create the section name string manually, do not rely on + # .gitsvnextmodules to always use the proper section name. + sec_name = 'submodule "' + gitsvnextmodules[sec]['path'] + '"' + gitmodules[sec_name] = {} + + # submodule.<name>.path + gitmodules[sec_name]['path'] = gitsvnextmodules[sec]['path'] + + # submodule.<name>.url + success, svn_url = get_absolute_svn_url(gitsvnextmodules[sec]['url'], svn_root_url) + git_url = get_git_url(svn_url) + if git_url is not None: + gitmodules[sec_name]['url'] = git_url + else: + # Abort, but this will not happen in practice, catched in + # add_submodule_tree_entry() via get_git_commit_hash() already. + raise SystemExit("Error: No Git URL found in mapping although a commit hash could be found.") + + # Write blob and adapt tree for .gitmodules + if gitmodules.sections(): + # Create a blob object from the content and add it to the tree. + blob = create_blob(gitmodules) + filter.insert(blob) + commit.file_changes.append(fr.FileChange(b'M', b'.gitmodules', blob.id, b'100644')) + else: + # Delete the file, even if a "git rm" of all submodules keeps it empty. + commit.file_changes.append(fr.FileChange(b'D', b'.gitmodules')) + +def delete_submodules(commit, metadata): + """ + Delete submodules from a Git commit. + + Used as filter-repo commit callback. + + Delete all submodules (inserted in the previous filter run) without an entry + in .gitsvnextmodules, these were real deletions of externals, which couldn't + be detected before. + Only the tree entries have to be removed because the .gitmodules file is + already in correct state from previous filter run. + """ + for change in commit.file_changes: + if change.filename == b'.gitsvnextmodules' and change.type in (b'M', b'D'): + gitsvnextmodules = parse_config(change.blob_id) + + # Search for all submodules in the tree + output = subprocess.check_output('git ls-tree -d -r -z'.split() + [commit.original_id]) + for line in output.split(b'\x00'): + if not line: + continue + mode_objtype_objid, dirname = line.split(b'\t', 1) + mode, objtype, objid = mode_objtype_objid.split(b' ') + if mode == b'160000' and objtype == b'commit': + # Submodule found + # -> Delete it if there is no corresponding entry in + # .gitsvnextmodules, keep/reinsert it otherwise + for sec in gitsvnextmodules.sections(): + if gitsvnextmodules[sec]['path'].encode() == dirname: + # Reinsert it, might have been deleted in previous commits + if add_submodule_tree_entry(commit, gitsvnextmodules, sec): + # And remove the config section because this external has been + # converted + gitsvnextmodules.remove_section(sec) + break + else: + # Delete it + commit.file_changes.append(fr.FileChange(b'D', dirname)) + + # Rewrite .gitsvnextmodules to contain the unhandled externals only, + # delete it if empty (all externals converted). + if gitsvnextmodules.sections(): + # Create a blob object from the content and replace the original one. + blob = create_blob(gitsvnextmodules) + filter.insert(blob) + change.blob_id = blob.id + else: + if change.type == b'M': + # File became empty, delete it + commit.file_changes.append(fr.FileChange(b'D', b'.gitsvnextmodules')) + break # avoid endless for loop + #else: + # File was empty already, delete command already present in stream + +my_args = parse_args() + +# Use passed URL without trailing slash(es) +if my_args.svn_root_url: + svn_root_url = my_args.svn_root_url.rstrip("/") + +# Arguments forwarded to filter-repo +extra_args = [] +if my_args.force: + extra_args = ['--force'] +if my_args.refs: + extra_args += ['--refs'] + my_args.refs + +cat_file_process = subprocess.Popen(['git', 'cat-file', '--batch'], + stdin = subprocess.PIPE, + stdout = subprocess.PIPE) +if my_args.analyze: + # Analysis + reportdir = create_report_dir(my_args) + + fr_args = fr.FilteringOptions.parse_args(['--dry-run'] + + extra_args) + filter = fr.RepoFilter(fr_args, commit_callback=analyze_externals) + filter.run() + + write_analysis(reportdir) +else: + # Conversion + svn_git_mappings = read_mappings(my_args.svn_git_mapfiles) + + # There are no references to commit hashes in commit messages because this + # script runs on a Git repository converted from a Subversion repository. + fr_args = fr.FilteringOptions.parse_args(['--preserve-commit-hashes', + '--preserve-commit-encoding', + '--replace-refs', 'update-no-add'] + + extra_args) + filter = fr.RepoFilter(fr_args, commit_callback=insert_submodules) + filter.run() + + # Store commit-map after first run + first_commit_map = parse_commit_map(get_commit_map_path()) + + filter = fr.RepoFilter(fr_args, commit_callback=delete_submodules) + filter.run() + + # Update commit-map after second run, based on original IDs + second_commit_map = parse_commit_map(get_commit_map_path()) + merged_commit_map = merge_commit_maps(first_commit_map, second_commit_map) + write_commit_map(merged_commit_map, get_commit_map_path()) + +cat_file_process.stdin.close() +cat_file_process.wait() -- 2.37.3