On 07/14/2011 09:16 AM, Michael Haggerty wrote: > I have noticed that "git filter-branch" gets pathologically slow when it > operates on a repository that has many references in a complicated > directory hierarchy. The time seems to go like O(N^3), where N is the > number of references being rewritten. Correction and new information... Correction: The test script that I attached to the last email was configured incorrectly. [Note to self: thunderbird reads attached files at the moment the email is *sent*, not the moment that the attachment is added in the compose window.] The corrected script is attached. New information: Once I have "primed" a git repository using the attached script, a command like the following takes about 90 ms: git rev-parse refs/heads/a4/b1/c0414^0 strace reveals that it is calling stat64() then lstat64() on every directory and every file under .git/refs. By contrast, git rev-parse refs/heads/a4/b1/c0414 goes more or less straight to the file .git/refs/tags/refs/heads/a4/b1/c0414, and finishes in a few milliseconds. Michael -- Michael Haggerty mhagger@xxxxxxxxxxxx http://softwareswirl.blogspot.com/
#! /usr/bin/python import sys import os import subprocess import shutil import time GIT_REPO = '/tmp/git-time-test' FILENAME1 = os.path.join(GIT_REPO, 'a.txt') FILENAME2 = os.path.join(GIT_REPO, 'b.txt') N = 1000 #REF_STYLE = 'flat' #REF_STYLE = 'subdir' REF_STYLE = 'shard' ACTION = 'filter' #ACTION = 'add_refs' #ACTION = 'move_refs' PACK_REFS = True def run(*args, **kw): if True: sys.stderr.write('Running %s\n' % (repr(args),)) subprocess.check_call(*args, cwd=GIT_REPO, **kw) def get_refnames(*patterns): cmd = ['git', 'for-each-ref', '--format=%(refname)'] + list(patterns) if True: sys.stderr.write('Running %s\n' % (repr(cmd),)) p = subprocess.Popen( cmd, stdout=subprocess.PIPE, cwd=GIT_REPO, ) (out, err) = p.communicate() return [l.strip() for l in out.splitlines()] def get_name(i): name = '%04d' % (i,) if REF_STYLE == 'flat': return 'refs/heads/b%s' % (name,) elif REF_STYLE == 'subdir': return 'refs/heads/a/b/c%s' % (name,) elif REF_STYLE == 'shard': return 'refs/heads/a%s/b%s/c%s' % (name[-1], name[-2], name,) def make_repo(): if os.path.exists(GIT_REPO): shutil.rmtree(GIT_REPO) subprocess.check_call(['git', 'init', GIT_REPO]) run(['git', 'config', 'user.name', 'Lou User']) run(['git', 'config', 'user.email', 'luser@xxxxxxxxxxx']) open(FILENAME1, 'w') run(['git', 'add', '-N', FILENAME1]) for i in range(N): open(FILENAME1, 'w').write('%d\n' % (i,)) run(['git', 'commit', '-a', '-m', 'Commit %d' % (i,)]) if ACTION != 'filter': run(['git', 'update-ref', 'refs/tags/%04d' % (i,), 'HEAD']) run(['git', 'update-ref', get_name(i), 'HEAD']) def pack_refs(): run(['git', 'pack-refs', '--all', '--prune']) def filter(): run(['git', 'filter-branch', '--tree-filter', 'cp a.txt b.txt', '--', '--all']) def add_refs(): refnames = get_refnames('refs/tags') for refname in refnames: assert not refname.endswith('master') i = int(refname.rsplit('/', 1)[-1]) new_refname = get_name(i) run(['git', 'update-ref', '-m', 'add ref', new_refname, refname]) def move_refs(): refnames = get_refnames('refs/tags') for refname in refnames: assert not refname.endswith('master') i = int(refname.rsplit('/', 1)[-1]) new_refname = get_name(N - 1 - i) run(['git', 'update-ref', '-m', 'move ref', new_refname, refname]) def main(args): make_repo() if PACK_REFS: pack_refs() t = time.time() if ACTION == 'filter': filter() elif ACTION == 'add_refs': add_refs() elif ACTION == 'move_refs': move_refs() print 'time to process: %.3f s' % (time.time() - t,) main(sys.argv[1:])