[PATCH/RFC] git-p4: stream from perforce to speed up clones

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Change commit() to stream data from Perforce and into fast-import
rather than reading into memory first, and then writing out. This
hugely reduces the memory requirements when cloning non-incrementally.

Signed-off-by: Luke Diamand <luke@xxxxxxxxxxx>
---
I've modified git-p4 so that it streams/pipes data into fast-import rather
that reading everything into memory first. The old scheme meant that
for a large repository (mine is around 2G) my PC just grinds to a
halt and never actually finishes. With this change it takes around ten
minutes.

Obviously there's scope here for breaking git-p4 quite badly. I have a
small test harness that covers regular and incremental imports, file
deletion and keyword expansion, but perhaps there are other aspects
I need to look at as well?

 contrib/fast-import/git-p4 |  164 ++++++++++++++++++++++++++++++++++++--------
 1 files changed, 136 insertions(+), 28 deletions(-)

diff --git a/contrib/fast-import/git-p4 b/contrib/fast-import/git-p4
index 342529d..f415ad0 100755
--- a/contrib/fast-import/git-p4
+++ b/contrib/fast-import/git-p4
@@ -1008,6 +1008,141 @@ class P4Sync(Command):
 
         return filesForCommit
 
+    # output one file from the P4 stream
+    # - helper for streamP4Files
+
+    def streamOneP4File(self, file, contents, branchPrefixes):
+        if verbose:
+            sys.stderr.write("%s\n" %  file["depotFile"])
+
+        relPath = self.stripRepoPath(file['depotFile'], branchPrefixes)
+
+        mode = "644"
+        if isP4Exec(file["type"]):
+            mode = "755"
+        elif file["type"] == "symlink":
+            mode = "120000"
+            # p4 print on a symlink contains "target\n", so strip it off
+            last = contents.pop()
+            last = last[:-1]
+            contents.append(last)
+
+        if self.isWindows and file["type"].endswith("text"):
+            mangled = []
+            for data in contents:
+                data = data.replace("\r\n", "\n")
+                mangled.append(data)
+            contents = mangled
+
+        if file['type'] in ('text+ko', 'unicode+ko', 'binary+ko'):
+            contents = map(lambda text: re.sub(r'(?i)\$(Id|Header):[^$]*\$',r'$\1$', text), contents)
+        elif file['type'] in ('text+k', 'ktext', 'kxtext', 'unicode+k', 'binary+k'):
+            contents = map(lambda text: re.sub(r'\$(Id|Header|Author|Date|DateTime|Change|File|Revision):[^$\n]*\$',r'$\1$', text), contents)
+
+        self.gitStream.write("M %s inline %s\n" % (mode, relPath))
+
+        # total length...
+        length = 0
+        for d in contents:
+            length = length + len(d)
+
+        self.gitStream.write("data %d\n" % length)
+        for d in contents:
+            self.gitStream.write(d)
+        self.gitStream.write("\n")
+
+    def streamOneP4Deletion(self, file, branchPrefixes):
+        if verbose:
+            sys.stderr.write("delete %s\n" %  file["path"])
+
+        relPath = self.stripRepoPath(file['path'], branchPrefixes)
+
+        self.gitStream.write("D %s\n" % relPath)
+
+    # Stream directly from "p4 files" into "git fast-import"
+    def streamP4Files(self, files, branchPrefixes):
+        filesForCommit = []
+        filesToRead = []
+        filesToDelete = []
+
+        for f in files:
+            includeFile = True
+            for val in self.clientSpecDirs:
+                if f['path'].startswith(val[0]):
+                    if val[1] <= 0:
+                        includeFile = False
+                    break
+
+            if includeFile:
+                filesForCommit.append(f)
+                if f['action'] not in ('delete', 'purge'):
+                    filesToRead.append(f)
+                else:
+                    filesToDelete.append(f)
+
+        filedata = []
+
+        # deleted files...
+        for f in filesToDelete:
+            self.streamOneP4Deletion(f, branchPrefixes)
+
+        if len(filesToRead) > 0:
+            stdin_file = tempfile.TemporaryFile(prefix='p4-stdin', mode='w+b')
+            stdin_file.write('\n'.join(['%s#%s' % (f['path'], f['rev'])
+                                                  for f in filesToRead]))
+            stdin_file.flush()
+            stdin_file.seek(0)
+            try:
+                p4 = subprocess.Popen('p4 -G -x - print',
+                                        shell=True,
+                                        stdin=stdin_file,
+                                        stdout=subprocess.PIPE);
+            except OSError,e:
+                print >> sys.stderr, "p4 print failed:", e
+
+            file = {}
+            contents = []
+            have_file_info = False
+
+            try:
+                while True:
+                    marshalled = marshal.load(p4.stdout)
+
+                    if marshalled.has_key('depotFile') and have_file_info:
+                        # start of a new file - output the old one first
+
+                        if file["type"] == "apple":
+                            print "\nfile %s is a strange apple file that forks. Ignoring" % file['path']
+                            continue
+
+
+                        self.streamOneP4File(file,contents,branchPrefixes)
+                        file = {}
+                        contents = []
+                        have_file_info = False
+
+                    # pick up the new file information... for the
+                    # 'data' field we need to append to our array
+                    for k in marshalled.keys():
+                        if k == 'data':
+                            contents.append(marshalled['data'])
+                        else:
+                            file[k] = marshalled[k]
+
+                    have_file_info = True
+            except EOFError:
+                pass
+
+            # do the last chunk
+
+            if file.has_key('depotFile'):
+                self.streamOneP4File(file,contents,branchPrefixes)
+
+            exitCode = p4.wait()
+            if exitCode != 0:
+                sys.stderr.write("p4 subshell failed getting file data\n")
+                sys.exit(1)
+
     def commit(self, details, files, branch, branchPrefixes, parent = ""):
         epoch = details["time"]
         author = details["user"]
@@ -1023,7 +1158,6 @@ class P4Sync(Command):
                 new_files.append (f)
             else:
                 sys.stderr.write("Ignoring file outside of prefix: %s\n" % path)
-        files = self.readP4Files(new_files)
 
         self.gitStream.write("commit %s\n" % branch)
 #        gitStream.write("mark :%s\n" % details["change"])
@@ -1051,33 +1185,7 @@ class P4Sync(Command):
                 print "parent %s" % parent
             self.gitStream.write("from %s\n" % parent)
 
-        for file in files:
-            if file["type"] == "apple":
-                print "\nfile %s is a strange apple file that forks. Ignoring!" % file['path']
-                continue
-
-            relPath = self.stripRepoPath(file['path'], branchPrefixes)
-            if file["action"] in ("delete", "purge"):
-                self.gitStream.write("D %s\n" % relPath)
-            else:
-                data = file['data']
-
-                mode = "644"
-                if isP4Exec(file["type"]):
-                    mode = "755"
-                elif file["type"] == "symlink":
-                    mode = "120000"
-                    # p4 print on a symlink contains "target\n", so strip it off
-                    data = data[:-1]
-
-                if self.isWindows and file["type"].endswith("text"):
-                    data = data.replace("\r\n", "\n")
-
-                self.gitStream.write("M %s inline %s\n" % (mode, relPath))
-                self.gitStream.write("data %s\n" % len(data))
-                self.gitStream.write(data)
-                self.gitStream.write("\n")
-
+        self.streamP4Files(new_files,branchPrefixes)
         self.gitStream.write("\n")
 
         change = int(details["change"])
-- 
1.6.3.GIT

--
To unsubscribe from this list: send the line "unsubscribe git" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[Index of Archives]     [Linux Kernel Development]     [Gcc Help]     [IETF Annouce]     [DCCP]     [Netdev]     [Networking]     [Security]     [V4L]     [Bugtraq]     [Yosemite]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Linux SCSI]     [Fedora Users]