[PATCH v2] Enhance unpack-objects for live repo and large objects

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Nicolas Pitre wrote:
> I wouldn't mind a _separate_ tool that would load a pack index,
> determine object sizes from it, and then extract big objects to write
> them as loose objects ...

Add two new options to git-unpack-objects:

--force:: Loose objects will be created even if they
already exist in the repository packed.

--min-blob-size=<n>::  Unpacking is only done for objects
larger than or equal to n kB (uncompressed size).

Passes the tests in "t" and tested on big objects.
Based on "next" but should apply to "master" as well.

Signed-off-by: Dana L. How <danahow@xxxxxxxxx>
---
 Documentation/git-unpack-objects.txt |   23 +++++++++++++++++++----
 builtin-unpack-objects.c             |   29 +++++++++++++++++++++++++++--
 cache.h                              |    2 ++
 sha1_file.c                          |   16 ++++++++++++----
 4 files changed, 60 insertions(+), 10 deletions(-)

diff --git a/Documentation/git-unpack-objects.txt b/Documentation/git-unpack-objects.txt
index ff6184b..3df2641 100644
--- a/Documentation/git-unpack-objects.txt
+++ b/Documentation/git-unpack-objects.txt
@@ -8,7 +8,7 @@ git-unpack-objects - Unpack objects from a packed archive
 
 SYNOPSIS
 --------
-'git-unpack-objects' [-n] [-q] [-r] <pack-file
+'git-unpack-objects' [-n] [-q] [-r] [-f] [--min-blob-size=N] <pack-file
 
 
 DESCRIPTION
@@ -17,9 +17,12 @@ Read a packed archive (.pack) from the standard input, expanding
 the objects contained within and writing them into the repository in
 "loose" (one object per file) format.
 
-Objects that already exist in the repository will *not* be unpacked
-from the pack-file.  Therefore, nothing will be unpacked if you use
-this command on a pack-file that exists within the target repository.
+By default,  objects that already exist in the repository will *not*
+be unpacked from the pack-file.  Therefore, nothing will be unpacked
+if you use this command on a pack-file that exists within the target
+repository,  unless you specify -f.  If an object already exists
+unpacked in the repository,  it will not be replaced with the copy
+from the pack,  with or without -f.
 
 Please see the `git-repack` documentation for options to generate
 new packs and replace existing ones.
@@ -40,6 +43,18 @@ OPTIONS
 	and make the best effort to recover as many objects as
 	possible.
 
+-f::
+	Allow loose objects to be created in the same repository that
+	contains the packfile.
+
+--min-blob-size=<n>::
+	Smallest loose object to create,  expressed in kB.
+	Blobs smaller than this will not be unpacked.  Default is 0.
+	If you specify this option with a deltified source packfile,
+	the source packfile should reside in the current repository
+	so delta bases too small to unpack are still accessible,  and
+	therefore -f will be needed for anything to be written.
+
 
 Author
 ------
diff --git a/builtin-unpack-objects.c b/builtin-unpack-objects.c
index a6ff62f..b8ee7b5 100644
--- a/builtin-unpack-objects.c
+++ b/builtin-unpack-objects.c
@@ -10,13 +10,16 @@
 #include "progress.h"
 
 static int dry_run, quiet, recover, has_errors;
-static const char unpack_usage[] = "git-unpack-objects [-n] [-q] [-r] < pack-file";
+static const char unpack_usage[] =
+"git-unpack-objects [-n] [-q] [-r] [-f] [--min-blob-size=N] < pack-file";
 
 /* We always read in 4kB chunks. */
 static unsigned char buffer[4096];
 static unsigned int offset, len;
 static off_t consumed_bytes;
 static SHA_CTX ctx;
+static int force = 0;
+uint32_t min_blob_size;
 
 /*
  * Make sure at least "min" bytes are available in the buffer, and
@@ -131,7 +134,18 @@ static void added_object(unsigned nr, enum object_type type,
 static void write_object(unsigned nr, enum object_type type,
 			 void *buf, unsigned long size)
 {
-	if (write_sha1_file(buf, size, typename(type), obj_list[nr].sha1) < 0)
+	/*
+	 * We never need to write it when it's too small.
+	 * Otherwise,  without -f,  we write it only when
+	 * it does not exist in the repository in any form.
+	 * Finally,  with -f,  we write it only when it does
+	 * not exist in the local repository as a loose object.
+	 * In all cases we fill in obj_list[nr].sha1 .
+	 */
+	if (size < min_blob_size)
+		hash_sha1_file(buf, size, typename(type), obj_list[nr].sha1);
+	else if (write_sha1_file_maybe(buf, size, typename(type),
+				       force, obj_list[nr].sha1) < 0)
 		die("failed to write object");
 	added_object(nr, type, buf, size);
 }
@@ -361,6 +375,17 @@ int cmd_unpack_objects(int argc, const char **argv, const char *prefix)
 				recover = 1;
 				continue;
 			}
+			if (!strcmp(arg, "-f")) {
+				force = 1;
+				continue;
+			}
+			if (!prefixcmp(arg, "--min-blob-size=")) {
+				char *end;
+				min_blob_size = strtoul(arg+16, &end, 0) * 1024;
+				if (!arg[16] || *end)
+					usage(unpack_usage);
+				continue;
+			}
 			if (!prefixcmp(arg, "--pack_header=")) {
 				struct pack_header *hdr;
 				char *c;
diff --git a/cache.h b/cache.h
index ec85d93..4994d03 100644
--- a/cache.h
+++ b/cache.h
@@ -343,6 +343,8 @@ extern int sha1_object_info(const unsigned char *, unsigned long *);
 extern void * read_sha1_file(const unsigned char *sha1, enum object_type *type, unsigned long *size);
 extern int hash_sha1_file(const void *buf, unsigned long len, const char *type, unsigned char *sha1);
 extern int write_sha1_file(void *buf, unsigned long len, const char *type, unsigned char *return_sha1);
+extern int write_sha1_file_maybe(void *buf, unsigned long len, const char *type,
+				 int dup_ok, unsigned char *return_sha1);
 extern int pretend_sha1_file(void *, unsigned long, enum object_type, unsigned char *);
 
 extern int check_sha1_signature(const unsigned char *sha1, void *buf, unsigned long size, const char *type);
diff --git a/sha1_file.c b/sha1_file.c
index 12d2ef2..e4c3288 100644
--- a/sha1_file.c
+++ b/sha1_file.c
@@ -1979,7 +1979,8 @@ int hash_sha1_file(const void *buf, unsigned long len, const char *type,
 	return 0;
 }
 
-int write_sha1_file(void *buf, unsigned long len, const char *type, unsigned char *returnsha1)
+int write_sha1_file_maybe(void *buf, unsigned long len, const char *type,
+			  int dup_ok, unsigned char *returnsha1)
 {
 	int size, ret;
 	unsigned char *compressed;
@@ -1990,14 +1991,15 @@ int write_sha1_file(void *buf, unsigned long len, const char *type, unsigned cha
 	char hdr[32];
 	int fd, hdrlen;
 
-	/* Normally if we have it in the pack then we do not bother writing
-	 * it out into .git/objects/??/?{38} file.
+	/* Normally if in a pack (or any where else) then we do not write
+	 * it out into .git/objects/??/?{38} file,  but with dup_ok != 0
+	 * we only avoid over-writing a loose blob in the local repo.
 	 */
 	write_sha1_file_prepare(buf, len, type, sha1, hdr, &hdrlen);
 	filename = sha1_file_name(sha1);
 	if (returnsha1)
 		hashcpy(returnsha1, sha1);
-	if (has_sha1_file(sha1))
+	if (!dup_ok && has_sha1_file(sha1))
 		return 0;
 	fd = open(filename, O_RDONLY);
 	if (fd >= 0) {
@@ -2062,6 +2064,12 @@ int write_sha1_file(void *buf, unsigned long len, const char *type, unsigned cha
 	return move_temp_to_file(tmpfile, filename);
 }
 
+int write_sha1_file(void *buf, unsigned long len, const char *type,
+		    unsigned char *returnsha1)
+{
+	return write_sha1_file_maybe(buf, len, type, 0, returnsha1);
+}
+
 /*
  * We need to unpack and recompress the object for writing
  * it out to a different file.
-- 
1.5.2.762.gd8c6-dirty

-
To unsubscribe from this list: send the line "unsubscribe git" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[Index of Archives]     [Linux Kernel Development]     [Gcc Help]     [IETF Annouce]     [DCCP]     [Netdev]     [Networking]     [Security]     [V4L]     [Bugtraq]     [Yosemite]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Linux SCSI]     [Fedora Users]

  Powered by Linux