RFC: [PATCH] Support incremental pack files

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



With CVS (or RCS), commiting a version, increases the required storage
only by the size of the diff.

Commiting a new version in GIT increases the storage by the compressed
size of each changed blob. Packing all unpacked objects decreases the
required storage, but does not generate deltas against objects in
packs. You need to repack all objects to get around this.

For normal source code, this is not a problem.  But if you want to use
git for big files, you waste storage (or CPU time for everything
repacking).

The follwing patch (again git-1.5.0-rc3) is a prototyp for supporting
incremental pack files in git. The file structures are not changed.
It only permits, that the base commit of a delta is located in a
different pack or as unpacked object.

Changes: 

* in sha1_file get_delta_base returns offset 0, if the delta base
object is not the same pack. Additionally it stores the sha1 of it in
a parameter. unpack_delta_entry and packed_delta_info are changed, so
that they search such a blob in other packs or as sha1
file. packed_object_info_detail stops searching, if the delta base
object is not in the same pack.

* to builtin-pack-objects.c a loop detector is added (flag
loop_check).  When adding a excluded object (= preferred base), its
position in the pack file is added, if it is in pack.

If a preferred base object is in a pack, it checks, if it is a delta.
If so, it locates the base object and stores it in ->delta. If it was
not in the object list, the base object gets added with a exclude
value of 2 and it is recusivly check the base object in the same way
(check_preferred_object).

try_delta refuses to produce a delta, if a loop would be created or if
the base object was added by check_preferred_object.

* mark_edges_unparsed travers all commit list for not parsed parent
commit objects. They are used as heuristic to create a list of
suiteable base object for building the delta.

With the patch, you can eg. pack all unpacked object with a command link like
git-pack-objects --non-empty --all --reflog --unpacked --incremental --base-parent

I see the following problems:
* fetching an incremental pack file over HTTP
* reusing a incremental pack file via reuse_cached_pack in builtin-pack-objects.c

mfg Martin Kögler

--- builtin-pack-objects.c.orig	2007-02-22 22:11:11.287809817 +0100
+++ builtin-pack-objects.c	2007-02-22 22:11:22.694525976 +0100
@@ -16,7 +16,7 @@
 static const char pack_usage[] = "\
 git-pack-objects [{ -q | --progress | --all-progress }] \n\
 	[--local] [--incremental] [--window=N] [--depth=N] \n\
-	[--no-reuse-delta] [--delta-base-offset] [--non-empty] \n\
+	[--no-reuse-delta] [--delta-base-offset] [--non-empty] [--base-parent] \n\
 	[--revs [--unpacked | --all]*] [--reflog] [--stdout | base-name] \n\
 	[<ref-list | <object-list]";
 
@@ -65,6 +65,8 @@
 static int local;
 static int incremental;
 static int allow_ofs_delta;
+static int base_parent;
+static int loop_check;
 
 static struct object_entry **sorted_by_sha, **sorted_by_type;
 static struct object_entry *objects;
@@ -692,6 +694,19 @@
 			}
 		}
 	}
+
+	if (loop_check && exclude) {
+		for (p = packed_git; p; p = p->next) {
+			unsigned long offset = find_pack_entry_one(sha1, p);
+			if (offset) {
+				if (!found_pack) {
+					found_offset = offset;
+					found_pack = p;
+				}
+			}
+		}
+	}
+
 	if ((entry = locate_object_entry(sha1)) != NULL)
 		goto already_added;
 
@@ -722,13 +737,13 @@
 		progress_update = 0;
 	}
 	if (exclude)
-		entry->preferred_base = 1;
-	else {
-		if (found_pack) {
-			entry->in_pack = found_pack;
-			entry->in_pack_offset = found_offset;
-		}
+	    entry->preferred_base = exclude;
+
+	if (found_pack) {
+	    entry->in_pack = found_pack;
+	    entry->in_pack_offset = found_offset;
 	}
+
 	return status;
 }
 
@@ -976,6 +991,78 @@
 	it->pcache.tree_size = size;
 }
 
+static void check_preferred_object(struct object_entry *entry)
+{
+    struct packed_git *p = entry->in_pack;
+    struct pack_window *w_curs = NULL;
+    unsigned long size, used;
+    unsigned char *buf;
+    unsigned long left = p->pack_size - entry->in_pack_offset;
+    struct object_entry *base_entry = NULL;
+    unsigned hash;
+
+    if (!p || !entry->preferred_base)
+	return;
+
+    buf = use_pack(p, &w_curs, entry->in_pack_offset, NULL);
+    
+    used = unpack_object_header_gently(buf, left,
+				       &entry->in_pack_type, &size);
+
+    unsigned char c, *base_name;
+    unsigned long ofs;
+    unsigned long used_0;
+    /* there is at least 20 bytes left in the pack */
+    switch (entry->in_pack_type) {
+	case OBJ_REF_DELTA:
+	    base_name = use_pack(p, &w_curs,
+				 entry->in_pack_offset + used, NULL);
+	    used += 20;
+	    break;
+	case OBJ_OFS_DELTA:
+	    buf = use_pack(p, &w_curs,
+			   entry->in_pack_offset + used, NULL);
+	    used_0 = 0;
+	    c = buf[used_0++];
+	    ofs = c & 127;
+	    while (c & 128) {
+		ofs += 1;
+		if (!ofs || ofs & ~(~0UL >> 7))
+		    die("delta base offset overflow in pack for %s",
+			sha1_to_hex(entry->sha1));
+		c = buf[used_0++];
+		ofs = (ofs << 7) + (c & 127);
+	    }
+	    if (ofs >= entry->in_pack_offset)
+		die("delta base offset out of bound for %s",
+		    sha1_to_hex(entry->sha1));
+	    ofs = entry->in_pack_offset - ofs;
+	    base_name = find_packed_object_name(p, ofs);
+	    used += used_0;
+	    break;
+	default:
+	    base_name = NULL;
+    }
+
+    unuse_pack(&w_curs);
+
+    if (!base_name)
+	return;
+
+    base_entry = locate_object_entry(base_name);
+    if (!base_entry) {
+	hash = name_hash("");
+	add_object_entry(base_name, hash, 2);
+
+	base_entry = locate_object_entry(base_name);
+	check_preferred_object(base_entry);
+    }
+
+    entry->delta = base_entry;
+    entry->delta_sibling = base_entry->delta_child;
+    base_entry->delta_child = entry;
+}
+
 static void check_object(struct object_entry *entry)
 {
 	char type[20];
@@ -1062,6 +1149,9 @@
 		/* Otherwise we would do the usual */
 	}
 
+	if (entry->in_pack && entry->preferred_base) 
+	    check_preferred_object (entry);
+
 	if (sha1_object_info(entry->sha1, type, &entry->size))
 		die("unable to get type of object %s",
 		    sha1_to_hex(entry->sha1));
@@ -1218,6 +1308,8 @@
 	 */
 	if (trg_entry->preferred_base)
 		return -1;
+	if (src_entry->preferred_base == 2)
+		return -1;
 
 	/*
 	 * We do not bother to try a delta that we discarded
@@ -1242,6 +1334,15 @@
 	if (src_entry->depth >= max_depth)
 		return 0;
 
+	if (loop_check) {
+	    struct object_entry *i = src_entry->delta;
+	    while (i) {
+		if (i == trg_entry)
+		    return 0;
+		i = i->delta;
+	    }
+	}
+
 	/* Now some size filtering heuristics. */
 	trg_size = trg_entry->size;
 	max_size = trg_size/2 - 20;
@@ -1540,6 +1641,8 @@
 
 	prepare_revision_walk(&revs);
 	mark_edges_uninteresting(revs.commits, &revs, show_edge);
+	if (base_parent)
+	    mark_edges_unparsed(revs.commits, &revs, show_edge);
 	traverse_commit_list(&revs, show_commit, show_object);
 }
 
@@ -1609,6 +1712,11 @@
 			no_reuse_delta = 1;
 			continue;
 		}
+		if (!strcmp("--base-parent", arg)) {
+			base_parent = 1;
+			loop_check = 1;
+			continue;
+		}
 		if (!strcmp("--delta-base-offset", arg)) {
 			allow_ofs_delta = 1;
 			continue;
--- list-objects.c.orig	2007-02-22 22:05:13.972754239 +0100
+++ list-objects.c	2007-02-22 22:11:11.254461051 +0100
@@ -66,6 +66,27 @@
 	tree->buffer = NULL;
 }
 
+void mark_edges_unparsed(struct commit_list *list,
+			 struct rev_info *revs,
+			 show_edge_fn show_edge)
+{
+	for ( ; list; list = list->next) {
+		struct commit *commit = list->item;
+		struct commit_list *parents;
+
+		for (parents = commit->parents; parents; parents = parents->next) {
+		    struct commit *parent = parents->item;
+		    if (parent->object.parsed)
+			continue;
+		    if (!(parent->object.flags & SHOWN)) {
+			parent->object.flags |= SHOWN;
+			show_edge(parent);
+		    }
+		}
+	}
+}
+
+
 static void mark_edge_parents_uninteresting(struct commit *commit,
 					    struct rev_info *revs,
 					    show_edge_fn show_edge)
--- list-objects.h.orig	2007-02-22 22:05:10.375119739 +0100
+++ list-objects.h	2007-02-22 22:05:48.095287586 +0100
@@ -8,5 +8,6 @@
 void traverse_commit_list(struct rev_info *revs, show_commit_fn, show_object_fn);
 
 void mark_edges_uninteresting(struct commit_list *, struct rev_info *, show_edge_fn);
+void mark_edges_unparsed(struct commit_list *, struct rev_info *, show_edge_fn);
 
 #endif
--- sha1_file.c.orig	2007-02-22 22:39:46.163797786 +0100
+++ sha1_file.c	2007-02-22 22:52:36.547048788 +0100
@@ -1030,7 +1030,8 @@
 				    unsigned long offset,
 				    enum object_type kind,
 				    unsigned long delta_obj_offset,
-				    unsigned long *base_obj_offset)
+				    unsigned long *base_obj_offset,
+				    unsigned char *base_sha1)
 {
 	unsigned char *base_info = use_pack(p, w_curs, offset, NULL);
 	unsigned long base_offset;
@@ -1059,9 +1060,7 @@
 	} else if (kind == OBJ_REF_DELTA) {
 		/* The base entry _must_ be in the same pack */
 		base_offset = find_pack_entry_one(base_info, p);
-		if (!base_offset)
-			die("failed to find delta-pack base object %s",
-				sha1_to_hex(base_info));
+		hashcpy (base_sha1, base_info);
 		offset += 20;
 	} else
 		die("I am totally screwed");
@@ -1081,18 +1080,25 @@
 			     char *type,
 			     unsigned long *sizep)
 {
+        unsigned char base_sha1[20];
 	unsigned long base_offset;
 
 	offset = get_delta_base(p, w_curs, offset, kind,
-		obj_offset, &base_offset);
+		obj_offset, &base_offset, base_sha1);
 
 	/* We choose to only get the type of the base object and
 	 * ignore potentially corrupt pack file that expects the delta
 	 * based on a base with a wrong size.  This saves tons of
 	 * inflate() calls.
 	 */
-	if (packed_object_info(p, base_offset, type, NULL))
+	if (base_offset) {
+	    if (packed_object_info(p, base_offset, type, NULL))
 		die("cannot get info for delta-pack base");
+	} else {
+	    if (sha1_object_info(base_sha1, type, NULL))
+		die("cannot get info for delta-pack base %s",
+		    sha1_to_hex (base_sha1));
+	}
 
 	if (sizep) {
 		const unsigned char *data;
@@ -1168,6 +1174,7 @@
 	struct pack_window *w_curs = NULL;
 	unsigned long obj_offset, val;
 	unsigned char *next_sha1;
+	unsigned char sha1[20];
 	enum object_type kind;
 
 	*delta_chain_length = 0;
@@ -1189,7 +1196,7 @@
 			return;
 		case OBJ_OFS_DELTA:
 			get_delta_base(p, &w_curs, offset, kind,
-				obj_offset, &offset);
+				obj_offset, &offset, sha1);
 			if (*delta_chain_length == 0) {
 				/* TODO: find base_sha1 as pointed by offset */
 			}
@@ -1199,6 +1206,8 @@
 			if (*delta_chain_length == 0)
 				hashcpy(base_sha1, next_sha1);
 			offset = find_pack_entry_one(next_sha1, p);
+			if (!offset)
+			    return;
 			break;
 		}
 		obj_offset = offset;
@@ -1281,11 +1290,15 @@
 				unsigned long *sizep)
 {
 	void *delta_data, *result, *base;
+	unsigned char base_sha1[20];
 	unsigned long result_size, base_size, base_offset;
 
 	offset = get_delta_base(p, w_curs, offset, kind,
-		obj_offset, &base_offset);
-	base = unpack_entry(p, base_offset, type, &base_size);
+		obj_offset, &base_offset, base_sha1);
+	if (base_offset)
+	    base = unpack_entry(p, base_offset, type, &base_size);
+	else
+	    base = read_sha1_file (base_sha1, type, &base_size);
 	if (!base)
 		die("failed to read delta base object at %lu from %s",
 		    base_offset, p->pack_name);
-
To unsubscribe from this list: send the line "unsubscribe git" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[Index of Archives]     [Linux Kernel Development]     [Gcc Help]     [IETF Annouce]     [DCCP]     [Netdev]     [Networking]     [Security]     [V4L]     [Bugtraq]     [Yosemite]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Linux SCSI]     [Fedora Users]