[PATCH v2 1/3] index-pack: add --unpack-limit to unpack objects

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



If the number of objects in the given pack is less than the limit, all
objects in the pack will be unpacked, and the pack will not be created
if it's streamed in. It's intended to replace unpack-objects.

Unlike unpack-objects, this operation requires writing the stream to
disk for delta resolution. Base objects are not appended to the
temporary pack though. Given that unpack limit is usually small, the
I/O overhead should be unnoticeable.

When large blobs are present in the stream, things will be
different. Right now, those blobs are uncompressed in memory for
simplicity. But in future bulk-checkin should be used for streaming
large blobs to a new pack (and not written to the temp pack to reduce
I/O overhead).

Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@xxxxxxxxx>
---
 I had something that could unpack without writing to temp pack file
 but I scraped it and chose this way because it follows closely how
 index-pack works. It's a good thing imo because .pack v4 is coming
 and I don't know how v4 may impact this unpack code path. Once things
 are settled, we can revisit and open a separate code path if it's
 still a good idea.

 builtin/index-pack.c | 136 +++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 122 insertions(+), 14 deletions(-)

diff --git a/builtin/index-pack.c b/builtin/index-pack.c
index 9c1cfac..a5b69e4 100644
--- a/builtin/index-pack.c
+++ b/builtin/index-pack.c
@@ -81,6 +81,7 @@ static int do_fsck_object;
 static int verbose;
 static int show_stat;
 static int check_self_contained_and_connected;
+static int unpack;
 
 static struct progress *progress;
 
@@ -93,6 +94,21 @@ static git_SHA_CTX input_ctx;
 static uint32_t input_crc32;
 static int input_fd, output_fd, pack_fd;
 
+/*
+ * When unpacking under --strict mode, objects whose reachability are
+ * suspect are kept in core without getting written in the object
+ * store.
+ */
+struct obj_buffer {
+	char *buffer;
+	unsigned long size;
+	enum object_type type;
+	unsigned char sha1[20];
+};
+
+struct obj_buffer *obj_buffers;
+int nr_object_buffers;
+
 #ifndef NO_PTHREADS
 
 static struct thread_local *thread_data;
@@ -209,6 +225,23 @@ static unsigned check_object(struct object *obj)
 	return 0;
 }
 
+static void write_cached_objects(void)
+{
+	unsigned char parano_sha1[20];
+	int i;
+	for (i = 0; i < nr_object_buffers; i++) {
+		struct obj_buffer *obj_buf = obj_buffers + i;
+		if (write_sha1_file(obj_buf->buffer, obj_buf->size,
+				    typename(obj_buf->type), parano_sha1) < 0)
+			die(_("failed to write object %s"),
+			    sha1_to_hex(obj_buf->sha1));
+		if (hashcmp(obj_buf->sha1, parano_sha1))
+			die(_("confused by unstable object source data for %s"),
+			    sha1_to_hex(obj_buf->sha1));
+		free(obj_buf->buffer);
+	}
+}
+
 static unsigned check_objects(void)
 {
 	unsigned i, max, foreign_nr = 0;
@@ -424,7 +457,18 @@ static void *unpack_entry_data(unsigned long offset, unsigned long size,
 		git_SHA1_Update(&c, hdr, hdrlen);
 	} else
 		sha1 = NULL;
-	if (type == OBJ_BLOB && size > big_file_threshold)
+	if (type == OBJ_BLOB && size > big_file_threshold &&
+	    /*
+	     * In future we should use bulk-checkin instead of forcing
+	     * large blobs in core like this.
+	     *
+	     * If we do so, in --stdin mode, we might also want to
+	     * update flush() to do seek() instead of write() on large
+	     * blobs to reduce I/O and disk consumption (because we
+	     * need the temp pack with other objects at correct
+	     * position for delta resolution)
+	     */
+	    !unpack)
 		buf = fixed_buf;
 	else
 		buf = xmalloc(size);
@@ -700,17 +744,18 @@ static int check_collison(struct object_entry *entry)
 	return 0;
 }
 
-static void sha1_object(const void *data, struct object_entry *obj_entry,
-			unsigned long size, enum object_type type,
-			const unsigned char *sha1)
+static void *sha1_object(void *data, struct object_entry *obj_entry,
+			 unsigned long size, enum object_type type,
+			 const unsigned char *sha1)
 {
 	void *new_data = NULL;
 	int collision_test_needed;
+	int queued = 0;
 
 	assert(data || obj_entry);
 
 	read_lock();
-	collision_test_needed = has_sha1_file(sha1);
+	collision_test_needed = unpack ? 0 : has_sha1_file(sha1);
 	read_unlock();
 
 	if (collision_test_needed && !data) {
@@ -776,11 +821,36 @@ static void sha1_object(const void *data, struct object_entry *obj_entry,
 				commit->buffer = NULL;
 			}
 			obj->flags |= FLAG_CHECKED;
+			if (unpack) {
+				obj_buffers[nr_object_buffers].buffer = data;
+				obj_buffers[nr_object_buffers].size = size;
+				obj_buffers[nr_object_buffers].type = type;
+				hashcpy(obj_buffers[nr_object_buffers].sha1, sha1);
+				nr_object_buffers++;
+				data = NULL;
+				queued = 1;
+			}
 		}
 		read_unlock();
 	}
 
+	if (unpack && !queued) {
+		unsigned char parano_sha1[20];
+
+		read_lock(); /* because write_sha1_file calls has_sha1_file */
+		if (write_sha1_file(data, size, typename(type),
+				    parano_sha1))
+			die(_("failed to unpack object at offset %lu"),
+			    obj_entry->idx.offset);
+		read_unlock();
+
+		if (hashcmp(sha1, parano_sha1))
+			die(_("confused by unstable object source data for %s"),
+			    sha1_to_hex(sha1));
+	}
+
 	free(new_data);
+	return data == new_data ? NULL : data;
 }
 
 /*
@@ -1021,7 +1091,8 @@ static void parse_pack_objects(unsigned char *sha1)
 			obj->real_type = OBJ_BAD;
 			nr_delays++;
 		} else
-			sha1_object(data, NULL, obj->size, obj->type, obj->idx.sha1);
+			data = sha1_object(data, NULL, obj->size,
+					   obj->type, obj->idx.sha1);
 		free(data);
 		display_progress(progress, i+1);
 	}
@@ -1188,6 +1259,18 @@ static struct object_entry *append_obj_to_pack(struct sha1file *f,
 	unsigned long s = size;
 	int n = 0;
 	unsigned char c = (type << 4) | (s & 15);
+	if (unpack) {
+		/*
+		 * Just enough info to make find_unresolved_deltas()
+		 * happy. We do not need to actually append the object
+		 * in unpack case.
+		 */
+		obj[0].size = size;
+		obj[0].type = type;
+		obj[0].real_type = type;
+		hashcpy(obj->idx.sha1, sha1);
+		return obj;
+	}
 	s >>= 4;
 	while (s) {
 		header[n++] = c | 0x80;
@@ -1277,7 +1360,11 @@ static void final(const char *final_pack_name, const char *curr_pack_name,
 		err = close(output_fd);
 		if (err)
 			die_errno(_("error while closing pack file"));
+		if (unpack)
+			unlink(curr_pack_name);
 	}
+	if (unpack)
+		return;
 
 	if (keep_msg) {
 		int keep_fd, keep_msg_len = strlen(keep_msg);
@@ -1489,14 +1576,14 @@ static void show_pack_info(int stat_only)
 int cmd_index_pack(int argc, const char **argv, const char *prefix)
 {
 	int i, fix_thin_pack = 0, verify = 0, stat_only = 0;
-	const char *curr_pack, *curr_index;
+	const char *curr_pack, *curr_index = NULL;
 	const char *index_name = NULL, *pack_name = NULL;
 	const char *keep_name = NULL, *keep_msg = NULL;
 	char *index_name_buf = NULL, *keep_name_buf = NULL;
-	struct pack_idx_entry **idx_objects;
 	struct pack_idx_option opts;
 	unsigned char pack_sha1[20];
 	unsigned foreign_nr = 1;	/* zero is a "good" value, assume bad */
+	int unpack_limit = 0;
 
 	if (argc == 2 && !strcmp(argv[1], "-h"))
 		usage(index_pack_usage);
@@ -1574,6 +1661,11 @@ int cmd_index_pack(int argc, const char **argv, const char *prefix)
 					opts.off32_limit = strtoul(c+1, &c, 0);
 				if (*c || opts.off32_limit & 0x80000000)
 					die(_("bad %s"), arg);
+			} else if (!prefixcmp(arg, "--unpack-limit=")) {
+				char *end;
+				unpack_limit = strtoul(arg+15, &end, 0);
+				if (!arg[15] || *end || unpack_limit < 0)
+					usage(index_pack_usage);
 			} else
 				usage(index_pack_usage);
 			continue;
@@ -1628,23 +1720,39 @@ int cmd_index_pack(int argc, const char **argv, const char *prefix)
 
 	curr_pack = open_pack_file(pack_name);
 	parse_pack_header();
+
+	if (nr_objects < unpack_limit) {
+		unpack = 1;
+		if (strict)
+			obj_buffers = xcalloc(nr_objects, sizeof(*obj_buffers));
+	}
+
 	objects = xcalloc(nr_objects + 1, sizeof(struct object_entry));
 	deltas = xcalloc(nr_objects, sizeof(struct delta_entry));
 	parse_pack_objects(pack_sha1);
 	resolve_deltas();
 	conclude_pack(fix_thin_pack, curr_pack, pack_sha1);
 	free(deltas);
-	if (strict)
+	if (strict) {
 		foreign_nr = check_objects();
+		if (unpack) {
+			write_cached_objects();
+			free(obj_buffers);
+		}
+	}
 
 	if (show_stat)
 		show_pack_info(stat_only);
 
-	idx_objects = xmalloc((nr_objects) * sizeof(struct pack_idx_entry *));
-	for (i = 0; i < nr_objects; i++)
-		idx_objects[i] = &objects[i].idx;
-	curr_index = write_idx_file(index_name, idx_objects, nr_objects, &opts, pack_sha1);
-	free(idx_objects);
+	if (!unpack) {
+		struct pack_idx_entry **idx_objects;
+		idx_objects = xmalloc((nr_objects) * sizeof(struct pack_idx_entry *));
+		for (i = 0; i < nr_objects; i++)
+			idx_objects[i] = &objects[i].idx;
+		curr_index = write_idx_file(index_name, idx_objects,
+					    nr_objects, &opts, pack_sha1);
+		free(idx_objects);
+	}
 
 	if (!verify)
 		final(pack_name, curr_pack,
-- 
1.8.2.83.gc99314b

--
To unsubscribe from this list: send the line "unsubscribe git" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html




[Index of Archives]     [Linux Kernel Development]     [Gcc Help]     [IETF Annouce]     [DCCP]     [Netdev]     [Networking]     [Security]     [V4L]     [Bugtraq]     [Yosemite]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Linux SCSI]     [Fedora Users]