Re: [PATCH v2] checkout: optimize "git checkout -b <new_branch>"

Duy Nguyen <pclouds@xxxxxxxxx> · Sun, 5 Aug 2018 10:57:18 +0200

On Thu, Aug 02, 2018 at 02:02:00PM -0400, Ben Peart wrote:
> > And if all go according to plan, there will be no changes made in the
> > index (by either 2-way merge or sparse checkout stuff) we should be
> > able to just skip writing down the index, if we haven't done that
> > already.
> > 
> 
> That would be great as writing the index is 8.4% of the time spent.

And that is of course not good. Avoiding writing index at all is nice
and all but we should not take that much time writing it (what about
other commands?)

I have another idea to reduce index size, which means cheaper trailer
hash calculation, but this may also be a direction you could look
into.

I notice that we have a lot of duplicate data in the index. For
example, most files would have the same ctime and even mtime (at least
the seconds part). group id, device id and file mode should also be
the same in normal case. I guess on Windows, inode is always the same
as well. So why store (and hash) them repeatedly?

This patch writes "index version 5" that only writes those fields out
if they are different from the previous entry (we already do this for
path name in v4).

On webkit repo (275k files), writing time can be reduced from 0.314s
(v2) to 0.206s (v5). File size of all versions:

 9.8M Aug  5 10:27 .git/index.gz
  38M Aug  5 09:54 .git/index-v2
  22M Aug  5 10:26 .git/index-v4
  15M Aug  5 10:34 .git/index-v5

As you can see there's still room to compress, but I don't think we
want to go full deflate mode because compression time goes up.

Another option I think you could consider is not storing full hash. As
long as the shortened hash is still unique, we can expand to full hash
at read time. Of course you can't go too short (I'm thinking half the
current hash size is reasonable trade off) and need to take into
account how much time will be lost for looking up and expanding these
short hashes at read time.

-- 8< --

diff --git a/cache.h b/cache.h
index e6f7ee4b64..1d6bc7d122 100644
--- a/cache.h
+++ b/cache.h
@@ -129,7 +129,7 @@ struct cache_header {
 };
 
 #define INDEX_FORMAT_LB 2
-#define INDEX_FORMAT_UB 4
+#define INDEX_FORMAT_UB 5
 
 /*
  * The "cache_time" is just the low 32 bits of the
diff --git a/read-cache.c b/read-cache.c
index b0b5df5de7..9bb095d6b2 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -2002,7 +2002,7 @@ static int ce_write_flush(git_hash_ctx *context, int fd)
 	return 0;
 }
 
-static int ce_write(git_hash_ctx *context, int fd, void *data, unsigned int len)
+static int ce_write(git_hash_ctx *context, int fd, const void *data, unsigned int len)
 {
 	while (len) {
 		unsigned int buffered = write_buffer_len;
@@ -2024,6 +2024,18 @@ static int ce_write(git_hash_ctx *context, int fd, void *data, unsigned int len)
 	return 0;
 }
 
+static int ce_write_u32(git_hash_ctx *context, int fd, uint32_t data)
+{
+	data = htonl(data);
+	return ce_write(context, fd, &data, sizeof(data));
+}
+
+static int ce_write_u16(git_hash_ctx *context, int fd, uint16_t data)
+{
+	data = htons(data);
+	return ce_write(context, fd, &data, sizeof(data));
+}
+
 static int write_index_ext_header(git_hash_ctx *context, int fd,
 				  unsigned int ext, unsigned int sz)
 {
@@ -2133,8 +2145,46 @@ static void copy_cache_entry_to_ondisk(struct ondisk_cache_entry *ondisk,
 	}
 }
 
+#define ONDISK_MASK (CE_STAGEMASK | CE_INTENT_TO_ADD | CE_SKIP_WORKTREE)
+
+static int ce_write_entry_v5(git_hash_ctx *c, int fd,
+			     const struct cache_entry *ce,
+			     const struct cache_entry *pce)
+{
+	uint8_t mask = 0;
+	const struct stat_data *st1 = &ce->ce_stat_data;
+	const struct stat_data *st2 = &pce->ce_stat_data;
+	//int ret = 0;
+
+	mask |= (st1->sd_ctime.sec != st2->sd_ctime.sec) << 0;
+	mask |= (st1->sd_mtime.sec != st2->sd_mtime.sec) << 1;
+	mask |= (st1->sd_dev != st2->sd_dev) << 2;
+	mask |= (st1->sd_ino != st2->sd_ino) << 3;
+	mask |= (ce->ce_mode != pce->ce_mode) << 4;
+	mask |= (st1->sd_uid != st2->sd_uid) << 5;
+	mask |= (st1->sd_gid != st2->sd_gid) << 6;
+	mask |= ((ce->ce_flags & ONDISK_MASK) != (pce->ce_flags & ONDISK_MASK)) << 7;
+
+	ce_write(c, fd, &mask, 1);
+	ce_write(c, fd, ce->oid.hash, GIT_SHA1_RAWSZ);
+	ce_write_u32(c, fd, st1->sd_ctime.nsec);
+	ce_write_u32(c, fd, st1->sd_mtime.nsec);
+	ce_write_u32(c, fd, st1->sd_size);
+	if (mask & (1 << 0)) ce_write_u32(c, fd, st1->sd_ctime.sec);
+	if (mask & (1 << 1)) ce_write_u32(c, fd, st1->sd_mtime.sec);
+	if (mask & (1 << 2)) ce_write_u32(c, fd, st1->sd_dev);
+	if (mask & (1 << 3)) ce_write_u32(c, fd, st1->sd_ino);
+	if (mask & (1 << 4)) ce_write_u16(c, fd, ce->ce_mode);
+	if (mask & (1 << 5)) ce_write_u32(c, fd, st1->sd_uid);
+	if (mask & (1 << 6)) ce_write_u32(c, fd, st1->sd_gid);
+	if (mask & (1 << 7)) ce_write_u32(c, fd, ce->ce_flags & ONDISK_MASK);
+	return 0;
+}
+
 static int ce_write_entry(git_hash_ctx *c, int fd, struct cache_entry *ce,
-			  struct strbuf *previous_name, struct ondisk_cache_entry *ondisk)
+			  const struct cache_entry *pce,
+			  struct strbuf *previous_name,
+			  struct ondisk_cache_entry *ondisk)
 {
 	int size;
 	int result;
@@ -2173,8 +2223,12 @@ static int ce_write_entry(git_hash_ctx *c, int fd, struct cache_entry *ce,
 		to_remove = previous_name->len - common;
 		prefix_size = encode_varint(to_remove, to_remove_vi);
 
-		copy_cache_entry_to_ondisk(ondisk, ce);
-		result = ce_write(c, fd, ondisk, size);
+		if (!pce) {
+			copy_cache_entry_to_ondisk(ondisk, ce);
+			result = ce_write(c, fd, ondisk, size);
+		} else {
+			result = ce_write_entry_v5(c, fd, ce, pce);
+		}
 		if (!result)
 			result = ce_write(c, fd, to_remove_vi, prefix_size);
 		if (!result)
@@ -2313,7 +2367,7 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
 	if (ce_write(&c, newfd, &hdr, sizeof(hdr)) < 0)
 		return -1;
 
-	previous_name = (hdr_version == 4) ? &previous_name_buf : NULL;
+	previous_name = (hdr_version >= 4) ? &previous_name_buf : NULL;
 
 	for (i = 0; i < entries; i++) {
 		struct cache_entry *ce = cache[i];
@@ -2334,7 +2388,10 @@ static int do_write_index(struct index_state *istate, struct tempfile *tempfile,
 
 			drop_cache_tree = 1;
 		}
-		if (ce_write_entry(&c, newfd, ce, previous_name, (struct ondisk_cache_entry *)&ondisk) < 0)
+		if (ce_write_entry(&c, newfd, ce,
+				   hdr_version >= 5 && i > 0 ? cache[i - 1] : NULL,
+				   previous_name,
+				   (struct ondisk_cache_entry *)&ondisk) < 0)
 			err = -1;
 
 		if (err)
-- 8< --
--
Duy