[PATCH v2 7/8] packfile: add kept-pack cache for find_kept_pack_entry()

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



From: Jeff King <peff@xxxxxxxx>

In a recent patch we added a function 'find_kept_pack_entry()' to look
for an object only among kept packs.

While this function avoids doing any lookup work in non-kept packs, it
is still linear in the number of packs, since we have to traverse the
linked list of packs once per object. Let's cache a reduced version of
that list to save us time.

Note that this cache will last the lifetime of the program. We could
invalidate it on reprepare_packed_git(), but there's not much point in
being rigorous here:

  - we might already fail to notice new .keep packs showing up after the
    program starts. We only reprepare_packed_git() when we fail to find
    an object. But adding a new pack won't cause that to happen.
    Somebody repacking could add a new pack and delete an old one, but
    most of the time we'd have a descriptor or mmap open to the old
    pack anyway, so we might not even notice.

  - in pack-objects we already cache the .keep state at startup, since
    56dfeb6263 (pack-objects: compute local/ignore_pack_keep early,
    2016-07-29). So this is just extending that concept further.

  - we don't have to worry about any packed_git being removed; we always
    keep the old structs around, even after reprepare_packed_git()

Here are p5303 results (as always, measured against the kernel):

  Test                                        HEAD^                   HEAD
  ----------------------------------------------------------------------------------------------
  5303.5: repack (1)                          57.44(54.71+10.78)      57.06(54.29+10.96) -0.7%
  5303.6: repack with --stdin-packs (1)       0.01(0.00+0.01)         0.01(0.01+0.00) +0.0%
  5303.10: repack (50)                        71.32(88.38+4.90)       71.47(88.60+5.04) +0.2%
  5303.11: repack with --stdin-packs (50)     3.43(11.81+0.22)        3.49(12.21+0.26) +1.7%
  5303.15: repack (1000)                      215.59(493.75+14.62)    217.41(495.36+14.85) +0.8%
  5303.16: repack with --stdin-packs (1000)   131.44(314.24+8.11)     126.75(309.88+8.09) -3.6%

Signed-off-by: Jeff King <peff@xxxxxxxx>
Signed-off-by: Taylor Blau <me@xxxxxxxxxxxx>
---
 builtin/pack-objects.c |   6 +--
 object-store.h         |  10 ++++
 packfile.c             | 103 +++++++++++++++++++++++------------------
 packfile.h             |   4 --
 revision.c             |   8 ++--
 5 files changed, 76 insertions(+), 55 deletions(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index fbd7b54d70..b2ba5aa14f 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -1225,9 +1225,9 @@ static int want_found_object(const struct object_id *oid, int exclude,
 		 */
 		unsigned flags = 0;
 		if (ignore_packed_keep_on_disk)
-			flags |= ON_DISK_KEEP_PACKS;
+			flags |= CACHE_ON_DISK_KEEP_PACKS;
 		if (ignore_packed_keep_in_core)
-			flags |= IN_CORE_KEEP_PACKS;
+			flags |= CACHE_IN_CORE_KEEP_PACKS;
 
 		if (ignore_packed_keep_on_disk && p->pack_keep)
 			return 0;
@@ -3089,7 +3089,7 @@ static void read_packs_list_from_stdin(void)
 	 * an optimization during delta selection.
 	 */
 	revs.no_kept_objects = 1;
-	revs.keep_pack_cache_flags |= IN_CORE_KEEP_PACKS;
+	revs.keep_pack_cache_flags |= CACHE_IN_CORE_KEEP_PACKS;
 	revs.blob_objects = 1;
 	revs.tree_objects = 1;
 	revs.tag_objects = 1;
diff --git a/object-store.h b/object-store.h
index c4fc9dd74e..4cbe8eae3c 100644
--- a/object-store.h
+++ b/object-store.h
@@ -105,6 +105,14 @@ static inline int pack_map_entry_cmp(const void *unused_cmp_data,
 	return strcmp(pg1->pack_name, key ? key : pg2->pack_name);
 }
 
+#define CACHE_ON_DISK_KEEP_PACKS 1
+#define CACHE_IN_CORE_KEEP_PACKS 2
+
+struct kept_pack_cache {
+	struct packed_git **packs;
+	unsigned flags;
+};
+
 struct raw_object_store {
 	/*
 	 * Set of all object directories; the main directory is first (and
@@ -150,6 +158,8 @@ struct raw_object_store {
 	/* A most-recently-used ordered version of the packed_git list. */
 	struct list_head packed_git_mru;
 
+	struct kept_pack_cache *kept_pack_cache;
+
 	/*
 	 * A map of packfiles to packed_git structs for tracking which
 	 * packs have been loaded already.
diff --git a/packfile.c b/packfile.c
index 5f35cfe788..2a139c907b 100644
--- a/packfile.c
+++ b/packfile.c
@@ -2031,10 +2031,7 @@ static int fill_pack_entry(const struct object_id *oid,
 	return 1;
 }
 
-static int find_one_pack_entry(struct repository *r,
-			       const struct object_id *oid,
-			       struct pack_entry *e,
-			       int kept_only)
+int find_pack_entry(struct repository *r, const struct object_id *oid, struct pack_entry *e)
 {
 	struct list_head *pos;
 	struct multi_pack_index *m;
@@ -2044,49 +2041,64 @@ static int find_one_pack_entry(struct repository *r,
 		return 0;
 
 	for (m = r->objects->multi_pack_index; m; m = m->next) {
-		if (!fill_midx_entry(r, oid, e, m))
-			continue;
-
-		if (!kept_only)
-			return 1;
-
-		if (((kept_only & ON_DISK_KEEP_PACKS) && e->p->pack_keep) ||
-		    ((kept_only & IN_CORE_KEEP_PACKS) && e->p->pack_keep_in_core))
+		if (fill_midx_entry(r, oid, e, m))
 			return 1;
 	}
 
 	list_for_each(pos, &r->objects->packed_git_mru) {
 		struct packed_git *p = list_entry(pos, struct packed_git, mru);
-		if (p->multi_pack_index && !kept_only) {
-			/*
-			 * If this pack is covered by the MIDX, we'd have found
-			 * the object already in the loop above if it was here,
-			 * so don't bother looking.
-			 *
-			 * The exception is if we are looking only at kept
-			 * packs. An object can be present in two packs covered
-			 * by the MIDX, one kept and one not-kept. And as the
-			 * MIDX points to only one copy of each object, it might
-			 * have returned only the non-kept version above. We
-			 * have to check again to be thorough.
-			 */
-			continue;
-		}
-		if (!kept_only ||
-		    (((kept_only & ON_DISK_KEEP_PACKS) && p->pack_keep) ||
-		     ((kept_only & IN_CORE_KEEP_PACKS) && p->pack_keep_in_core))) {
-			if (fill_pack_entry(oid, e, p)) {
-				list_move(&p->mru, &r->objects->packed_git_mru);
-				return 1;
-			}
+		if (!p->multi_pack_index && fill_pack_entry(oid, e, p)) {
+			list_move(&p->mru, &r->objects->packed_git_mru);
+			return 1;
 		}
 	}
 	return 0;
 }
 
-int find_pack_entry(struct repository *r, const struct object_id *oid, struct pack_entry *e)
+static void maybe_invalidate_kept_pack_cache(struct repository *r,
+					     unsigned flags)
 {
-	return find_one_pack_entry(r, oid, e, 0);
+	if (!r->objects->kept_pack_cache)
+		return;
+	if (r->objects->kept_pack_cache->flags == flags)
+		return;
+	free(r->objects->kept_pack_cache->packs);
+	FREE_AND_NULL(r->objects->kept_pack_cache);
+}
+
+static struct packed_git **kept_pack_cache(struct repository *r, unsigned flags)
+{
+	maybe_invalidate_kept_pack_cache(r, flags);
+
+	if (!r->objects->kept_pack_cache) {
+		struct packed_git **packs = NULL;
+		size_t nr = 0, alloc = 0;
+		struct packed_git *p;
+
+		/*
+		 * We want "all" packs here, because we need to cover ones that
+		 * are used by a midx, as well. We need to look in every one of
+		 * them (instead of the midx itself) to cover duplicates. It's
+		 * possible that an object is found in two packs that the midx
+		 * covers, one kept and one not kept, but the midx returns only
+		 * the non-kept version.
+		 */
+		for (p = get_all_packs(r); p; p = p->next) {
+			if ((p->pack_keep && (flags & CACHE_ON_DISK_KEEP_PACKS)) ||
+			    (p->pack_keep_in_core && (flags & CACHE_IN_CORE_KEEP_PACKS))) {
+				ALLOC_GROW(packs, nr + 1, alloc);
+				packs[nr++] = p;
+			}
+		}
+		ALLOC_GROW(packs, nr + 1, alloc);
+		packs[nr] = NULL;
+
+		r->objects->kept_pack_cache = xmalloc(sizeof(*r->objects->kept_pack_cache));
+		r->objects->kept_pack_cache->packs = packs;
+		r->objects->kept_pack_cache->flags = flags;
+	}
+
+	return r->objects->kept_pack_cache->packs;
 }
 
 int find_kept_pack_entry(struct repository *r,
@@ -2094,13 +2106,15 @@ int find_kept_pack_entry(struct repository *r,
 			 unsigned flags,
 			 struct pack_entry *e)
 {
-	/*
-	 * Load all packs, including midx packs, since our "kept" strategy
-	 * relies on that. We're relying on the side effect of it setting up
-	 * r->objects->packed_git, which is a little ugly.
-	 */
-	get_all_packs(r);
-	return find_one_pack_entry(r, oid, e, flags);
+	struct packed_git **cache;
+
+	for (cache = kept_pack_cache(r, flags); *cache; cache++) {
+		struct packed_git *p = *cache;
+		if (fill_pack_entry(oid, e, p))
+			return 1;
+	}
+
+	return 0;
 }
 
 int has_object_pack(const struct object_id *oid)
@@ -2109,7 +2123,8 @@ int has_object_pack(const struct object_id *oid)
 	return find_pack_entry(the_repository, oid, &e);
 }
 
-int has_object_kept_pack(const struct object_id *oid, unsigned flags)
+int has_object_kept_pack(const struct object_id *oid,
+			 unsigned flags)
 {
 	struct pack_entry e;
 	return find_kept_pack_entry(the_repository, oid, flags, &e);
diff --git a/packfile.h b/packfile.h
index 624327f64d..eb56db2a7b 100644
--- a/packfile.h
+++ b/packfile.h
@@ -161,10 +161,6 @@ int packed_object_info(struct repository *r,
 void mark_bad_packed_object(struct packed_git *p, const unsigned char *sha1);
 const struct packed_git *has_packed_and_bad(struct repository *r, const unsigned char *sha1);
 
-#define ON_DISK_KEEP_PACKS 1
-#define IN_CORE_KEEP_PACKS 2
-#define ALL_KEEP_PACKS (ON_DISK_KEEP_PACKS | IN_CORE_KEEP_PACKS)
-
 /*
  * Iff a pack file in the given repository contains the object named by sha1,
  * return true and store its location to e.
diff --git a/revision.c b/revision.c
index 4c5adb90b1..41c0478705 100644
--- a/revision.c
+++ b/revision.c
@@ -2338,14 +2338,14 @@ static int handle_revision_opt(struct rev_info *revs, int argc, const char **arg
 		die(_("--unpacked=<packfile> no longer supported"));
 	} else if (!strcmp(arg, "--no-kept-objects")) {
 		revs->no_kept_objects = 1;
-		revs->keep_pack_cache_flags |= IN_CORE_KEEP_PACKS;
-		revs->keep_pack_cache_flags |= ON_DISK_KEEP_PACKS;
+		revs->keep_pack_cache_flags |= CACHE_IN_CORE_KEEP_PACKS;
+		revs->keep_pack_cache_flags |= CACHE_ON_DISK_KEEP_PACKS;
 	} else if (skip_prefix(arg, "--no-kept-objects=", &optarg)) {
 		revs->no_kept_objects = 1;
 		if (!strcmp(optarg, "in-core"))
-			revs->keep_pack_cache_flags |= IN_CORE_KEEP_PACKS;
+			revs->keep_pack_cache_flags |= CACHE_IN_CORE_KEEP_PACKS;
 		if (!strcmp(optarg, "on-disk"))
-			revs->keep_pack_cache_flags |= ON_DISK_KEEP_PACKS;
+			revs->keep_pack_cache_flags |= CACHE_ON_DISK_KEEP_PACKS;
 	} else if (!strcmp(arg, "-r")) {
 		revs->diff = 1;
 		revs->diffopt.flags.recursive = 1;
-- 
2.30.0.533.g2f8b6b552f.dirty




[Index of Archives]     [Linux Kernel Development]     [Gcc Help]     [IETF Annouce]     [DCCP]     [Netdev]     [Networking]     [Security]     [V4L]     [Bugtraq]     [Yosemite]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Linux SCSI]     [Fedora Users]

  Powered by Linux