This patch extracts tree, parent sha-1 and committer date of packed commits that have exactly one parent out. So that revision traversal code can avoid inflating commit for these sha-1. The assumption is commits with one parent dominate. Two new files are created per pack: one file contains sha-1 and dates. The other file contains commit sha-1 mapping to the offset in the former file. Signed-off-by: Nguyễn Thái Ngọc Duy <pclouds@xxxxxxxxx> --- Makefile | 3 + builtin/index-pack.c | 113 ++++++++++++++++++++++++++++++++++- cache.h | 9 +++ pack-write.c | 11 +++- pack.h | 1 + sha1_cache.c | 161 ++++++++++++++++++++++++++++++++++++++++++++++++++ sha1_cache.h | 6 ++ sha1_file.c | 12 ++++- test-sha1-cache.c | 19 ++++++ 9 files changed, 331 insertions(+), 4 deletions(-) create mode 100644 sha1_cache.c create mode 100644 sha1_cache.h create mode 100644 test-sha1-cache.c diff --git a/Makefile b/Makefile index 87fb30a..fced758 100644 --- a/Makefile +++ b/Makefile @@ -481,6 +481,7 @@ TEST_PROGRAMS_NEED_X += test-sha1 TEST_PROGRAMS_NEED_X += test-sigchain TEST_PROGRAMS_NEED_X += test-subprocess TEST_PROGRAMS_NEED_X += test-svn-fe +TEST_PROGRAMS_NEED_X += test-sha1-cache TEST_PROGRAMS = $(patsubst %,%$X,$(TEST_PROGRAMS_NEED_X)) @@ -606,6 +607,7 @@ LIB_H += run-command.h LIB_H += sequencer.h LIB_H += sha1-array.h LIB_H += sha1-lookup.h +LIB_H += sha1_cache.h LIB_H += sideband.h LIB_H += sigchain.h LIB_H += strbuf.h @@ -722,6 +724,7 @@ LIB_OBJS += setup.o LIB_OBJS += sequencer.o LIB_OBJS += sha1-array.o LIB_OBJS += sha1-lookup.o +LIB_OBJS += sha1_cache.o LIB_OBJS += sha1_file.o LIB_OBJS += sha1_name.o LIB_OBJS += shallow.o diff --git a/builtin/index-pack.c b/builtin/index-pack.c index dd1c5c9..67673ef 100644 --- a/builtin/index-pack.c +++ b/builtin/index-pack.c @@ -15,6 +15,7 @@ static const char index_pack_usage[] = struct object_entry { struct pack_idx_entry idx; + off_t commit_offset; unsigned long size; unsigned int hdr_size; enum object_type type; @@ -63,6 +64,7 @@ static int nr_resolved_deltas; static int from_stdin; static int strict; static int verbose; +static int write_cache = 1; static struct progress *progress; @@ -75,6 +77,13 @@ static git_SHA_CTX input_ctx; static uint32_t input_crc32; static int input_fd, output_fd, pack_fd; +static int cache_fd; +static int cache_written; +static uint32_t cache_nr; +static char cache_filename[PATH_MAX]; +static const char *cache_idxname; +static unsigned char cache_sha1[20]; + static int mark_link(struct object *obj, int type, void *data) { if (!obj) @@ -682,6 +691,58 @@ static int compare_delta_entry(const void *a, const void *b) objects[delta_b->obj_no].type); } +static unsigned long parse_commit_date(const char *buf, const char *tail) +{ + const char *dateptr; + + if (buf + 6 >= tail) + return 0; + if (memcmp(buf, "author", 6)) + return 0; + while (buf < tail && *buf++ != '\n') + /* nada */; + if (buf + 9 >= tail) + return 0; + if (memcmp(buf, "committer", 9)) + return 0; + while (buf < tail && *buf++ != '>') + /* nada */; + if (buf >= tail) + return 0; + dateptr = buf; + while (buf < tail && *buf++ != '\n') + /* nada */; + if (buf >= tail) + return 0; + /* dateptr < buf && buf[-1] == '\n', so strtoul will stop at buf-1 */ + return strtoul(dateptr, NULL, 10); +} + +static void write_commit_sha1(struct object_entry *obj, const char *data) +{ + unsigned char sha1[20]; + if (!obj->commit_offset && + !memcmp(data, "tree ", 5) && + !memcmp(data + 45, "\nparent ", 8) && + memcmp(data + 45 + 48, "\nparent ", 8)) { + uint32_t date; + obj->commit_offset = cache_written; + cache_nr++; + + get_sha1_hex(data + 5, sha1); + cache_written += xwrite(cache_fd, sha1, 20); + get_sha1_hex(data + 5 + 48, sha1); + cache_written += xwrite(cache_fd, sha1, 20); + date = parse_commit_date(data + 5 + 48 + 41, data + obj->size); + date = htonl(date); + cache_written += xwrite(cache_fd, &date, 4); + } +#if 0 + hashclr(sha1); + cache_written += xwrite(cache_fd, sha1, 20); +#endif +} + /* Parse all objects and return the pack content SHA1 hash */ static void parse_pack_objects(unsigned char *sha1) { @@ -707,8 +768,13 @@ static void parse_pack_objects(unsigned char *sha1) nr_deltas++; delta->obj_no = i; delta++; - } else + } else { sha1_object(data, obj->size, obj->type, obj->idx.sha1); + + /* We assume that commits are never deltified */ + if (write_cache && obj->real_type == OBJ_COMMIT) + write_commit_sha1(obj, data); + } free(data); display_progress(progress, i+1); } @@ -919,6 +985,18 @@ static void final(const char *final_pack_name, const char *curr_pack_name, } else if (from_stdin) chmod(final_pack_name, 0444); + if (write_cache) { + snprintf(name, sizeof(name), "%s/pack/pack-%s.sha1", + get_object_directory(), sha1_to_hex(sha1)); + if (move_temp_to_file(cache_filename, name)) + die("cannot store commit cache file"); + + snprintf(name, sizeof(name), "%s/pack/pack-%s.sidx", + get_object_directory(), sha1_to_hex(sha1)); + if (move_temp_to_file(cache_idxname, name)) + die("cannot store commit cache file"); + } + if (final_index_name != curr_index_name) { if (!final_index_name) { snprintf(name, sizeof(name), "%s/pack/pack-%s.idx", @@ -1120,6 +1198,8 @@ int cmd_index_pack(int argc, const char **argv, const char *prefix) keep_msg = ""; } else if (!prefixcmp(arg, "--keep=")) { keep_msg = arg + 7; + } else if (!strcmp(arg, "--no-cache")) { + write_cache = 0; } else if (!prefixcmp(arg, "--pack_header=")) { struct pack_header *hdr; char *c; @@ -1191,6 +1271,17 @@ int cmd_index_pack(int argc, const char **argv, const char *prefix) if (strict) opts.flags |= WRITE_IDX_STRICT; + if (verify) + write_cache = 0; + if (write_cache) { + unsigned char sha1[20]; + cache_fd = odb_mkstemp(cache_filename, + sizeof(cache_filename), + "pack/tmp_sha1_XXXXXX"); + hashclr(sha1); + cache_written = xwrite(cache_fd, sha1, 20); + } + curr_pack = open_pack_file(pack_name); parse_pack_header(); objects = xcalloc(nr_objects + 1, sizeof(struct object_entry)); @@ -1243,6 +1334,26 @@ int cmd_index_pack(int argc, const char **argv, const char *prefix) curr_index = write_idx_file(index_name, idx_objects, nr_objects, &opts, pack_sha1); free(idx_objects); + if (write_cache) { + int nr; + struct pack_idx_entry **idx; + + close(cache_fd); + + idx = idx_objects = xmalloc((cache_nr) * sizeof(struct pack_idx_entry *)); + for (nr = i = 0; i < nr_objects; i++) { + if (objects[i].commit_offset) { + objects[i].idx.offset = objects[i].commit_offset; + *idx++ = &objects[i].idx; + nr++; + } + } + assert(nr == cache_nr); + opts.flags |= WRITE_IDX_SHA1_CACHE; + cache_idxname = write_idx_file(NULL, idx_objects, cache_nr, &opts, cache_sha1); + free(idx_objects); + } + if (!verify) final(pack_name, curr_pack, index_name, curr_index, diff --git a/cache.h b/cache.h index 9bd8c2d..fd3953f 100644 --- a/cache.h +++ b/cache.h @@ -972,6 +972,14 @@ struct pack_window { unsigned int inuse_cnt; }; +struct sha1_cache { + const void *idx; + const void *data; + size_t idx_size; + size_t data_size; + uint32_t nr; +}; + extern struct packed_git { struct packed_git *next; struct pack_window *windows; @@ -984,6 +992,7 @@ extern struct packed_git { int index_version; time_t mtime; int pack_fd; + struct sha1_cache commit_cache; unsigned pack_local:1, pack_keep:1, do_not_close:1; diff --git a/pack-write.c b/pack-write.c index ca9e63b..6da977a 100644 --- a/pack-write.c +++ b/pack-write.c @@ -85,8 +85,11 @@ const char *write_idx_file(const char *index_name, struct pack_idx_entry **objec f = sha1fd(fd, index_name); } - /* if last object's offset is >= 2^31 we should use index V2 */ - index_version = need_large_offset(last_obj_offset, opts) ? 2 : opts->version; + if (opts->flags & WRITE_IDX_SHA1_CACHE) + index_version = 2; + else + /* if last object's offset is >= 2^31 we should use index V2 */ + index_version = need_large_offset(last_obj_offset, opts) ? 2 : opts->version; /* index versions 2 and above need a header */ if (index_version >= 2) { @@ -138,6 +141,9 @@ const char *write_idx_file(const char *index_name, struct pack_idx_entry **objec if (index_version >= 2) { unsigned int nr_large_offset = 0; + if (opts->flags & WRITE_IDX_SHA1_CACHE) + goto skip_crc32; + /* write the crc32 table */ list = sorted_by_sha; for (i = 0; i < nr_objects; i++) { @@ -146,6 +152,7 @@ const char *write_idx_file(const char *index_name, struct pack_idx_entry **objec sha1write(f, &crc32_val, 4); } +skip_crc32: /* write the 32-bit offset table */ list = sorted_by_sha; for (i = 0; i < nr_objects; i++) { diff --git a/pack.h b/pack.h index aa6ee7d..f002b6a 100644 --- a/pack.h +++ b/pack.h @@ -40,6 +40,7 @@ struct pack_idx_option { /* flag bits */ #define WRITE_IDX_VERIFY 01 /* verify only, do not write the idx file */ #define WRITE_IDX_STRICT 02 +#define WRITE_IDX_SHA1_CACHE 04 uint32_t version; uint32_t off32_limit; diff --git a/sha1_cache.c b/sha1_cache.c new file mode 100644 index 0000000..3f244bb --- /dev/null +++ b/sha1_cache.c @@ -0,0 +1,161 @@ +#include "cache.h" +#include "pack.h" +#include "sha1_cache.h" + +static int git_open_noatime(const char *name) +{ + static int sha1_file_open_flag = O_NOATIME; + + for (;;) { + int fd = open(name, O_RDONLY | sha1_file_open_flag); + if (fd >= 0) + return fd; + + /* Might the failure be due to O_NOATIME? */ + if (errno != ENOENT && sha1_file_open_flag) { + sha1_file_open_flag = 0; + continue; + } + + return -1; + } +} + +/* + * Commit cache format is basically pack index v2 except that: + * - no crc32 table + * - no large offset support + * - offset table contains offset in _this_ file, in the commit + * cache section + * - the commit cache section follows offset table, each entry + * consists of tree sha1, parent sha1 if any, and terminated + * by null sha-1. + */ + +int open_sha1_cache(struct sha1_cache *cache, + const char *data_path, const char *idx_path) +{ + void *idx_map, *data; + struct pack_idx_header *hdr; + size_t idx_size, data_size; + uint32_t nr, i, *index; + int fd = git_open_noatime(idx_path); + struct stat st; + + if (fd < 0) + return -1; + if (fstat(fd, &st)) { + close(fd); + return -1; + } + idx_size = xsize_t(st.st_size); + if (idx_size < 4 * 256 + 20 + 20) { + close(fd); + return error("index file %s is too small", idx_path); + } + idx_map = xmmap(NULL, idx_size, PROT_READ, MAP_PRIVATE, fd, 0); + close(fd); + + fd = git_open_noatime(data_path); + if (fd < 0) + return -1; + if (fstat(fd, &st)) { + close(fd); + return -1; + } + data_size = xsize_t(st.st_size); + data = xmmap(NULL, data_size, PROT_READ, MAP_PRIVATE, fd, 0); + close(fd); + + hdr = idx_map; + if (hdr->idx_signature != htonl(PACK_IDX_SIGNATURE)) + return error("not a commit cache file %s", idx_path); + if (ntohl(hdr->idx_version) != 2) + return error("wrong version %s %d", idx_path, ntohl(hdr->idx_version)); + + nr = 0; + index = idx_map; + index += 2; /* skip index header */ + for (i = 0; i < 256; i++) { + uint32_t n = ntohl(index[i]); + if (n < nr) { + munmap(idx_map, idx_size); + return error("non-monotonic index %s", idx_path); + } + nr = n; + } + + cache->idx = idx_map; + cache->idx_size = idx_size; + cache->nr = nr; + cache->data = data; + cache->data_size = data_size; + return 0; +} + +static const void *object_offset(const struct sha1_cache *cache, uint32_t n) +{ + const unsigned char *index = cache->idx; + uint32_t off; + index += 4 * 256; + index += 8 + cache->nr * 20; + off = ntohl(*((uint32_t *)(index + 4 * n))); + return (const char *)cache->data + off; +} + +const void *find_sha1_in_cache(const unsigned char *sha1) +{ + const uint32_t *level1_ofs; + const unsigned char *index; + unsigned hi, lo, stride; + + struct packed_git *p = find_sha1_pack(sha1, packed_git); + if (!p) + return 0; + + open_pack_index(p); + if (!p->commit_cache.nr) + return 0; + + level1_ofs = p->commit_cache.idx; + index = p->commit_cache.idx; + + /* skip header */ + level1_ofs += 2; + index += 8; + + /* fanout table */ + index += 4 * 256; + hi = ntohl(level1_ofs[*sha1]); + lo = ((*sha1 == 0x0) ? 0 : ntohl(level1_ofs[*sha1 - 1])); + stride = 20; + + do { + unsigned mi = (lo + hi) / 2; + int cmp = hashcmp(index + mi * stride, sha1); + + if (!cmp) + return object_offset(&p->commit_cache, mi); + if (cmp > 0) + hi = mi; + else + lo = mi+1; + } while (lo < hi); + return NULL; +} + +int has_commit_cache(const unsigned char *sha1, + unsigned char *tree, + unsigned char *parent, + unsigned long *date) +{ + const unsigned char *data; + data = find_sha1_in_cache(sha1); + if (!data) + return 0; + + hashcpy(tree, data); + hashcpy(parent, data + 20); + *date = ntohl(*((uint32_t*)(data + 40))); + return 1; +} diff --git a/sha1_cache.h b/sha1_cache.h new file mode 100644 index 0000000..59823cf --- /dev/null +++ b/sha1_cache.h @@ -0,0 +1,6 @@ +extern int open_sha1_cache(struct sha1_cache *cache, + const char *data_path, const char *idx_path); +extern const void *find_sha1_in_cache(const unsigned char *sha1); +extern int has_commit_cache(const unsigned char *sha1, + unsigned char *tree, unsigned char *parent, + unsigned long *date); diff --git a/sha1_file.c b/sha1_file.c index 88f2151..fc2f460 100644 --- a/sha1_file.c +++ b/sha1_file.c @@ -19,6 +19,7 @@ #include "pack-revindex.h" #include "sha1-lookup.h" #include "bulk-checkin.h" +#include "sha1_cache.h" #ifndef O_NOATIME #if defined(__linux__) && (defined(__i386__) || defined(__PPC__)) @@ -578,14 +579,23 @@ static int check_packed_git_idx(const char *path, struct packed_git *p) int open_pack_index(struct packed_git *p) { char *idx_name; + int baselen = strlen(p->pack_name) - strlen(".pack"); int ret; if (p->index_data) return 0; idx_name = xstrdup(p->pack_name); - strcpy(idx_name + strlen(idx_name) - strlen(".pack"), ".idx"); + strcpy(idx_name + baselen, ".idx"); ret = check_packed_git_idx(idx_name, p); + if (!ret) { + char *cache_name; + cache_name = xstrdup(p->pack_name); + strcpy(idx_name + baselen, ".sidx"); + strcpy(cache_name + baselen, ".sha1"); + open_sha1_cache(&p->commit_cache, cache_name, idx_name); + free(cache_name); + } free(idx_name); return ret; } diff --git a/test-sha1-cache.c b/test-sha1-cache.c new file mode 100644 index 0000000..7b7f32c --- /dev/null +++ b/test-sha1-cache.c @@ -0,0 +1,19 @@ +#include "cache.h" +#include "sha1_cache.h" + +int main(int argc, char **argv) +{ + unsigned char sha1[20]; + unsigned char tree[20]; + unsigned char parent[20]; + unsigned long date; + + setup_git_directory(); + prepare_packed_git(); + get_sha1_hex(argv[1], sha1); + if (!has_commit_cache(sha1, tree, parent, &date)) + return 1; + printf("tree %s\nparent %s\ndate %ld\n", + sha1_to_hex(tree), sha1_to_hex(parent), date); + return 0; +} -- 1.7.3.1.256.g2539c.dirty -- To unsubscribe from this list: send the line "unsubscribe git" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html