From: Derrick Stolee <dstolee@xxxxxxxxxxxxx> The parent and recursive patterns allowed by the "cone mode" option in sparse-checkout are restrictive enough that we can avoid using the regex parsing. Everything is based on prefix matches, so we can use hashsets to store the prefixes from the sparse-checkout file. When checking a path, we can strip path entries from the path and check the hashset for an exact match. As a test, I created a cone-mode sparse-checkout file for the Linux repository that actually includes every file. This was constructed by taking every folder in the Linux repo and creating the pattern pairs here: /$folder/* !/$folder/*/* This resulted in a sparse-checkout file sith 8,296 patterns. Running 'git read-tree -mu HEAD' on this file had the following performance: core.sparseCheckout=false: 0.21 s (0.00 s) core.sparseCheckout=true: 3.75 s (3.50 s) core.sparseCheckout=cone: 0.23 s (0.01 s) The times in parentheses above correspond to the time spent in the first clear_ce_flags() call, according to the trace2 performance traces. While this example is contrived, it demonstrates how these patterns can slow the sparse-checkout feature. Signed-off-by: Derrick Stolee <dstolee@xxxxxxxxxxxxx> --- Documentation/git-sparse-checkout.txt | 1 - dir.c | 154 +++++++++++++++++++++++++- dir.h | 27 +++++ t/t1091-sparse-checkout-builtin.sh | 8 ++ 4 files changed, 183 insertions(+), 7 deletions(-) diff --git a/Documentation/git-sparse-checkout.txt b/Documentation/git-sparse-checkout.txt index 463319055b..7ade827370 100644 --- a/Documentation/git-sparse-checkout.txt +++ b/Documentation/git-sparse-checkout.txt @@ -85,7 +85,6 @@ negate patterns. For example, to remove the file `unwanted`: !unwanted ---------------- - ## CONE PATTERN SET The full pattern set allows for arbitrary pattern matches and complicated diff --git a/dir.c b/dir.c index d021c908e5..2c5ff89a72 100644 --- a/dir.c +++ b/dir.c @@ -599,6 +599,99 @@ void parse_exclude_pattern(const char **pattern, *patternlen = len; } +static int el_hashmap_cmp(const void *unused_cmp_data, + const void *a, const void *b, const void *key) +{ + const struct exclude_entry *ee1 = a; + const struct exclude_entry *ee2 = b; + + return strncmp(ee1->pattern, ee2->pattern, ee1->patternlen); +} + +static void add_exclude_to_hashsets(struct exclude_list *el, struct exclude *x) +{ + struct exclude_entry *e; + char *truncated; + char *data = NULL; + + if (!el->use_cone_patterns) + return; + + if (x->patternlen >= 4 && + !strcmp(x->pattern + x->patternlen - 4, "/*/*")) { + if (!(x->flags & EXC_FLAG_NEGATIVE)) { + /* Not a cone pattern. */ + el->use_cone_patterns = 0; + warning(_("unrecognized pattern: '%s'"), x->pattern); + goto clear_hashmaps; + } + + truncated = xstrdup(x->pattern); + truncated[x->patternlen - 4] = 0; + + e = xmalloc(sizeof(struct exclude_entry)); + e->pattern = truncated; + e->patternlen = x->patternlen - 4; + hashmap_entry_init(e, memhash(e->pattern, e->patternlen)); + + if (!hashmap_get(&el->recursive_hashmap, e, NULL)) { + /* We did not see the "parent" included */ + warning(_("unrecognized negative pattern: '%s'"), x->pattern); + free(truncated); + goto clear_hashmaps; + } + + hashmap_add(&el->parent_hashmap, e); + hashmap_remove(&el->recursive_hashmap, e, &data); + free(data); + return; + } + + if (x->patternlen >= 2 && + !strcmp(x->pattern + x->patternlen - 2, "/*")) { + if (x->flags & EXC_FLAG_NEGATIVE) { + warning(_("unrecognized negative pattern: '%s'"), x->pattern); + goto clear_hashmaps; + } + + e = xmalloc(sizeof(struct exclude_entry)); + + truncated = xstrdup(x->pattern); + truncated[x->patternlen - 2] = 0; + e->pattern = truncated; + e->patternlen = x->patternlen - 2; + hashmap_entry_init(e, memhash(e->pattern, e->patternlen)); + + hashmap_add(&el->recursive_hashmap, e); + + if (hashmap_get(&el->parent_hashmap, e, NULL)) { + /* we already included this at the parent level */ + warning(_("your sparse-checkout file may have issues: pattern '%s' is repeated"), + x->pattern); + hashmap_remove(&el->parent_hashmap, e, &data); + free(data); + } + return; + } + +clear_hashmaps: + hashmap_free(&el->parent_hashmap, 1); + hashmap_free(&el->recursive_hashmap, 1); + el->use_cone_patterns = 0; +} + +static int hashmap_contains_path(struct hashmap *map, + struct strbuf *pattern) +{ + struct exclude_entry e; + + /* Check straight mapping */ + e.pattern = pattern->buf; + e.patternlen = pattern->len; + hashmap_entry_init(&e, memhash(e.pattern, e.patternlen)); + return !!hashmap_get(map, &e, NULL); +} + void add_exclude(const char *string, const char *base, int baselen, struct exclude_list *el, int srcpos) { @@ -623,6 +716,8 @@ void add_exclude(const char *string, const char *base, ALLOC_GROW(el->excludes, el->nr + 1, el->alloc); el->excludes[el->nr++] = x; x->el = el; + + add_exclude_to_hashsets(el, x); } static int read_skip_worktree_file_from_index(const struct index_state *istate, @@ -848,6 +943,10 @@ static int add_excludes_from_buffer(char *buf, size_t size, int i, lineno = 1; char *entry; + el->use_cone_patterns = core_sparse_checkout == SPARSE_CHECKOUT_CONE ? 1 : 0; + hashmap_init(&el->recursive_hashmap, el_hashmap_cmp, NULL, 0); + hashmap_init(&el->parent_hashmap, el_hashmap_cmp, NULL, 0); + el->filebuf = buf; if (skip_utf8_bom(&buf, size)) @@ -1070,18 +1169,61 @@ static struct exclude *last_exclude_matching_from_list(const char *pathname, /* * Scan the list and let the last match determine the fate. - * Return 1 for exclude, 0 for include and -1 for undecided. + * Return 0 for exclude, 1 for include and -1 for undecided. */ int is_excluded_from_list(const char *pathname, int pathlen, const char *basename, int *dtype, struct exclude_list *el, struct index_state *istate) { struct exclude *exclude; - exclude = last_exclude_matching_from_list(pathname, pathlen, basename, - dtype, el, istate); - if (exclude) - return exclude->flags & EXC_FLAG_NEGATIVE ? 0 : 1; - return -1; /* undecided */ + struct strbuf parent_pathname = STRBUF_INIT; + int result = 0; + const char *slash_pos; + + if (!el->use_cone_patterns) { + exclude = last_exclude_matching_from_list(pathname, pathlen, basename, + dtype, el, istate); + + if (exclude) + return exclude->flags & EXC_FLAG_NEGATIVE ? 0 : 1; + + return -1; /* undecided */ + } + + strbuf_addch(&parent_pathname, '/'); + strbuf_add(&parent_pathname, pathname, pathlen); + slash_pos = strrchr(parent_pathname.buf, '/'); + + if (slash_pos == parent_pathname.buf) { + /* include every file in root */ + result = 1; + goto done; + } + + strbuf_setlen(&parent_pathname, slash_pos - parent_pathname.buf); + + if (hashmap_contains_path(&el->parent_hashmap, &parent_pathname)) { + result = 1; + goto done; + } + + while (parent_pathname.len) { + if (hashmap_contains_path(&el->recursive_hashmap, + &parent_pathname)) { + result = -1; + goto done; + } + + slash_pos = strrchr(parent_pathname.buf, '/'); + if (slash_pos == parent_pathname.buf) + break; + + strbuf_setlen(&parent_pathname, slash_pos - parent_pathname.buf); + } + +done: + strbuf_release(&parent_pathname); + return result; } static struct exclude *last_exclude_matching_from_lists(struct dir_struct *dir, diff --git a/dir.h b/dir.h index 680079bbe3..2d3356d1c0 100644 --- a/dir.h +++ b/dir.h @@ -4,6 +4,7 @@ /* See Documentation/technical/api-directory-listing.txt */ #include "cache.h" +#include "hashmap.h" #include "strbuf.h" struct dir_entry { @@ -37,6 +38,13 @@ struct exclude { int srcpos; }; +/* used for hashmaps for cone patterns */ +struct exclude_entry { + struct hashmap_entry ent; + char *pattern; + size_t patternlen; +}; + /* * Each excludes file will be parsed into a fresh exclude_list which * is appended to the relevant exclude_list_group (either EXC_DIRS or @@ -55,6 +63,25 @@ struct exclude_list { const char *src; struct exclude **excludes; + + /* + * While scanning the excludes, we attempt to match the patterns + * with a more restricted set that allows us to use hashsets for + * matching logic, which is faster than the linear lookup in the + * excludes array above. If non-zero, that check succeeded. + */ + unsigned use_cone_patterns; + + /* + * Stores paths where everything starting with those paths + * is included. + */ + struct hashmap recursive_hashmap; + + /* + * Used to check single-level parents of blobs. + */ + struct hashmap parent_hashmap; }; /* diff --git a/t/t1091-sparse-checkout-builtin.sh b/t/t1091-sparse-checkout-builtin.sh index 8cc377b839..60f10864a1 100755 --- a/t/t1091-sparse-checkout-builtin.sh +++ b/t/t1091-sparse-checkout-builtin.sh @@ -134,6 +134,14 @@ test_expect_success 'cone mode: match patterns' ' test_cmp expect dir ' +test_expect_success 'cone mode: warn on bad pattern' ' + test_when_finished mv sparse-checkout repo/.git/info && + cp repo/.git/info/sparse-checkout . && + echo "!/deep/deeper/*" >>repo/.git/info/sparse-checkout && + git -C repo read-tree -mu HEAD 2>err && + test_i18ngrep "unrecognized negative pattern" err +' + test_expect_success 'sparse-checkout disable' ' git -C repo sparse-checkout disable && test_path_is_missing repo/.git/info/sparse-checkout && -- gitgitgadget