Am 10.05.20 um 11:07 schrieb René Scharfe: > Would a stack work? When we see a candidate non-directory, we put > it on the stack. When we see a candidate directory, we compare it > to the entry at the top of the stack using strcmp(). Equality > indicates a duplicate and we are done. If the directory name is > less then we can pop the entry from the stack and check again, as > we're past the point where a duplicate would be. Makes sense? > > The candidate stack solution would have linear complexity and > require less memory than a full list of candidates. > > It would rely on the entries to be in the correct order (same as > the patch below, come to think of it), but that's probably OK. > We may miss DUPLICATE_ENTRIES (just like today :), but > TREE_NOT_SORTED would still be reported. Here's what I mean. It seems to work, but could use some critical thought and testing. -- >8 -- Subject: [PATCH] fsck: report non-consecutive duplicate names in trees Tree entries are sorted in path order, meaning that directory names get a slash ('/') appended implicitly. Git fsck checks if trees contains consecutive duplicates, but due to that ordering there can be non-consecutive duplicates as well if one of them is a directory and the other one isn't. Such a tree cannot be fully checked out. Find these duplicates by recording candidate file names on a stack and check candidate directory names against that stack to find matches. Suggested-by: Brandon Williams <bwilliamseng@xxxxxxxxx> Original-test-by: Brandon Williams <bwilliamseng@xxxxxxxxx> Signed-off-by: René Scharfe <l.s.r@xxxxxx> --- fsck.c | 72 +++++++++++++++++++++++++++++++++++++++++++++++-- t/t1450-fsck.sh | 16 +++++++++++ 2 files changed, 86 insertions(+), 2 deletions(-) diff --git a/fsck.c b/fsck.c index 087a7f1ffc..8bb3ecf282 100644 --- a/fsck.c +++ b/fsck.c @@ -523,6 +523,28 @@ int fsck_walk(struct object *obj, void *data, struct fsck_options *options) } } +struct name_stack { + const char **names; + size_t nr, alloc; +}; + +static void name_stack_push(struct name_stack *stack, const char *name) +{ + ALLOC_GROW(stack->names, stack->nr + 1, stack->alloc); + stack->names[stack->nr++] = name; +} + +static const char *name_stack_pop(struct name_stack *stack) +{ + return stack->nr ? stack->names[--stack->nr] : NULL; +} + +static void name_stack_clear(struct name_stack *stack) +{ + FREE_AND_NULL(stack->names); + stack->nr = stack->alloc = 0; +} + /* * The entries in a tree are ordered in the _path_ order, * which means that a directory entry is ordered by adding @@ -534,7 +556,14 @@ int fsck_walk(struct object *obj, void *data, struct fsck_options *options) #define TREE_UNORDERED (-1) #define TREE_HAS_DUPS (-2) -static int verify_ordered(unsigned mode1, const char *name1, unsigned mode2, const char *name2) +static int is_less_than_slash(unsigned char c) +{ + return '\0' < c && c < '/'; +} + +static int verify_ordered(unsigned mode1, const char *name1, + unsigned mode2, const char *name2, + struct name_stack *candidates) { int len1 = strlen(name1); int len2 = strlen(name2); @@ -566,6 +595,41 @@ static int verify_ordered(unsigned mode1, const char *name1, unsigned mode2, con c1 = '/'; if (!c2 && S_ISDIR(mode2)) c2 = '/'; + + /* + * There can be non-consecutive duplicates due to the implicitly + * add slash, e.g.: + * + * foo + * foo.bar + * foo.bar.baz + * foo.bar/ + * foo/ + * + * Record non-directory candidates (like "foo" and "foo.bar" in + * the example) on a stack and check directory candidates (like + * foo/" and "foo.bar/") against that stack. + */ + if (!c1 && is_less_than_slash(c2)) { + name_stack_push(candidates, name1); + } else if (c2 == '/' && is_less_than_slash(c1)) { + for (;;) { + const char *p; + const char *f_name = name_stack_pop(candidates); + + if (!f_name) + break; + if (!skip_prefix(name2, f_name, &p)) + break; + if (!*p) + return TREE_HAS_DUPS; + if (is_less_than_slash(*p)) { + name_stack_push(candidates, f_name); + break; + } + } + } + return c1 < c2 ? 0 : TREE_UNORDERED; } @@ -587,6 +651,7 @@ static int fsck_tree(const struct object_id *oid, struct tree_desc desc; unsigned o_mode; const char *o_name; + struct name_stack df_dup_candidates = { NULL }; if (init_tree_desc_gently(&desc, buffer, size)) { retval += report(options, oid, OBJ_TREE, FSCK_MSG_BAD_TREE, "cannot be parsed as a tree"); @@ -666,7 +731,8 @@ static int fsck_tree(const struct object_id *oid, } if (o_name) { - switch (verify_ordered(o_mode, o_name, mode, name)) { + switch (verify_ordered(o_mode, o_name, mode, name, + &df_dup_candidates)) { case TREE_UNORDERED: not_properly_sorted = 1; break; @@ -682,6 +748,8 @@ static int fsck_tree(const struct object_id *oid, o_name = name; } + name_stack_clear(&df_dup_candidates); + if (has_null_sha1) retval += report(options, oid, OBJ_TREE, FSCK_MSG_NULL_SHA1, "contains entries pointing to null sha1"); if (has_full_path) diff --git a/t/t1450-fsck.sh b/t/t1450-fsck.sh index 449ebc5657..91a6e34f38 100755 --- a/t/t1450-fsck.sh +++ b/t/t1450-fsck.sh @@ -257,6 +257,22 @@ test_expect_success 'tree object with duplicate entries' ' test_i18ngrep "error in tree .*contains duplicate file entries" out ' +test_expect_success 'tree object with dublicate names' ' + test_when_finished "remove_object \$blob" && + test_when_finished "remove_object \$tree" && + test_when_finished "remove_object \$badtree" && + blob=$(echo blob | git hash-object -w --stdin) && + printf "100644 blob %s\t%s\n" $blob x.2 >tree && + tree=$(git mktree <tree) && + printf "100644 blob %s\t%s\n" $blob x.1 >badtree && + printf "100644 blob %s\t%s\n" $blob x >>badtree && + printf "040000 tree %s\t%s\n" $tree x >>badtree && + badtree=$(git mktree <badtree) && + test_must_fail git fsck 2>out && + test_i18ngrep "$badtree" out && + test_i18ngrep "error in tree .*contains duplicate file entries" out +' + test_expect_success 'unparseable tree object' ' test_oid_cache <<-\EOF && junk sha1:twenty-bytes-of-junk -- 2.26.2