From: Torsten Bögershausen <tboegi@xxxxxx> When statistics are done for the autocrlf handling, the search in the content can be stopped, if e.g - a search for binary is done, and a NUL character is found - a search for CRLF is done, and the first CRLF is found. Similar when statistics for binary vs non-binary are gathered: Whenever a lone CR or NUL is found, the search can be aborted. When checking out files in "auto" mode, any file that has a "lone CR" or a CRLF will not be converted, so the search can be aborted early. Add the new bit, CONVERT_STAT_BITS_ANY_CR, which is set for either lone CR or CRLF. Many binary files have a NUL very early (within the first few bytes, latest within the first 1..2K). It is often not necessary to load the whole content of a file or blob into memory. Use a streaming handling for blobs and files in the worktree. Signed-off-by: Torsten Bögershausen <tboegi@xxxxxx> --- convert.c | 162 ++++++++++++++++++++++++++++++++++++++++---------------------- 1 file changed, 106 insertions(+), 56 deletions(-) diff --git a/convert.c b/convert.c index f524b8d..b6da114 100644 --- a/convert.c +++ b/convert.c @@ -3,6 +3,7 @@ #include "run-command.h" #include "quote.h" #include "sigchain.h" +#include "streaming.h" /* * convert.c - convert a file when checking it out and checking it in. @@ -13,10 +14,10 @@ * translation when the "text" attribute or "auto_crlf" option is set. */ -/* Stat bits: When BIN is set, the txt bits are unset */ #define CONVERT_STAT_BITS_TXT_LF 0x1 #define CONVERT_STAT_BITS_TXT_CRLF 0x2 #define CONVERT_STAT_BITS_BIN 0x4 +#define CONVERT_STAT_BITS_ANY_CR 0x8 enum crlf_action { CRLF_UNDEFINED, @@ -31,30 +32,36 @@ enum crlf_action { struct text_stat { /* NUL, CR, LF and CRLF counts */ - unsigned nul, lonecr, lonelf, crlf; + unsigned stat_bits, lonecr, lonelf, crlf; /* These are just approximations! */ unsigned printable, nonprintable; }; -static void gather_stats(const char *buf, unsigned long size, struct text_stat *stats) +static void do_gather_stats(const char *buf, unsigned long size, + struct text_stat *stats, unsigned earlyout) { unsigned long i; - memset(stats, 0, sizeof(*stats)); - + if (!buf || !size) + return; for (i = 0; i < size; i++) { unsigned char c = buf[i]; if (c == '\r') { + stats->stat_bits |= CONVERT_STAT_BITS_ANY_CR; if (i+1 < size && buf[i+1] == '\n') { stats->crlf++; i++; - } else + stats->stat_bits |= CONVERT_STAT_BITS_TXT_CRLF; + } else { stats->lonecr++; + stats->stat_bits |= CONVERT_STAT_BITS_BIN; + } continue; } if (c == '\n') { stats->lonelf++; + stats->stat_bits |= CONVERT_STAT_BITS_TXT_LF; continue; } if (c == 127) @@ -67,7 +74,7 @@ static void gather_stats(const char *buf, unsigned long size, struct text_stat * stats->printable++; break; case 0: - stats->nul++; + stats->stat_bits |= CONVERT_STAT_BITS_BIN; /* fall through */ default: stats->nonprintable++; @@ -75,6 +82,8 @@ static void gather_stats(const char *buf, unsigned long size, struct text_stat * } else stats->printable++; + if (stats->stat_bits & earlyout) + break; /* We found what we have been searching for */ } /* If file ends with EOF then don't count this EOF as non-printable. */ @@ -86,41 +95,63 @@ static void gather_stats(const char *buf, unsigned long size, struct text_stat * * The same heuristics as diff.c::mmfile_is_binary() * We treat files with bare CR as binary */ -static int convert_is_binary(unsigned long size, const struct text_stat *stats) +static void convert_nonprintable(struct text_stat *stats) { - if (stats->lonecr) - return 1; - if (stats->nul) - return 1; if ((stats->printable >> 7) < stats->nonprintable) - return 1; - return 0; + stats->stat_bits |= CONVERT_STAT_BITS_BIN; +} + +static void gather_stats(const char *buf, unsigned long size, + struct text_stat *stats, unsigned earlyout) +{ + memset(stats, 0, sizeof(*stats)); + do_gather_stats(buf, size, stats, earlyout); + convert_nonprintable(stats); } -static unsigned int gather_convert_stats(const char *data, unsigned long size) + +static unsigned get_convert_stats_sha1(const char *path, + unsigned const char *sha1, + unsigned earlyout) { + struct git_istream *st; struct text_stat stats; - int ret = 0; - if (!data || !size) - return 0; - gather_stats(data, size, &stats); - if (convert_is_binary(size, &stats)) - ret |= CONVERT_STAT_BITS_BIN; - if (stats.crlf) - ret |= CONVERT_STAT_BITS_TXT_CRLF; - if (stats.lonelf) - ret |= CONVERT_STAT_BITS_TXT_LF; + enum object_type type; + unsigned long sz; - return ret; + if (!sha1) + return 0; + memset(&stats, 0, sizeof(stats)); + st = open_istream(sha1, &type, &sz, NULL); + if (!st) { + return 0; + } + if (type != OBJ_BLOB) + goto close_and_exit_i; + for (;;) { + char buf[1024]; + ssize_t readlen = read_istream(st, buf, sizeof(buf)); + if (readlen < 0) + break; + if (!readlen) + break; + do_gather_stats(buf, (unsigned long)readlen, &stats, earlyout); + if (stats.stat_bits & earlyout) + break; /* We found what we have been searching for */ + } +close_and_exit_i: + close_istream(st); + convert_nonprintable(&stats); + return stats.stat_bits; } -static const char *gather_convert_stats_ascii(const char *data, unsigned long size) +static const char *convert_stats_ascii(unsigned convert_stats) { - unsigned int convert_stats = gather_convert_stats(data, size); - + unsigned mask = CONVERT_STAT_BITS_TXT_LF | + CONVERT_STAT_BITS_TXT_CRLF; if (convert_stats & CONVERT_STAT_BITS_BIN) return "-text"; - switch (convert_stats) { + switch (convert_stats & mask) { case CONVERT_STAT_BITS_TXT_LF: return "lf"; case CONVERT_STAT_BITS_TXT_CRLF: @@ -132,24 +163,46 @@ static const char *gather_convert_stats_ascii(const char *data, unsigned long si } } +static unsigned get_convert_stats_wt(const char *path) +{ + struct text_stat stats; + unsigned earlyout = CONVERT_STAT_BITS_BIN; + int fd; + memset(&stats, 0, sizeof(stats)); + fd = open(path, O_RDONLY); + if (fd < 0) + return 0; + for (;;) { + char buf[1024]; + ssize_t readlen = read(fd, buf, sizeof(buf)); + if (readlen < 0) + break; + if (!readlen) + break; + do_gather_stats(buf, (unsigned long)readlen, &stats, earlyout); + if (stats.stat_bits & earlyout) + break; /* We found what we have been searching for */ + } + close(fd); + convert_nonprintable(&stats); + return stats.stat_bits; +} + const char *get_cached_convert_stats_ascii(const char *path) { - const char *ret; - unsigned long sz; - void *data = read_blob_data_from_cache(path, &sz); - ret = gather_convert_stats_ascii(data, sz); - free(data); - return ret; + unsigned convert_stats; + unsigned earlyout = CONVERT_STAT_BITS_BIN; + convert_stats = get_convert_stats_sha1(path, + get_sha1_from_cache(path), + earlyout); + return convert_stats_ascii(convert_stats); } const char *get_wt_convert_stats_ascii(const char *path) { - const char *ret = ""; - struct strbuf sb = STRBUF_INIT; - if (strbuf_read_file(&sb, path, 0) >= 0) - ret = gather_convert_stats_ascii(sb.buf, sb.len); - strbuf_release(&sb); - return ret; + unsigned convert_stats; + convert_stats = get_convert_stats_wt(path); + return convert_stats_ascii(convert_stats); } static int text_eol_is_crlf(void) @@ -219,16 +272,11 @@ static void check_safe_crlf(const char *path, enum crlf_action crlf_action, static int has_cr_in_index(const char *path) { - unsigned long sz; - void *data; - int has_cr; - - data = read_blob_data_from_cache(path, &sz); - if (!data) - return 0; - has_cr = memchr(data, '\r', sz) != NULL; - free(data); - return has_cr; + unsigned convert_stats; + convert_stats = get_convert_stats_sha1(path, + get_sha1_from_cache(path), + CONVERT_STAT_BITS_ANY_CR); + return convert_stats & CONVERT_STAT_BITS_ANY_CR; } static int crlf_to_git(const char *path, const char *src, size_t len, @@ -249,10 +297,10 @@ static int crlf_to_git(const char *path, const char *src, size_t len, if (!buf && !src) return 1; - gather_stats(src, len, &stats); + gather_stats(src, len, &stats, CONVERT_STAT_BITS_BIN); if (crlf_action == CRLF_AUTO || crlf_action == CRLF_AUTO_INPUT || crlf_action == CRLF_AUTO_CRLF) { - if (convert_is_binary(len, &stats)) + if (stats.stat_bits & CONVERT_STAT_BITS_BIN) return 0; if (crlf_action == CRLF_AUTO_INPUT || crlf_action == CRLF_AUTO_CRLF) { @@ -309,11 +357,13 @@ static int crlf_to_worktree(const char *path, const char *src, size_t len, { char *to_free = NULL; struct text_stat stats; + unsigned earlyout = CONVERT_STAT_BITS_TXT_CRLF | CONVERT_STAT_BITS_BIN; + if (!len || output_eol(crlf_action) != EOL_CRLF) return 0; - gather_stats(src, len, &stats); + gather_stats(src, len, &stats, earlyout); /* No "naked" LF? Nothing to convert, regardless. */ if (!stats.lonelf) @@ -327,7 +377,7 @@ static int crlf_to_worktree(const char *path, const char *src, size_t len, return 0; } - if (convert_is_binary(len, &stats)) + if (stats.stat_bits & CONVERT_STAT_BITS_BIN) return 0; } -- 2.8.0.rc2.6.g3847ccb -- To unsubscribe from this list: send the line "unsubscribe git" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html