tboegi@xxxxxx writes: > -static void gather_stats(const char *buf, unsigned long size, struct text_stat *stats) > +static void gather_stats_partly(const char *buf, unsigned long len, > + struct text_stat *stats, unsigned earlyout) > { I think it is OK not to rename the function (you'd be passing earlyout=0 for callers that want exact stat, right?). > unsigned long i; > > - memset(stats, 0, sizeof(*stats)); > - > - for (i = 0; i < size; i++) { > + if (!buf || !len) > + return; > + for (i = 0; i < len; i++) { > unsigned char c = buf[i]; > if (c == '\r') { > - if (i+1 < size && buf[i+1] == '\n') { > + stats->stat_bits |= CONVERT_STAT_BITS_ANY_CR; > + if (i+1 < len && buf[i+1] == '\n') { > stats->crlf++; > i++; > - } else > + stats->stat_bits |= CONVERT_STAT_BITS_TXT_CRLF; > + } else { > stats->lonecr++; > + stats->stat_bits |= CONVERT_STAT_BITS_BIN; > + } > continue; > } > if (c == '\n') { > stats->lonelf++; > + stats->stat_bits |= CONVERT_STAT_BITS_TXT_LF; > continue; > } > if (c == 127) > @@ -67,7 +74,7 @@ static void gather_stats(const char *buf, unsigned long size, struct text_stat * > stats->printable++; > break; > case 0: > - stats->nul++; > + stats->stat_bits |= CONVERT_STAT_BITS_BIN; > /* fall through */ > default: > stats->nonprintable++; So depending on the distribution of the bytes in the file, the bitfields in stats->stat_bits will be filled one bit at a time in random order. > @@ -75,10 +82,12 @@ static void gather_stats(const char *buf, unsigned long size, struct text_stat * > } > else > stats->printable++; > + if (stats->stat_bits & earlyout) > + break; /* We found what we have been searching for */ But an "earlyout" says that if "any" of the earlyout bit is seen, we can return. It somehow felt a bit too limited to me in my initial reading, but I guess I shouldn't be surprised to see that such a limited interface is sufficient for a file-local helper function ;-). The only caller that the semantics of this exit condition matters is the one that wants to know "do we have NUL or CR anywhere?", so I guess this should be sufficient. > } > > /* If file ends with EOF then don't count this EOF as non-printable. */ > - if (size >= 1 && buf[size-1] == '\032') > + if (len >= 1 && buf[len-1] == '\032') > stats->nonprintable--; This noise is somewhat irritating. Was there a reason why size was a bad name for the variable? > +static const char *convert_stats_ascii(unsigned convert_stats) > { > - unsigned int convert_stats = gather_convert_stats(data, size); > - > + const unsigned mask = CONVERT_STAT_BITS_TXT_LF | > + CONVERT_STAT_BITS_TXT_CRLF; > if (convert_stats & CONVERT_STAT_BITS_BIN) > return "-text"; > - switch (convert_stats) { > + switch (convert_stats & mask) { > case CONVERT_STAT_BITS_TXT_LF: > return "lf"; > case CONVERT_STAT_BITS_TXT_CRLF: Subtle. The caller runs the stat colllection with early-out set to BITS_BIN, so that this can set "-text" early. It knows that without BITS_BIN, the stat was taken for the whole contents and the check lf or crlf can be reliable. I wonder if we can/need to do something to remove this subtleness out of this callchain, which could be a source of confusion. > @@ -132,24 +162,45 @@ static const char *gather_convert_stats_ascii(const char *data, unsigned long si > } > } > > +static unsigned get_convert_stats_wt(const char *path) > +{ > + struct text_stat stats; > + unsigned earlyout = CONVERT_STAT_BITS_BIN; > + int fd; > + memset(&stats, 0, sizeof(stats)); > + fd = open(path, O_RDONLY); > + if (fd < 0) > + return 0; > + for (;;) { > + char buf[2*1024]; Where is this 2kB come from? Out of thin air?