On Wed, Jun 24, 2015 at 11:10 PM, Johannes Schindelin <johannes.schindelin@xxxxxx> wrote: > Hi Paul, > > On 2015-06-18 13:25, Paul Tan wrote: > >> diff --git a/builtin/am.c b/builtin/am.c >> index e9a3687..7b97ea8 100644 >> --- a/builtin/am.c >> +++ b/builtin/am.c >> @@ -121,6 +121,96 @@ static void am_destroy(const struct am_state *state) >> strbuf_release(&sb); >> } >> >> +/* >> + * Returns 1 if the file looks like a piece of email a-la RFC2822, 0 otherwise. >> + * We check this by grabbing all the non-indented lines and seeing if they look >> + * like they begin with valid header field names. >> + */ >> +static int is_email(const char *filename) >> +{ >> + struct strbuf sb = STRBUF_INIT; >> + FILE *fp = xfopen(filename, "r"); >> + int ret = 1; >> + >> + while (!strbuf_getline(&sb, fp, '\n')) { >> + const char *x; >> + >> + strbuf_rtrim(&sb); >> + >> + if (!sb.len) >> + break; /* End of header */ >> + >> + /* Ignore indented folded lines */ >> + if (*sb.buf == '\t' || *sb.buf == ' ') >> + continue; >> + >> + /* It's a header if it matches the regexp "^[!-9;-~]+:" */ > > Why not just compile a regex and use it here? We use regexes elsewhere anyway... Ah, you're right. A regular expression would definitely be clearer. I've fixed it on my end. >> +/** >> + * Attempts to detect the patch_format of the patches contained in `paths`, >> + * returning the PATCH_FORMAT_* enum value. Returns PATCH_FORMAT_UNKNOWN if >> + * detection fails. >> + */ >> +static int detect_patch_format(struct string_list *paths) >> +{ >> + enum patch_format ret = PATCH_FORMAT_UNKNOWN; >> + struct strbuf l1 = STRBUF_INIT; >> + struct strbuf l2 = STRBUF_INIT; >> + struct strbuf l3 = STRBUF_INIT; >> + FILE *fp; >> + >> + /* >> + * We default to mbox format if input is from stdin and for directories >> + */ >> + if (!paths->nr || !strcmp(paths->items->string, "-") || >> + is_directory(paths->items->string)) { >> + ret = PATCH_FORMAT_MBOX; >> + goto done; >> + } >> + >> + /* >> + * Otherwise, check the first 3 lines of the first patch, starting >> + * from the first non-blank line, to try to detect its format. >> + */ >> + fp = xfopen(paths->items->string, "r"); >> + while (!strbuf_getline(&l1, fp, '\n')) { >> + strbuf_trim(&l1); >> + if (l1.len) >> + break; >> + } >> + strbuf_getline(&l2, fp, '\n'); > > We should test the return value of `strbuf_getline()`; if EOF was reached already, `strbuf_getwholeline()` does not touch the strbuf. I know, the strbuf is still initialized empty here, but it is too easy to forget when changing this code. Ah OK. I'll vote for doing a strbuf_reset() just before the strbuf_getline() though. >> + strbuf_trim(&l2); >> + strbuf_getline(&l3, fp, '\n'); >> + strbuf_trim(&l3); >> + fclose(fp); >> + >> + if (starts_with(l1.buf, "From ") || starts_with(l1.buf, "From: ")) >> + ret = PATCH_FORMAT_MBOX; > > Hmm. We can test that earlier and return without reading from the file any further, I think. The "reading 3 lines at the beginning" logic is meant to support a later patch where support for StGit and mercurial patches is added. That said, it's true that we don't need to read 3 lines in this patch, so I think I will remove it in this patch. >> + else if (l1.len && l2.len && l3.len && is_email(paths->items->string)) >> + ret = PATCH_FORMAT_MBOX; > > Maybe we can do better than this by folding the `is_email() function into this here function, reusing the same strbuf to read the lines and keeping track of the email header lines we saw... I would really like to avoid opening the same file twice just to figure out whether it is in email format. Okay, how about every time we call a strbuf_getline(), we save the line to a string_list as well? Like string_list_getline_crlf() below: /** * Like strbuf_getline(), but supports both '\n' and "\r\n" as line * terminators. */ static int strbuf_getline_crlf(struct strbuf *sb, FILE *fp) { if (strbuf_getwholeline(sb, fp, '\n')) return EOF; if (sb->buf[sb->len - 1] == '\n') { strbuf_setlen(sb, sb->len - 1); if (sb->len > 0 && sb->buf[sb->len - 1] == '\r') strbuf_setlen(sb, sb->len - 1); } return 0; } /** * Like strbuf_getline_crlf(), but appends the line to a string_list and * returns it as a string. Returns NULL on EOF. */ static const char *string_list_getline_crlf(struct string_list *list, FILE *fp) { struct strbuf sb = STRBUF_INIT; struct string_list_item *item; if (strbuf_getline_crlf(&sb, fp)) return NULL; item = string_list_append_nodup(list, strbuf_detach(&sb, NULL)); return item->string; } So now, is_email() can have access to previously-read lines, and if it needs some more, it can read more as well: static int is_email(struct string_list *lines, FILE *fp) { const char *header_regex = "^[!-9;-~]+:"; regex_t regex; int ret = 1; size_t i; if (regcomp(®ex, header_regex, REG_NOSUB | REG_EXTENDED)) die("Invalid search pattern: %s", header_regex); for (i = 0; i < lines->nr || string_list_getline_crlf(lines, fp); i++) { const char *line = lines->items[i].string; if (!*line) break; /* End of header */ /* Ignore indented folded lines */ if (*line == '\t' || *line == ' ') continue; /* It's a header if it matches header_regex */ if (regexec(®ex, line, 0, NULL, 0)) { ret = 0; goto done; } } done: regfree(®ex); return ret; } Which solves the problem of opening the file 2 times. What do you think? Regards, Paul -- To unsubscribe from this list: send the line "unsubscribe git" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html