On Mon, Jan 9, 2023 at 4:29 AM Đoàn Trần Công Danh <congdanhqx@xxxxxxxxx> wrote: > > The newly added heuristic to parse less precision ISO-8601 conflicts > with other heuristics to parse datetime-strings. E.g.: > > Thu, 7 Apr 2005 15:14:13 -0700 > > Let's limit the new heuristic to only datetime string with a 'T' > followed immediately by some digits, and if we failed to parse the > upcoming string, rollback the change. > > Signed-off-by: Đoàn Trần Công Danh <congdanhqx@xxxxxxxxx> > --- > > Here is a better thought out change, which tried to minimize the impact of > new heuristics. > > While I think it's a fixup, but I still needs explaination, I think I may > reword it's as a full patch instead. > Range-diff: > 1: 4036e5a944 ! 1: b703425a57 fixup! date.c: allow ISO 8601 reduced precision times > @@ Metadata > Author: Đoàn Trần Công Danh <congdanhqx@xxxxxxxxx> > > ## Commit message ## > - fixup! date.c: allow ISO 8601 reduced precision times > + date.c: limit less precision ISO-8601 with its marker > + > + The newly added heuristic to parse less precision ISO-8601 conflicts > + with other heuristics to parse datetime-strings. E.g.: > + > + Thu, 7 Apr 2005 15:14:13 -0700 > + > + Let's limit the new heuristic to only datetime string with a 'T' > + followed immediately by some digits, and if we failed to parse the > + upcoming string, rollback the change. > > Signed-off-by: Đoàn Trần Công Danh <congdanhqx@xxxxxxxxx> > > @@ date.c: static int match_alpha(const char *date, struct tm *tm, int *offset) > } > > + /* ISO-8601 allows yyyymmDD'T'HHMMSS, with less precision */ > -+ if (*date == 'T' && isdigit(date[1])) { > -+ tm->tm_hour = tm->tm_min = tm->tm_sec = 0; > -+ return strlen("T"); > ++ if (*date == 'T' && isdigit(date[1]) && tm->tm_hour == -1) { > ++ tm->tm_min = tm->tm_sec = 0; > ++ return 1; > + } > + > /* BAD CRAP */ > @@ date.c: static inline int nodate(struct tm *tm) > - * We just do a binary 'and' to see if the sign bit > - * is set in all the values. > + * Have we seen an ISO-8601-alike date, i.e. 20220101T0, > -+ * In those special case, those fields have been set to 0 > ++ * In which, hour is still unset, > ++ * and minutes and second has been set to 0. > */ > -static inline int notime(struct tm *tm) > +static inline int maybeiso8601(struct tm *tm) > @@ date.c: static inline int nodate(struct tm *tm) > - return (tm->tm_hour & > - tm->tm_min & > - tm->tm_sec) < 0; > -+ return tm->tm_hour == 0 && > ++ return tm->tm_hour == -1 && > + tm->tm_min == 0 && > + tm->tm_sec == 0; > } > > /* > @@ date.c: static int match_digit(const char *date, struct tm *tm, int *offset, int *tm_gmt > - /* 4 digits, compact style of ISO-8601's time: HHMM */ > - /* 2 digits, compact style of ISO-8601's time: HH */ > - if (n == 8 || n == 6 || > + > + /* 8 digits, compact style of ISO-8601's date: YYYYmmDD */ > + /* 6 digits, compact style of ISO-8601's time: HHMMSS */ > +- /* 4 digits, compact style of ISO-8601's time: HHMM */ > +- /* 2 digits, compact style of ISO-8601's time: HH */ > +- if (n == 8 || n == 6 || > - (!nodate(tm) && notime(tm) && > -+ (!nodate(tm) && maybeiso8601(tm) && > - (n == 4 || n == 2))) { > +- (n == 4 || n == 2))) { > ++ if (n == 8 || n == 6) { > unsigned int num1 = num / 10000; > unsigned int num2 = (num % 10000) / 100; > + unsigned int num3 = num % 100; > +@@ date.c: static int match_digit(const char *date, struct tm *tm, int *offset, int *tm_gmt > + else if (n == 6 && set_time(num1, num2, num3, tm) == 0 && > + *end == '.' && isdigit(end[1])) > + strtoul(end + 1, &end, 10); > +- else if (n == 4) > +- set_time(num2, num3, 0, tm); > +- else if (n == 2) > +- set_time(num3, 0, 0, tm); > + return end - date; > + } > + > ++ /* reduced precision of ISO-8601's time: HHMM or HH */ > ++ if (maybeiso8601(tm)) { > ++ unsigned int num1 = num; > ++ unsigned int num2 = 0; > ++ if (n == 4) { > ++ num1 = num / 100; > ++ num2 = num % 100; > ++ } > ++ if ((n == 4 || n == 2) && !nodate(tm) && > ++ set_time(num1, num2, 0, tm) == 0) > ++ return n; > ++ /* > ++ * We thought this is an ISO-8601 time string, > ++ * we set minutes and seconds to 0, > ++ * turn out it isn't, rollback the change. > ++ */ > ++ tm->tm_min = tm->tm_sec = -1; > ++ } > ++ > + /* Four-digit year or a timezone? */ > + if (n == 4) { > + if (num <= 1400 && *offset == -1) { > > ## t/t0006-date.sh ## > @@ t/t0006-date.sh: check_parse '20080214T20:30' '2008-02-14 20:30:00 +0000' > > date.c | 49 +++++++++++++++++++++++++++++++++---------------- > t/t0006-date.sh | 3 ++- > 2 files changed, 35 insertions(+), 17 deletions(-) > > diff --git a/date.c b/date.c > index b011b9d6b3..6f45eeb356 100644 > --- a/date.c > +++ b/date.c > @@ -493,6 +493,12 @@ static int match_alpha(const char *date, struct tm *tm, int *offset) > return 2; > } > > + /* ISO-8601 allows yyyymmDD'T'HHMMSS, with less precision */ > + if (*date == 'T' && isdigit(date[1]) && tm->tm_hour == -1) { > + tm->tm_min = tm->tm_sec = 0; > + return 1; > + } > + > /* BAD CRAP */ > return skip_alpha(date); > } > @@ -639,15 +645,15 @@ static inline int nodate(struct tm *tm) > } > > /* > - * Have we filled in any part of the time yet? > - * We just do a binary 'and' to see if the sign bit > - * is set in all the values. > + * Have we seen an ISO-8601-alike date, i.e. 20220101T0, > + * In which, hour is still unset, > + * and minutes and second has been set to 0. > */ > -static inline int notime(struct tm *tm) > +static inline int maybeiso8601(struct tm *tm) > { > - return (tm->tm_hour & > - tm->tm_min & > - tm->tm_sec) < 0; > + return tm->tm_hour == -1 && > + tm->tm_min == 0 && > + tm->tm_sec == 0; > } > > /* > @@ -701,11 +707,7 @@ static int match_digit(const char *date, struct tm *tm, int *offset, int *tm_gmt > > /* 8 digits, compact style of ISO-8601's date: YYYYmmDD */ > /* 6 digits, compact style of ISO-8601's time: HHMMSS */ > - /* 4 digits, compact style of ISO-8601's time: HHMM */ > - /* 2 digits, compact style of ISO-8601's time: HH */ > - if (n == 8 || n == 6 || > - (!nodate(tm) && notime(tm) && > - (n == 4 || n == 2))) { > + if (n == 8 || n == 6) { > unsigned int num1 = num / 10000; > unsigned int num2 = (num % 10000) / 100; > unsigned int num3 = num % 100; > @@ -714,13 +716,28 @@ static int match_digit(const char *date, struct tm *tm, int *offset, int *tm_gmt > else if (n == 6 && set_time(num1, num2, num3, tm) == 0 && > *end == '.' && isdigit(end[1])) > strtoul(end + 1, &end, 10); > - else if (n == 4) > - set_time(num2, num3, 0, tm); > - else if (n == 2) > - set_time(num3, 0, 0, tm); > return end - date; > } > > + /* reduced precision of ISO-8601's time: HHMM or HH */ > + if (maybeiso8601(tm)) { > + unsigned int num1 = num; > + unsigned int num2 = 0; > + if (n == 4) { > + num1 = num / 100; > + num2 = num % 100; > + } > + if ((n == 4 || n == 2) && !nodate(tm) && > + set_time(num1, num2, 0, tm) == 0) > + return n; > + /* > + * We thought this is an ISO-8601 time string, > + * we set minutes and seconds to 0, > + * turn out it isn't, rollback the change. > + */ > + tm->tm_min = tm->tm_sec = -1; > + } > + > /* Four-digit year or a timezone? */ > if (n == 4) { > if (num <= 1400 && *offset == -1) { > diff --git a/t/t0006-date.sh b/t/t0006-date.sh > index 16fb0bf4bd..130207fc04 100755 > --- a/t/t0006-date.sh > +++ b/t/t0006-date.sh > @@ -93,7 +93,8 @@ check_parse '20080214T20:30' '2008-02-14 20:30:00 +0000' > check_parse '20080214T20' '2008-02-14 20:00:00 +0000' > check_parse '20080214T203045' '2008-02-14 20:30:45 +0000' > check_parse '20080214T2030' '2008-02-14 20:30:00 +0000' > -check_parse '20080214T20' '2008-02-14 20:00:00 +0000' > +check_parse '20080214T000000.20' '2008-02-14 00:00:00 +0000' > +check_parse '20080214T00:00:00.20' '2008-02-14 00:00:00 +0000' > check_parse '20080214T203045-04:00' '2008-02-14 20:30:45 -0400' > check_parse '20080214T203045 -04:00' '2008-02-14 20:30:45 -0400' > check_parse '20080214T203045.019-04:00' '2008-02-14 20:30:45 -0400' > -- > 2.39.0.287.g690a66fa66 > Thanks, Đoàn. LGTM, and much safer.