Am 26.02.23 um 12:50 schrieb Masahiro Yamada: > git-compat-util.h implements most of is*() macros. > > Add isblank() and isgraph(), which are useful to clean up wildmatch.c > in a consistent way (in this and later commits). > > In the previous submission, I just moved isblank() and isgraph() as > implemented in wildmatch.c. I knew they were not robust against the > pointer increment like isblank(*s++), but I thought it was the same > pattern as isprint(), which has the same issue. Unfortunately, it was > more controversial than I had expected... Not sure we need that story in the commit message, but it gave me an idea: To go back to the isprint() version from 1c149ab2dd (ctype: support iscntrl, ispunct, isxdigit and isprint, 2012-10-15), which evaluates its argument only once: #define isprint(x) (sane_istest(x, GIT_ALPHA | GIT_DIGIT | GIT_SPACE | \ GIT_PUNCT | GIT_REGEX_SPECIAL | GIT_GLOB_SPECIAL | \ GIT_PATHSPEC_MAGIC)) But then I realized that it wrongly classifies \t, \r and \n as being printing characters; 567342fc77 (test-ctype: test iscntrl, ispunct, isxdigit and isprint, 2023-02-13) shows it. So it's not so easy, however, ... > This version implements them as inline functions because we ran out > all bits in the sane_ctype[] table. This is the same pattern as > islower() and isupper(). ... if you remove GIT_SPACE from the definition above you get a macro version of isgraph() that uses a single table lookup. > Once we refactor ctype.c to create more room in sane_ctype[], isblank() > and isgraph() will be able to use sane_istest(). Probably so will > islower() and isupper(). The ctype in Linux kernel (lib/ctype.c) has > the LOWER and UPPER bits separately. If we're out of bits then isblank() is a good choice to implement without a table lookup, as this class only contains two characters and two comparisons should be quite fast. Stepping back a bit: Is using the unlocalized is* macros in wildmatch() safe, i.e. do we get the same results as before regardless of locale? Junio's remark in https://lore.kernel.org/git/xmqq3579crsd.fsf@gitster.g/ sounds convincing to me if we don't care about single-byte code pages and require plain ASCII or UTF-8. I think it's a good idea to address that point in the commit message. And adding tests to t/helper/test-ctype.c would be nice. > Signed-off-by: Masahiro Yamada <masahiroy@xxxxxxxxxx> > --- > > Changes in v2: > - Use inline functions > > git-compat-util.h | 14 ++++++++++++++ > wildmatch.c | 14 ++------------ > 2 files changed, 16 insertions(+), 12 deletions(-) > > diff --git a/git-compat-util.h b/git-compat-util.h > index 4f0028ce60..b29c238f02 100644 > --- a/git-compat-util.h > +++ b/git-compat-util.h > @@ -1212,10 +1212,12 @@ extern const unsigned char tolower_trans_tbl[256]; > /* Sane ctype - no locale, and works with signed chars */ > #undef isascii > #undef isspace > +#undef isblank > #undef isdigit > #undef isalpha > #undef isalnum > #undef isprint > +#undef isgraph > #undef islower > #undef isupper > #undef tolower > @@ -1236,10 +1238,12 @@ extern const unsigned char sane_ctype[256]; > #define sane_istest(x,mask) ((sane_ctype[(unsigned char)(x)] & (mask)) != 0) > #define isascii(x) (((x) & ~0x7f) == 0) > #define isspace(x) sane_istest(x,GIT_SPACE) > +#define isblank(x) sane_isblank(x) > #define isdigit(x) sane_istest(x,GIT_DIGIT) > #define isalpha(x) sane_istest(x,GIT_ALPHA) > #define isalnum(x) sane_istest(x,GIT_ALPHA | GIT_DIGIT) > #define isprint(x) ((x) >= 0x20 && (x) <= 0x7e) > +#define isgraph(x) sane_isgraph(x) > #define islower(x) sane_iscase(x, 1) > #define isupper(x) sane_iscase(x, 0) > #define is_glob_special(x) sane_istest(x,GIT_GLOB_SPECIAL) > @@ -1270,6 +1274,16 @@ static inline int sane_iscase(int x, int is_lower) > return (x & 0x20) == 0; > } > > +static inline int sane_isblank(int c) > +{ > + return c == ' ' || c == '\t'; > +} > + > +static inline int sane_isgraph(int c) > +{ > + return isprint(c) && !isspace(c); > +} > + > /* > * Like skip_prefix, but compare case-insensitively. Note that the comparison > * is done via tolower(), so it is strictly ASCII (no multi-byte characters or > diff --git a/wildmatch.c b/wildmatch.c > index 7e5a7ea1ea..85c4c7f8a7 100644 > --- a/wildmatch.c > +++ b/wildmatch.c > @@ -28,18 +28,8 @@ typedef unsigned char uchar; > # define ISASCII(c) isascii(c) > #endif > > -#ifdef isblank > -# define ISBLANK(c) (ISASCII(c) && isblank(c)) > -#else > -# define ISBLANK(c) ((c) == ' ' || (c) == '\t') > -#endif > - > -#ifdef isgraph > -# define ISGRAPH(c) (ISASCII(c) && isgraph(c)) > -#else > -# define ISGRAPH(c) (ISASCII(c) && isprint(c) && !isspace(c)) > -#endif > - > +#define ISBLANK(c) (ISASCII(c) && isblank(c)) > +#define ISGRAPH(c) (ISASCII(c) && isgraph(c)) > #define ISPRINT(c) (ISASCII(c) && isprint(c)) > #define ISDIGIT(c) (ISASCII(c) && isdigit(c)) > #define ISALNUM(c) (ISASCII(c) && isalnum(c))