Re: [PATCH 1/5] git-compat-util: add isblank() and isgraph()

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Am 11.02.23 um 08:01 schrieb Masahiro Yamada:
> On Sat, Feb 11, 2023 at 7:03 AM René Scharfe <l.s.r@xxxxxx> wrote:
>>
>> We already use eight bits for the lookup table values in sane_ctype.
>> Perhaps it's time to move GIT_GLOB_SPECIAL, GIT_REGEX_SPECIAL,
>> GIT_PATHSPEC_MAGIC and GIT_PUNCT to their own table to make room for
>> flags for isprint(), isgraph() and isblank().
>
>
>
> I think that is a good idea.
>
> If we can have more room for isupper() and islower(),
> we will be able to delete sane_iscase().
>
>
> (I was thinking 'unsigned short', but having two tables is OK.)
>
>
>
> So, how can I proceed with this patchset?
>
>
>
> [A] After somebody refactors ctype.c table,
>     I will rebase this series on top of that.
>
> [B] keep isblank() and isgraph() local to wildmatch.c
>     until that happens, so I can proceed without
>     depending on the ctype.c refactoring.
>
>     Apparently, wildmatch.c is not using a pointer
>     increment with isblank() or isgraph().
>
> [C] If 'somebody' in [A] is supposed to me,
>     my v2 will include ctype refactoring.

[D] We need more tests first. :)  I sent patches to test the current
classifiers separately.  A similar test for isblank would have helped
you detect the mismatch quickly.  Full test coverage also gives
confidence when tinkering with the existing classifiers.

1c149ab2dd (ctype: support iscntrl, ispunct, isxdigit and isprint,
2012-10-15) added an implementation of isprint that evaluated its
argument only once, by the way, but the one from 0fcec2ce54
(format-patch: make rfc2047 encoding more strict, 2012-10-18)
replaced it.

Widening the element type of the lookup table would work, but might
impact performance.  I guess it would be slower, but perhaps we'd
have to measure it to be sure.  Splitting the table into unrelated
subsets would avoid that.

Deciding which flags to move requires knowing the full target set,
I think.  The punctuation-related flags looked to me like good
candidates until I saw the original isprint implementation which
uses them and several of the others.  So I'm not so sure anymore,
but here's a patch that moves them out:

---
 ctype.c           | 51 +++++++++++++++++++++++++++++++++++++----------
 git-compat-util.h | 21 ++++++++++---------
 2 files changed, 53 insertions(+), 19 deletions(-)

diff --git a/ctype.c b/ctype.c
index fc0225cebd..c7a0dcba13 100644
--- a/ctype.c
+++ b/ctype.c
@@ -9,26 +9,57 @@ enum {
 	S = GIT_SPACE,
 	A = GIT_ALPHA,
 	D = GIT_DIGIT,
-	G = GIT_GLOB_SPECIAL,	/* *, ?, [, \\ */
-	R = GIT_REGEX_SPECIAL,	/* $, (, ), +, ., ^, {, | */
-	P = GIT_PATHSPEC_MAGIC, /* other non-alnum, except for ] and } */
 	X = GIT_CNTRL,
-	U = GIT_PUNCT,
 	Z = GIT_CNTRL | GIT_SPACE
 };

 const unsigned char sane_ctype[256] = {
 	X, X, X, X, X, X, X, X, X, Z, Z, X, X, Z, X, X,		/*   0.. 15 */
 	X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X,		/*  16.. 31 */
-	S, P, P, P, R, P, P, P, R, R, G, R, P, P, R, P,		/*  32.. 47 */
-	D, D, D, D, D, D, D, D, D, D, P, P, P, P, P, G,		/*  48.. 63 */
-	P, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,		/*  64.. 79 */
-	A, A, A, A, A, A, A, A, A, A, A, G, G, U, R, P,		/*  80.. 95 */
-	P, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,		/*  96..111 */
-	A, A, A, A, A, A, A, A, A, A, A, R, R, U, P, X,		/* 112..127 */
+	S, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,		/*  32.. 47 */
+	D, D, D, D, D, D, D, D, D, D, 0, 0, 0, 0, 0, 0,		/*  48.. 63 */
+	0, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,		/*  64.. 79 */
+	A, A, A, A, A, A, A, A, A, A, A, 0, 0, 0, 0, 0,		/*  80.. 95 */
+	0, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,		/*  96..111 */
+	A, A, A, A, A, A, A, A, A, A, A, 0, 0, 0, 0, X,		/* 112..127 */
 	/* Nothing in the 128.. range */
 };

+const unsigned char sane_ctype2[256] = {
+	['*'] = GIT_GLOB_SPECIAL,
+	['?'] = GIT_GLOB_SPECIAL,
+	['['] = GIT_GLOB_SPECIAL,
+	['\\'] = GIT_GLOB_SPECIAL,
+	['$'] = GIT_REGEX_SPECIAL,
+	['('] = GIT_REGEX_SPECIAL,
+	[')'] = GIT_REGEX_SPECIAL,
+	['+'] = GIT_REGEX_SPECIAL,
+	['.'] = GIT_REGEX_SPECIAL,
+	['^'] = GIT_REGEX_SPECIAL,
+	['{'] = GIT_REGEX_SPECIAL,
+	['|'] = GIT_REGEX_SPECIAL,
+	['!'] = GIT_PATHSPEC_MAGIC,
+	['"'] = GIT_PATHSPEC_MAGIC,
+	['#'] = GIT_PATHSPEC_MAGIC,
+	['%'] = GIT_PATHSPEC_MAGIC,
+	['&'] = GIT_PATHSPEC_MAGIC,
+	['\''] = GIT_PATHSPEC_MAGIC,
+	[','] = GIT_PATHSPEC_MAGIC,
+	['-'] = GIT_PATHSPEC_MAGIC,
+	['/'] = GIT_PATHSPEC_MAGIC,
+	[':'] = GIT_PATHSPEC_MAGIC,
+	[';'] = GIT_PATHSPEC_MAGIC,
+	['<'] = GIT_PATHSPEC_MAGIC,
+	['='] = GIT_PATHSPEC_MAGIC,
+	['>'] = GIT_PATHSPEC_MAGIC,
+	['@'] = GIT_PATHSPEC_MAGIC,
+	['_'] = GIT_PATHSPEC_MAGIC,
+	['`'] = GIT_PATHSPEC_MAGIC,
+	['~'] = GIT_PATHSPEC_MAGIC,
+	[']'] = GIT_PUNCT,
+	['}'] = GIT_PUNCT,
+};
+
 /* For case-insensitive kwset */
 const unsigned char tolower_trans_tbl[256] = {
 	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
diff --git a/git-compat-util.h b/git-compat-util.h
index 4f0028ce60..61e71fe702 100644
--- a/git-compat-util.h
+++ b/git-compat-util.h
@@ -1228,11 +1228,7 @@ extern const unsigned char sane_ctype[256];
 #define GIT_SPACE 0x01
 #define GIT_DIGIT 0x02
 #define GIT_ALPHA 0x04
-#define GIT_GLOB_SPECIAL 0x08
-#define GIT_REGEX_SPECIAL 0x10
-#define GIT_PATHSPEC_MAGIC 0x20
 #define GIT_CNTRL 0x40
-#define GIT_PUNCT 0x80
 #define sane_istest(x,mask) ((sane_ctype[(unsigned char)(x)] & (mask)) != 0)
 #define isascii(x) (((x) & ~0x7f) == 0)
 #define isspace(x) sane_istest(x,GIT_SPACE)
@@ -1242,15 +1238,22 @@ extern const unsigned char sane_ctype[256];
 #define isprint(x) ((x) >= 0x20 && (x) <= 0x7e)
 #define islower(x) sane_iscase(x, 1)
 #define isupper(x) sane_iscase(x, 0)
-#define is_glob_special(x) sane_istest(x,GIT_GLOB_SPECIAL)
-#define is_regex_special(x) sane_istest(x,GIT_GLOB_SPECIAL | GIT_REGEX_SPECIAL)
 #define iscntrl(x) (sane_istest(x,GIT_CNTRL))
-#define ispunct(x) sane_istest(x, GIT_PUNCT | GIT_REGEX_SPECIAL | \
-		GIT_GLOB_SPECIAL | GIT_PATHSPEC_MAGIC)
 #define isxdigit(x) (hexval_table[(unsigned char)(x)] != -1)
 #define tolower(x) sane_case((unsigned char)(x), 0x20)
 #define toupper(x) sane_case((unsigned char)(x), 0)
-#define is_pathspec_magic(x) sane_istest(x,GIT_PATHSPEC_MAGIC)
+
+extern const unsigned char sane_ctype2[256];
+#define GIT_GLOB_SPECIAL 0x01
+#define GIT_REGEX_SPECIAL 0x02
+#define GIT_PATHSPEC_MAGIC 0x04
+#define GIT_PUNCT 0x08
+#define sane_istest2(x,mask) ((sane_ctype2[(unsigned char)(x)] & (mask)) != 0)
+#define is_glob_special(x) sane_istest2(x,GIT_GLOB_SPECIAL)
+#define is_regex_special(x) sane_istest2(x,GIT_GLOB_SPECIAL | GIT_REGEX_SPECIAL)
+#define ispunct(x) sane_istest2(x, GIT_PUNCT | GIT_REGEX_SPECIAL | \
+		GIT_GLOB_SPECIAL | GIT_PATHSPEC_MAGIC)
+#define is_pathspec_magic(x) sane_istest2(x,GIT_PATHSPEC_MAGIC)

 static inline int sane_case(int x, int high)
 {
--
2.39.1




[Index of Archives]     [Linux Kernel Development]     [Gcc Help]     [IETF Annouce]     [DCCP]     [Netdev]     [Networking]     [Security]     [V4L]     [Bugtraq]     [Yosemite]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Linux SCSI]     [Fedora Users]

  Powered by Linux