Add the requisite markers for multi-byte characters so that the expansion code can recognise them. Also allow wide blank characters to terminate words. Signed-off-by: Herbert Xu <herbert@xxxxxxxxxxxxxxxxxxx> --- src/expand.c | 19 +++++++ src/mktokens | 1 + src/parser.c | 136 +++++++++++++++++++++++++++++++++++++++++---------- 3 files changed, 129 insertions(+), 27 deletions(-) diff --git a/src/expand.c b/src/expand.c index a3b81d5..eedd69d 100644 --- a/src/expand.c +++ b/src/expand.c @@ -275,6 +275,7 @@ static char *argstr(char *p, int flag) CTLESC, CTLVAR, CTLBACKQ, + CTLMBCHAR, CTLARI, CTLENDARI, 0 @@ -299,6 +300,8 @@ tilde: start: startloc = expdest - (char *)stackblock(); for (;;) { + unsigned ml; + unsigned mb; int end; length += strcspn(p + length, reject); @@ -361,6 +364,22 @@ addquote: startloc++; } break; + case CTLMBCHAR: + c = (signed char)*p--; + mb = mbnext(p); + ml = (mb >> 8) - 2; + if (flag & QUOTES_ESC) { + length = (mb >> 8) + (mb & 0xff); + if (c == (char)CTLESC) + startloc += length; + break; + } + if (c == CTLESC) + startloc += ml; + p += mb & 0xff; + expdest = stnputs(p, ml, expdest); + p += mb >> 8; + break; case CTLESC: startloc++; length++; diff --git a/src/mktokens b/src/mktokens index 78055be..dcef676 100644 --- a/src/mktokens +++ b/src/mktokens @@ -41,6 +41,7 @@ cat > "${TMPDIR}"/ka$$ <<\! TEOF 1 end of file +TBLANK 0 blank TNL 0 newline TSEMI 0 ";" TBACKGND 0 "&" diff --git a/src/parser.c b/src/parser.c index 27611f0..71d61f3 100644 --- a/src/parser.c +++ b/src/parser.c @@ -36,7 +36,11 @@ #include <alloca.h> #endif +#include <limits.h> +#include <stdbool.h> #include <stdlib.h> +#include <wchar.h> +#include <wctype.h> #include "shell.h" #include "parser.h" @@ -801,6 +805,8 @@ xxreadtoken(void) setprompt(2); } for (;;) { /* until token or start of word found */ + int tok; + c = pgetc_eatbnl(); switch (c) { case ' ': case '\t': @@ -834,9 +840,10 @@ xxreadtoken(void) case ')': RETURN(TRP); } - break; + tok = readtoken1(c, BASESYNTAX, (char *)NULL, 0); + if (tok != TBLANK) + return tok; } - return readtoken1(c, BASESYNTAX, (char *)NULL, 0); #undef RETURN } @@ -876,7 +883,53 @@ static void synstack_pop(struct synstack **stack) *stack = (*stack)->next; } +static unsigned getmbc(int c, char *out, int mode) +{ + char *const start = out; + mbstate_t mbst = {}; + unsigned ml = 0; + size_t ml2; + wchar_t wc; + char *mbc; + if (likely(c >= 0)) + return 0; + + mbc = (mode & 3) < 2 ? out + 2 + (mode == 1) : out; + mbc[ml] = c; + while ((ml2 = mbrtowc(&wc, mbc + ml++, 1, &mbst)) == -2) { + if (ml >= MB_LEN_MAX) + break; + c = pgetc_eoa(); + if (c == PEOA || c == PEOF) + break; + mbc[ml] = c; + } + + if (ml2 == 1 && ml > 1) { + if (mode == 4 && iswblank(wc)) + return 1; + + if ((mode & 3) < 2) { + USTPUTC(CTLMBCHAR, out); + if (mode == 1) + USTPUTC(CTLESC, out); + USTPUTC(ml, out); + } + STADJUST(ml, out); + if ((mode & 3) < 2) { + USTPUTC(ml, out); + USTPUTC(CTLMBCHAR, out); + } + + return out - start; + } + + if (ml > 1) + pungetn(ml - 1); + + return 0; +} /* * If eofmark is NULL, read a word or a redirection symbol. If eofmark @@ -929,12 +982,29 @@ readtoken1(int firstc, char const *syntax, char *eofmark, int striptabs) } #endif CHECKEND(); /* set c to PEOF if at end of here document */ - for (;;) { /* until end of line or end of word */ - CHECKSTRSPACE(4, out); /* permit 4 calls to USTPUTC */ + /* Until end of line or end of word */ + for (;; c = pgetc_top(synstack)) { + int fieldsplitting; + unsigned ml; + + /* Permit max(MB_LEN_MAX, 23) calls to USTPUTC. */ + CHECKSTRSPACE((MB_LEN_MAX > 16 ? MB_LEN_MAX : 16) + 7, + out); + fieldsplitting = synstack->syntax == BASESYNTAX && + !synstack->varnest ? 4 : 0; + ml = getmbc(c, out, fieldsplitting); + if (ml == 1) { + if (out == stackblock()) + return TBLANK; + c = pgetc(); + break; + } + out += ml; + if (ml) + continue; switch(synstack->syntax[c]) { case CNL: /* '\n' */ - if (synstack->syntax == BASESYNTAX && - !synstack->varnest) + if (fieldsplitting) goto endword; /* exit outer loop */ USTPUTC(c, out); nlprompt(); @@ -956,26 +1026,33 @@ readtoken1(int firstc, char const *syntax, char *eofmark, int striptabs) USTPUTC(CTLESC, out); USTPUTC('\\', out); pungetc(); - } else { - if ( - synstack->dblquote && - c != '\\' && c != '`' && - c != '$' && ( - c != '"' || - (eofmark != NULL && - !synstack->varnest) - ) && ( - c != '}' || - !synstack->varnest - ) - ) { - USTPUTC(CTLESC, out); - USTPUTC('\\', out); - } - USTPUTC(CTLESC, out); - USTPUTC(c, out); - quotef++; + break; } + + if ( + synstack->dblquote && + c != '\\' && c != '`' && + c != '$' && ( + c != '"' || + (eofmark != NULL && + !synstack->varnest) + ) && ( + c != '}' || + !synstack->varnest + ) + ) { + USTPUTC(CTLESC, out); + USTPUTC('\\', out); + } + quotef++; + + ml = getmbc(c, out, 1); + out += ml; + if (ml) + break; + + USTPUTC(CTLESC, out); + USTPUTC(c, out); break; case CSQUOTE: synstack->syntax = SQSYNTAX; @@ -1053,11 +1130,10 @@ toggledq: case CEOF: goto endword; /* exit outer loop */ default: - if (synstack->varnest == 0) + if (fieldsplitting) goto endword; /* exit outer loop */ USTPUTC(c, out); } - c = pgetc_top(synstack); } } endword: @@ -1384,6 +1460,7 @@ parsebackq: { size_t psavelen; size_t savelen; union node *n; + unsigned ml; char *pstr; char *str; @@ -1415,6 +1492,11 @@ parsebackq: { if (pc != '\\' && pc != '`' && pc != '$' && (!synstack->dblquote || pc != '"')) STPUTC('\\', pout); + CHECKSTRSPACE(MB_LEN_MAX, pout); + ml = getmbc(pc, pout, 2); + pout += ml; + if (ml) + continue; break; case PEOF: -- 2.39.2